├── .dockerignore
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── config.yml
    │   ├── feature_request.yml
    │   └── question.yml
├── .gitignore
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── bin
    ├── workflow-glue
    └── workflow_glue
    │   ├── __init__.py
    │   ├── check_reference_index.py
    │   ├── models
    │       ├── __init__.py
    │       └── common.py
    │   ├── report.py
    │   ├── report_utils
    │       ├── read_data.py
    │       └── sections.py
    │   ├── tests
    │       ├── __init__.py
    │       └── test_test.py
    │   ├── util.py
    │   └── wfg_helpers
    │       ├── __init__.py
    │       ├── check_bam_headers_in_dir.py
    │       ├── check_sample_sheet.py
    │       ├── check_xam_index.py
    │       ├── configure_igv.py
    │       ├── get_max_depth_locus.py
    │       └── reheader_samstream.py
├── data
    ├── .gitkeep
    └── OPTIONAL_FILE
├── docs
    ├── 01_brief_description.md
    ├── 02_introduction.md
    ├── 03_compute_requirements.md
    ├── 04_install_and_run.md
    ├── 05_related_protocols.md
    ├── 06_input_example.md
    ├── 06_input_parameters.md
    ├── 07_outputs.md
    ├── 08_pipeline_overview.md
    ├── 09_troubleshooting.md
    ├── 10_FAQ.md
    └── 11_other.md
├── lib
    ├── ArgumentParser.groovy
    ├── CWUtil.groovy
    ├── NfcoreSchema.groovy
    ├── NfcoreTemplate.groovy
    ├── Pinguscript.groovy
    ├── WorkflowMain.groovy
    ├── common.nf
    ├── ingress.nf
    └── nfcore_external_java_deps.jar
├── main.nf
├── nextflow.config
├── nextflow_schema.json
├── output_definition.json
├── subworkflows
    └── process_references.nf
└── test_data
    ├── .gitkeep
    ├── bam
        └── test.bam
    ├── counts
        └── ERCC_mix1.csv
    ├── fastq
        ├── barcode01
        │   └── reads.fq
        └── barcode02
        │   └── reads.fq
    ├── other_references
        ├── case01
        │   └── reference.fasta
        └── case02
        │   └── reference.fasta
    ├── references
        ├── ERCC.fasta
        ├── SIRV_isoforms_multi-fasta_170612a.fasta
        └── combined_references.mmi
    └── ubam
        └── test.ubam


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | test_data
 3 | bin
 4 | CHANGELOG.md
 5 | data
 6 | lib
 7 | LICENSE
 8 | main.nf
 9 | nextflow.config
10 | README.md
11 | test_data
12 | # we typically run tests with outputs to these:
13 | output
14 | work


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
  1 | name: Bug Report
  2 | description: File a bug report
  3 | labels: ["triage"]
  4 | body:
  5 |   - type: markdown
  6 |     attributes:
  7 |       value: |
  8 |         Thanks for taking the time to fill out this bug report!
  9 | 
 10 | 
 11 |   - type: markdown
 12 |     attributes:
 13 |       value: |
 14 |           # Background
 15 |   - type: dropdown
 16 |     id: os
 17 |     attributes:
 18 |       label: Operating System
 19 |       description: What operating system are you running?
 20 |       options:
 21 |         - Windows 10
 22 |         - Windows 11
 23 |         - macOS
 24 |         - Ubuntu 22.04
 25 |         - CentOS 7
 26 |         - Other Linux (please specify below)
 27 |     validations:
 28 |       required: true
 29 |   - type: input
 30 |     id: other-os
 31 |     attributes:
 32 |       label: Other Linux
 33 |       placeholder: e.g. Fedora 38
 34 |   - type: input
 35 |     id: version
 36 |     attributes:
 37 |       label: Workflow Version
 38 |       description: This is most easily found in the workflow output log
 39 |       placeholder: v1.2.3
 40 |     validations:
 41 |       required: true
 42 |   - type: dropdown
 43 |     id: execution
 44 |     attributes:
 45 |       label: Workflow Execution
 46 |       description: Where are you running the workflow?
 47 |       options:
 48 |         - EPI2ME Desktop (Local)
 49 |         - EPI2ME Desktop (Cloud)
 50 |         - Command line (Local)
 51 |         - Command line (Cluster)
 52 |         - Other (please describe)
 53 |     validations:
 54 |       required: true
 55 |   - type: input
 56 |     id: other-workflow-execution
 57 |     attributes:
 58 |       label: Other workflow execution
 59 |       description: If "Other", please describe
 60 |       placeholder: Tell us where / how you are running the workflow.
 61 | 
 62 |   - type: markdown
 63 |     attributes:
 64 |       value: |
 65 |         # EPI2ME Desktop Application
 66 |         If you are using the application please provide the following.
 67 |   - type: input
 68 |     id: labs-version
 69 |     attributes:
 70 |       label: EPI2ME Version
 71 |       description: Available from the application settings page.
 72 |       placeholder: v5.1.1
 73 |     validations:
 74 |       required: false
 75 | 
 76 | 
 77 |   - type: markdown
 78 |     attributes:
 79 |       value: |
 80 |         # Command-line execution
 81 |         If you are using nextflow on a command-line, please provide the following.
 82 |   - type: textarea
 83 |     id: cli-command
 84 |     attributes:
 85 |       label: CLI command run
 86 |       description: Please tell us the command you are running
 87 |       placeholder: e.g. nextflow run epi2me-labs/wf-human-variations -profile standard --fastq my-reads/fastq
 88 |     validations:
 89 |       required: false
 90 |   - type: dropdown
 91 |     id: profile
 92 |     attributes:
 93 |       label: Workflow Execution - CLI Execution Profile
 94 |       description: Which execution profile are you using? If you are using a custom profile or nextflow configuration, please give details below.
 95 |       options:
 96 |         - standard (default)
 97 |         - singularity
 98 |         - custom
 99 |     validations:
100 |       required: false
101 | 
102 | 
103 |   - type: markdown
104 |     attributes:
105 |       value: |
106 |         # Report details
107 |   - type: textarea
108 |     id: what-happened
109 |     attributes:
110 |       label: What happened?
111 |       description: Also tell us, what did you expect to happen?
112 |       placeholder: Tell us what you see!
113 |     validations:
114 |       required: true
115 |   - type: textarea
116 |     id: logs
117 |     attributes:
118 |       label: Relevant log output
119 |       description: For CLI execution please include the full output from running nextflow. For execution from the EPI2ME application please copy the contents of the "Workflow logs" panel from the "Logs" tab corresponding to your workflow instance. (This will be automatically formatted into code, so no need for backticks).
120 |       render: shell
121 |     validations:
122 |       required: true
123 |   - type: textarea
124 |     id: activity-log
125 |     attributes:
126 |       label: Application activity log entry
127 |       description: For use with the EPI2ME application please see the Settings > View Activity Log page, and copy the contents of any items listed in red using the Copy to clipboard button.
128 |       render: shell
129 |     validations:
130 |       required: false
131 |   - type: dropdown
132 |     id: run-demo
133 |     attributes:
134 |       label: Were you able to successfully run the latest version of the workflow with the demo data?
135 |       description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button?
136 |       options:
137 |         - 'yes'
138 |         - 'no'
139 |         - other (please describe below)
140 |     validations:
141 |       required: true
142 |   - type: textarea
143 |     id: demo-other
144 |     attributes:
145 |       label: Other demo data information
146 |       render: shell
147 |     validations:
148 |       required: false
149 | 
150 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |     - name: Nanopore customer support
4 |       url: https://nanoporetech.com/contact
5 |       about: For general support, including bioinformatics questions.
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature request
 2 | description: Suggest an idea for this project
 3 | labels: ["feature request"]
 4 | body:
 5 |   
 6 |   - type: textarea
 7 |     id: question1
 8 |     attributes:
 9 |       label: Is your feature related to a problem?
10 |       placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 |     validations:
12 |       required: true
13 |   - type: textarea
14 |     id: question2
15 |     attributes:
16 |       label: Describe the solution you'd like
17 |       placeholder: A clear and concise description of what you want to happen.
18 |     validations:
19 |       required: true
20 |   - type: textarea
21 |     id: question3
22 |     attributes:
23 |       label: Describe alternatives you've considered
24 |       placeholder: A clear and concise description of any alternative solutions or features you've considered.
25 |     validations:
26 |       required: true
27 |   - type: textarea
28 |     id: question4
29 |     attributes:
30 |       label: Additional context
31 |       placeholder: Add any other context about the feature request here.
32 |     validations:
33 |       required: false
34 | 
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.yml:
--------------------------------------------------------------------------------
 1 | name: Question
 2 | description: Ask a generic question about this project unrelated to features or bugs.
 3 | labels: ["question"]
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Please reserve this form for issues not related to bugs or feature requests. If our developers deem your questions to be related to bugs or features you will be asked to fill in the appropriate form.
 9 |   - type: textarea
10 |     id: question1
11 |     attributes:
12 |       label: Ask away!
13 |       placeholder: |
14 |           Bad question: How do I use this workflow in my HPC cluster?
15 |           Good question: My HPC cluster uses a GridEngine scheduler. Can you point me to documentation for how to use your workflows to efficiently submit jobs to my cluster?
16 |     validations:
17 |       required: true
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | nextflow
2 | .nextflow*
3 | template-workflow
4 | .*.swp
5 | .*.swo
6 | *.pyc
7 | *.pyo
8 | .DS_store
9 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | # Include shared CI
  2 | include:
  3 |     - project: "epi2melabs/ci-templates"
  4 |       file: "wf-containers.yaml"
  5 | 
  6 | variables:
  7 |     NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references test_data/references \
  8 |     --counts test_data/counts/ERCC_mix1.csv"
  9 |     CI_FLAVOUR: "new"
 10 | 
 11 | docker-run:
 12 |     tags: []
 13 | 
 14 |     # Define a 1D job matrix to inject a variable named MATRIX_NAME into
 15 |     #   the CI environment, we can use the value of MATRIX_NAME to determine
 16 |     #   which options to apply as part of the rules block below
 17 |     # NOTE There is a slightly cleaner way to define this matrix to include
 18 |     #   the variables, but it is broken when using long strings! See CW-756
 19 |     parallel:
 20 |         matrix:
 21 |             - MATRIX_NAME: [
 22 |                 "fastq", "bam", "ubam", "compress-one-ref", "compress-all-refs", "mmi",
 23 |                 "numeric_chrom_id_ref-01", "numeric_chrom_id_ref-02",
 24 |                 "igv",
 25 |             ]
 26 |     rules:
 27 |         # NOTE As we're overriding the rules block for the included docker-run
 28 |         #   we must redefine this CI_COMMIT_BRANCH rule to prevent docker-run
 29 |         #   being incorrectly scheduled for "detached merge request pipelines" etc.
 30 |         - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template")
 31 |           when: never
 32 |         - if: $MATRIX_NAME == "fastq"
 33 |           variables:
 34 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references test_data/references \
 35 |                   --counts test_data/counts/ERCC_mix1.csv -executor.\\$$local.memory 12GB"
 36 |               NF_IGNORE_PROCESSES: checkReferences,renameBamFiles
 37 |         - if: $MATRIX_NAME == "bam"
 38 |           variables:
 39 |               NF_WORKFLOW_OPTS: "--bam test_data/bam --references test_data/references \
 40 |                   --counts test_data/counts/ERCC_mix1.csv -executor.\\$$local.memory 12GB"
 41 |               NF_IGNORE_PROCESSES: alignReads,checkReferences
 42 |         - if: $MATRIX_NAME == "ubam"
 43 |           variables:
 44 |               NF_WORKFLOW_OPTS: "--bam test_data/ubam --references test_data/references \
 45 |                   --counts test_data/counts/ERCC_mix1.csv -executor.\\$$local.memory 12GB"
 46 |               NF_IGNORE_PROCESSES: checkReferences,renameBamFiles
 47 |         # run on FASTQ again after compressing one / all references
 48 |         - if: $MATRIX_NAME == "compress-one-ref"
 49 |           variables:
 50 |               NF_BEFORE_SCRIPT: |
 51 |                   mkdir -p $CI_PROJECT_NAME &&
 52 |                   cp -r test_data/references $CI_PROJECT_NAME/refs &&
 53 |                   gzip $CI_PROJECT_NAME/refs/ERCC.fasta
 54 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references $CI_PROJECT_NAME/refs \
 55 |                   --counts test_data/counts/ERCC_mix1.csv -executor.\\$$local.memory 12GB"
 56 |               NF_IGNORE_PROCESSES: checkReferences,renameBamFiles
 57 |         - if: $MATRIX_NAME == "compress-all-refs"
 58 |           variables:
 59 |               NF_BEFORE_SCRIPT: |
 60 |                   mkdir -p $CI_PROJECT_NAME &&
 61 |                   cp -r test_data/references $CI_PROJECT_NAME/refs &&
 62 |                   gzip $CI_PROJECT_NAME/refs/*
 63 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references $CI_PROJECT_NAME/refs \
 64 |                   --counts test_data/counts/ERCC_mix1.csv -executor.\\$$local.memory 12GB"
 65 |               NF_IGNORE_PROCESSES: checkReferences,renameBamFiles
 66 |         - if: $MATRIX_NAME == "mmi"
 67 |           variables:
 68 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references test_data/references \
 69 |                   --reference_mmi_file test_data/references/combined_references.mmi -executor.\\$$local.memory 12GB"
 70 |               NF_IGNORE_PROCESSES: makeMMIndex,renameBamFiles
 71 |         - if: $MATRIX_NAME == "igv"
 72 |           variables:
 73 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references test_data/references \
 74 |                   --counts test_data/counts/ERCC_mix1.csv -executor.\\$$local.memory 12GB \
 75 |                   --igv"
 76 |               NF_IGNORE_PROCESSES: checkReferences,renameBamFiles
 77 |         - if: $MATRIX_NAME == "numeric_chrom_id_ref-01"
 78 |           variables:
 79 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references test_data/other_references/case01 \
 80 |                   -executor.\\$$local.memory 12GB"
 81 |               NF_IGNORE_PROCESSES: checkReferences,renameBamFiles
 82 |         - if: $MATRIX_NAME == "numeric_chrom_id_ref-02"
 83 |           variables:
 84 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references test_data/other_references/case02 \
 85 |                   -executor.\\$$local.memory 12GB"
 86 |               NF_IGNORE_PROCESSES: checkReferences,renameBamFiles
 87 |         - if: $MATRIX_NAME == "igv"
 88 |           variables:
 89 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references test_data/references \
 90 |                   --counts test_data/counts/ERCC_mix1.csv -executor.\\$$local.memory 12GB \
 91 |                   --igv"
 92 |               NF_IGNORE_PROCESSES: checkReferences,renameBamFiles
 93 | 
 94 | aws-run:
 95 |     parallel:
 96 |         matrix:
 97 |             - MATRIX_NAME: [ "counts", "no-counts" ]
 98 |     rules:
 99 |         # NOTE As we're overriding the rules block for the included docker-run
100 |         #   we must redefine this CI_COMMIT_BRANCH rule to prevent docker-run
101 |         #   being incorrectly scheduled for "detached merge request pipelines" etc.
102 |         - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template")
103 |           when: never
104 |         - if: $MATRIX_NAME == "counts"
105 |           variables:
106 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references test_data/references \
107 |                   --counts test_data/counts/ERCC_mix1.csv"
108 |               NF_IGNORE_PROCESSES: sortInputBam
109 |         - if: $MATRIX_NAME == "no-counts"
110 |           variables:
111 |               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --references test_data/references"
112 |               NF_IGNORE_PROCESSES: sortInputBam
113 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: docs_readme
 5 |         name: docs_readme
 6 |         entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json
 7 |         language: python
 8 |         always_run: true
 9 |         pass_filenames: false
10 |         additional_dependencies:
11 |           - epi2melabs==0.0.58
12 |   - repo: https://github.com/pycqa/flake8
13 |     rev: 5.0.4
14 |     hooks:
15 |       - id: flake8
16 |         pass_filenames: false
17 |         additional_dependencies:
18 |           - flake8-rst-docstrings
19 |           - flake8-docstrings
20 |           - flake8-import-order
21 |           - flake8-forbid-visual-indent
22 |           - pep8-naming
23 |           - flake8-no-types
24 |           - flake8-builtins
25 |           - flake8-absolute-import
26 |           - flake8-print
27 |         args: [
28 |             "bin",
29 |             "--import-order-style=google",
30 |             "--statistics",
31 |             "--max-line-length=88",
32 |             "--per-file-ignores=bin/workflow_glue/models/*:NT001",
33 |         ]
34 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | 
  8 | ## [v1.2.3]
  9 | ### Changed
 10 | - Updated container image to move conda environment out of `/home`. This fixes an issue some users experienced when running the workflow with singularity: nextflow would report not being able to find programs. 
 11 | - Updated to wf-template v5.6.0, changing:
 12 |     - Reduce verbosity of debug logging from fastcat which can occasionally occlude errors found in FASTQ files during ingress.
 13 |     - Log banner art to say "EPI2ME" instead of "EPI2ME Labs" to match current branding. This has no effect on the workflow outputs.
 14 |     - Unexpected workflow parameters now cause the workflow to fail.
 15 | ### Fixed
 16 | - Updated to wf-template v5.6.0, fixing:
 17 |     - dacite.exceptions.WrongTypeError during report generation when barcode is null.
 18 |     - Sequence summary read length N50 incorrectly displayed minimum read length, it now correctly shows the N50.
 19 |     - Sequence summary component alignment and coverage plots failed to plot under some conditions.
 20 | 
 21 | ## [v1.2.2]
 22 | ### Changed
 23 | - Reconcile template with v5.3.0 and v5.3.1
 24 | - IGV output files are only output if `--igv` is used
 25 | ### Fixed
 26 | - Error caused by numeric reference chromosome identifiers.
 27 | - Combined_refs.mmi is only published to the output directory when `reference_mmi_file` is not supplied.
 28 | - Combined reference MMI is now output as `combined_refs.mmi` to match the declared output_definition.json.
 29 | 
 30 | 
 31 | ## [v1.2.1]
 32 | ### Changed
 33 | - Update to ezcharts 0.11.2
 34 | 
 35 | ## [v1.2.0]
 36 | ### Changed
 37 | - Streamlined and simplified the report.
 38 | - The per-read stats TSV file is no longer created by default. Instead, several histogram TSV files of read / alignment statistics (read length and mean quality; alignment accuracy and coverage) are output. The original per-read TSV can still be created with the `--per_read_stats` parameter. It is now gzip-compressed.
 39 | 
 40 | ### Added
 41 | - IGV config JSON file to the outputs (in order to visualise the alignments and called variants).
 42 | 
 43 | ## [v1.1.2]
 44 | ### Changed
 45 | - The Summary section of the report now only lists the first 7 sample names and reference files, instead of listing them all.
 46 | 
 47 | ## [v1.1.1]
 48 | ### Changed
 49 | - Reduced the memory requested by some processes to avoid failing in WSL (since there is slightly less memory available in WSL than specified in `.wslconfig`).
 50 | 
 51 | ## [v1.1.0]
 52 | ### Changed
 53 | - Some formatting changes to github issue template.
 54 | 
 55 | ### Added
 56 | - Produce a MMI index file.
 57 | - `--reference_mmi_file` option to use a pre-generated MMI index file as reference.
 58 | 
 59 | ### Removed
 60 | - The limit of `20` for the `--threads` parameter.
 61 | 
 62 | ## [v1.0.3]
 63 | ### Fixed
 64 | - Fix regression in depth plots that concatenated the curves of the different samples, rather than displaying them as a multi-line plot
 65 | 
 66 | ### Changed
 67 | - BAM tags from uBAM inputs are now carried over to the resulting BAM files.
 68 | 
 69 | ## [v1.0.2]
 70 | ### Fixed
 71 | - Regression that caused failing on compressed references.
 72 | - The `alignReads` process requesting too little memory in some cases.
 73 | 
 74 | ### Changed
 75 | - Reduced the minimum memory requirement from 16 to 12 GB.
 76 | 
 77 | ## [v1.0.1]
 78 | ### Fixed
 79 | - The workflow failing due to commas in reference sequence names.
 80 | 
 81 | ### Changed
 82 | - How samples, reference files, and reference sequence names are listed in the summary section at the beginning of the report.
 83 | 
 84 | ## [v1.0.0]
 85 | ### Added
 86 | - Memory requirements for each process.
 87 | 
 88 | ### Changed
 89 | - Reworked docs to follow new layout.
 90 | 
 91 | ## [v0.6.3]
 92 | ### Fixed
 93 | - Mangled depth plots when there are multiple reference sequences.
 94 | - Report generation failing when there is only a single read or a small number of reads with near-identical mean quality for a sample or reference file.
 95 | 
 96 | ## [v0.6.2]
 97 | ### Fixed
 98 | - Report generation failing when a sample name begins with the name of another sample (e.g. 'sample_A' and 'sample_A_2').
 99 | 
100 | ### Removed
101 | - Default local executor CPU and RAM limits.
102 | 
103 | ### Changed
104 | - Names of barcoded directories in the sample sheet now need to be of format `barcodeXY`.
105 | 
106 | ## [v0.6.1]
107 | ### Fixed
108 | - Workflow failing when using a large number of reference sequences.
109 | 
110 | ## [v0.6.0]
111 | ### Removed
112 | - `--ubam` option. `--bam` can now be used for both BAM and uBAM files. The workflow will determine if files are aligned or not (and align them against the provided reference in that case).
113 | 
114 | ## [v0.5.3]
115 | ### Fixed
116 | - Read length histogram only displaying a small number of bins when there are a few outlier reads a lot longer than the other reads.
117 | - configure-jbrowse breaking on unescaped spaces
118 | 
119 | ### Changed
120 | - x-axis limits for accuracy, mean read quality, and read alignment coverage histograms to be more dynamic.
121 | ### Fixed
122 | - Workflow will no longer crash when running with `--bam` on an input directory containing more than one `.bam` file.
123 | 
124 | ## [v0.5.2]
125 | ### Changed
126 | - Removed no longer used `--concat_fastq` parameter.
127 | 
128 | ## [v0.5.1]
129 | ### Changed
130 | - Updated GitHub issue templates to force capture of more information.
131 | - Example command to use demo data.
132 | 
133 | ### Fixed
134 | - Tooltips in depth plots not showing.
135 | 
136 | ## [v0.5.0]
137 | ### Changed
138 | - Bumped minimum required Nextflow version to 22.10.8.
139 | - Enum choices are enumerated in the `--help` output.
140 | - Enum choices are enumerated as part of the error message when a user has selected an invalid choice.
141 | 
142 | ## [v0.4.1]
143 | ### Fixed
144 | - Workflow aborting on `fastcat_or_mv` process.
145 | 
146 | ## [v0.4.0]
147 | ### Changed
148 | - Replaced `--threads` option with `--mapping_threads` and `--sorting_threads`, which control the number of threads used during the alignment process.
149 |     - `--mapping_threads` controls the number of threads used by `minimap2`.
150 |     - `--sorting_threads` controls the number of threads used to sort the aligned reads.
151 |     - The total number of threads used by the alignment process is the sum of the two values.
152 |     - Other processes use a hard-coded number of threads ranging between 1 and 3.
153 | 
154 | ### Added
155 | - Parameters `--minimap_args` and `--minimap_preset` to expose additional `minimap2` options to the user.
156 |     - For RNA data sets, `--minimap_preset` can be set to `'rna'` to automatically configure the workflow accordingly (`'dna'` is the default preset).
157 |     - Advanced users can provide `--minimap_args` to pass additional overriding arguments to `minimap2`
158 | 
159 | ## [v0.3.6]
160 | ### Added
161 | - Configuration for running demo data in AWS
162 | 
163 | ## [v0.3.5]
164 | ### Fixed
165 | - Bug crashing the report when running on AWS without a `--counts` file.
166 | 
167 | ## [v0.3.4]
168 | ### Changed
169 | - Now uses ONT Public License.
170 | - Report now uses dropdown menus instead of tabs.
171 | 
172 | ### Fixed
173 | - Missing `seqkit` in `getVersions`process.
174 | 
175 | ## [v0.3.3]
176 | ### Removed
177 | - `-y` flag from `minimap2` command
178 | 
179 | ## [v0.3.2]
180 | ### Changed
181 | - format to 'directory-path' for parameters fastq, bam, ubam, references
182 | 
183 | ## [v0.3.1]
184 | ### Fixed
185 | - missing header for 'Useful links' in docs
186 | - description about references in schema (now only mentions an input directory)
187 | 
188 | ## [v0.3.0]
189 | ### Changed
190 | - uses bamstats instead of mapula
191 | - uses ezcharts for report
192 | 
193 | ### Removed
194 | - legacy option 'demultiplex'
195 | 
196 | ## [v0.2.4]
197 | ### Fixed
198 | - sample_sheet format in schema to expect a file
199 | 
200 | ## [v0.2.3]
201 | ### Changed
202 | - Updated description in manifest
203 | 
204 | ## [v0.2.2]
205 | ### Changed
206 | - Harmonized line plot colours in report.
207 | - Expanded explanation for coverage plots.
208 | 
209 | ## [v0.2.1]
210 | ### Changed
211 | - Changed plot layout and margins to avoid overflowing plots
212 | 
213 | ## [v0.2.0]
214 | ### Added
215 | - Workflow will now output a JBrowse2 `jbrowse.json` configuration
216 | 
217 | ### Changed
218 | - Output combined reference file to `out_dir`
219 | - `-profile conda` is no longer supported, users should use `-profile standard` (Docker) or `-profile singularity` instead
220 | - Removed option for specifying report suffix
221 | - Restructured workflow parameter schema
222 | 
223 | ## [v0.1.9]
224 | ### Added
225 | - Input params and handling for bam and ubam formats
226 | 
227 | ### Updated
228 | - Bumped base container to v0.2.0
229 | 
230 | ## [v0.1.8]
231 | ### Changed
232 | - Fastqingress metadata map
233 | 
234 | ### Fixed
235 | - Set out_dir option type to ensure output is written to correct directory on Windows.
236 | 
237 | ### Added
238 | - Argument Parser for fastqingress.
239 | 
240 | ## [v0.1.7]
241 | ### Fixed
242 | - Coloring with less than 3 samples
243 | 
244 | ## [v0.1.6]
245 | ### Fixed
246 | - run id and barcode output correctly
247 | 
248 | ## [v0.1.5]
249 | ### Added
250 | - concat_fastq boolean parameter
251 | 
252 | ### Changed
253 | - Better help text on cli
254 | 
255 | ## [v0.1.4]
256 | ### Fixed
257 | - Mosdepth 0 step
258 | 
259 | ### Added
260 | - Depth coverage steps parameters
261 | 
262 | ## [v0.1.3]
263 | ### Fixed
264 | - Cumulative coverage plotting incorrect numbers
265 | 
266 | ## [v0.1.2]
267 | ### Added
268 | - Cumulative coverage plot
269 | 
270 | ## [v0.1.1]
271 | ### Changed.
272 | - reference can be either a directory or single file.
273 | - output one merged CSV vs one for each barcode.
274 | - speed up a few steps including mosdepth and report creation.
275 | 
276 | ## [v0.1.0]
277 | ### Fixed.
278 | - run_id in mapula output json.
279 | - Only accept certain format files as references.
280 | - reduce storage required for workspace.
281 | 
282 | ### Added.
283 | - Handling for no alignments.
284 | - Integration with EPI2ME Labs notebook environment.
285 | 
286 | ## [v0.0.9]
287 | ### Added
288 | - Error message if no references in directory provided.
289 | - Singularity profile.
290 | - Ping telemetry file.
291 | 
292 | ### Fixed
293 | - Calculate depth coverage graph steps based on length of reference.
294 | 
295 | ### Changed
296 | - Sample name to sample id
297 | 
298 | ## [v0.0.8]
299 | ### Added
300 | - Option to add suffix to HTML report name.
301 | - Unmapped QC statistics
302 | - Depth coverage graph per reference
303 | 
304 | ### Changed
305 | - Help message now uses JSON schema
306 | - Updated fastqingress
307 | 
308 | ## [v0.0.7]
309 | ### Fixed
310 | - Correct conda profile environment file path
311 | 
312 | ## [v0.0.6]
313 | ### Fixed
314 | - Remove erroneous --prefix messages
315 | - Increase default batch_size to 1000
316 | - Increase default max local executor cpus to 8
317 | 
318 | ## [v0.0.5]
319 | ### Changed
320 | - Retag of v0.0.4, updated sample reports
321 | 
322 | ## [v0.0.4]
323 | ### Changed
324 | - Make prefix optional
325 | 
326 | ## [v0.0.3]
327 | ### Added
328 | - Barcode awarenesss support with --demultiplex flag (requires guppy_barcoder to be installed)
329 | - Output naming via new required --prefix argument
330 | 
331 | ## [v0.0.2]
332 | ### Changed
333 | - Standardised report name.
334 | - Make docker executor default.
335 | 
336 | ## [v0.0.1]
337 | * Initial release
338 | 
339 | ### Added
340 | - Basic running of alignment workflow and reporting
341 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Oxford Nanopore Technologies PLC. Public License Version 1.0
  2 | =============================================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor’s Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Executable Form"
 25 |     means any form of the work other than Source Code Form.
 26 | 
 27 | 1.6. "Larger Work"
 28 |     means a work that combines Covered Software with other material, in
 29 |     a separate file or files, that is not Covered Software.
 30 | 
 31 | 1.7. "License"
 32 |     means this document.
 33 | 
 34 | 1.8. "Licensable"
 35 |     means having the right to grant, to the maximum extent possible,
 36 |     whether at the time of the initial grant or subsequently, any and
 37 |     all of the rights conveyed by this License.
 38 | 
 39 | 1.9. "Modifications"
 40 |     means any of the following:
 41 | 
 42 |     (a)	  any file in Source Code Form that results from an addition to,
 43 |           deletion from, or modification of the contents of Covered
 44 |           Software; or
 45 |     (b)   any new file in Source Code Form that contains any Covered
 46 |           Software.
 47 | 
 48 | 1.10. "Research Purposes"
 49 |     means use for internal research and not intended for or directed
 50 |     towards commercial advantages or monetary compensation; provided,
 51 |     however, that monetary compensation does not include sponsored
 52 |     research of research funded by grants.
 53 | 
 54 | 1.11  "Secondary License"
 55 |     means either the GNU General Public License, Version 2.0, the GNU
 56 |     Lesser General Public License, Version 2.1, the GNU Affero General
 57 |     Public License, Version 3.0, or any later versions of those
 58 |     licenses.
 59 | 
 60 | 1.12. "Source Code Form"
 61 |     means the form of the work preferred for making modifications.
 62 | 
 63 | 1.13. "You" (or "Your")
 64 |     means an individual or a legal entity exercising rights under this
 65 |     License. For legal entities, "You" includes any entity that
 66 |     controls, is controlled by, or is under common control with You. For
 67 |     purposes of this definition, "control" means (a) the power, direct
 68 |     or indirect, to cause the direction or management of such entity,
 69 |     whether by contract or otherwise, or (b) ownership of more than
 70 |     fifty percent (50%) of the outstanding shares or beneficial
 71 |     ownership of such entity.
 72 | 
 73 | 2. License Grants and Conditions
 74 | --------------------------------
 75 | 
 76 | 2.1. Grants
 77 | 
 78 | Each Contributor hereby grants You a world-wide, royalty-free,
 79 | non-exclusive license under Contributor copyrights Licensable by such
 80 | Contributor to use, reproduce, make available, modify, display,
 81 | perform, distribute, and otherwise exploit solely for Research Purposes
 82 | its Contributions, either on an unmodified basis, with Modifications,
 83 | or as part of a Larger Work.
 84 | 
 85 | 2.2. Effective Date
 86 | 
 87 | The licenses granted in Section 2.1 with respect to any Contribution
 88 | become effective for each Contribution on the date the Contributor
 89 | first distributes such Contribution.
 90 | 
 91 | 2.3. Limitations on Grant Scope
 92 | 
 93 | The licenses granted in this Section 2 are the only rights granted under
 94 | this License. No additional rights or licenses will be implied from the
 95 | distribution or licensing of Covered Software under this License. The
 96 | License is incompatible with Secondary Licenses.  Notwithstanding
 97 | Section 2.1 above, no copyright license is granted:
 98 | 
 99 | (a) for any code that a Contributor has removed from Covered Software;
100 |     or
101 | 
102 | (b) use of the Contributions or its Contributor Version other than for
103 | Research Purposes only; or
104 | 
105 | (c) for infringements caused by: (i) Your and any other third party’s
106 | modifications of Covered Software, or (ii) the combination of its
107 | Contributions with other software (except as part of its Contributor
108 | Version).
109 | 
110 | This License does not grant any rights in the patents, trademarks,
111 | service marks, or logos of any Contributor (except as may be necessary
112 | to comply with the notice requirements in Section 3.4).
113 | 
114 | 2.4. Subsequent Licenses
115 | 
116 | No Contributor makes additional grants as a result of Your choice to
117 | distribute the Covered Software under a subsequent version of this
118 | License (see Section 10.2) or under the terms of a Secondary License
119 | (if permitted under the terms of Section 3.3).
120 | 
121 | 2.5. Representation
122 | 
123 | Each Contributor represents that the Contributor believes its
124 | Contributions are its original creation(s) or it has sufficient rights
125 | to grant the rights to its Contributions conveyed by this License.
126 | 
127 | 2.6. Fair Use
128 | 
129 | This License is not intended to limit any rights You have under
130 | applicable copyright doctrines of fair use, fair dealing, or other
131 | equivalents.
132 | 
133 | 2.7. Conditions
134 | 
135 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
136 | in Section 2.1.
137 | 
138 | 3. Responsibilities
139 | -------------------
140 | 
141 | 3.1. Distribution of Source Form
142 | 
143 | All distribution of Covered Software in Source Code Form, including any
144 | Modifications that You create or to which You contribute, must be under
145 | the terms of this License. You must inform recipients that the Source
146 | Code Form of the Covered Software is governed by the terms of this
147 | License, and how they can obtain a copy of this License. You may not
148 | attempt to alter or restrict the recipients’ rights in the Source Code Form.
149 | 
150 | 3.2. Distribution of Executable Form
151 | 
152 | If You distribute Covered Software in Executable Form then:
153 | 
154 | (a) such Covered Software must also be made available in Source Code
155 |     Form, as described in Section 3.1, and You must inform recipients of
156 |     the Executable Form how they can obtain a copy of such Source Code
157 |     Form by reasonable means in a timely manner, at a charge no more
158 |     than the cost of distribution to the recipient; and
159 | 
160 | (b) You may distribute such Executable Form under the terms of this
161 |     License.
162 | 
163 | 3.3. Distribution of a Larger Work
164 | 
165 | You may create and distribute a Larger Work under terms of Your choice,
166 | provided that You also comply with the requirements of this License for
167 | the Covered Software. The Larger Work may not be a combination of Covered
168 | Software with a work governed by one or more Secondary Licenses.
169 | 
170 | 3.4. Notices
171 | 
172 | You may not remove or alter the substance of any license notices
173 | (including copyright notices, patent notices, disclaimers of warranty,
174 | or limitations of liability) contained within the Source Code Form of
175 | the Covered Software, except that You may alter any license notices to
176 | the extent required to remedy known factual inaccuracies.
177 | 
178 | 3.5. Application of Additional Terms
179 | 
180 | You may not choose to offer, or charge a fee for use of the Covered
181 | Software or a fee for, warranty, support, indemnity or liability
182 | obligations to one or more recipients of Covered Software.  You must
183 | make it absolutely clear that any such warranty, support, indemnity, or
184 | liability obligation is offered by You alone, and You hereby agree to
185 | indemnify every Contributor for any liability incurred by such
186 | Contributor as a result of warranty, support, indemnity or liability
187 | terms You offer. You may include additional disclaimers of warranty and
188 | limitations of liability specific to any jurisdiction.
189 | 
190 | 4. Inability to Comply Due to Statute or Regulation
191 | ---------------------------------------------------
192 | 
193 | If it is impossible for You to comply with any of the terms of this
194 | License with respect to some or all of the Covered Software due to
195 | statute, judicial order, or regulation then You must: (a) comply with
196 | the terms of this License to the maximum extent possible; and (b)
197 | describe the limitations and the code they affect. Such description must
198 | be placed in a text file included with all distributions of the Covered
199 | Software under this License. Except to the extent prohibited by statute
200 | or regulation, such description must be sufficiently detailed for a
201 | recipient of ordinary skill to be able to understand it.
202 | 
203 | 5. Termination
204 | --------------
205 | 
206 | 5.1. The rights granted under this License will terminate automatically
207 | if You fail to comply with any of its terms.
208 | 
209 | 5.2. If You initiate litigation against any entity by asserting an
210 | infringement claim (excluding declaratory judgment actions,
211 | counter-claims, and cross-claims) alleging that a Contributor Version
212 | directly or indirectly infringes, then the rights granted to
213 | You by any and all Contributors for the Covered Software under Section
214 | 2.1 of this License shall terminate.
215 | 
216 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
217 | end user license agreements (excluding distributors and resellers) which
218 | have been validly granted by You or Your distributors under this License
219 | prior to termination shall survive termination.
220 | 
221 | ************************************************************************
222 | *                                                                      *
223 | *  6. Disclaimer of Warranty                                           *
224 | *  -------------------------                                           *
225 | *                                                                      *
226 | *  Covered Software is provided under this License on an "as is"       *
227 | *  basis, without warranty of any kind, either expressed, implied, or  *
228 | *  statutory, including, without limitation, warranties that the       *
229 | *  Covered Software is free of defects, merchantable, fit for a        *
230 | *  particular purpose or non-infringing. The entire risk as to the     *
231 | *  quality and performance of the Covered Software is with You.        *
232 | *  Should any Covered Software prove defective in any respect, You     *
233 | *  (not any Contributor) assume the cost of any necessary servicing,   *
234 | *  repair, or correction. This disclaimer of warranty constitutes an   *
235 | *  essential part of this License. No use of any Covered Software is   *
236 | *  authorized under this License except under this disclaimer.         *
237 | *                                                                      *
238 | ************************************************************************
239 | 
240 | ************************************************************************
241 | *                                                                      *
242 | *  7. Limitation of Liability                                          *
243 | *  --------------------------                                          *
244 | *                                                                      *
245 | *  Under no circumstances and under no legal theory, whether tort      *
246 | *  (including negligence), contract, or otherwise, shall any           *
247 | *  Contributor, or anyone who distributes Covered Software as          *
248 | *  permitted above, be liable to You for any direct, indirect,         *
249 | *  special, incidental, or consequential damages of any character      *
250 | *  including, without limitation, damages for lost profits, loss of    *
251 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
252 | *  and all other commercial damages or losses, even if such party      *
253 | *  shall have been informed of the possibility of such damages. This   *
254 | *  limitation of liability shall not apply to liability for death or   *
255 | *  personal injury resulting from such party’s negligence to the       *
256 | *  extent applicable law prohibits such limitation, but in such event, *
257 | *  and to the greatest extent permissible, damages will be limited to  *
258 | *  direct damages not to exceed one hundred dollars. Some              *
259 | *  jurisdictions do not allow the exclusion or limitation of           *
260 | *  incidental or consequential damages, so this exclusion and          *
261 | *  limitation may not apply to You.                                    *
262 | *                                                                      *
263 | ************************************************************************
264 | 
265 | 8. Litigation
266 | -------------
267 | 
268 | Any litigation relating to this License may be brought only in the
269 | courts of a jurisdiction where the defendant maintains its principal
270 | place of business and such litigation shall be governed by laws of that
271 | jurisdiction, without reference to its conflict-of-law provisions.
272 | Nothing in this Section shall prevent a party’s ability to bring
273 | cross-claims or counter-claims.
274 | 
275 | 9. Miscellaneous
276 | ----------------
277 | 
278 | This License represents the complete agreement concerning the subject
279 | matter hereof. If any provision of this License is held to be
280 | unenforceable, such provision shall be reformed only to the extent
281 | necessary to make it enforceable. Any law or regulation which provides
282 | that the language of a contract shall be construed against the drafter
283 | shall not be used to construe this License against a Contributor.
284 | 
285 | 10. Versions of the License
286 | ---------------------------
287 | 
288 | 10.1. New Versions
289 | 
290 | Oxford Nanopore Technologies PLC. is the license steward. Except as
291 | provided in Section 10.3, no one other than the license steward has the
292 | right to modify or publish new versions of this License. Each version
293 | will be given a distinguishing version number.
294 | 
295 | 10.2. Effect of New Versions
296 | 
297 | You may distribute the Covered Software under the terms of the version
298 | of the License under which You originally received the Covered Software,
299 | or under the terms of any subsequent version published by the license
300 | steward.
301 | 
302 | 10.3. Modified Versions
303 | 
304 | If you create software not governed by this License, and you want to
305 | create a new license for such software, you may create and use a
306 | modified version of this License if you rename the license and remove
307 | any references to the name of the license steward (except to note that
308 | such modified license differs from this License).
309 | 
310 | Exhibit A - Source Code Form License Notice
311 | -------------------------------------------
312 | 
313 |   This Source Code Form is subject to the terms of the Oxford Nanopore
314 |   Technologies PLC. Public License, v. 1.0. Full licence can be found
315 |   obtained from support@nanoporetech.com
316 | 
317 | If it is not possible or desirable to put the notice in a particular
318 | file, then You may include the notice in a location (such as a LICENSE
319 | file in a relevant directory) where a recipient would be likely to look
320 | for such a notice.
321 | 
322 | You may add additional accurate notices of copyright ownership.
323 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Alignment workflow
  2 | 
  3 | Align nanopore sequencing data and visualize mapping statistics.
  4 | 
  5 | 
  6 | 
  7 | ## Introduction
  8 | 
  9 | This workflow provides an easy way to align Oxford Nanopore reads and gather mapping
 10 | stats either locally for small amounts of data or at scale in a distributed
 11 | environment such as a cluster or the cloud.
 12 | 
 13 | > This workflow contains minimal functionality that is duplicated in many of our more specialised workflows.
 14 | > Please consider using one of these alternative workflows before using this one: you very likely do not need
 15 | > to use this workflow.
 16 | 
 17 | In brief, it will perform the following:
 18 | * Combine all reference files in the directory passed to `--references`.
 19 | * Align input reads (passed as FASTQ or unaligned BAM files) against the reference (Note that BAM files with aligned reads can be used as well; these will skip the alignment step and only stats and the report will be produced).
 20 | * Create alignment stats.
 21 | * Calculate depth of coverage along the reference sequences (this step can be skipped if requested).
 22 | * Create an HTML report to illustrate the results.
 23 | 
 24 | 
 25 | 
 26 | 
 27 | ## Compute requirements
 28 | 
 29 | Recommended requirements:
 30 | 
 31 | + CPUs = 12
 32 | + Memory = 32GB
 33 | 
 34 | Minimum requirements:
 35 | 
 36 | + CPUs = 6
 37 | + Memory = 12GB
 38 | 
 39 | Approximate run time: 0.5-5 minutes per sample (depending on number of reads, length of reference, and available compute).
 40 | 
 41 | ARM processor support: True
 42 | 
 43 | 
 44 | 
 45 | 
 46 | ## Install and run
 47 | 
 48 | 
 49 | These are instructions to install and run the workflow on command line.
 50 | You can also access the workflow via the
 51 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/).
 52 | 
 53 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage
 54 | compute and software resources,
 55 | therefore Nextflow will need to be
 56 | installed before attempting to run the workflow.
 57 | 
 58 | The workflow can currently be run using either
 59 | [Docker](https://docs.docker.com/get-started/)
 60 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
 61 | to provide isolation of the required software.
 62 | Both methods are automated out-of-the-box provided
 63 | either Docker or Singularity is installed.
 64 | This is controlled by the
 65 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles)
 66 | parameter as exemplified below.
 67 | 
 68 | It is not required to clone or download the git repository
 69 | in order to run the workflow.
 70 | More information on running EPI2ME workflows can
 71 | be found on our [website](https://labs.epi2me.io/wfindex).
 72 | 
 73 | The following command can be used to obtain the workflow.
 74 | This will pull the repository in to the assets folder of
 75 | Nextflow and provide a list of all parameters
 76 | available for the workflow as well as an example command:
 77 | 
 78 | ```
 79 | nextflow run epi2me-labs/wf-alignment --help
 80 | ```
 81 | To update a workflow to the latest version on the command line use
 82 | the following command:
 83 | ```
 84 | nextflow pull epi2me-labs/wf-alignment
 85 | ```
 86 | 
 87 | A demo dataset is provided for testing of the workflow.
 88 | It can be downloaded and unpacked using the following commands:
 89 | ```
 90 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-alignment/wf-alignment-demo.tar.gz
 91 | tar -xzvf wf-alignment-demo.tar.gz
 92 | ```
 93 | The workflow can then be run with the downloaded demo data using:
 94 | ```
 95 | nextflow run epi2me-labs/wf-alignment \
 96 | 	--fastq 'wf-alignment-demo/fastq' \
 97 | 	--references 'wf-alignment-demo/references' \
 98 | 	-profile standard
 99 | ```
100 | 
101 | For further information about running a workflow on
102 | the command line see https://labs.epi2me.io/wfquickstart/
103 | 
104 | 
105 | 
106 | 
107 | ## Related protocols
108 | 
109 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices.
110 | 
111 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/).
112 | 
113 | 
114 | 
115 | 
116 | ## Input example
117 | 
118 | <!---Example of input directory structure, delete and edit as appropriate per workflow.--->
119 | This workflow accepts either FASTQ or BAM files as input.
120 | 
121 | The FASTQ or BAM input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ or BAM file; (ii) the path to a top-level directory containing FASTQ or BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ or BAM files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`.
122 | 
123 | ```
124 | (i)                     (ii)                 (iii)    
125 | input_reads.fastq   ─── input_directory  ─── input_directory
126 |                         ├── reads0.fastq     ├── barcode01
127 |                         └── reads1.fastq     │   ├── reads0.fastq
128 |                                              │   └── reads1.fastq
129 |                                              ├── barcode02
130 |                                              │   ├── reads0.fastq
131 |                                              │   ├── reads1.fastq
132 |                                              │   └── reads2.fastq
133 |                                              └── barcode03
134 |                                               └── reads0.fastq
135 | ```
136 | 
137 | 
138 | 
139 | ## Input parameters
140 | 
141 | ### Input Options
142 | 
143 | | Nextflow parameter name  | Type | Description | Help | Default |
144 | |--------------------------|------|-------------|------|---------|
145 | | fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
146 | | bam | string | BAM or unaligned BAM (uBAM) files to use in the analysis. | This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
147 | | analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False |
148 | | references | string | Path to a directory containing FASTA reference files. | Accepted file extensions are '.fasta', '.fna', '.ffn', '.faa', '.frn', '.fa', '.txt', '.fa.gz', '.fna.gz', '.frn.gz', '.ffn.gz', '.fasta.gz'. In addition, an MMI index file can be provided to make the workflow run faster using the option `--reference_mmi_file`. |  |
149 | | reference_mmi_file | string | Path to an MMI index file to be used as reference. | Accepted file extension is '.mmi'. The references parameter is still required if this is provided. Note that some minimap2 alignment options are set by the reference MMI and cannot be overridden. |  |
150 | | counts | string | Path to a CSV file containing expected counts as a control. | The expected counts CSV file must contain columns named 'reference' and 'expected_counts' in order to be valid. the 'reference' column should contain names matching the names of reference sequences within the fasta files provided using --references. |  |
151 | 
152 | 
153 | ### Sample Options
154 | 
155 | | Nextflow parameter name  | Type | Description | Help | Default |
156 | |--------------------------|------|-------------|------|---------|
157 | | sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. |  |
158 | | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. |  |  |
159 | 
160 | 
161 | ### Output Options
162 | 
163 | | Nextflow parameter name  | Type | Description | Help | Default |
164 | |--------------------------|------|-------------|------|---------|
165 | | out_dir | string | Directory for output of all workflow results. |  | output |
166 | | prefix | string | Optional prefix attached to each of the output filenames. | Output filename format will be `<prefix>-filename.ext`. |  |
167 | | per_read_stats | boolean | Generate Bamstats per-read stats. | With this option, the workflow will produce detailed per-read alignment stats emitted as gzipped TSV file. As these files can get quite large, it is recommended to only request them when necessary. | False |
168 | | igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. |  | False |
169 | 
170 | 
171 | ### Advanced options
172 | 
173 | | Nextflow parameter name  | Type | Description | Help | Default |
174 | |--------------------------|------|-------------|------|---------|
175 | | depth_coverage | boolean | Calculate depth coverage statistics and include them in the report. | This step can be a computational bottleneck. Set this to false if your reference sequences are >50mb to speed things up. | True |
176 | | minimap_preset | string | Pre-defined parameter sets for `minimap2`, covering most common use cases. | Available parameter sets are: 'dna' (`-ax map-ont`), 'rna' (`-ax splice -uf`). | dna |
177 | | minimap_args | string | String of command line arguments to be passed on to `minimap2`. | This overrides the options defined by `--minimap_preset` and allows for running the alignment step in a more customized way. |  |
178 | 
179 | 
180 | ### Miscellaneous Options
181 | 
182 | | Nextflow parameter name  | Type | Description | Help | Default |
183 | |--------------------------|------|-------------|------|---------|
184 | | threads | integer | Number of CPU threads to use for the alignment step. | The alignment process will run with this many threads (note that the memory used by minimap2 scales with the number of threads). The total CPU resources used by the workflow are constrained by the Nextflow executor configuration. | 4 |
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | ## Outputs
192 | 
193 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}.
194 | 
195 | | Title | File path | Description | Per sample or aggregated |
196 | |-------|-----------|-------------|--------------------------|
197 | | workflow report | wf-alignment-report.html | Report for all samples | aggregated |
198 | | Combined references | combined_refs.fasta | FASTA file containing all input references. | aggregated |
199 | | Combined references index | combined_refs.fasta.fai | Index file for combined references FASTA. | aggregated |
200 | | Combined references MMI index | combined_refs.mmi | Minimap2 index file for combined references FASTA. | aggregated |
201 | | Per-read alignment stats | {{ alias }}.readstats.tsv.gz | Bamstats per-read output TSV file (compressed with gzip). | per-sample |
202 | | Per-reference alignment stats | {{ alias }}.flagstat.tsv | Bamstats flagstat output TSV file. | per-sample |
203 | | Alignment accuracy histogram | {{ alias }}-histograms/accuracy.hist | Bamstats alignment accuracy histogram TSV file. | per-sample |
204 | | Alignment coverage histogram | {{ alias }}-histograms/coverage.hist | Bamstats alignment coverage histogram TSV file. | per-sample |
205 | | Read length histogram (mapped) | {{ alias }}-histograms/length.hist | Bamstats read length histogram TSV file (for mapped reads). | per-sample |
206 | | Read length histogram (unmapped) | {{ alias }}-histograms/length.unmap.hist | Bamstats read length histogram TSV file (for unmapped reads). | per-sample |
207 | | Read quality histogram (mapped) | {{ alias }}-histograms/quality.hist | Bamstats read quality histogram TSV file (for mapped reads). | per-sample |
208 | | Read quality histogram (unmapped) | {{ alias }}-histograms/quality.unmap.hist | Bamstats read quality histogram TSV file (for unmapped reads). | per-sample |
209 | | Alignments BAM file | {{ alias }}.sorted.aligned.bam | BAM file with alignments of filtered input reads against the combined references. | per-sample |
210 | | Alignments index file | {{ alias }}.sorted.aligned.bam.bai | Index for alignments BAM file. | per-sample |
211 | | IGV config JSON file | igv.json | JSON file with IGV config options to be used by the EPI2ME Desktop Application. | aggregated |
212 | 
213 | 
214 | 
215 | 
216 | ## Pipeline overview
217 | 
218 | ### 1. Combine reference files
219 | 
220 | All reference files in the directory passed to `--references` are concatenated.
221 | 
222 | ### 2. Align reads
223 | 
224 | Input reads are aligned against the combined reference with [Minimap2](https://github.com/lh3/minimap2). If BAM files are used as input (with `--bam`), only reads in files without a reference in the SAM header are aligned. For other BAM files this step is skipped.
225 | 
226 | ### 3. Create alignment stats
227 | 
228 | [Bamstats](https://github.com/epi2me-labs/fastcat#bamstats) is used to create per-read and per-reference alignment stats from the BAM files.
229 | 
230 | ### 4. Calculate depth of coverage
231 | 
232 | Depth of coverage along the reference sequences is determined with [Mosdepth](https://github.com/brentp/mosdepth) (using 200 windows per reference sequence). To speed up the workflow, this step can be skipped by adding `--depth-coverage false`.
233 | 
234 | 
235 | 
236 | 
237 | ## Troubleshooting
238 | 
239 | + If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug.
240 | + Please see [here](https://labs.epi2me.io/trouble-shooting/) for how to resolve some common Nextflow issues and [here](https://labs.epi2me.io/how-to-exits/) for how to interpret command exit codes.
241 | 
242 | 
243 | 
244 | 
245 | ## FAQ's
246 | 
247 | *I cannot select a single reference file in the EPI2ME desktop app.* - When running the workflow via the desktop app, you need to provide a directory with reference files. If you only have a single file, you can create a directory to place your reference file inside and select this with the reference input option.
248 | 
249 | *How are the values in the `acc` column (and other metrics) in the per-read output stats calculated?* -
250 | For details on the per-read stats output files, please refer to the [fastcat/bamstats documentation](https://github.com/epi2me-labs/fastcat#output-format).
251 | 
252 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-alignment/issues) page or start a discussion on the [community](https://community.nanoporetech.com/).
253 | 
254 | 
255 | 
256 | 
257 | ## Related blog posts
258 | 
259 | - [How to align your data](https://labs.epi2me.io/how-to-align/)
260 | 
261 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts.
262 | 
263 | 
264 | 
265 | 
266 | 


--------------------------------------------------------------------------------
/bin/workflow-glue:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Entrypoint of pseudo-package for all the code used in the workflow."""
3 | 
4 | from workflow_glue import cli
5 | 
6 | if __name__ == "__main__":
7 |     cli()
8 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/__init__.py:
--------------------------------------------------------------------------------
 1 | """Workflow Python code."""
 2 | import argparse
 3 | import glob
 4 | import importlib
 5 | import itertools
 6 | import os
 7 | import sys
 8 | 
 9 | from .util import _log_level, get_main_logger  # noqa: ABS101
10 | 
11 | 
12 | __version__ = "0.0.1"
13 | _package_name = "workflow_glue"
14 | 
15 | HELPERS = "wfg_helpers"
16 | 
17 | 
18 | def get_components(allowed_components=None):
19 |     """Find a list of workflow command scripts."""
20 |     logger = get_main_logger(_package_name)
21 | 
22 |     # gather all python files in the current directory and the wfg_helpers
23 |     home_path = os.path.dirname(os.path.abspath(__file__))
24 |     standard_lib = os.path.join(home_path, HELPERS)
25 |     globs = itertools.chain.from_iterable((
26 |         glob.glob(os.path.join(path, "*.py"))
27 |         for path in (home_path, standard_lib)))
28 | 
29 |     components = dict()
30 |     for fname in globs:
31 |         name = os.path.splitext(os.path.basename(fname))[0]
32 |         if name in ("__init__", "util"):
33 |             continue
34 |         if allowed_components is not None and name not in allowed_components:
35 |             continue
36 | 
37 |         # leniently attempt to import module
38 |         try:
39 |             if HELPERS in fname:
40 |                 mod = importlib.import_module(f"{_package_name}.{HELPERS}.{name}")
41 |             else:
42 |                 mod = importlib.import_module(f"{_package_name}.{name}")
43 |         except ModuleNotFoundError as e:
44 |             # if imports cannot be satisifed, refuse to add the component
45 |             # rather than exploding
46 |             logger.warn(f"Could not load {name} due to missing module {e.name}")
47 |             continue
48 | 
49 |         # if theres a main() and and argparser() thats good enough for us.
50 |         try:
51 |             req = "main", "argparser"
52 |             if all(callable(getattr(mod, x)) for x in req):
53 |                 components[name] = mod
54 |         except Exception:
55 |             pass
56 |     return components
57 | 
58 | 
59 | def cli():
60 |     """Run workflow entry points."""
61 |     logger = get_main_logger(_package_name)
62 |     logger.info("Bootstrapping CLI.")
63 |     parser = argparse.ArgumentParser(
64 |         'wf-glue',
65 |         parents=[_log_level()],
66 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
67 | 
68 |     parser.add_argument(
69 |         '-v', '--version', action='version',
70 |         version='%(prog)s {}'.format(__version__))
71 | 
72 |     subparsers = parser.add_subparsers(
73 |         title='subcommands', description='valid commands',
74 |         help='additional help', dest='command')
75 |     subparsers.required = True
76 | 
77 |     # importing everything can take time, try to shortcut
78 |     if len(sys.argv) > 1:
79 |         components = get_components(allowed_components=[sys.argv[1]])
80 |         if not sys.argv[1] in components:
81 |             logger.warn("Importing all modules, this may take some time.")
82 |             components = get_components()
83 |     else:
84 |         components = get_components()
85 | 
86 |     # add all module parsers to main CLI
87 |     for name, module in components.items():
88 |         p = subparsers.add_parser(
89 |             name.split(".")[-1], parents=[module.argparser()])
90 |         p.set_defaults(func=module.main)
91 | 
92 |     args = parser.parse_args()
93 | 
94 |     logger.info("Starting entrypoint.")
95 |     args.func(args)
96 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/check_reference_index.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Parse references from MMI index file."""
 3 | 
 4 | 
 5 | import struct
 6 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 7 | 
 8 | MM_IDX_MAGIC = b"MMI\2"
 9 | 
10 | 
11 | def parse_mmi(mmi_file):
12 |     """Get references from the MMI file."""
13 |     # Minimap2 Index Description: .mmi -> binary file.
14 |     # Created in:
15 |     # https://github.com/lh3/minimap2/blob/fc2e1607d73ea4870e6f396697c79051aff23eed/index.c
16 |     # see this Github Issue: https://github.com/lh3/minimap2/issues/820
17 |     # It contains 4 parts:
18 |     # 1) Magic string: ("MMI\x02", MM_IDX_MAGIC in the source) -> first four bytes.
19 |     # 2) Followed by 5 integers of constants: #L273
20 |     #        mi = mm_idx_init(int w, int k, int bucket_bits, int B, int flag);
21 |     #        - w-> minimizer width w
22 |     #        - k-> window length k,
23 |     #        - bucket_bits (b)-> hardcoded value 14: #L403, 421
24 |     #        - B-> number of sequences ?? -> L418
25 |     #        - flags: is_hpc, name...
26 |     # 3) REFERENCES IDS: The reference information is stored: #L427C2-L427C2
27 |     #        - name: the name in ascii
28 |     #        - offset: sum_len
29 |     #        - len: the length as integer.
30 |     #        - is_alt: ALT contigs
31 |     #    This is repeated for each sequence.
32 |     # 4) SEQUENCES
33 |     sequences = dict()
34 |     # open file in read binary mode
35 |     with open(mmi_file, "rb") as fd:
36 |         # type I: convert C values to python
37 |         # https://docs.python.org/3/library/struct.html#format-characters
38 |         magic = fd.read(4)
39 |         if magic != MM_IDX_MAGIC:
40 |             raise ValueError(f"{mmi_file} does not appear to be a minimap2 index.")
41 |         # w, k, bucket bits, n_seqs, flags
42 |         _, _, _, n_seq, _ = struct.unpack("5I", fd.read(20))
43 |         # contents is then (len name, name, len seq)
44 |         for _ in range(n_seq):
45 |             name_length = struct.unpack("B", fd.read(1))[0]
46 |             name = fd.read(name_length).decode("ascii")
47 |             length = struct.unpack("I", fd.read(4))[0]
48 |             sequences[name] = length
49 |     return sequences
50 | 
51 | 
52 | def get_fai_references(fai_file):
53 |     """Get references from the FASTA fai file."""
54 |     with open(fai_file, 'r') as fasta_ids:
55 |         # check references
56 |         return {i.split('\t')[0] for i in fasta_ids.readlines()}
57 | 
58 | 
59 | def main(args):
60 |     """Run entry point."""
61 |     logger = get_named_logger("check_reference_index")
62 |     fasta_names = get_fai_references(args.fasta_fai)
63 |     sequences = set(parse_mmi(args.mmi_file).keys())
64 |     if len(sequences) != len(fasta_names):
65 |         raise Exception(
66 |             f"Number of sequences in the MMI file ({len(sequences)}) doesn't match "
67 |             f"the number of sequences provided references ({len(fasta_names)}).")
68 |     if not sequences - fasta_names:
69 |         logger.info("All the MMI references are in the FASTA reference")
70 |     else:
71 |         raise Exception(
72 |             "The next references found in the MMI file are not in the FASTA file: "
73 |             f"{sequences - fasta_names}")
74 | 
75 | 
76 | def argparser():
77 |     """Argument parser for entrypoint."""
78 |     parser = wf_parser("report")
79 |     parser.add_argument(
80 |         "--mmi_file",
81 |         help="MMI index file"
82 |     )
83 |     parser.add_argument(
84 |         "--fasta_fai",
85 |         help="Fai index file."
86 |     )
87 |     return parser
88 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/models/__init__.py:
--------------------------------------------------------------------------------
1 | """A collection of scripts for results models."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/models/common.py:
--------------------------------------------------------------------------------
  1 | """Common model classes used across all workflows."""
  2 | from dataclasses import asdict, dataclass, field
  3 | from enum import Enum
  4 | import json
  5 | from pathlib import Path
  6 | from typing import Any, Dict, List, Optional
  7 | 
  8 | from ..util import get_named_logger  # noqa: ABS101
  9 | 
 10 | logger = get_named_logger("Models")
 11 | 
 12 | 
 13 | @dataclass
 14 | class WorkflowBaseModel:
 15 |     """Common things for stuff in the model."""
 16 | 
 17 |     def get(
 18 |         self,
 19 |         field_name: str,
 20 |         title: bool = True,
 21 |         **kwargs
 22 |     ):
 23 |         """Get reportable field tuple."""
 24 |         field_info = self.__dataclass_fields__.get(field_name)
 25 |         # provide an empty string default title to minimise drama
 26 |         field_title = field_info.metadata.get("title", "")
 27 |         value = self.get_reportable_value(field_name=field_name, **kwargs)
 28 |         if title:
 29 |             return (field_title, value)
 30 |         return value
 31 | 
 32 |     def get_reportable_value(
 33 |             self,
 34 |             field_name: str,
 35 |             *,
 36 |             decimal_places: int = None,
 37 |             default_value: str = "N/A") -> Optional[str]:
 38 |         """Get the value of a value and make it reportable."""
 39 |         # Get the field info using the field name
 40 |         field_info = self.__dataclass_fields__.get(field_name)
 41 |         if field_info is None:
 42 |             raise AttributeError(
 43 |                 f"{field_name!r} is not a field on {self.__class__.__name__}"
 44 |             )
 45 | 
 46 |         value = getattr(self, field_name)
 47 | 
 48 |         if value is None:
 49 |             return default_value
 50 | 
 51 |         if isinstance(value, (int, float)):
 52 |             if decimal_places:
 53 |                 value = round(value, decimal_places)
 54 |             if value < 0.0001 or value > 99999999:
 55 |                 value = f"{value:.2E}"
 56 |         else:
 57 |             if decimal_places:
 58 |                 raise TypeError(
 59 |                     "decimal_places is not a supported argument for a non-numeric.")
 60 | 
 61 |         unit = field_info.metadata.get('unit')
 62 | 
 63 |         if unit:
 64 |             return f"{value} {unit}"
 65 | 
 66 |         return str(value)
 67 | 
 68 | 
 69 | class SampleType(str, Enum):
 70 |     """The type of the sample."""
 71 | 
 72 |     no_template_control = "no_template_control"
 73 |     positive_control = "positive_control"
 74 |     negative_control = "negative_control"
 75 |     test_sample = "test_sample"
 76 | 
 77 |     def friendly_name(self):
 78 |         """Convert sample type to string."""
 79 |         return self.name.replace("_", " ").capitalize()
 80 | 
 81 | 
 82 | @dataclass
 83 | class SampleIdentifier:
 84 |     """Additional identifiers for a sample."""
 85 | 
 86 |     name: str = field(
 87 |         metadata={
 88 |             "title": "Identifier name",
 89 |             "Description": "The name of the sample identifier"})
 90 |     value: str = field(
 91 |         metadata={
 92 |             "title": "Identifier value",
 93 |             "Description": "The value of the sample identifier"})
 94 | 
 95 | 
 96 | @dataclass
 97 | class CheckResult:
 98 |     """
 99 |     A result of some check the workflow has performed.
100 | 
101 |     This can be at sample or workflow level.
102 |     """
103 | 
104 |     check_category: str = field(
105 |         metadata={
106 |             "title": "Check category",
107 |             "description": "The category of the check"})
108 |     check_name: str = field(
109 |         metadata={
110 |             "title": "Check name",
111 |             "description": "The name of the check"})
112 |     check_pass: bool = field(
113 |         metadata={
114 |             "title": "Check pass",
115 |             "description": "If true the check has passed"})
116 |     check_threshold: str | None = field(
117 |         default=None, metadata={
118 |             "title": "Check threshold",
119 |             "description": "The threshold for the check, useful for reporting later"})
120 | 
121 |     categories = {}
122 | 
123 |     def friendly_check_category(self):
124 |         """Convert category to string."""
125 |         if self.check_category not in self.categories:
126 |             raise ValueError(f"{self.check_category} has no friendly name")
127 |         return self.categories[self.check_category]
128 | 
129 |     def friendly_check_name(self):
130 |         """Convert check name to string."""
131 |         return self.check_name.replace("_", " ").capitalize()
132 | 
133 | 
134 | @dataclass
135 | class ResultsContents:
136 |     """Placeholder class for results contents."""
137 | 
138 |     pass
139 | 
140 | 
141 | @dataclass
142 | class Sample:
143 |     """A sample sheet entry and its corresponding checks and related results."""
144 | 
145 |     alias: str = field(
146 |         metadata={
147 |             "title": "Sample alias",
148 |             "description": "The alias for the sample given by the user"})
149 |     sample_type: SampleType = field(
150 |         metadata={
151 |             "title": "Sample type",
152 |             "description": "The type of the sample"})
153 |     sample_pass: bool = field(
154 |         metadata={
155 |             "title": "Sample pass",
156 |             "description": "If true the sample has passed workflow checks"})
157 |     barcode: str | None = field(
158 |         default=None,
159 |         metadata={
160 |             "title": "Sample barcode",
161 |             "description": "The physical barcode assigned to the sample"})
162 |     additional_identifiers: List[SampleIdentifier] = field(
163 |         default_factory=list, metadata={
164 |             "title": "Additional sample identifiers",
165 |             "description": "Additional identifiers for the sample"})
166 |     sample_checks: list[CheckResult] = field(
167 |         default_factory=list, metadata={
168 |             "title": "Sample checks",
169 |             "description": "An array of checks performed on the sample"})
170 |     results: ResultsContents | None = field(
171 |         default=None, metadata={
172 |             "title": "Sample results",
173 |             "description": "Further specific workflow results for this sample"})
174 |     config:  Dict[str, Any] | None = field(
175 |         default=None, metadata={
176 |             "title": "Sample configuration",
177 |             "description": """Sample specific config parameters
178 |             used for running analysis"""})
179 | 
180 |     def __post_init__(self):
181 |         """Determine overall status for a sample given the individual check results."""
182 |         self.sample_pass = all(
183 |             check.check_pass for check in self.sample_checks)
184 | 
185 |     def get_sample_identifier(self, sample_identifier):
186 |         """Get a sample identifier given the identifier name."""
187 |         for identifier in self.additional_identifiers:
188 |             if identifier.name == sample_identifier:
189 |                 return identifier.value
190 |         raise KeyError("Sample identifier not found")
191 | 
192 |     def set_sample_identifier(self, name, value):
193 |         """Set a sample identifier."""
194 |         sample_identifier = SampleIdentifier(
195 |             name=name,
196 |             value=value)
197 |         self.additional_identifiers.append(sample_identifier)
198 |         return self.additional_identifiers
199 | 
200 |     def to_json(self, filename):
201 |         """Save class as JSON."""
202 |         with open(filename, 'w') as f:
203 |             json.dump(asdict(self), f, default=str, indent=2)
204 | 
205 |     def get_reportable_qc_status(self, max_criteria=4):
206 |         """Store global status of the sample and list of QC criteria to show.
207 | 
208 |         :params max_criteria: Maximum number of criteria to be reported.
209 |         """
210 |         # Store global status: pass/ failed
211 |         qc_global_status = {"status": self.sample_pass, "scope": "QC status"}
212 |         qc_criteria = []
213 |         if self.sample_pass:
214 |             qc_criteria.append(
215 |                     {"status": self.sample_pass, "scope": "All acceptance criteria met"}
216 |             )
217 |         else:
218 |             # Report failed criteria until a maximum value
219 |             for qc in self.sample_checks:
220 |                 if not qc.check_pass:  # append criteria if failed
221 |                     qc_criteria.append(
222 |                         {
223 |                             "status": qc.check_pass,
224 |                             "category": qc.friendly_check_category(),
225 |                             "scope": qc.friendly_check_name(),
226 |                         }
227 |                     )
228 |             if len(qc_criteria) > max_criteria:
229 |                 # Replace all the failed criteria, with a sentence with the number
230 |                 # instead of listing all of them.
231 |                 # Set status to False as more than max_criteria are failed.
232 |                 qc_criteria = [
233 |                     {
234 |                         "status": False,
235 |                         "scope": f"{len(qc_criteria)} acceptance criteria",
236 |                     },
237 |                 ]
238 |         return qc_global_status, qc_criteria
239 | 
240 | 
241 | @dataclass
242 | class RunStats:
243 |     """Basic run statistics for the entire run."""
244 | 
245 |     total_reads: int | None = field(
246 |         default=None, metadata={
247 |             "title": "Total reads",
248 |             "description": "Total number of reads on run"})
249 |     total_ambiguous_reads: int | None = field(
250 |         default=None, metadata={
251 |             "title": "Total ambiguous reads",
252 |             "description": "Number of reads of unknown provenance"})
253 |     total_unaligned_reads: int | None = field(
254 |         default=None, metadata={
255 |             "title": "Total unaligned reads",
256 |             "description": "Number of unaligned reads"})
257 | 
258 | 
259 | @dataclass
260 | class WorkflowResult(WorkflowBaseModel):
261 |     """
262 |     Definition for results that will be returned by this workflow.
263 | 
264 |     This structure will be passed through by Gizmo speaking clients
265 |     as WorkflowInstance.results.
266 |     """
267 | 
268 |     samples: list[Sample] = field(
269 |         metadata={
270 |             "title": "Samples",
271 |             "description": "Samples in this workflow instance"})
272 |     workflow_pass: bool | None = field(
273 |         default=None, metadata={
274 |             "title": "Workflow pass",
275 |             "description": "True if this workflow instance passes all checks"})
276 |     workflow_checks: list[CheckResult] = field(
277 |         default_factory=list, metadata={
278 |             "title": "Workflow checks",
279 |             "description": "An array of checks performed on the workflow instance"})
280 |     run_stats: RunStats | None = field(
281 |         default=None, metadata={
282 |             "title": "Samples",
283 |             "description": "Basic run statistics"})
284 |     client_fields: dict[str, Any] | None = field(
285 |         default_factory=dict, metadata={
286 |             "title": "Client fields",
287 |             "description": "Arbitrary key-value pairs provided by the client"})
288 |     versions: dict[str, Any] | None = field(
289 |         default_factory=dict, metadata={
290 |             "title": "Analysis tool versions",
291 |             "description": """Key-value pairs collecting the
292 |             software used and the corresponding versions"""})
293 |     params: dict[str, Any] | None = field(
294 |         default_factory=dict, metadata={
295 |             "title": "Pertinent parameters",
296 |             "description": """Key-value pairs with the
297 |             options chosen by the user"""})
298 | 
299 |     def load_client_fields(self, filename):
300 |         """Load client fields."""
301 |         with open(filename) as f:
302 |             try:
303 |                 client_fields = json.loads(f.read())
304 |                 # convert any lists into strings for display
305 |                 for key, value in client_fields.items():
306 |                     if isinstance(value, list):
307 |                         client_fields[key] = ', '.join(value)
308 |             except json.decoder.JSONDecodeError:
309 |                 client_fields = {"error": "Error parsing client fields file."}
310 | 
311 |         self.client_fields = client_fields
312 |         return self.client_fields
313 | 
314 |     def load_params(self, params_json, keep=None):
315 |         """Create a workflow params dict."""
316 |         params_json = Path(params_json)
317 |         if keep is None:
318 |             keep = []
319 |         if not params_json.is_file():
320 |             raise FileNotFoundError(f"No such file: {params_json}")
321 |         with open(params_json, "r") as f:
322 |             try:
323 |                 params_dict = json.loads(f.read())
324 |                 self.params = {
325 |                     k: v for k, v in params_dict.items() if k in set(keep)
326 |                 }
327 |                 return self.params
328 |             except ValueError:
329 |                 raise ValueError(f"Invalid JSON file: {params_json}")
330 | 
331 |     def load_versions(self, versions_path):
332 |         """Create a version list of dict."""
333 |         versions_path = Path(versions_path)
334 |         if not versions_path.exists():
335 |             raise FileNotFoundError(f"No such file: {versions_path}")
336 | 
337 |         if versions_path.is_dir():
338 |             version_files = [
339 |                 vp for vp in versions_path.iterdir() if vp.is_file()
340 |             ]
341 |         elif versions_path.is_file():
342 |             version_files = [versions_path]
343 |         else:
344 |             raise IOError(f"{versions_path} should be either a directory or a file")
345 |         for fname in version_files:
346 |             versions = {}
347 |             with open(fname, "r", encoding="utf-8") as fh:
348 |                 for line in fh.readlines():
349 |                     name, version = line.strip().split(",")
350 |                     versions[name] = version
351 |         self.versions = versions
352 |         return self.versions
353 | 
354 |     def to_json(self, filename):
355 |         """Save class as JSON."""
356 |         with open(filename, 'w') as f:
357 |             json.dump(asdict(self), f, default=str, indent=2)
358 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/report.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Report using ezcharts."""
  3 | 
  4 | from pathlib import Path
  5 | 
  6 | from ezcharts.components.reports import labs
  7 | import pandas as pd
  8 | 
  9 | from .report_utils import read_data, sections  # noqa: ABS101
 10 | 
 11 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 12 | 
 13 | 
 14 | def main(args):
 15 |     """Run entry point."""
 16 |     logger = get_named_logger("report")
 17 | 
 18 |     per_sample_dirs = sorted(args.data.glob("*"))
 19 | 
 20 |     flagstat_df = pd.concat(
 21 |         (
 22 |             read_data.flagstat(sample_dir).assign(sample_name=sample_dir.name)
 23 |             for sample_dir in per_sample_dirs
 24 |         )
 25 |     ).astype({"sample_name": "category"})
 26 | 
 27 |     # read the ref names (i.e. get a dict mapping ref names to the ref file)
 28 |     refname2reffile = read_data.refnames(args.refnames_dir)
 29 |     ref_files = sorted(set(refname2reffile.values()) - set(["unmapped"]))
 30 |     ref_seqs = sorted(set(refname2reffile.keys()) - set(["*"]))
 31 |     # read depth info if available
 32 |     try:
 33 |         depth_df = pd.concat(
 34 |             (read_data.depths(d).assign(sample_name=d.name) for d in per_sample_dirs)
 35 |         ).astype({"sample_name": "category"})
 36 |     except AttributeError:
 37 |         depth_df = None
 38 |     # read counts if available
 39 |     counts = read_data.counts(args.counts) if args.counts is not None else None
 40 | 
 41 |     # add a column with the respective ref. files to the stats dataframes
 42 |     for df in (flagstat_df, depth_df):
 43 |         if df is None:
 44 |             continue
 45 |         try:
 46 |             df["ref_file"] = (
 47 |                 df["ref"].apply(lambda ref: refname2reffile[ref]).astype("category")
 48 |             )
 49 |         except KeyError as e:
 50 |             (missing_ref,) = e.args
 51 |             raise ValueError(
 52 |                 f"Reference '{missing_ref}' not found in the provided "
 53 |                 f"reference files {ref_files}."
 54 |             )
 55 | 
 56 |     # create the report
 57 |     report = labs.LabsReport(
 58 |         "wf-alignment report",
 59 |         "wf-alignment",
 60 |         args.params,
 61 |         args.versions,
 62 |         workflow_version=args.wf_version,
 63 |     )
 64 |     # add sections
 65 |     sections.summary(report, per_sample_dirs, ref_files, ref_seqs, flagstat_df)
 66 |     sections.seqsum(report, per_sample_dirs)
 67 |     if depth_df is not None:
 68 |         sections.depths(report, depth_df)
 69 |     if counts is not None:
 70 |         sections.counts(report, flagstat_df, counts)
 71 |     report_fname = "wf-alignment-report.html"
 72 |     report.write(report_fname)
 73 | 
 74 |     logger.info(f"Written report to '{report_fname}'.")
 75 | 
 76 | 
 77 | def argparser():
 78 |     """Argument parser for entrypoint."""
 79 |     parser = wf_parser("report")
 80 |     parser.add_argument(
 81 |         "--data",
 82 |         type=Path,
 83 |         help="directory with per-sample data (with a sub-directory for each sample)",
 84 |     )
 85 |     parser.add_argument(
 86 |         "--refnames_dir",
 87 |         help="directory with files containing reference names",
 88 |     )
 89 |     parser.add_argument(
 90 |         "--counts",
 91 |         required=False,
 92 |         help=(
 93 |             "CSV file with expected counts "
 94 |             "(columns: Reference, expected_count, expected_length)"
 95 |         ),
 96 |     )
 97 |     parser.add_argument(
 98 |         "--params",
 99 |         default=None,
100 |         help="CSV file with workflow parameters",
101 |     )
102 |     parser.add_argument(
103 |         "--versions",
104 |         help="CSV file with software versions",
105 |     )
106 |     parser.add_argument(
107 |         "--wf-version",
108 |         default="unknown version",
109 |         help="Workflow version",
110 |     )
111 |     return parser
112 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/report_utils/read_data.py:
--------------------------------------------------------------------------------
 1 | """Read data for report."""
 2 | 
 3 | import os
 4 | 
 5 | from ezcharts.components import fastcat
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def length_hist(data_dir):
10 |     """Load bamstats length histogram data."""
11 |     lengths_mapped = fastcat.load_histogram(data_dir, "length")
12 |     try:
13 |         lengths_unmapped = fastcat.load_histogram(data_dir, "length.unmap")
14 |     except FileNotFoundError:
15 |         lengths_unmapped = None
16 |     return fastcat.sum_hists((lengths_mapped, lengths_unmapped))
17 | 
18 | 
19 | def flagstat(sample_dir):
20 |     """Load bamstats flagstat."""
21 |     return fastcat.load_bamstats_flagstat(sample_dir / "bamstats.flagstat.tsv")
22 | 
23 | 
24 | def depths(data_dir):
25 |     """Parse mosdepth results file if there is one."""
26 |     depths_file = data_dir / "depth.all_regions.bed.gz"
27 |     if not depths_file.exists():
28 |         return None
29 |     depths = pd.read_csv(
30 |         depths_file,
31 |         sep="\t",
32 |         header=None,
33 |         names=["ref", "start", "end", "depth"],
34 |         dtype={"ref": str, "start": int, "end": int, "depth": float},
35 |     )
36 |     return depths
37 | 
38 | 
39 | def refnames(refnames_dir):
40 |     """Read files mapping reference sequence IDs to reference file names.
41 | 
42 |     :param refnames_dir: directory containing files with ref. names
43 |     :return: `dict` mapping reference sequence IDs to reference files
44 |     """
45 |     refname2reffile = {"*": "unmapped"}
46 |     for ref_name_file in os.listdir(refnames_dir):
47 |         with open(f"{refnames_dir}/{ref_name_file}", "r") as f:
48 |             for ref_name in f:
49 |                 refname2reffile[ref_name.strip()] = os.path.basename(
50 |                     ref_name_file.split(".names")[0]
51 |                 )
52 |     return refname2reffile
53 | 
54 | 
55 | def counts(counts_file):
56 |     """Read expected counts data.
57 | 
58 |     :param counts_file: CSV file with expected counts (needs columns `["reference",
59 |         "expected_count"]`)
60 |     :raises ValueError: throw error if one of the required columns is missing
61 |     :return: `pd.Series` with the expected counts
62 |     """
63 |     counts = pd.read_csv(
64 |         counts_file,
65 |         dtype={"Reference": str, "expected_count": float})
66 |     counts.columns = [col.lower() for col in counts.columns]
67 |     if not ("reference" in counts.columns and "expected_count" in counts.columns):
68 |         raise ValueError(
69 |             (
70 |                 "Counts CSV must (at least) contain the columns 'reference' and "
71 |                 "'expected_count' (capitalisation is ignored)."
72 |             )
73 |         )
74 |     counts["reference"] = counts["reference"].astype(str)
75 |     return counts.set_index("reference")["expected_count"].squeeze()
76 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """__init__.py for the tests."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_test.py:
--------------------------------------------------------------------------------
 1 | """A dummy test."""
 2 | 
 3 | import argparse
 4 | 
 5 | from workflow_glue import report
 6 | 
 7 | 
 8 | def test():
 9 |     """Just showing that we can import using the workflow-glue."""
10 |     assert isinstance(report.argparser(), argparse.ArgumentParser)
11 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/util.py:
--------------------------------------------------------------------------------
 1 | """The odd helper function.
 2 | 
 3 | Be careful what you place in here. This file is imported into all glue.
 4 | """
 5 | import argparse
 6 | import logging
 7 | 
 8 | 
 9 | _log_name = None
10 | 
11 | 
12 | def get_main_logger(name):
13 |     """Create the top-level logger."""
14 |     global _log_name
15 |     _log_name = name
16 |     logging.basicConfig(
17 |         format='[%(asctime)s - %(name)s] %(message)s',
18 |         datefmt='%H:%M:%S', level=logging.INFO)
19 |     return logging.getLogger(name)
20 | 
21 | 
22 | def get_named_logger(name):
23 |     """Create a logger with a name.
24 | 
25 |     :param name: name of logger.
26 |     """
27 |     name = name.ljust(10)[:10]  # so logging is aligned
28 |     logger = logging.getLogger('{}.{}'.format(_log_name, name))
29 |     return logger
30 | 
31 | 
32 | def wf_parser(name):
33 |     """Make an argument parser for a workflow command."""
34 |     return argparse.ArgumentParser(
35 |         name,
36 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
37 |         add_help=False)
38 | 
39 | 
40 | def _log_level():
41 |     """Parser to set logging level and acquire software version/commit."""
42 |     parser = argparse.ArgumentParser(
43 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False)
44 | 
45 |     modify_log_level = parser.add_mutually_exclusive_group()
46 |     modify_log_level.add_argument(
47 |         '--debug', action='store_const',
48 |         dest='log_level', const=logging.DEBUG, default=logging.INFO,
49 |         help='Verbose logging of debug information.')
50 |     modify_log_level.add_argument(
51 |         '--quiet', action='store_const',
52 |         dest='log_level', const=logging.WARNING, default=logging.INFO,
53 |         help='Minimal logging; warnings only.')
54 | 
55 |     return parser
56 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | """A collection of helper scripts common to workflows."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_bam_headers_in_dir.py:
--------------------------------------------------------------------------------
 1 | """Check (u)BAM files for `@SQ` lines whether they are the same in all headers."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("checkBamHdr")
14 | 
15 |     if not args.input_path.is_dir():
16 |         raise ValueError(f"Input path '{args.input_path}' must be a directory.")
17 | 
18 |     target_files = list(args.input_path.glob("*"))
19 |     if not target_files:
20 |         raise ValueError(f"No files found in input directory '{args.input_path}'.")
21 |     # Loop over target files and check if there are `@SQ` lines in all headers or not.
22 |     # Set `is_unaligned` accordingly. If there are mixed headers (either with some files
23 |     # containing `@SQ` lines and some not or with different files containing different
24 |     # `@SQ` lines), set `mixed_headers` to `True`.
25 |     # Also check if there is the SO line, to validate whether the file is (un)sorted.
26 |     first_sq_lines = None
27 |     mixed_headers = False
28 |     sorted_xam = False
29 |     for xam_file in target_files:
30 |         # get the `@SQ` and `@HD` lines in the header
31 |         with pysam.AlignmentFile(xam_file, check_sq=False) as f:
32 |             # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with
33 |             # same reference but different SQ.UR as mixed_header (see CW-4842)
34 |             sq_lines = [{
35 |                 "SN": sq["SN"],
36 |                 "LN": sq["LN"],
37 |                 "M5": sq.get("M5"),
38 |             } for sq in f.header.get("SQ", [])]
39 |             hd_lines = f.header.get("HD")
40 |         # Check if it is sorted.
41 |         # When there is more than one BAM, merging/sorting
42 |         # will happen regardless of this flag.
43 |         if hd_lines is not None and hd_lines.get('SO') == 'coordinate':
44 |             sorted_xam = True
45 |         if first_sq_lines is None:
46 |             # this is the first file
47 |             first_sq_lines = sq_lines
48 |         else:
49 |             # this is a subsequent file; check with the first `@SQ` lines
50 |             if sq_lines != first_sq_lines:
51 |                 mixed_headers = True
52 |                 break
53 | 
54 |     # we set `is_unaligned` to `True` if there were no mixed headers and the last file
55 |     # didn't have `@SQ` lines (as we can then be sure that none of the files did)
56 |     is_unaligned = not mixed_headers and not sq_lines
57 |     # write `is_unaligned` and `mixed_headers` out so that they can be set as env.
58 |     # variables
59 |     sys.stdout.write(
60 |         f"IS_UNALIGNED={int(is_unaligned)};" +
61 |         f"MIXED_HEADERS={int(mixed_headers)};" +
62 |         f"IS_SORTED={int(sorted_xam)}"
63 |     )
64 |     logger.info(f"Checked (u)BAM headers in '{args.input_path}'.")
65 | 
66 | 
67 | def argparser():
68 |     """Argument parser for entrypoint."""
69 |     parser = wf_parser("check_bam_headers_in_dir")
70 |     parser.add_argument("input_path", type=Path, help="Path to target directory")
71 |     return parser
72 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_sample_sheet.py:
--------------------------------------------------------------------------------
  1 | """Check if a sample sheet is valid."""
  2 | import codecs
  3 | import csv
  4 | import os
  5 | import re
  6 | import sys
  7 | 
  8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
  9 | 
 10 | 
 11 | # Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my
 12 | # comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8
 13 | # I should add). If we do not handle this with the correct encoding, the mark will
 14 | # appear in the parsed data, causing the header to be malformed.
 15 | # See CW-2310
 16 | def determine_codec(f):
 17 |     """Peek at a file and return an appropriate reading codec."""
 18 |     with open(f, 'rb') as f_bytes:
 19 |         # Could use chardet here if we need to expand codec support
 20 |         initial_bytes = f_bytes.read(8)
 21 | 
 22 |         for codec, encoding_name in [
 23 |             [codecs.BOM_UTF8, "utf-8-sig"],  # use the -sig codec to drop the mark
 24 |             [codecs.BOM_UTF16_BE, "utf-16"],  # don't specify LE or BE to drop mark
 25 |             [codecs.BOM_UTF16_LE, "utf-16"],
 26 |             [codecs.BOM_UTF32_BE, "utf-32"],  # handle 32 for completeness
 27 |             [codecs.BOM_UTF32_LE, "utf-32"],  # again skip LE or BE to drop mark
 28 |         ]:
 29 |             if initial_bytes.startswith(codec):
 30 |                 return encoding_name
 31 |         return None  # will cause file to be opened with default encoding
 32 | 
 33 | 
 34 | def main(args):
 35 |     """Run the entry point."""
 36 |     logger = get_named_logger("checkSheet")
 37 | 
 38 |     barcodes = []
 39 |     aliases = []
 40 |     sample_types = []
 41 |     analysis_groups = []
 42 |     allowed_sample_types = [
 43 |         "test_sample", "positive_control", "negative_control", "no_template_control"
 44 |     ]
 45 | 
 46 |     if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet):
 47 |         sys.stdout.write("Could not open sample sheet file.")
 48 |         sys.exit()
 49 | 
 50 |     try:
 51 |         encoding = determine_codec(args.sample_sheet)
 52 |         with open(args.sample_sheet, "r", encoding=encoding) as f:
 53 |             try:
 54 |                 # Excel files don't throw any error until here
 55 |                 csv.Sniffer().sniff(f.readline())
 56 |                 f.seek(0)  # return to initial position again
 57 |             except Exception as e:
 58 |                 # Excel fails with UniCode error
 59 |                 sys.stdout.write(
 60 |                     "The sample sheet doesn't seem to be a CSV file.\n"
 61 |                     "The sample sheet has to be a CSV file.\n"
 62 |                     "Please verify that the sample sheet is a CSV file.\n"
 63 |                     f"Parsing error: {e}"
 64 |                  )
 65 | 
 66 |                 sys.exit()
 67 | 
 68 |             csv_reader = csv.DictReader(f)
 69 |             n_row = 0
 70 |             for row in csv_reader:
 71 |                 n_row += 1
 72 |                 if n_row == 1:
 73 |                     n_cols = len(row)
 74 |                 else:
 75 |                     # check we got the same number of fields
 76 |                     if len(row) != n_cols:
 77 |                         sys.stdout.write(
 78 |                             f"Unexpected number of cells in row number {n_row}"
 79 |                         )
 80 |                         sys.exit()
 81 |                 try:
 82 |                     barcodes.append(row["barcode"])
 83 |                 except KeyError:
 84 |                     sys.stdout.write("'barcode' column missing")
 85 |                     sys.exit()
 86 |                 try:
 87 |                     aliases.append(row["alias"])
 88 |                 except KeyError:
 89 |                     sys.stdout.write("'alias' column missing")
 90 |                     sys.exit()
 91 |                 try:
 92 |                     sample_types.append(row["type"])
 93 |                 except KeyError:
 94 |                     pass
 95 |                 try:
 96 |                     analysis_groups.append(row["analysis_group"])
 97 |                 except KeyError:
 98 |                     pass
 99 |     except Exception as e:
100 |         sys.stdout.write(f"Parsing error: {e}")
101 |         sys.exit()
102 | 
103 |     # check barcodes are correct format
104 |     for barcode in barcodes:
105 |         if not re.match(r'^barcode\d\d+$', barcode):
106 |             sys.stdout.write("values in 'barcode' column are incorrect format")
107 |             sys.exit()
108 | 
109 |     # check aliases are correct format
110 |     # for now we have decided they may not start with "barcode"
111 |     for alias in aliases:
112 |         if alias.startswith("barcode"):
113 |             sys.stdout.write("values in 'alias' column must not begin with 'barcode'")
114 |             sys.exit()
115 | 
116 |     # check barcodes are all the same length
117 |     first_length = len(barcodes[0])
118 |     for barcode in barcodes[1:]:
119 |         if len(barcode) != first_length:
120 |             sys.stdout.write("values in 'barcode' column are different lengths")
121 |             sys.exit()
122 | 
123 |     # check barcode and alias values are unique
124 |     if len(barcodes) > len(set(barcodes)):
125 |         sys.stdout.write("values in 'barcode' column not unique")
126 |         sys.exit()
127 |     if len(aliases) > len(set(aliases)):
128 |         sys.stdout.write("values in 'alias' column not unique")
129 |         sys.exit()
130 | 
131 |     if sample_types:
132 |         # check if "type" column has unexpected values
133 |         unexp_type_vals = set(sample_types) - set(allowed_sample_types)
134 | 
135 |         if unexp_type_vals:
136 |             sys.stdout.write(
137 |                 f"found unexpected values in 'type' column: {unexp_type_vals}. "
138 |                 f"Allowed values are: {allowed_sample_types}"
139 |             )
140 |             sys.exit()
141 | 
142 |         if args.required_sample_types:
143 |             for required_type in args.required_sample_types:
144 |                 if required_type not in allowed_sample_types:
145 |                     sys.stdout.write(f"Not an allowed sample type: {required_type}")
146 |                     sys.exit()
147 |                 if sample_types.count(required_type) < 1:
148 |                     sys.stdout.write(
149 |                         f"Sample sheet requires at least 1 of {required_type}")
150 |                     sys.exit()
151 |     if analysis_groups:
152 |         # if there was a "analysis_group" column, make sure it had values for all
153 |         # samples
154 |         if not all(analysis_groups):
155 |             sys.stdout.write(
156 |                 "if an 'analysis_group' column exists, it needs values in each row"
157 |             )
158 |             sys.exit()
159 | 
160 |     logger.info(f"Checked sample sheet {args.sample_sheet}.")
161 | 
162 | 
163 | def argparser():
164 |     """Argument parser for entrypoint."""
165 |     parser = wf_parser("check_sample_sheet")
166 |     parser.add_argument("sample_sheet", help="Sample sheet to check")
167 |     parser.add_argument(
168 |         "--required_sample_types",
169 |         help="List of required sample types. Each sample type provided must "
170 |              "appear at least once in the sample sheet",
171 |         nargs="*"
172 |     )
173 |     return parser
174 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_xam_index.py:
--------------------------------------------------------------------------------
 1 | """Validate a single (u)BAM file index."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def validate_xam_index(xam_file):
12 |     """Use fetch to validate the index.
13 | 
14 |     Invalid indexes will fail the call with a ValueError:
15 |     ValueError: fetch called on bamfile without index
16 |     """
17 |     with pysam.AlignmentFile(xam_file, check_sq=False) as alignments:
18 |         try:
19 |             alignments.fetch()
20 |             has_valid_index = True
21 |         except ValueError:
22 |             has_valid_index = False
23 |     return has_valid_index
24 | 
25 | 
26 | def main(args):
27 |     """Run the entry point."""
28 |     logger = get_named_logger("checkBamIdx")
29 | 
30 |     # Check if a XAM has a valid index
31 |     has_valid_index = validate_xam_index(args.input_xam)
32 |     # write `has_valid_index` out so that they can be set as env.
33 |     sys.stdout.write(
34 |         f"HAS_VALID_INDEX={int(has_valid_index)}"
35 |     )
36 |     logger.info(f"Checked (u)BAM index for: '{args.input_xam}'.")
37 | 
38 | 
39 | def argparser():
40 |     """Argument parser for entrypoint."""
41 |     parser = wf_parser("check_xam_index")
42 |     parser.add_argument("input_xam", type=Path, help="Path to target XAM")
43 |     return parser
44 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/configure_igv.py:
--------------------------------------------------------------------------------
  1 | """Create an IGV config file."""
  2 | 
  3 | import json
  4 | from pathlib import Path
  5 | import sys
  6 | 
  7 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
  8 | 
  9 | 
 10 | # Common variables
 11 | REF_EXTENSIONS = [".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fna.gz"]
 12 | DATA_TYPES_LISTS = {
 13 |     "bam": ["bam"],
 14 |     "bam_idx": ["bam.bai"],
 15 |     "cram": ["cram"],
 16 |     "cram_idx": ["cram.crai"],
 17 |     "vcf": ["vcf", "vcf.gz"],
 18 |     "vcf_idx": ["vcf.gz.tbi", "vcf.gz.csi"],
 19 |     "bcf": ["bcf"],
 20 |     "bcf_idx": ["bcf.csi"],
 21 |     "gtf": ["gtf", "gtf.gz"],
 22 |     "gtf_idx": ["gtf.gz.tbi"],
 23 |     "gff": ["gff", "gff.gz", "gff3", "gff3.gz"],
 24 |     "gff_idx": ["gff.gz.tbi", "gff3.gz.tbi"],
 25 |     "bed": ["bed", "bed.gz"],
 26 |     "bed_idx": ["bed.gz.tbi"],
 27 |     "bedmethyl": ["bedmethyl", "bedmethyl.gz"],
 28 |     "bedmethyl_idx": ["bedmethyl.gz.tbi"],
 29 |     "ref": REF_EXTENSIONS,
 30 | }
 31 | DATA_TYPES = {
 32 |     ext: ftype for ftype, extlist in DATA_TYPES_LISTS.items() for ext in extlist
 33 | }
 34 | 
 35 | # Data by idx
 36 | DATA_INDEXES_FMT = {
 37 |     fmt: f"{fmt}_idx" for fmt, dtype in DATA_TYPES.items() if "_idx" not in dtype
 38 | }
 39 | 
 40 | # Assign each format to its index
 41 | INDEX_PAIRS = {
 42 |     "bam": ("bai",),
 43 |     "cram": ("crai",),
 44 |     "vcf": ("tbi", "csi"),
 45 |     "bcf": ("csi",),
 46 |     "bed": ("tbi",),
 47 |     "bedmethyl": ("tbi",),
 48 |     "gff": ("tbi",),
 49 |     "gtf": ("tbi",),
 50 | }
 51 | 
 52 | 
 53 | class TrackBuilder:
 54 |     """Class that builds an IGV track."""
 55 | 
 56 |     def __init__(self):
 57 |         """Initialize properties for interval track."""
 58 |         # Reference properties
 59 |         self.ref = None
 60 |         self.fai = None
 61 |         self.gzi = None
 62 |         # Samples info
 63 |         self.samples = {}
 64 |         # Track properties
 65 |         self.igv_json = {"reference": {}, "tracks": []}
 66 |         self.track_type = {
 67 |             "bam": "alignment",
 68 |             "cram": "alignment",
 69 |             "bcf": "variant",
 70 |             "vcf": "variant",
 71 |             "bedmethyl": "annotation",
 72 |             "bed": "annotation",
 73 |             "gtf": "annotation",
 74 |             "gff": "annotation",
 75 |         }
 76 |         # Here we save aliases of file formats that IGV.js
 77 |         # wants and that do not match the input file extension.
 78 |         self.igv_fmt_alias = {"gff": "gff3"}
 79 |         # lookup of extra options for each data type
 80 |         self.extra_opts_lookups = {
 81 |             "bam": {},
 82 |             "cram": {},
 83 |             "bcf": {},
 84 |             "vcf": {},
 85 |             "bed": {},
 86 |             "bedmethyl": {},
 87 |             "gtf": {},
 88 |             "gff": {},
 89 |         }
 90 | 
 91 |     def add_ref(self, ref=None):
 92 |         """Add reference file, unless already defined."""
 93 |         if self.ref:
 94 |             raise Exception(
 95 |                 f"Reference genome has already been set to {self.ref}.\n"
 96 |                 "Only one reference FASTA file is expected."
 97 |             )
 98 |         else:
 99 |             self.ref = ref
100 | 
101 |     def add_ref_index(self, ref_index=None):
102 |         """Add reference index if valid."""
103 |         basename = Path(self.ref).name
104 |         idx_basename = Path(ref_index).name
105 |         if idx_basename == f"{basename}.fai":
106 |             self.fai = ref_index
107 |         if idx_basename == f"{basename}.gzi" and basename.endswith(".gz"):
108 |             self.gzi = ref_index
109 | 
110 |     def parse_fnames(self, fofn):
111 |         """Parse list with filenames and return them grouped.
112 | 
113 |         :param fofn: File with list of file names (one per line)
114 |         """
115 |         tmp_samples = {}
116 |         with open(fofn, "r") as f:
117 |             for line in f:
118 |                 # If the line contains the sample name, prepare the data structure
119 |                 if "," in line:
120 |                     sample, fname = line.strip().split(",")
121 |                     if sample not in tmp_samples:
122 |                         tmp_samples[sample] = SampleBundle(sample=sample)
123 |                     tmp_samples[sample].append(fname)
124 |                 else:
125 |                     # Otherwise, assign everything to NO_SAMPLE
126 |                     # Files will still be displayed, but in no specific order.
127 |                     fname = line.strip()
128 |                     if any(fname.endswith(ext) for ext in REF_EXTENSIONS):
129 |                         self.add_ref(ref=fname)
130 |                     elif fname.endswith(".fai") or fname.endswith(".gzi"):
131 |                         self.add_ref_index(ref_index=fname)
132 |                     else:
133 |                         if "NO_SAMPLE" not in tmp_samples.keys():
134 |                             tmp_samples["NO_SAMPLE"] = SampleBundle(sample="NO_SAMPLE")
135 |                         tmp_samples["NO_SAMPLE"].append(fname)
136 |         # Re-order samples in dict and add them to the list, leaving
137 |         # NO_SAMPLE as last
138 |         sorted_samples = (
139 |             sorted([sample for sample in tmp_samples.keys() if sample != 'NO_SAMPLE'])
140 |         )
141 |         if 'NO_SAMPLE' in tmp_samples.keys():
142 |             sorted_samples += ['NO_SAMPLE']
143 |         for sample in sorted_samples:
144 |             self.samples[sample] = tmp_samples[sample]
145 | 
146 |     def build_igv_json(self):
147 |         """Ensure there is a reference genome."""
148 |         if not self.ref:
149 |             raise ValueError(
150 |                 "No reference file (i.e. file ending in one of "
151 |                 f"{REF_EXTENSIONS} was found)."
152 |             )
153 |         # Evaluate that a bgzipped reference has the appropriate index.
154 |         if self.ref.endswith(".gz") and not self.gzi:
155 |             raise ValueError(f"GZI reference index for {self.ref} not found.")
156 | 
157 |         # Create the base track if there is a reference genome.
158 |         self.igv_json["reference"] = {
159 |             "id": "ref",
160 |             "name": "ref",
161 |             "wholeGenomeView": False,
162 |             "fastaURL": self.ref,
163 |         }
164 |         if self.fai:
165 |             self.igv_json["reference"]["indexURL"] = self.fai
166 |         if self.gzi:
167 |             self.igv_json["reference"]["compressedIndexURL"] = self.gzi
168 | 
169 |         # Add samples data now
170 |         for sample, bundle in self.samples.items():
171 |             bundle.process_data()
172 |             # Add the bundled data to the tracks
173 |             for fname, index, file_fmt in bundle.data_bundles:
174 |                 self.add_track(
175 |                     fname,
176 |                     file_fmt,
177 |                     sample_name=sample if sample != "NO_SAMPLE" else None,
178 |                     index=index,
179 |                     extra_opts=self.extra_opts_lookups[file_fmt],
180 |                 )
181 | 
182 |     def add_track(self, infile, file_fmt, sample_name=None, index=None, extra_opts={}):
183 |         """Add a track to an IGV json.
184 | 
185 |         This function takes an input file, an optional index file, its
186 |         file format and additional extra options for the track.
187 | 
188 |         :param infile: input file to create a track for
189 |         :param file_fmt: input file track type
190 |         :param sample_name: Name of the sample to display in the track name
191 |         :param index: index for the input file
192 |         :param extra_opts: dict of extra options for the track
193 |         :return: dict with track options
194 |         """
195 |         # Define track name depending on whether the sample ID is provided
196 |         track_name = Path(infile).name
197 |         if sample_name:
198 |             track_name = f"{sample_name}: {Path(infile).name}"
199 |         track_dict = {
200 |             "name": track_name,
201 |             "type": self.track_type[file_fmt],
202 |             "format": self.igv_fmt_alias.get(file_fmt, file_fmt),
203 |             "url": infile,
204 |         }
205 |         # add the index, if present
206 |         if index:
207 |             track_dict["indexURL"] = index
208 |         track_dict.update(extra_opts)
209 |         self.igv_json["tracks"] += [track_dict]
210 | 
211 |     def add_locus(self, locus):
212 |         """Add target locus to the json."""
213 |         self.igv_json["locus"] = locus
214 | 
215 |     def add_extra_opts(
216 |         self,
217 |         extra_alignment_opts=None,
218 |         extra_variant_opts=None,
219 |         extra_interval_opts=None,
220 |     ):
221 |         """Import extra options from json files."""
222 |         if extra_alignment_opts is not None:
223 |             with open(extra_alignment_opts, "r") as f:
224 |                 extra_alignment_opts_json = json.load(f)
225 |                 for ftype in ["bam", "cram"]:
226 |                     self.extra_opts_lookups[ftype] = extra_alignment_opts_json
227 |         if extra_variant_opts is not None:
228 |             with open(extra_variant_opts, "r") as f:
229 |                 extra_variant_opts_json = json.load(f)
230 |                 for ftype in ["vcf", "bcf"]:
231 |                     self.extra_opts_lookups[ftype] = extra_variant_opts_json
232 |         if extra_interval_opts is not None:
233 |             with open(extra_interval_opts, "r") as f:
234 |                 extra_interval_opts_json = json.load(f)
235 |                 for ftype in ["bed", "bedmethyl", "gff", "gtf"]:
236 |                     self.extra_opts_lookups[ftype] = extra_interval_opts_json
237 | 
238 | 
239 | class SampleBundle:
240 |     """Sample data class.
241 | 
242 |     This class stores the data for multiple tracks for a
243 |     single sample, then is used to generate a collection of
244 |     IGV.js tracks.
245 |     """
246 | 
247 |     def __init__(self, sample):
248 |         """Initialize properties for a sample."""
249 |         self.sample = sample
250 |         self.infiles = []
251 |         self.data_bundles = []
252 | 
253 |     def append(self, fname):
254 |         """Add a new raw file to the bundle."""
255 |         self.infiles.append(fname)
256 | 
257 |     def process_data(self):
258 |         """Process input files."""
259 |         fbasenames = [Path(fname).name for fname in self.infiles]
260 |         ftypes = [self.classify_files(bname) for bname in fbasenames]
261 |         self.data_bundles = self.pair_file_with_index(self.infiles, fbasenames, ftypes)
262 | 
263 |     @staticmethod
264 |     def classify_files(fname):
265 |         """Classify inputs."""
266 |         for extension, ftype in DATA_TYPES.items():
267 |             if fname.endswith(f".{extension}"):
268 |                 return ftype
269 | 
270 |     @staticmethod
271 |     def pair_file_with_index(infiles, fbasenames, ftypes):
272 |         """Clump files with their indexes."""
273 |         # Collect data by group type
274 |         groups = {ftype: {"basenames": [], "paths": []} for ftype in set(ftypes)}
275 |         # Group each file by its type and base name
276 |         for ftype, fbasename, fname in zip(ftypes, fbasenames, infiles):
277 |             groups[ftype]["basenames"] += [fbasename]
278 |             groups[ftype]["paths"] += [fname]
279 | 
280 |         # Output bundles
281 |         outputs = []
282 |         # Start matching the variant files
283 |         for ftype, itype in DATA_INDEXES_FMT.items():
284 |             # Ignore file formats that are not present in the bundle.
285 |             if ftype not in groups:
286 |                 continue
287 |             # Make pairs of files.
288 |             for fbasename, fpath in zip(
289 |                 groups[ftype]["basenames"], groups[ftype]["paths"]
290 |             ):
291 |                 #  Construct potential index file names based on basename of input files
292 |                 idx_basenames = set(
293 |                     [f"{fbasename}.{idx}" for idx in INDEX_PAIRS[ftype]]
294 |                 )
295 |                 # Find which indexes are available
296 |                 if itype in groups.keys():
297 |                     idx_basenames = list(
298 |                         idx_basenames.intersection(set(groups[itype]["basenames"]))
299 |                     )
300 |                     # Get the first index (if there are more than one,
301 |                     # it doesn't matter)
302 |                     bname = idx_basenames[0]
303 |                     idx_fn = groups[itype]["paths"][
304 |                         groups[itype]["basenames"].index(bname)
305 |                     ]
306 |                     outputs.append([fpath, idx_fn, ftype])
307 |                 # Otherwise, return only the simple file.
308 |                 else:
309 |                     outputs.append([fpath, None, ftype])
310 |         return outputs
311 | 
312 | 
313 | def main(args):
314 |     """Run the entry point."""
315 |     logger = get_named_logger("configIGV")
316 | 
317 |     # parse the FOFN
318 |     igv_builder = TrackBuilder()
319 | 
320 |     # Add the additional track configurations
321 |     igv_builder.add_extra_opts(
322 |         extra_alignment_opts=args.extra_alignment_opts,
323 |         extra_variant_opts=args.extra_variant_opts,
324 |         extra_interval_opts=args.extra_interval_opts
325 |     )
326 | 
327 |     # Import files
328 |     igv_builder.parse_fnames(args.fofn)
329 | 
330 |     # initialise the IGV options dict with the reference options
331 |     igv_builder.build_igv_json()
332 | 
333 |     # Add locus information
334 |     if args.locus is not None:
335 |         igv_builder.add_locus(args.locus)
336 | 
337 |     json.dump(igv_builder.igv_json, sys.stdout, indent=4)
338 | 
339 |     logger.info("Printed IGV config JSON to STDOUT.")
340 | 
341 | 
342 | def argparser():
343 |     """Argument parser for entrypoint."""
344 |     parser = wf_parser("configure_igv")
345 |     parser.add_argument(
346 |         "--fofn",
347 |         required=True,
348 |         help=(
349 |             "File with list of names of reference / XAM / VCF files and indices "
350 |             "(one filename per line)"
351 |         ),
352 |     )
353 |     parser.add_argument(
354 |         "--locus",
355 |         help="Locus string to set initial genomic coordinates to display in IGV",
356 |     )
357 |     parser.add_argument(
358 |         "--extra-alignment-opts",
359 |         help="JSON file with extra options for alignment tracks",
360 |     )
361 |     parser.add_argument(
362 |         "--extra-variant-opts",
363 |         help="JSON file with extra options for variant tracks",
364 |     )
365 |     parser.add_argument(
366 |         "--extra_interval_opts",
367 |         help="JSON file with extra options for interval tracks",
368 |     )
369 |     return parser
370 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/get_max_depth_locus.py:
--------------------------------------------------------------------------------
 1 | """Find max depth window in a `mosdepth` regions BED file and write as locus string."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("getMaxDepth")
14 | 
15 |     # read the regions BED file
16 |     df = pd.read_csv(
17 |         args.depths_bed, sep="\t", header=None, names=["ref", "start", "end", "depth"]
18 |     )
19 | 
20 |     # get the window with the largest depth
21 |     ref, start, end, depth = df.loc[df["depth"].idxmax()]
22 | 
23 |     # get the length of the reference of that window
24 |     ref_length = df.query("ref == @ref")["end"].iloc[-1]
25 | 
26 |     # show the whole reference in case it's shorter than the desired locus size
27 |     if ref_length < args.locus_size:
28 |         start = 1
29 |         end = ref_length
30 |     else:
31 |         # otherwise, show a region of the desired size around the window
32 |         half_size = args.locus_size // 2
33 |         mid = (start + end) // 2
34 |         start = mid - half_size
35 |         end = mid + half_size
36 |         # check if the region starts below `1` or ends beyond the end of the reference
37 |         if start < 1:
38 |             start = 1
39 |             end = args.locus_size
40 |         if end > ref_length:
41 |             start = ref_length - args.locus_size
42 |             end = ref_length
43 | 
44 |     # write depth and locus string
45 |     sys.stdout.write(f"{depth}\t{ref}:{start}-{end}")
46 | 
47 |     logger.info("Wrote locus with maximum depth to STDOUT.")
48 | 
49 | 
50 | def argparser():
51 |     """Argument parser for entrypoint."""
52 |     parser = wf_parser("get_max_depth_locus")
53 |     parser.add_argument(
54 |         "depths_bed",
55 |         type=Path,
56 |         help="path to mosdepth regions depth file (can be compressed)",
57 |     )
58 |     parser.add_argument(
59 |         "locus_size", type=int, help="size of the locus in basepairs (e.g. '2000')"
60 |     )
61 |     return parser
62 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/reheader_samstream.py:
--------------------------------------------------------------------------------
  1 | """Reheader a SAM in a stream.
  2 | 
  3 | When using the bam2fq -> minimap2 pattern for (re)aligning BAM data, we
  4 | lose any existing RG and PG headers. This is particularly egregious when
  5 | handling basecalled data as lines related to dorado basecalling settings
  6 | as well as dorado RG headers are lost; orphaning RG tags in the reads.
  7 | This is problematic for downstream anaylses that would like to read the
  8 | XAM header to intelligently determine how to handle the reads based on
  9 | the basecaller model and basecaller configuration.
 10 | 
 11 | This script handles:
 12 |   - Inserting RG, PG and CO lines from an existing XAM header into the
 13 |     header of the SAM emitted from minimap2's alignment stream
 14 |   - Inserting a PG header to indicate that a call to bam2fq was made
 15 |   - Updating the first streamed PG.PP parent tag with the last PG.ID
 16 |     of the existing XAM header to maintain a chain of custody
 17 |   - Updating any streamed PG.ID (and PG.PP) tags to avoid collisions
 18 |     with inserted PG.ID
 19 | 
 20 | Handling collisions may seem like overkill but it is anticipated that
 21 | this script will be called immediately after minimap2, any previous
 22 | attempt to use minimap2 will lead to ambiguity. This would be the
 23 | expected case where users have used wf-basecalling or wf-alignment to
 24 | align a set of reads, only to realign them to another reference (eg.
 25 | via wf-human-variation). Arguably, we should remove older references to
 26 | minimap2 as they will have been invalidated by the call to bam2fq but
 27 | removing PG records and sticking the PG chain back together seems more
 28 | fraught with annoying future bugs than simply resolving conflicts.
 29 | 
 30 | This script will explode on a stream that contains:
 31 |   - PG lines in the original header where the last PG in the chain is
 32 |     ambiguous, or where the parent PP IDs are not injective
 33 |   - PG lines in the stream that do not appear in the order of their
 34 |     chain (that is if a PG.PP refers to a PG.ID that has not been
 35 |     encountered yet)
 36 | 
 37 | SQ lines are retained after an HD line. That is to say, the most recent
 38 | set of SQ lines observed after an HD will appear in the final output.
 39 | SQ, RG, PG and CO lines are emitted as a group together, with elements
 40 | written out in the order observed.
 41 | 
 42 | PG lines are naively appended to the last PG element in the chain. No
 43 | attempt is made to keep multiple program chains intact as this can lead
 44 | to bloated headers. Broken PG metadata is a known problem (see
 45 | samtools/hts-specs#275) but one that is preferable to headers that
 46 | become unwieldly large to process: there IS an upper limit to a SAM
 47 | header's size after all.
 48 | 
 49 | This script takes advantage of minimap2's SAM output to immediately
 50 | reheader the stream before any downstream calls to other programs pollute
 51 | the PG header. This script is a little overkill but attempts to be robust
 52 | with handling PG collisions and more obviously encapsulates reheadering
 53 | behaviour, and leaves some room to do more clever things as necessary.
 54 | """
 55 | from shutil import copyfileobj
 56 | import sys
 57 | 
 58 | from ..util import wf_parser  # noqa: ABS101
 59 | 
 60 | 
 61 | class SamHeader:
 62 |     """An overkill container to manage merging PG lines in SAM headers.
 63 | 
 64 |     Collision handling is simple. If a PG.ID is duplicated by the stream
 65 |     then we add a suffix to its name and keep an eye out for the
 66 |     corresponding PG.PP later. We assume that headers emitted by the
 67 |     stream are chronological because this script should not be called as
 68 |     part of any complicated pipework other than immediately following
 69 |     minimap2.
 70 |     """
 71 | 
 72 |     def __init__(self):
 73 |         """Initialise a collision aware PG container."""
 74 |         self.remapped_pgids = {}
 75 |         self.collision_suffix = 0
 76 | 
 77 |         # Default HD, in case the new stream does not provide one
 78 |         self.hd = "@HD\tVN:1.6\tSO:unknown"
 79 | 
 80 |         # We'll merge RG, CO and PG
 81 |         self.rg_records = []
 82 |         self.co_records = []
 83 |         self.pg_records = []
 84 | 
 85 |         # We keep the most recently observed block of SQ records by
 86 |         # resetting SQ on the first SQ seen after non-SQ. We cannot
 87 |         # rely on HD being emitted (as minimap2 does not do this!)
 88 |         self.sq_records = []
 89 |         self.reset_sq = False
 90 | 
 91 |         self.observed_rgids = set()
 92 |         self.observed_pgids = set()
 93 |         self.last_pgid = None
 94 | 
 95 |     @staticmethod
 96 |     def str_to_record(line):
 97 |         """Return an appropriate struct for a given string record."""
 98 |         try:
 99 |             record_type, record_data = line.strip().split('\t', 1)
100 |         except ValueError:
101 |             raise Exception(f"Record type could not be determined: {line}")
102 | 
103 |         if len(record_type) > 3:
104 |             raise Exception(f"Record type malformed: {record_type}")
105 | 
106 |         record = {}
107 |         if record_type in ["@HD", "@CO", "@SQ"]:
108 |             return record_type, record_data
109 |         elif record_type in ["@RG", "@PG"]:
110 |             for field in record_data.strip().split('\t'):
111 |                 k, v = field.split(':', 1)
112 |                 if len(k) == 2 and k[0].isalpha() and k[1].isalnum():
113 |                     record[k] = v
114 |                 else:
115 |                     raise Exception(f"{record_type} with invalid tag: '{k}'")
116 |             if "ID" not in record:
117 |                 raise Exception(f"{record_type} with no ID: {record_data}")
118 |             return record_type, record
119 |         else:
120 |             raise Exception(f"Unknown record type: {line}")
121 | 
122 |     @staticmethod
123 |     def record_to_str(record_type, record_data):
124 |         """Form a string from a header record."""
125 |         if record_type in ["@PG", "@RG"]:
126 |             tags = [f"{k}:{v}" for k, v in record_data.items()]
127 |             return f"{record_type}\t" + '\t'.join(tags)
128 |         elif record_type in ["@SQ", "@CO"]:
129 |             return f"{record_type}\t{record_data}"
130 | 
131 |     @staticmethod
132 |     def resolve_pg_chain(pg_dicts):
133 |         """Check links between PG.ID and PP.ID, exploding if inconsistent."""
134 |         links = {}
135 |         # Document links between all ID and their PP parent
136 |         pgids_without_ppid = 0
137 |         for pgd in pg_dicts:
138 |             pgid = pgd["ID"]
139 |             pgpp = pgd.get("PP")
140 |             links[pgid] = pgpp
141 |             if pgpp is None:
142 |                 pgids_without_ppid += 1
143 |         if len(links) > 0:
144 |             # If there are links, exactly one should have a None parent
145 |             # to indicate the first PG in the chain. Explode if we see
146 |             # no head or multiple heads.
147 |             if pgids_without_ppid == 0:
148 |                 raise Exception("PG chain does not have a head.")
149 |             elif pgids_without_ppid > 1:
150 |                 raise Exception("PG chain has multiple heads.")
151 |         for source in links:
152 |             head = source
153 |             path = [head]
154 |             while True:
155 |                 head = links[head]
156 |                 if head is None:
157 |                     break
158 |                 if head in path:
159 |                     path.append(head)
160 |                     raise Exception(f"PG chain appears to contain cycle: {path}")
161 |                 path.append(head)
162 |         # This function is only really called to catch any explosions
163 |         # but we'll return the links here as it is useful for testing
164 |         return links
165 | 
166 |     def _bump_pg_collider(self):
167 |         """Alter the collision suffix after determining a collision."""
168 |         self.collision_suffix += 1
169 | 
170 |     def _uncollide_pgid(self, pgid):
171 |         """Return an uncollided string for a given PG ID."""
172 |         new_pgid = f"{pgid}-{self.collision_suffix}"
173 |         self.remapped_pgids[pgid] = new_pgid
174 |         self._bump_pg_collider()
175 |         return new_pgid
176 | 
177 |     def add_line(self, line):
178 |         """Add a header line to the header."""
179 |         record_type, record = self.str_to_record(line)
180 | 
181 |         if record_type == "@HD":
182 |             self.hd = f"@HD\t{record}"
183 |         elif record_type == "@CO":
184 |             self.co_records.append(record)
185 |         elif record_type == "@SQ":
186 |             if self.reset_sq:
187 |                 self.sq_records = []
188 |                 self.reset_sq = False
189 |             self.sq_records.append(record)
190 |         elif record_type == "@RG":
191 |             rgid = record["ID"]
192 |             if rgid not in self.observed_rgids:
193 |                 self.observed_rgids.add(rgid)
194 |                 self.rg_records.append(record)
195 |             elif record not in self.rg_records:
196 |                 # if rgid has been seen before, abort if this record is different
197 |                 raise Exception(
198 |                     f"Duplicate RG with ID '{rgid}' conflicts with previously seen RG with same ID."  # noqa:E501
199 |                 )
200 |         elif record_type == "@PG":
201 |             pgid = record["ID"]
202 |             if pgid in self.observed_pgids:
203 |                 # collision, rewrite the pgid
204 |                 pgid = self._uncollide_pgid(pgid)
205 |                 record["ID"] = pgid
206 |             else:
207 |                 self.observed_pgids.add(pgid)
208 | 
209 |             # maintain chain
210 |             ppid = record.get("PP")
211 |             if not ppid:
212 |                 # record has no parent, this is either
213 |                 # - the first record (last_pgid is None) so is the tail
214 |                 # - an inserted record that needs its parent to be the current tail
215 |                 if not self.last_pgid:
216 |                     self.last_pgid = pgid
217 |                 else:
218 |                     record["PP"] = self.last_pgid
219 |                     self.last_pgid = pgid
220 |             else:
221 |                 if ppid not in self.observed_pgids:
222 |                     raise Exception(
223 |                         f"Encountered PG.PP '{ppid}' before observing corresponding PG.ID"  # noqa:E501
224 |                     )
225 |                 # remap parent id (if needed)
226 |                 record["PP"] = self.remapped_pgids.get(ppid, ppid)
227 |                 # set tail to this record
228 |                 self.last_pgid = pgid
229 | 
230 |             self.pg_records.append(record)
231 | 
232 |         if len(self.sq_records) > 0 and record_type != '@SQ':
233 |             self.reset_sq = True
234 | 
235 |         return record
236 | 
237 |     def write_header(self, fh):
238 |         """Write this header to a file handle."""
239 |         self.resolve_pg_chain(self.pg_records)  # check PG header
240 |         fh.write(f"{self.hd}\n")
241 |         for sq in self.sq_records:
242 |             fh.write(self.record_to_str("@SQ", sq) + '\n')
243 |         for rg in self.rg_records:
244 |             fh.write(self.record_to_str("@RG", rg) + '\n')
245 |         for pg in self.pg_records:
246 |             fh.write(self.record_to_str("@PG", pg) + '\n')
247 |         for co in self.co_records:
248 |             fh.write(self.record_to_str("@CO", co) + '\n')
249 | 
250 | 
251 | def reheader_samstream(header_in, stream_in, stream_out, args):
252 |     """Run reheader_samstream."""
253 |     # read original header into container
254 |     sh = SamHeader()
255 |     for line in header_in:
256 |         sh.add_line(line)
257 | 
258 |     # append user provided lines to container
259 |     for line in args.insert:
260 |         sh.add_line(line)
261 | 
262 |     # read the header portion of the minimap2 stream
263 |     wrote_header = False
264 |     for line in stream_in:
265 |         if line[0] != '@':
266 |             # write out header on first alignment
267 |             sh.write_header(stream_out)
268 |             wrote_header = True
269 |             # and actually write the first alignment
270 |             stream_out.write(line)
271 |             break
272 |         sh.add_line(line)
273 | 
274 |     # Pass through the rest of the alignments.
275 |     # I toyed with a few ways of doing this:
276 |     #   - A trivial iter over the input file was slow. presumably as we incurred some
277 |     #     overhead calling read() and write() and decoding more than other methods.
278 |     #   - os.read/write avoids dealing with higher level python read/write but requires
279 |     #     file descriptors which rules out non-file-like objects. this made testing more
280 |     #     annoying as StringIO does not have a file descriptor. we could have mocked fds
281 |     #     but i was not happy with the discrepancy between real and test execution.
282 |     #   - copyfileobj with the stream_in.buffer would also avoid some of the higher
283 |     #     level text handling but would require all tests to provide inputs that have
284 |     #     an underlying binary buffer. it was also not possible to seek the buffer to
285 |     #     the position of the text stream as we've used next() to iterate over the
286 |     #     header lines, fixing this would have required rewriting of the header
287 |     #     handling or keeping track of the position in the stream ourselves which
288 |     #     just seemed unncessary overkill given how we expect this program to be used.
289 |     # copyfileobj on the text streams is more efficient than merely iterating the file
290 |     # and dumping the lines out and seems to do the job. this keeps the code and tests
291 |     # simple with minimal additional cost to performance. i anticipate any overhead of
292 |     # this program will be dwarfed by that of minimap2/samtools sort anyway.
293 |     # increasing the buffer size gave worse performance in my limited testing so we
294 |     # leave it as the default here.
295 |     copyfileobj(stream_in, stream_out)
296 | 
297 |     # If there were no alignments, we won't have hit the != @ case in the first stdin,
298 |     # and we won't have written the header out. Write a header if we haven't already.
299 |     if not wrote_header:
300 |         sh.write_header(stream_out)
301 | 
302 | 
303 | def argparser():
304 |     """Argument parser for entrypoint."""
305 |     parser = wf_parser("reheader_samstream")
306 |     parser.add_argument("header_in")
307 |     parser.add_argument("--insert", action="append", default=[])
308 |     return parser
309 | 
310 | 
311 | def main(args):
312 |     """reheader_samstream default entry point."""
313 |     with open(args.header_in) as header_in:
314 |         reheader_samstream(header_in, sys.stdin, sys.stdout, args)
315 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-alignment/1bb08961e2aa8cfdb84fe2b4a8413c48039bbd04/data/.gitkeep


--------------------------------------------------------------------------------
/data/OPTIONAL_FILE:
--------------------------------------------------------------------------------
 1 | # Nothing to see here. A sentinel file to replace real data.
 2 | # e.g.:
 3 | #
 4 | # input:
 5 | #     file some_data
 6 | #     file extra_data
 7 | # script:
 8 | # def extra = extra_data.name != 'OPTIONAL_FILE' ? "--extra-data $opt" : ''
 9 | # """
10 | # command ${some_data} ${extra}
11 | # """
12 | 


--------------------------------------------------------------------------------
/docs/01_brief_description.md:
--------------------------------------------------------------------------------
1 | Align nanopore sequencing data and visualize mapping statistics.


--------------------------------------------------------------------------------
/docs/02_introduction.md:
--------------------------------------------------------------------------------
 1 | This workflow provides an easy way to align Oxford Nanopore reads and gather mapping
 2 | stats either locally for small amounts of data or at scale in a distributed
 3 | environment such as a cluster or the cloud.
 4 | 
 5 | > This workflow contains minimal functionality that is duplicated in many of our more specialised workflows.
 6 | > Please consider using one of these alternative workflows before using this one: you very likely do not need
 7 | > to use this workflow.
 8 | 
 9 | In brief, it will perform the following:
10 | * Combine all reference files in the directory passed to `--references`.
11 | * Align input reads (passed as FASTQ or unaligned BAM files) against the reference (Note that BAM files with aligned reads can be used as well; these will skip the alignment step and only stats and the report will be produced).
12 | * Create alignment stats.
13 | * Calculate depth of coverage along the reference sequences (this step can be skipped if requested).
14 | * Create an HTML report to illustrate the results.
15 | 


--------------------------------------------------------------------------------
/docs/03_compute_requirements.md:
--------------------------------------------------------------------------------
 1 | Recommended requirements:
 2 | 
 3 | + CPUs = 12
 4 | + Memory = 32GB
 5 | 
 6 | Minimum requirements:
 7 | 
 8 | + CPUs = 6
 9 | + Memory = 12GB
10 | 
11 | Approximate run time: 0.5-5 minutes per sample (depending on number of reads, length of reference, and available compute).
12 | 
13 | ARM processor support: True
14 | 


--------------------------------------------------------------------------------
/docs/04_install_and_run.md:
--------------------------------------------------------------------------------
 1 | 
 2 | These are instructions to install and run the workflow on command line.
 3 | You can also access the workflow via the
 4 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/).
 5 | 
 6 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage
 7 | compute and software resources,
 8 | therefore Nextflow will need to be
 9 | installed before attempting to run the workflow.
10 | 
11 | The workflow can currently be run using either
12 | [Docker](https://docs.docker.com/get-started/)
13 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
14 | to provide isolation of the required software.
15 | Both methods are automated out-of-the-box provided
16 | either Docker or Singularity is installed.
17 | This is controlled by the
18 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles)
19 | parameter as exemplified below.
20 | 
21 | It is not required to clone or download the git repository
22 | in order to run the workflow.
23 | More information on running EPI2ME workflows can
24 | be found on our [website](https://labs.epi2me.io/wfindex).
25 | 
26 | The following command can be used to obtain the workflow.
27 | This will pull the repository in to the assets folder of
28 | Nextflow and provide a list of all parameters
29 | available for the workflow as well as an example command:
30 | 
31 | ```
32 | nextflow run epi2me-labs/wf-alignment --help
33 | ```
34 | To update a workflow to the latest version on the command line use
35 | the following command:
36 | ```
37 | nextflow pull epi2me-labs/wf-alignment
38 | ```
39 | 
40 | A demo dataset is provided for testing of the workflow.
41 | It can be downloaded and unpacked using the following commands:
42 | ```
43 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-alignment/wf-alignment-demo.tar.gz
44 | tar -xzvf wf-alignment-demo.tar.gz
45 | ```
46 | The workflow can then be run with the downloaded demo data using:
47 | ```
48 | nextflow run epi2me-labs/wf-alignment \
49 | 	--fastq 'wf-alignment-demo/fastq' \
50 | 	--references 'wf-alignment-demo/references' \
51 | 	-profile standard
52 | ```
53 | 
54 | For further information about running a workflow on
55 | the command line see https://labs.epi2me.io/wfquickstart/
56 | 


--------------------------------------------------------------------------------
/docs/05_related_protocols.md:
--------------------------------------------------------------------------------
1 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices.
2 | 
3 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/).
4 | 


--------------------------------------------------------------------------------
/docs/06_input_example.md:
--------------------------------------------------------------------------------
 1 | <!---Example of input directory structure, delete and edit as appropriate per workflow.--->
 2 | This workflow accepts either FASTQ or BAM files as input.
 3 | 
 4 | The FASTQ or BAM input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ or BAM file; (ii) the path to a top-level directory containing FASTQ or BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ or BAM files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`.
 5 | 
 6 | ```
 7 | (i)                     (ii)                 (iii)    
 8 | input_reads.fastq   ─── input_directory  ─── input_directory
 9 |                         ├── reads0.fastq     ├── barcode01
10 |                         └── reads1.fastq     │   ├── reads0.fastq
11 |                                              │   └── reads1.fastq
12 |                                              ├── barcode02
13 |                                              │   ├── reads0.fastq
14 |                                              │   ├── reads1.fastq
15 |                                              │   └── reads2.fastq
16 |                                              └── barcode03
17 |                                               └── reads0.fastq
18 | ```


--------------------------------------------------------------------------------
/docs/06_input_parameters.md:
--------------------------------------------------------------------------------
 1 | ### Input Options
 2 | 
 3 | | Nextflow parameter name  | Type | Description | Help | Default |
 4 | |--------------------------|------|-------------|------|---------|
 5 | | fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
 6 | | bam | string | BAM or unaligned BAM (uBAM) files to use in the analysis. | This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
 7 | | analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False |
 8 | | references | string | Path to a directory containing FASTA reference files. | Accepted file extensions are '.fasta', '.fna', '.ffn', '.faa', '.frn', '.fa', '.txt', '.fa.gz', '.fna.gz', '.frn.gz', '.ffn.gz', '.fasta.gz'. In addition, an MMI index file can be provided to make the workflow run faster using the option `--reference_mmi_file`. |  |
 9 | | reference_mmi_file | string | Path to an MMI index file to be used as reference. | Accepted file extension is '.mmi'. The references parameter is still required if this is provided. Note that some minimap2 alignment options are set by the reference MMI and cannot be overridden. |  |
10 | | counts | string | Path to a CSV file containing expected counts as a control. | The expected counts CSV file must contain columns named 'reference' and 'expected_counts' in order to be valid. the 'reference' column should contain names matching the names of reference sequences within the fasta files provided using --references. |  |
11 | 
12 | 
13 | ### Sample Options
14 | 
15 | | Nextflow parameter name  | Type | Description | Help | Default |
16 | |--------------------------|------|-------------|------|---------|
17 | | sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. |  |
18 | | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. |  |  |
19 | 
20 | 
21 | ### Output Options
22 | 
23 | | Nextflow parameter name  | Type | Description | Help | Default |
24 | |--------------------------|------|-------------|------|---------|
25 | | out_dir | string | Directory for output of all workflow results. |  | output |
26 | | prefix | string | Optional prefix attached to each of the output filenames. | Output filename format will be `<prefix>-filename.ext`. |  |
27 | | per_read_stats | boolean | Generate Bamstats per-read stats. | With this option, the workflow will produce detailed per-read alignment stats emitted as gzipped TSV file. As these files can get quite large, it is recommended to only request them when necessary. | False |
28 | | igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. |  | False |
29 | 
30 | 
31 | ### Advanced options
32 | 
33 | | Nextflow parameter name  | Type | Description | Help | Default |
34 | |--------------------------|------|-------------|------|---------|
35 | | depth_coverage | boolean | Calculate depth coverage statistics and include them in the report. | This step can be a computational bottleneck. Set this to false if your reference sequences are >50mb to speed things up. | True |
36 | | minimap_preset | string | Pre-defined parameter sets for `minimap2`, covering most common use cases. | Available parameter sets are: 'dna' (`-ax map-ont`), 'rna' (`-ax splice -uf`). | dna |
37 | | minimap_args | string | String of command line arguments to be passed on to `minimap2`. | This overrides the options defined by `--minimap_preset` and allows for running the alignment step in a more customized way. |  |
38 | 
39 | 
40 | ### Miscellaneous Options
41 | 
42 | | Nextflow parameter name  | Type | Description | Help | Default |
43 | |--------------------------|------|-------------|------|---------|
44 | | threads | integer | Number of CPU threads to use for the alignment step. | The alignment process will run with this many threads (note that the memory used by minimap2 scales with the number of threads). The total CPU resources used by the workflow are constrained by the Nextflow executor configuration. | 4 |
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/docs/07_outputs.md:
--------------------------------------------------------------------------------
 1 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}.
 2 | 
 3 | | Title | File path | Description | Per sample or aggregated |
 4 | |-------|-----------|-------------|--------------------------|
 5 | | workflow report | wf-alignment-report.html | Report for all samples | aggregated |
 6 | | Combined references | combined_refs.fasta | FASTA file containing all input references. | aggregated |
 7 | | Combined references index | combined_refs.fasta.fai | Index file for combined references FASTA. | aggregated |
 8 | | Combined references MMI index | combined_refs.mmi | Minimap2 index file for combined references FASTA. | aggregated |
 9 | | Per-read alignment stats | {{ alias }}.readstats.tsv.gz | Bamstats per-read output TSV file (compressed with gzip). | per-sample |
10 | | Per-reference alignment stats | {{ alias }}.flagstat.tsv | Bamstats flagstat output TSV file. | per-sample |
11 | | Alignment accuracy histogram | {{ alias }}-histograms/accuracy.hist | Bamstats alignment accuracy histogram TSV file. | per-sample |
12 | | Alignment coverage histogram | {{ alias }}-histograms/coverage.hist | Bamstats alignment coverage histogram TSV file. | per-sample |
13 | | Read length histogram (mapped) | {{ alias }}-histograms/length.hist | Bamstats read length histogram TSV file (for mapped reads). | per-sample |
14 | | Read length histogram (unmapped) | {{ alias }}-histograms/length.unmap.hist | Bamstats read length histogram TSV file (for unmapped reads). | per-sample |
15 | | Read quality histogram (mapped) | {{ alias }}-histograms/quality.hist | Bamstats read quality histogram TSV file (for mapped reads). | per-sample |
16 | | Read quality histogram (unmapped) | {{ alias }}-histograms/quality.unmap.hist | Bamstats read quality histogram TSV file (for unmapped reads). | per-sample |
17 | | Alignments BAM file | {{ alias }}.sorted.aligned.bam | BAM file with alignments of filtered input reads against the combined references. | per-sample |
18 | | Alignments index file | {{ alias }}.sorted.aligned.bam.bai | Index for alignments BAM file. | per-sample |
19 | | IGV config JSON file | igv.json | JSON file with IGV config options to be used by the EPI2ME Desktop Application. | aggregated |
20 | 


--------------------------------------------------------------------------------
/docs/08_pipeline_overview.md:
--------------------------------------------------------------------------------
 1 | ### 1. Combine reference files
 2 | 
 3 | All reference files in the directory passed to `--references` are concatenated.
 4 | 
 5 | ### 2. Align reads
 6 | 
 7 | Input reads are aligned against the combined reference with [Minimap2](https://github.com/lh3/minimap2). If BAM files are used as input (with `--bam`), only reads in files without a reference in the SAM header are aligned. For other BAM files this step is skipped.
 8 | 
 9 | ### 3. Create alignment stats
10 | 
11 | [Bamstats](https://github.com/epi2me-labs/fastcat#bamstats) is used to create per-read and per-reference alignment stats from the BAM files.
12 | 
13 | ### 4. Calculate depth of coverage
14 | 
15 | Depth of coverage along the reference sequences is determined with [Mosdepth](https://github.com/brentp/mosdepth) (using 200 windows per reference sequence). To speed up the workflow, this step can be skipped by adding `--depth-coverage false`.
16 | 


--------------------------------------------------------------------------------
/docs/09_troubleshooting.md:
--------------------------------------------------------------------------------
1 | + If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug.
2 | + Please see [here](https://labs.epi2me.io/trouble-shooting/) for how to resolve some common Nextflow issues and [here](https://labs.epi2me.io/how-to-exits/) for how to interpret command exit codes.
3 | 


--------------------------------------------------------------------------------
/docs/10_FAQ.md:
--------------------------------------------------------------------------------
1 | *I cannot select a single reference file in the EPI2ME desktop app.* - When running the workflow via the desktop app, you need to provide a directory with reference files. If you only have a single file, you can create a directory to place your reference file inside and select this with the reference input option.
2 | 
3 | *How are the values in the `acc` column (and other metrics) in the per-read output stats calculated?* -
4 | For details on the per-read stats output files, please refer to the [fastcat/bamstats documentation](https://github.com/epi2me-labs/fastcat#output-format).
5 | 
6 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-alignment/issues) page or start a discussion on the [community](https://community.nanoporetech.com/).
7 | 


--------------------------------------------------------------------------------
/docs/11_other.md:
--------------------------------------------------------------------------------
1 | - [How to align your data](https://labs.epi2me.io/how-to-align/)
2 | 
3 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts.
4 | 


--------------------------------------------------------------------------------
/lib/ArgumentParser.groovy:
--------------------------------------------------------------------------------
 1 | /* Check arguments of a Nextflow function
 2 |  *
 3 |  * Nextflow script does not support the Groovy idiom:
 4 |  *
 5 |  *     def function(Map args[:], arg1, arg2, ...)
 6 |  * 
 7 |  * to support unordered kwargs. The methods here are designed
 8 |  * to reduce boileplate while allowing Nextflow script to implement
 9 |  *
10 |  *     def function(Map args[:])
11 |  *
12 |  * with required and default values. This is similar to some Python
13 |  * libraries' (notably matplotlib) extensive use of things like:
14 |  *
15 |  *     def function(*args, **kwargs)
16 |  *
17 |  * to implement generic APIs. Why do we want to do all this? Because
18 |  * we want to write library code with a clean set of required parameters
19 |  * but also extensible with non-required parameters with default values.
20 |  * This allows us to later add parameters without breaking existing code,
21 |  * and is very common practice elsewhere.
22 |  */
23 | 
24 | import java.util.Set
25 | 
26 | class ArgumentParser {
27 |     Set args
28 |     Map kwargs
29 |     String name
30 | 
31 |     /* Parse arguments, raising an error on unknown keys */
32 |     public Map parse_args(LinkedHashMap given_args) {
33 |         Set opt_keys = kwargs.keySet()
34 |         Set given_keys = given_args.keySet()
35 |         check_required(given_keys)
36 |         check_unknown(given_keys, opt_keys)
37 |         return kwargs + given_args
38 |     }
39 |     
40 |     /* Parse arguments, without raising an error for extra keys */
41 |     public Map parse_known_args(LinkedHashMap given_args) {
42 |         Set opt_keys = kwargs.keySet()
43 |         Set given_keys = given_args.keySet()
44 |         check_required(given_keys)
45 |         return kwargs + given_args
46 |     }
47 |     
48 |     private void check_required(Set given) {
49 |         Set missing_keys = args - given
50 |         if (!missing_keys.isEmpty()) {
51 |             throw new Exception("Missing arguments for function ${name}: ${missing_keys}")
52 |         }
53 |     }
54 |     
55 |     private void check_unknown(Set given, Set kwargs_keys) {
56 |         Set extra_keys = given - (args + kwargs_keys)
57 |         if (!extra_keys.isEmpty()) {
58 |             throw new Exception("Unknown arguments provided to function ${name}: ${extra_keys}.")
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/lib/CWUtil.groovy:
--------------------------------------------------------------------------------
 1 | /* Miscellaneous utilities for workflows from the ONT Customer Workflows Group.
 2 |  */
 3 | class CWUtil {
 4 | 
 5 |     /* Mutate the global Nextflow params map
 6 |     *
 7 |     * Occasionally, we may wish to mutate the value of a parameter provided
 8 |     * by the user. Typically, this leads to workflows with `params.my_param`
 9 |     * and `params._my_param` which is ripe for confusion. Instead, we can
10 |     * mutate the parameter value in the Nextflow params ScriptMap itself
11 |     * with the following call:
12 |     *
13 |     *     CWUtil.mutateParam(params, k, v)
14 |     *
15 |     * This is possible as Groovy actually has a surprisingly loose
16 |     * definition of "private", and allows us to call the private `allowNames`
17 |     * method on the ScriptMap which removes the read-only status for a key set.
18 |     * We can follow this up with a call to the private `put0` to reinsert
19 |     * the key and mark it as read-only again.
20 |     */
21 |     public static void mutateParam(nf_params, key, value) {
22 |         Set s = [key] // must be a set to allow call to allowNames
23 |         nf_params.allowNames(s)
24 |         nf_params.put0(key, value)
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/lib/NfcoreTemplate.groovy:
--------------------------------------------------------------------------------
  1 | //
  2 | // This file holds several functions used within the nf-core pipeline template.
  3 | //
  4 | 
  5 | // MIT License
  6 | // 
  7 | // Copyright (c) 2018 nf-core
  8 | // 
  9 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 10 | // of this software and associated documentation files (the "Software"), to deal
 11 | // in the Software without restriction, including without limitation the rights
 12 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 | // copies of the Software, and to permit persons to whom the Software is
 14 | // furnished to do so, subject to the following conditions:
 15 | // 
 16 | // The above copyright notice and this permission notice shall be included in all
 17 | // copies or substantial portions of the Software.
 18 | // 
 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 | // SOFTWARE.
 26 | 
 27 | 
 28 | import org.yaml.snakeyaml.Yaml
 29 | 
 30 | class NfcoreTemplate {
 31 | 
 32 |     //
 33 |     // Check AWS Batch related parameters have been specified correctly
 34 |     //
 35 |     public static void awsBatch(workflow, params) {
 36 |         if (workflow.profile.contains('awsbatch')) {
 37 |             // Check params.awsqueue and params.awsregion have been set if running on AWSBatch
 38 |             assert (params.awsqueue && params.awsregion) : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!"
 39 |             // Check outdir paths to be S3 buckets if running on AWSBatch
 40 |             assert params.outdir.startsWith('s3:')       : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!"
 41 |         }
 42 |     }
 43 | 
 44 |     //
 45 |     // Check params.hostnames
 46 |     //
 47 |     public static void hostName(workflow, params, log) {
 48 |         Map colors = logColours(params.monochrome_logs)
 49 |         if (params.hostnames) {
 50 |             try {
 51 |                 def hostname = "hostname".execute().text.trim()
 52 |                 params.hostnames.each { prof, hnames ->
 53 |                     hnames.each { hname ->
 54 |                         if (hostname.contains(hname) && !workflow.profile.contains(prof)) {
 55 |                             log.info "=${colors.yellow}====================================================${colors.reset}=\n" +
 56 |                                 "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" +
 57 |                                 "      but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" +
 58 |                                 "      ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" +
 59 |                                 "=${colors.yellow}====================================================${colors.reset}="
 60 |                         }
 61 |                     }
 62 |                 }
 63 |             } catch (Exception e) {
 64 |                 log.warn "[$workflow.manifest.name] Could not determine 'hostname' - skipping check. Reason: ${e.message}."
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 |     //
 70 |     // Generate version string
 71 |     //
 72 |     public static String version(workflow) {
 73 |         String version_string = ""
 74 | 
 75 |         if (workflow.manifest.version) {
 76 |             def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : ''
 77 |             version_string += "${prefix_v}${workflow.manifest.version}"
 78 |         }
 79 | 
 80 |         if (workflow.commitId) {
 81 |             def git_shortsha = workflow.commitId.substring(0, 7)
 82 |             version_string += "-g${git_shortsha}"
 83 |         }
 84 | 
 85 |         return version_string
 86 |     }
 87 | 
 88 |     //
 89 |     // Construct and send completion email
 90 |     //
 91 |     public static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], fail_mapped_reads=[:]) {
 92 | 
 93 |         // Set up the e-mail variables
 94 |         def subject = "[$workflow.manifest.name] Successful: $workflow.runName"
 95 |         if (fail_mapped_reads.size() > 0) {
 96 |             subject = "[$workflow.manifest.name] Partially successful (${fail_mapped_reads.size()} skipped): $workflow.runName"
 97 |         }
 98 |         if (!workflow.success) {
 99 |             subject = "[$workflow.manifest.name] FAILED: $workflow.runName"
100 |         }
101 | 
102 |         def summary = [:]
103 |         for (group in summary_params.keySet()) {
104 |             summary << summary_params[group]
105 |         }
106 | 
107 |         def misc_fields = [:]
108 |         misc_fields['Date Started']              = workflow.start
109 |         misc_fields['Date Completed']            = workflow.complete
110 |         misc_fields['Pipeline script file path'] = workflow.scriptFile
111 |         misc_fields['Pipeline script hash ID']   = workflow.scriptId
112 |         if (workflow.repository) misc_fields['Pipeline repository Git URL']    = workflow.repository
113 |         if (workflow.commitId)   misc_fields['Pipeline repository Git Commit'] = workflow.commitId
114 |         if (workflow.revision)   misc_fields['Pipeline Git branch/tag']        = workflow.revision
115 |         misc_fields['Nextflow Version']           = workflow.nextflow.version
116 |         misc_fields['Nextflow Build']             = workflow.nextflow.build
117 |         misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp
118 | 
119 |         def email_fields = [:]
120 |         email_fields['version']           = NfcoreTemplate.version(workflow)
121 |         email_fields['runName']           = workflow.runName
122 |         email_fields['success']           = workflow.success
123 |         email_fields['dateComplete']      = workflow.complete
124 |         email_fields['duration']          = workflow.duration
125 |         email_fields['exitStatus']        = workflow.exitStatus
126 |         email_fields['errorMessage']      = (workflow.errorMessage ?: 'None')
127 |         email_fields['errorReport']       = (workflow.errorReport ?: 'None')
128 |         email_fields['commandLine']       = workflow.commandLine
129 |         email_fields['projectDir']        = workflow.projectDir
130 |         email_fields['summary']           = summary << misc_fields
131 |         email_fields['fail_mapped_reads'] = fail_mapped_reads.keySet()
132 |         email_fields['min_mapped_reads']  = params.min_mapped_reads
133 | 
134 |         // On success try attach the multiqc report
135 |         def mqc_report = null
136 |         try {
137 |             if (workflow.success && !params.skip_multiqc) {
138 |                 mqc_report = multiqc_report.getVal()
139 |                 if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) {
140 |                     if (mqc_report.size() > 1) {
141 |                         log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one"
142 |                     }
143 |                     mqc_report = mqc_report[0]
144 |                 }
145 |             }
146 |         } catch (all) {
147 |             if (multiqc_report) {
148 |                 log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email"
149 |             }
150 |         }
151 | 
152 |         // Check if we are only sending emails on failure
153 |         def email_address = params.email
154 |         if (!params.email && params.email_on_fail && !workflow.success) {
155 |             email_address = params.email_on_fail
156 |         }
157 | 
158 |         // Render the TXT template
159 |         def engine       = new groovy.text.GStringTemplateEngine()
160 |         def tf           = new File("$projectDir/assets/email_template.txt")
161 |         def txt_template = engine.createTemplate(tf).make(email_fields)
162 |         def email_txt    = txt_template.toString()
163 | 
164 |         // Render the HTML template
165 |         def hf            = new File("$projectDir/assets/email_template.html")
166 |         def html_template = engine.createTemplate(hf).make(email_fields)
167 |         def email_html    = html_template.toString()
168 | 
169 |         // Render the sendmail template
170 |         def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit
171 |         def smail_fields           = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ]
172 |         def sf                     = new File("$projectDir/assets/sendmail_template.txt")
173 |         def sendmail_template      = engine.createTemplate(sf).make(smail_fields)
174 |         def sendmail_html          = sendmail_template.toString()
175 | 
176 |         // Send the HTML e-mail
177 |         Map colors = logColours(params.monochrome_logs)
178 |         if (email_address) {
179 |             try {
180 |                 if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') }
181 |                 // Try to send HTML e-mail using sendmail
182 |                 [ 'sendmail', '-t' ].execute() << sendmail_html
183 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-"
184 |             } catch (all) {
185 |                 // Catch failures and try with plaintext
186 |                 def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ]
187 |                 if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) {
188 |                     mail_cmd += [ '-A', mqc_report ]
189 |                 }
190 |                 mail_cmd.execute() << email_html
191 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-"
192 |             }
193 |         }
194 | 
195 |         // Write summary e-mail HTML to a file
196 |         def output_d = new File("${params.outdir}/pipeline_info/")
197 |         if (!output_d.exists()) {
198 |             output_d.mkdirs()
199 |         }
200 |         def output_hf = new File(output_d, "pipeline_report.html")
201 |         output_hf.withWriter { w -> w << email_html }
202 |         def output_tf = new File(output_d, "pipeline_report.txt")
203 |         output_tf.withWriter { w -> w << email_txt }
204 |     }
205 | 
206 |     //
207 |     // Print pipeline summary on completion
208 |     //
209 |     public static void summary(workflow, params, log, fail_mapped_reads=[:], pass_mapped_reads=[:]) {
210 |         Map colors = logColours(params.monochrome_logs)
211 | 
212 |         if (pass_mapped_reads.size() > 0) {
213 |             def idx = 0
214 |             def samp_aln = ''
215 |             def total_aln_count = pass_mapped_reads.size() + fail_mapped_reads.size()
216 |             for (samp in pass_mapped_reads) {
217 |                 samp_aln += "    ${samp.value}: ${samp.key}\n"
218 |                 idx += 1
219 |                 if (idx > 5) {
220 |                     samp_aln += "    ..see pipeline reports for full list\n"
221 |                     break;
222 |                 }
223 |             }
224 |             log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} ${pass_mapped_reads.size()}/$total_aln_count samples passed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-"
225 |         }
226 |         if (fail_mapped_reads.size() > 0) {
227 |             def samp_aln = ''
228 |             for (samp in fail_mapped_reads) {
229 |                 samp_aln += "    ${samp.value}: ${samp.key}\n"
230 |             }
231 |             log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} ${fail_mapped_reads.size()} samples skipped since they failed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-"
232 |         }
233 | 
234 |         if (workflow.success) {
235 |             if (workflow.stats.ignoredCount == 0) {
236 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-"
237 |             } else {
238 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-"
239 |             }
240 |         } else {
241 |             hostName(workflow, params, log)
242 |             log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-"
243 |         }
244 |     }
245 | 
246 |     //
247 |     // ANSII Colours used for terminal logging
248 |     //
249 |     public static Map logColours(Boolean monochrome_logs) {
250 |         Map colorcodes = [:]
251 | 
252 |         // Reset / Meta
253 |         colorcodes['reset']      = monochrome_logs ? '' : "\033[0m"
254 |         colorcodes['bold']       = monochrome_logs ? '' : "\033[1m"
255 |         colorcodes['dim']        = monochrome_logs ? '' : "\033[2m"
256 |         colorcodes['underlined'] = monochrome_logs ? '' : "\033[4m"
257 |         colorcodes['blink']      = monochrome_logs ? '' : "\033[5m"
258 |         colorcodes['reverse']    = monochrome_logs ? '' : "\033[7m"
259 |         colorcodes['hidden']     = monochrome_logs ? '' : "\033[8m"
260 | 
261 |         // Regular Colors
262 |         colorcodes['black']      = monochrome_logs ? '' : "\033[0;30m"
263 |         colorcodes['red']        = monochrome_logs ? '' : "\033[0;31m"
264 |         colorcodes['green']      = monochrome_logs ? '' : "\033[0;32m"
265 |         colorcodes['yellow']     = monochrome_logs ? '' : "\033[0;33m"
266 |         colorcodes['blue']       = monochrome_logs ? '' : "\033[0;34m"
267 |         colorcodes['purple']     = monochrome_logs ? '' : "\033[0;35m"
268 |         colorcodes['cyan']       = monochrome_logs ? '' : "\033[0;36m"
269 |         colorcodes['white']      = monochrome_logs ? '' : "\033[0;37m"
270 | 
271 |         // Bold
272 |         colorcodes['bblack']     = monochrome_logs ? '' : "\033[1;30m"
273 |         colorcodes['bred']       = monochrome_logs ? '' : "\033[1;31m"
274 |         colorcodes['bgreen']     = monochrome_logs ? '' : "\033[1;32m"
275 |         colorcodes['byellow']    = monochrome_logs ? '' : "\033[1;33m"
276 |         colorcodes['bblue']      = monochrome_logs ? '' : "\033[1;34m"
277 |         colorcodes['bpurple']    = monochrome_logs ? '' : "\033[1;35m"
278 |         colorcodes['bcyan']      = monochrome_logs ? '' : "\033[1;36m"
279 |         colorcodes['bwhite']     = monochrome_logs ? '' : "\033[1;37m"
280 | 
281 |         // Underline
282 |         colorcodes['ublack']     = monochrome_logs ? '' : "\033[4;30m"
283 |         colorcodes['ured']       = monochrome_logs ? '' : "\033[4;31m"
284 |         colorcodes['ugreen']     = monochrome_logs ? '' : "\033[4;32m"
285 |         colorcodes['uyellow']    = monochrome_logs ? '' : "\033[4;33m"
286 |         colorcodes['ublue']      = monochrome_logs ? '' : "\033[4;34m"
287 |         colorcodes['upurple']    = monochrome_logs ? '' : "\033[4;35m"
288 |         colorcodes['ucyan']      = monochrome_logs ? '' : "\033[4;36m"
289 |         colorcodes['uwhite']     = monochrome_logs ? '' : "\033[4;37m"
290 | 
291 |         // High Intensity
292 |         colorcodes['iblack']     = monochrome_logs ? '' : "\033[0;90m"
293 |         colorcodes['ired']       = monochrome_logs ? '' : "\033[0;91m"
294 |         colorcodes['igreen']     = monochrome_logs ? '' : "\033[0;92m"
295 |         colorcodes['iyellow']    = monochrome_logs ? '' : "\033[0;93m"
296 |         colorcodes['iblue']      = monochrome_logs ? '' : "\033[0;94m"
297 |         colorcodes['ipurple']    = monochrome_logs ? '' : "\033[0;95m"
298 |         colorcodes['icyan']      = monochrome_logs ? '' : "\033[0;96m"
299 |         colorcodes['iwhite']     = monochrome_logs ? '' : "\033[0;97m"
300 | 
301 |         // Bold High Intensity
302 |         colorcodes['biblack']    = monochrome_logs ? '' : "\033[1;90m"
303 |         colorcodes['bired']      = monochrome_logs ? '' : "\033[1;91m"
304 |         colorcodes['bigreen']    = monochrome_logs ? '' : "\033[1;92m"
305 |         colorcodes['biyellow']   = monochrome_logs ? '' : "\033[1;93m"
306 |         colorcodes['biblue']     = monochrome_logs ? '' : "\033[1;94m"
307 |         colorcodes['bipurple']   = monochrome_logs ? '' : "\033[1;95m"
308 |         colorcodes['bicyan']     = monochrome_logs ? '' : "\033[1;96m"
309 |         colorcodes['biwhite']    = monochrome_logs ? '' : "\033[1;97m"
310 | 
311 |         return colorcodes
312 |     }
313 | 
314 |     //
315 |     // Does what is says on the tin
316 |     //
317 |     public static String dashedLine(monochrome_logs) {
318 |         Map colors = logColours(monochrome_logs)
319 |         return "${colors.dim}--------------------------------------------------------------------------------${colors.reset}"
320 |     }
321 | 
322 |     // epi2me-labs logo
323 |     public static String logo(workflow, monochrome_logs) {
324 |         Map colors = NfcoreTemplate.logColours(monochrome_logs)
325 |         String workflow_name = workflow.manifest.name.split("/")[1]
326 |         String workflow_version = version(workflow)
327 |         String.format(
328 |             """
329 |             ${colors.igreen}||||||||||   ${colors.reset}${colors.dim}_____ ____ ___ ____  __  __ _____
330 |             ${colors.igreen}||||||||||  ${colors.reset}${colors.dim}| ____|  _ \\_ _|___ \\|  \\/  | ____|
331 |             ${colors.yellow}|||||       ${colors.reset}${colors.dim}|  _| | |_) | |  __) | |\\/| |  _|
332 |             ${colors.yellow}|||||       ${colors.reset}${colors.dim}| |___|  __/| | / __/| |  | | |__
333 |             ${colors.iblue}||||||||||  ${colors.reset}${colors.dim}|_____|_|  |___|_____|_|  |_|_____|
334 |             ${colors.iblue}||||||||||  ${colors.reset}${colors.bold}${workflow_name} ${workflow_version}${colors.reset}
335 |             ${NfcoreTemplate.dashedLine(monochrome_logs)}
336 |             """.stripIndent()
337 |         )
338 |     }
339 | }
340 | 
341 | 
342 | 


--------------------------------------------------------------------------------
/lib/Pinguscript.groovy:
--------------------------------------------------------------------------------
  1 | import static groovy.json.JsonOutput.toJson
  2 | import groovy.json.JsonBuilder
  3 | import groovy.json.JsonSlurper
  4 | 
  5 | 
  6 | class Pinguscript {
  7 | 
  8 |     // Send a ping for the start of a workflow
  9 |     public static void ping_start(nextflow, workflow, params) {
 10 |         wf_ping(nextflow, workflow, "start", null, params)
 11 |     }
 12 |     // Send a ping for a completed workflow (successful or otherwise)
 13 |     public static void ping_complete(nextflow, workflow, params) {
 14 |         wf_ping(nextflow, workflow, "end", null, params)
 15 |     }
 16 |     // Send a ping for a workflow error
 17 |     public static void ping_error(nextflow, workflow, params) {
 18 |         def error_message = workflow.errorMessage
 19 |         wf_ping(nextflow, workflow, "error", error_message, params)
 20 |     }
 21 |     // Shared handler to construct a ping JSON and send it
 22 |     private static String wf_ping(nextflow, workflow, event, error_message, params) {
 23 |         if (params.disable_ping) {
 24 |             return "{}"
 25 |         }
 26 |         def body_json = make_wf_ping(nextflow, workflow, event, error_message, params)
 27 |         send_ping_post("epilaby", body_json)
 28 |     }
 29 | 
 30 |     // Helper to removing keys from a map
 31 |     private static clean_meta(meta, keys_to_remove) {
 32 |         for (key in keys_to_remove) {
 33 |             if (meta.containsKey(key)) {
 34 |                 meta.remove(key)
 35 |             }
 36 |         }
 37 |     }
 38 | 
 39 |     // Helper for fetching a key from the params map
 40 |     // seems pointless but you just know someone is going to end up writing meta.this ? meta.that
 41 |     private static get_meta(meta, key) {
 42 |         (meta.containsKey(key) && meta[key]) ? meta[key].toString() : null
 43 |     }
 44 | 
 45 |     // Construct workflow ping JSON
 46 |     private static String make_wf_ping(nextflow, workflow, event, error_message, params) {
 47 |         // cheeky deepcopy using json
 48 |         String paramsJSON = new JsonBuilder(params).toPrettyString()
 49 |         def params_data = new JsonSlurper().parseText(paramsJSON)
 50 | 
 51 |         // OS
 52 |         // TODO check version on WSL
 53 |         def opsys = System.properties['os.name'].toLowerCase()
 54 |         def opver = System.properties['os.version']
 55 |         if (opver.toLowerCase().contains("wsl")){
 56 |             opsys = "wsl"
 57 |         }
 58 | 
 59 |         // placeholder for any future okta business
 60 |         // for now we'll use the guest_<ulid> sent to wf.epi2me_user
 61 |         def user = get_meta(params.wf, "epi2me_user")
 62 | 
 63 |         // drop cruft to save some precious bytes
 64 |         // affects the deep copy rather than original params
 65 |         clean_meta(params_data, [
 66 |             "schema_ignore_params",
 67 |         ])
 68 |         def ingress_ids = []
 69 |         if (params_data.containsKey("wf")) {
 70 |             ingress_ids = params_data.wf["ingress.run_ids"] ?: []
 71 |             clean_meta(params_data.wf, [
 72 |                 "agent", // we send this later
 73 |                 "epi2me_instance", // we send this later
 74 |                 "epi2me_user", // we send this later
 75 |                 "example_cmd",
 76 |                 "ingress.run_ids", // we will send this elsewhere
 77 |             ])
 78 |         }
 79 | 
 80 |         // try and get runtime information
 81 |         def cpus = null
 82 |         try {
 83 |             cpus = Runtime.getRuntime().availableProcessors()
 84 |         }
 85 |         catch(Exception e) {}
 86 | 
 87 |         def workflow_success = null
 88 |         def workflow_exitcode = null
 89 |         if (event != "start") {
 90 |             workflow_success = workflow.success
 91 |             workflow_exitcode = workflow.exitStatus
 92 |         }
 93 | 
 94 |         /// build message
 95 |         def body_json = new JsonBuilder()
 96 |         body_json \
 97 |             "tracking_id": [
 98 |                 "msg_id": UUID.randomUUID().toString(),
 99 |                 "version": "3.0.1"
100 |             ],
101 |             "source": "workflow",
102 |             "event": event,
103 |             "params": params_data,
104 |             // data will be null on start events, as ingress has not run
105 |             "data": event != "start" ? [run_ids: ingress_ids] : null,
106 |             "workflow": [
107 |                 "name": workflow.manifest.name,
108 |                 "version": workflow.manifest.version, // could use NfcoreTemplate.version(workflow)
109 |                 "run_name": workflow.runName, // required to disambiguate sessions
110 |                 "session": workflow.sessionId,
111 |                 "profile": workflow.profile,
112 |                 "resume": workflow.resume,
113 |                 "error": error_message, // null if no error
114 |                 "success": workflow_success,
115 |                 "exitcode": workflow_exitcode,
116 |             ],
117 |             "env": [
118 |                 "user": user, // placeholder for any future okta
119 |                 "os": [
120 |                     "name": opsys,
121 |                     "version": opver
122 |                 ],
123 |                 "resource": [
124 |                     "cpus": cpus,
125 |                     "memory": null, // placeholder, no point asking via Runtime as it will just give us the Xmx size
126 |                 ],
127 |                 "agent": get_meta(params.wf, "agent"), // access via original params
128 |                 "epi2me": [
129 |                     "instance": get_meta(params.wf, "epi2me_instance"),
130 |                     "user": user,
131 |                 ],
132 |                 "nextflow": [
133 |                     "version": nextflow.version.toString(),
134 |                     "version_compat": nextflow.version.matches(workflow.manifest.nextflowVersion)
135 |                 ]
136 |             ]
137 |         return body_json
138 |     }
139 | 
140 |     // Send a JSON payload to a given endpoint
141 |     private static String send_ping_post(endpoint, body_json) {
142 |         // Attempt to send payload and absorb any possible Exception gracefully
143 |         String postResult
144 |         boolean raise_exception = false
145 |         try {
146 |             ((HttpURLConnection)new URL("https://ping.oxfordnanoportal.com/${endpoint}").openConnection()).with({
147 |                 requestMethod = 'POST'
148 |                 doOutput = true
149 |                 setConnectTimeout(5000)
150 |                 setReadTimeout(10000)
151 |                 setRequestProperty('Content-Type', 'application/json')
152 |                 setRequestProperty('accept', 'application/json')
153 |                 outputStream.withPrintWriter({printWriter ->
154 |                     printWriter.write(body_json.toString())
155 |                 })
156 | 
157 |                 // Rethrow exceptions that imply we're not using this endpoint properly
158 |                 if(responseCode >= 400 && agent.toString() == "cw-ci") {
159 |                     raise_exception = true
160 |                 }
161 |                 // Accessing inputStream.text will raise an Exception for failed requests
162 |                 postResult = inputStream.text
163 |             })
164 |         }
165 |         catch(Exception e) {
166 |             if(raise_exception) { throw e }
167 |         }
168 |         return (postResult)
169 |     }
170 | }
171 | 


--------------------------------------------------------------------------------
/lib/WorkflowMain.groovy:
--------------------------------------------------------------------------------
 1 | // This file is based on the nf-core/tools pipeline-template.
 2 | // Changes to this file must be propagated via wf-template.
 3 | 
 4 | class WorkflowMain {
 5 | 
 6 |     // Citation string for pipeline
 7 |     public static String citation(workflow) {
 8 |         return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" +
 9 |             "* The nf-core framework\n" +
10 |             "  https://doi.org/10.1038/s41587-020-0439-x\n\n"
11 |     }
12 | 
13 |     // Generate help string
14 |     public static String help(workflow, params, log) {
15 |         String line_sep = ' \\ \n\t'
16 |         String command_example = params.wf.example_cmd.join(line_sep)
17 |         String command = 'nextflow run ' + workflow.manifest.name + line_sep + command_example
18 |         String help_string = ''
19 |         help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs)
20 |         help_string += NfcoreSchema.paramsHelp(workflow, params, command)
21 |         help_string += '\n' + citation(workflow) + '\n'
22 |         return help_string
23 |     }
24 | 
25 |     // Generate parameter summary log string
26 |     public static String paramsSummaryLog(workflow, params, log) {
27 |         String workflow_version = NfcoreTemplate.version(workflow)
28 |         String summary_log = ''
29 |         summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs)
30 |         summary_log += NfcoreSchema.paramsSummaryLog(workflow, params)
31 |         summary_log += '\n' + citation(workflow) + '\n'
32 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
33 |         summary_log += "\nThis is ${workflow.manifest.name} ${workflow_version}.\n"
34 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
35 |         return summary_log
36 |     }
37 | 
38 |     // Validate parameters and print summary to screen
39 |     public static void initialise(workflow, params, log) {
40 |         // Print help to screen if required
41 |         if (params.help) {
42 |             log.info help(workflow, params, log)
43 |             System.exit(0)
44 |         }
45 | 
46 |         // Print workflow version and exit on --version
47 |         if (params.version) {
48 |             String workflow_version = NfcoreTemplate.version(workflow)
49 |             log.info "${workflow.manifest.name} ${workflow_version}"
50 |             System.exit(0)
51 |         }
52 | 
53 |         // Explode on conda
54 |         // conda.enabled seems to be backward compatible but wrap this
55 |         // in a generic catch just in case
56 |         try {
57 |             if (workflow.session.config.conda.enabled) {
58 |                 log.error "Sorry, this workflow is not compatible with Conda, please use -profile standard (Docker) or -profile singularity."
59 |                 System.exit(1)
60 |             }
61 |         } catch(Exception e) {}
62 | 
63 |         // Validate workflow parameters via the JSON schema
64 |         if (params.validate_params) {
65 |             NfcoreSchema.validateParameters(workflow, params, log)
66 |         }
67 | 
68 |         // Print parameter summary log to screen
69 |         log.info paramsSummaryLog(workflow, params, log)
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/lib/common.nf:
--------------------------------------------------------------------------------
 1 | import groovy.json.JsonBuilder
 2 | 
 3 | process getParams {
 4 |     label "wf_common"
 5 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "params.json"
 6 |     cache false
 7 |     cpus 1
 8 |     memory "2 GB"
 9 |     output:
10 |         path "params.json"
11 |     script:
12 |         def paramsJSON = new JsonBuilder(params).toPrettyString().replaceAll("'", "'\\\\''")
13 |     """
14 |     # Output nextflow params object to JSON
15 |     echo '$paramsJSON' > params.json
16 |     """
17 | }
18 | 
19 | process configure_igv {
20 |     publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv
21 |     label "wf_common"
22 |     cpus 1
23 |     memory "2 GB"
24 |     input:
25 |         // the python script will work out what to do with all the files based on their
26 |         // extensions
27 |         path "file-names.txt"
28 |         val locus_str
29 |         val aln_extra_opts
30 |         val var_extra_opts
31 |     output: path "igv.json"
32 |     script:
33 |     // the locus argument just makes sure that the initial view in IGV shows something
34 |     // interesting
35 |     String locus_arg = locus_str ? "--locus $locus_str" : ""
36 |     // extra options for alignment tracks
37 |     def aln_opts_json_str = \
38 |         aln_extra_opts ? new JsonBuilder(aln_extra_opts).toPrettyString() : ""
39 |     String aln_extra_opts_arg = \
40 |         aln_extra_opts ? "--extra-alignment-opts extra-aln-opts.json" : ""
41 |     // extra options for variant tracks
42 |     def var_opts_json_str = \
43 |         var_extra_opts ? new JsonBuilder(var_extra_opts).toPrettyString() : ""
44 |     String var_extra_opts_arg = \
45 |         var_extra_opts ? "--extra-vcf-opts extra-var-opts.json" : ""
46 |     """
47 |     # write out JSON files with extra options for the alignment and variant tracks
48 |     echo '$aln_opts_json_str' > extra-aln-opts.json
49 |     echo '$var_opts_json_str' > extra-var-opts.json
50 | 
51 |     workflow-glue configure_igv \
52 |         --fofn file-names.txt \
53 |         $locus_arg \
54 |         $aln_extra_opts_arg \
55 |         $var_extra_opts_arg \
56 |     > igv.json
57 |     """
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/lib/nfcore_external_java_deps.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-alignment/1bb08961e2aa8cfdb84fe2b4a8413c48039bbd04/lib/nfcore_external_java_deps.jar


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
  1 | //
  2 | // Notes to End Users.
  3 | //
  4 | // The workflow should run without editing this configuration file,
  5 | // however there may be instances in which you wish to edit this
  6 | // file for compute performance or other reasons. Please see:
  7 | //
  8 | //   https://nextflow.io/docs/latest/config.html#configuration
  9 | //
 10 | // for further help editing this file.
 11 | 
 12 | 
 13 | params {
 14 |     help = false
 15 |     out_dir = "output"
 16 |     store_dir = null
 17 |     version = false
 18 |     fastq = null
 19 |     bam = null
 20 |     references = null
 21 |     igv = false
 22 |     reference_mmi_file = null
 23 |     counts = null
 24 |     prefix = null
 25 |     sample = null
 26 |     sample_sheet = null
 27 |     disable_ping = false
 28 |     depth_coverage = true
 29 |     analyse_unclassified = false
 30 |     minimap_preset = "dna"
 31 |     minimap_args = null
 32 |     threads = 4
 33 |     per_read_stats = false
 34 | 
 35 |     monochrome_logs = false
 36 |     validate_params = true
 37 |     show_hidden_params = false
 38 |     schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf'
 39 | 
 40 |     wf {
 41 |         num_depth_windows = 200
 42 |         max_depth_window_size = 10000
 43 |         igv_locus_depth_threshold = 10
 44 |         example_cmd = [
 45 |             "--fastq 'wf-alignment-demo/fastq'",
 46 |             "--references 'wf-alignment-demo/references'",
 47 |         ]
 48 |         container_sha = "shafd8c4500ccf8b133bef07111626b9e7d876c430d"
 49 |         common_sha = "sha1c69fd30053aad5d516e9567b3944384325a0fee"
 50 |     }
 51 | }
 52 | 
 53 | 
 54 | manifest {
 55 |     name            = 'epi2me-labs/wf-alignment'
 56 |     author          = 'Oxford Nanopore Technologies'
 57 |     homePage        = 'https://github.com/epi2me-labs/wf-alignment'
 58 |     description     = 'Align Nanopore reads and visualize mapping statistics.'
 59 |     mainScript      = 'main.nf'
 60 |     nextflowVersion = '>=23.04.2'
 61 |     version         = 'v1.2.3'
 62 | }
 63 | 
 64 | epi2melabs {
 65 |     tags = 'wf-alignment,minimap2,alignment,mapping,utility'
 66 | }
 67 | 
 68 | 
 69 | // used by default for "standard" (docker) and singularity profiles,
 70 | // other profiles may override.
 71 | process {
 72 |     withLabel:wfalignment {
 73 |         container = "ontresearch/wf-alignment:${params.wf.container_sha}"
 74 |     }
 75 |     withLabel:wf_common {
 76 |         container = "ontresearch/wf-common:${params.wf.common_sha}"
 77 |     }
 78 |     shell = ['/bin/bash', '-euo', 'pipefail']
 79 | }
 80 | 
 81 | 
 82 | profiles {
 83 |     // the "standard" profile is used implicitly by nextflow
 84 |     // if no other profile is given on the CLI
 85 |     standard {
 86 |         docker {
 87 |             enabled = true
 88 |             // this ensures container is run as host user and group, but
 89 |             //    also adds host user to the within-container group
 90 |             runOptions = "--user \$(id -u):\$(id -g) --group-add 100"
 91 |         }
 92 |     }
 93 | 
 94 |     // using singularity instead of docker
 95 |     singularity {
 96 |         singularity {
 97 |             enabled = true
 98 |             autoMounts = true
 99 |         }
100 |     }
101 | 
102 | 
103 |     // keep stub conda profile to prevent unknown profile warning so users get a better error
104 |     conda {
105 |         conda.enabled = true
106 |     }
107 | 
108 | 
109 |     // Using AWS batch.
110 |     // May need to set aws.region and aws.batch.cliPath
111 |     awsbatch {
112 |         process {
113 |             executor = 'awsbatch'
114 |             queue = "${params.aws_queue}"
115 |             memory = '8G'
116 |             withLabel:wfalignment {
117 |                 container = "${params.aws_image_prefix}-wf-alignment:${params.wf.container_sha}"
118 |             }
119 |             withLabel:wf_common {
120 |                 container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}"
121 |             }
122 |             shell = ['/bin/bash', '-euo', 'pipefail']
123 |         }
124 |     }
125 | 
126 | 
127 | 
128 |     // local profile for simplified development testing
129 |     local {
130 |         process.executor = 'local'
131 |     }
132 | }
133 | 
134 | timeline {
135 |     enabled = true
136 |     overwrite = true
137 |     file = "${params.out_dir}/execution/timeline.html"
138 | }
139 | report {
140 |     enabled = true
141 |     overwrite = true
142 |     file = "${params.out_dir}/execution/report.html"
143 | }
144 | trace {
145 |     enabled = true
146 |     overwrite = true
147 |     file = "${params.out_dir}/execution/trace.txt"
148 | }
149 | 
150 | env {
151 |     PYTHONNOUSERSITE = 1
152 |     JAVA_TOOL_OPTIONS = "-Xlog:disable -Xlog:all=warning:stderr"
153 | }
154 | 


--------------------------------------------------------------------------------
/nextflow_schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "http://json-schema.org/draft-07/schema",
  3 |     "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
  4 |     "title": "epi2me-labs/wf-alignment",
  5 |     "workflow_title": "Alignment workflow",
  6 |     "description": "Align nanopore sequencing data and visualize mapping statistics.",
  7 |     "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-alignment/wf-alignment-demo.tar.gz",
  8 |     "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-alignment/wf-alignment-demo/aws.nextflow.config",
  9 |     "url": "https://github.com/epi2me-labs/wf-alignment",
 10 |     "type": "object",
 11 |     "definitions": {
 12 |         "input": {
 13 |             "title": "Input Options",
 14 |             "type": "object",
 15 |             "fa_icon": "fas fa-arrow-right",
 16 |             "description": "Parameters for finding and handling input data for analysis.",
 17 |             "properties": {
 18 |                 "fastq": {
 19 |                     "type": "string",
 20 |                     "format": "path",
 21 |                     "title": "FASTQ",
 22 |                     "description": "FASTQ files to use in the analysis.",
 23 |                     "help_text": "This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
 24 |                 },
 25 |                 "bam": {
 26 |                     "type": "string",
 27 |                     "format": "path",
 28 |                     "description": "BAM or unaligned BAM (uBAM) files to use in the analysis.",
 29 |                     "help_text": "This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
 30 |                 },
 31 |                 "analyse_unclassified": {
 32 |                     "type": "boolean",
 33 |                     "default": false,
 34 |                     "title": "Analyse unclassified reads",
 35 |                     "description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.",
 36 |                     "help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory."
 37 |                 },
 38 |                 "references": {
 39 |                     "type": "string",
 40 |                     "format": "directory-path",
 41 |                     "description": "Path to a directory containing FASTA reference files.",
 42 |                     "help_text": "Accepted file extensions are '.fasta', '.fna', '.ffn', '.faa', '.frn', '.fa', '.txt', '.fa.gz', '.fna.gz', '.frn.gz', '.ffn.gz', '.fasta.gz'. In addition, an MMI index file can be provided to make the workflow run faster using the option `--reference_mmi_file`."
 43 |                 },
 44 |                 "reference_mmi_file": {
 45 |                     "type": "string",
 46 |                     "format": "file-path",
 47 |                     "description": "Path to an MMI index file to be used as reference.",
 48 |                     "help_text": "Accepted file extension is '.mmi'. The references parameter is still required if this is provided. Note that some minimap2 alignment options are set by the reference MMI and cannot be overridden."
 49 |                 },
 50 |                 "counts": {
 51 |                     "type": "string",
 52 |                     "format": "file-path",
 53 |                     "description": "Path to a CSV file containing expected counts as a control.",
 54 |                     "help_text": "The expected counts CSV file must contain columns named 'reference' and 'expected_counts' in order to be valid. the 'reference' column should contain names matching the names of reference sequences within the fasta files provided using --references."
 55 |                 }
 56 |             },
 57 |             "allOf": [
 58 |                 {
 59 |                     "required": [
 60 |                         "references"
 61 |                     ]
 62 |                 },
 63 |                 {
 64 |                     "oneOf": [
 65 |                         {
 66 |                             "required": [
 67 |                                 "fastq"
 68 |                             ]
 69 |                         },
 70 |                         {
 71 |                             "required": [
 72 |                                 "bam"
 73 |                             ]
 74 |                         }
 75 |                     ]
 76 |                 }
 77 |             ]
 78 |         },
 79 |         "samples": {
 80 |             "title": "Sample Options",
 81 |             "type": "object",
 82 |             "fa_icon": "fas fa-vials",
 83 |             "description": "Parameters that relate to samples such as sample sheets and sample names.",
 84 |             "properties": {
 85 |                 "sample_sheet": {
 86 |                     "type": "string",
 87 |                     "format": "file-path",
 88 |                     "description": "A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files.",
 89 |                     "help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`."
 90 |                 },
 91 |                 "sample": {
 92 |                     "type": "string",
 93 |                     "description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files."
 94 |                 }
 95 |             }
 96 |         },
 97 |         "output": {
 98 |             "title": "Output Options",
 99 |             "type": "object",
100 |             "fa_icon": "fas fa-arrow-left",
101 |             "description": "Parameters for saving and naming workflow outputs.",
102 |             "properties": {
103 |                 "out_dir": {
104 |                     "type": "string",
105 |                     "default": "output",
106 |                     "format": "directory-path",
107 |                     "description": "Directory for output of all workflow results."
108 |                 },
109 |                 "prefix": {
110 |                     "type": "string",
111 |                     "title": "Output filename prefix",
112 |                     "description": "Optional prefix attached to each of the output filenames.",
113 |                     "help_text": "Output filename format will be `<prefix>-filename.ext`."
114 |                 },
115 |                 "per_read_stats": {
116 |                     "type": "boolean",
117 |                     "default": false,
118 |                     "title": "Per-read alignment stats",
119 |                     "description": "Generate Bamstats per-read stats.",
120 |                     "help_text": "With this option, the workflow will produce detailed per-read alignment stats emitted as gzipped TSV file. As these files can get quite large, it is recommended to only request them when necessary."
121 |                 },
122 |                 "igv": {
123 |                     "type": "boolean",
124 |                     "default": false,
125 |                     "title": "IGV",
126 |                     "description": "Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files."
127 |                 }
128 |             }
129 |         },
130 |         "advanced_options": {
131 |             "title": "Advanced options",
132 |             "type": "object",
133 |             "fa_icon": "far fa-question-circle",
134 |             "description": "Advanced options for configuring processes inside the workflow.",
135 |             "help_text": "These advanced options do not need to be changed for typical use, but allow fine tuning of workflows for users who want more control over the workflow.",
136 |             "properties": {
137 |                 "depth_coverage": {
138 |                     "type": "boolean",
139 |                     "default": true,
140 |                     "description": "Calculate depth coverage statistics and include them in the report.",
141 |                     "help_text": "This step can be a computational bottleneck. Set this to false if your reference sequences are >50mb to speed things up."
142 |                 },
143 |                 "minimap_preset": {
144 |                     "type": "string",
145 |                     "title": "Minimap2 parameter preset",
146 |                     "description": "Pre-defined parameter sets for `minimap2`, covering most common use cases.",
147 |                     "help_text": "Available parameter sets are: 'dna' (`-ax map-ont`), 'rna' (`-ax splice -uf`).",
148 |                     "enum": [
149 |                         "dna",
150 |                         "rna"
151 |                     ],
152 |                     "default": "dna"
153 |                 },
154 |                 "minimap_args": {
155 |                     "type": "string",
156 |                     "title": "Minimap2 CLI arguments",
157 |                     "description": "String of command line arguments to be passed on to `minimap2`.",
158 |                     "help_text": "This overrides the options defined by `--minimap_preset` and allows for running the alignment step in a more customized way."
159 |                 }
160 |             }
161 |         },
162 |         "misc": {
163 |             "title": "Miscellaneous Options",
164 |             "type": "object",
165 |             "description": "Everything else.",
166 |             "default": "",
167 |             "properties": {
168 |                 "threads": {
169 |                     "type": "integer",
170 |                     "default": 4,
171 |                     "minimum": 4,
172 |                     "description": "Number of CPU threads to use for the alignment step.",
173 |                     "help_text": "The alignment process will run with this many threads (note that the memory used by minimap2 scales with the number of threads). The total CPU resources used by the workflow are constrained by the Nextflow executor configuration."
174 |                 },
175 |                 "disable_ping": {
176 |                     "type": "boolean",
177 |                     "default": false,
178 |                     "description": "Enable to prevent sending a workflow ping.",
179 |                     "overrides": {
180 |                         "epi2mecloud": {
181 |                             "hidden": true
182 |                         }
183 |                     }
184 |                 },
185 |                 "help": {
186 |                     "type": "boolean",
187 |                     "default": false,
188 |                     "description": "Display help text.",
189 |                     "fa_icon": "fas fa-question-circle",
190 |                     "hidden": true
191 |                 },
192 |                 "version": {
193 |                     "type": "boolean",
194 |                     "default": false,
195 |                     "description": "Display version and exit.",
196 |                     "fa_icon": "fas fa-question-circle",
197 |                     "hidden": true
198 |                 },
199 |                 "store_dir": {
200 |                     "type": "string",
201 |                     "description": "Unused.",
202 |                     "hidden": true
203 |                 }
204 |             }
205 |         }
206 |     },
207 |     "allOf": [
208 |         {
209 |             "$ref": "#/definitions/input"
210 |         },
211 |         {
212 |             "$ref": "#/definitions/samples"
213 |         },
214 |         {
215 |             "$ref": "#/definitions/output"
216 |         },
217 |         {
218 |             "$ref": "#/definitions/advanced_options"
219 |         },
220 |         {
221 |             "$ref": "#/definitions/misc"
222 |         }
223 |     ],
224 |     "properties": {
225 |         "aws_image_prefix": {
226 |             "type": "string",
227 |             "hidden": true
228 |         },
229 |         "aws_queue": {
230 |             "type": "string",
231 |             "hidden": true
232 |         },
233 |         "monochrome_logs": {
234 |             "type": "boolean"
235 |         },
236 |         "validate_params": {
237 |             "type": "boolean",
238 |             "default": true
239 |         },
240 |         "show_hidden_params": {
241 |             "type": "boolean"
242 |         }
243 |     },
244 |     "resources": {
245 |         "recommended": {
246 |             "cpus": 12,
247 |             "memory": "32GB"
248 |         },
249 |         "minimum": {
250 |             "cpus": 6,
251 |             "memory": "12GB"
252 |         },
253 |         "run_time": "0.5-5 minutes per sample (depending on number of reads, length of reference, and available compute).",
254 |         "arm_support": true
255 |     }
256 | }
257 | 


--------------------------------------------------------------------------------
/output_definition.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "files": {
  3 |     "workflow-report": {
  4 |       "filepath": "wf-alignment-report.html",
  5 |       "title": "workflow report",
  6 |       "description": "Report for all samples",
  7 |       "mime-type": "text/html",
  8 |       "optional": false,
  9 |       "type": "aggregated"
 10 |     },
 11 |     "references": {
 12 |       "filepath": "combined_refs.fasta",
 13 |       "title": "Combined references",
 14 |       "description": "FASTA file containing all input references.",
 15 |       "mime-type": "text/txt",
 16 |       "optional": false,
 17 |       "type": "aggregated"
 18 |     },
 19 |     "references-index": {
 20 |       "filepath": "combined_refs.fasta.fai",
 21 |       "title": "Combined references index",
 22 |       "description": "Index file for combined references FASTA.",
 23 |       "mime-type": "text/txt",
 24 |       "optional": false,
 25 |       "type": "aggregated"
 26 |     },
 27 |     "references-mmi-index": {
 28 |       "filepath": "combined_refs.mmi",
 29 |       "title": "Combined references MMI index",
 30 |       "description": "Minimap2 index file for combined references FASTA.",
 31 |       "mime-type": "application/octet-stream",
 32 |       "optional": true,
 33 |       "type": "aggregated"
 34 |     },
 35 |     "per-read-bamstats": {
 36 |       "filepath": "{{ alias }}.readstats.tsv.gz",
 37 |       "title": "Per-read alignment stats",
 38 |       "description": "Bamstats per-read output TSV file (compressed with gzip).",
 39 |       "mime-type": "application/gzip",
 40 |       "optional": true,
 41 |       "type": "per-sample"
 42 |     },
 43 |     "bamstats-flagstat": {
 44 |       "filepath": "{{ alias }}.flagstat.tsv",
 45 |       "title": "Per-reference alignment stats",
 46 |       "description": "Bamstats flagstat output TSV file.",
 47 |       "mime-type": "text/tab-separated-values",
 48 |       "optional": false,
 49 |       "type": "per-sample"
 50 |     },
 51 |     "alignment-accuracy-histogram": {
 52 |       "filepath": "{{ alias }}-histograms/accuracy.hist",
 53 |       "title": "Alignment accuracy histogram",
 54 |       "description": "Bamstats alignment accuracy histogram TSV file.",
 55 |       "mime-type": "text/tab-separated-values",
 56 |       "optional": false,
 57 |       "type": "per-sample"
 58 |     },
 59 |     "alignment-coverage-histogram": {
 60 |       "filepath": "{{ alias }}-histograms/coverage.hist",
 61 |       "title": "Alignment coverage histogram",
 62 |       "description": "Bamstats alignment coverage histogram TSV file.",
 63 |       "mime-type": "text/tab-separated-values",
 64 |       "optional": false,
 65 |       "type": "per-sample"
 66 |     },
 67 |     "mapped-read-length-histogram": {
 68 |       "filepath": "{{ alias }}-histograms/length.hist",
 69 |       "title": "Read length histogram (mapped)",
 70 |       "description": "Bamstats read length histogram TSV file (for mapped reads).",
 71 |       "mime-type": "text/tab-separated-values",
 72 |       "optional": false,
 73 |       "type": "per-sample"
 74 |     },
 75 |     "unmapped-read-length-histogram": {
 76 |       "filepath": "{{ alias }}-histograms/length.unmap.hist",
 77 |       "title": "Read length histogram (unmapped)",
 78 |       "description": "Bamstats read length histogram TSV file (for unmapped reads).",
 79 |       "mime-type": "text/tab-separated-values",
 80 |       "optional": false,
 81 |       "type": "per-sample"
 82 |     },
 83 |     "mapped-read-quality-histogram": {
 84 |       "filepath": "{{ alias }}-histograms/quality.hist",
 85 |       "title": "Read quality histogram (mapped)",
 86 |       "description": "Bamstats read quality histogram TSV file (for mapped reads).",
 87 |       "mime-type": "text/tab-separated-values",
 88 |       "optional": false,
 89 |       "type": "per-sample"
 90 |     },
 91 |     "unmapped-read-quality-histogram": {
 92 |       "filepath": "{{ alias }}-histograms/quality.unmap.hist",
 93 |       "title": "Read quality histogram (unmapped)",
 94 |       "description": "Bamstats read quality histogram TSV file (for unmapped reads).",
 95 |       "mime-type": "text/tab-separated-values",
 96 |       "optional": false,
 97 |       "type": "per-sample"
 98 |     },
 99 |     "alignments": {
100 |       "filepath": "{{ alias }}.sorted.aligned.bam",
101 |       "title": "Alignments BAM file",
102 |       "description": "BAM file with alignments of filtered input reads against the combined references.",
103 |       "mime-type": "application/gzip",
104 |       "optional": false,
105 |       "type": "per-sample"
106 |     },
107 |     "alignments-index": {
108 |       "filepath": "{{ alias }}.sorted.aligned.bam.bai",
109 |       "title": "Alignments index file",
110 |       "description": "Index for alignments BAM file.",
111 |       "mime-type": "application/octet-stream",
112 |       "optional": false,
113 |       "type": "per-sample"
114 |     },
115 |     "igv-config": {
116 |       "filepath": "igv.json",
117 |       "title": "IGV config JSON file",
118 |       "description": "JSON file with IGV config options to be used by the EPI2ME Desktop Application.",
119 |       "mime-type": "application/json",
120 |       "optional": true,
121 |       "type": "aggregated"
122 |     }
123 |   }
124 | }
125 | 


--------------------------------------------------------------------------------
/subworkflows/process_references.nf:
--------------------------------------------------------------------------------
 1 | process combine {
 2 |     label "wfalignment"
 3 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "combined_refs.fasta"
 4 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "combined_refs.fasta.fai"
 5 |     cpus 1
 6 |     memory "2 GB"
 7 |     input: path "reference*.fasta"
 8 |     output:
 9 |         path outfname, emit: fasta
10 |         path "*fai", emit: index
11 |     script:
12 |     outfname = "combined_refs.fasta"
13 |     """
14 |     find -name 'reference*.fasta' -exec zcat -f {} + > $outfname
15 | 
16 |     # make sure all sequence IDs are unique
17 |     if [ "\$(grep "^>" $outfname | sort | uniq -d)" ]; then
18 |         echo "Sequence IDs in the reference files must be unique." 1>&2
19 |         exit 1
20 |     fi
21 |     samtools faidx $outfname
22 |     """
23 | }
24 | 
25 | process fx2tab {
26 |     label "wfalignment"
27 |     cpus 1
28 |     memory { reference.size() > 1e9 ? "15 GB" : "2 GB" }
29 |     input:
30 |         path reference
31 |     output:
32 |         path "*.names.txt", emit: names
33 |         path "*.lengths.tsv", emit: lengths
34 |     script:
35 |     """
36 |     seqkit fx2tab --length --name --only-id $reference > fx2tab.out
37 |     cut -f1 fx2tab.out > ${reference}.names.txt
38 |     echo -e 'name\\tlengths' > ${reference}.lengths.tsv
39 |     cat fx2tab.out >> ${reference}.lengths.tsv
40 |     """
41 | }
42 | 
43 | workflow process_references {
44 |     List extensions = [
45 |         "fasta", "fna", "ffn", "faa", "frn", "fa", "txt",
46 |         "fa.gz", "fna.gz", "frn.gz", "ffn.gz", "fasta.gz"
47 |     ]
48 |     take:
49 |         input
50 |     main:
51 |         // get the reference files
52 |         Path input = file(input, checkIfExists: true)
53 |         List ref_files
54 |         if (input.isDirectory()) {
55 |             // we got a directory with one or multiple references
56 |             ref_files = extensions.collect {
57 |                 file(input.resolve("*.$it"), type: "file")
58 |             }.flatten()
59 |             if (ref_files.size() == 0) {
60 |                     error "No references found in ${input}."
61 |             }
62 |         }
63 |         else {
64 |             // the reference is a single file --> make sure it has an expected extension
65 |             if (!extensions.any { input.name.endsWith(it) }) {
66 |                 error "Reference file $input does not have an " +
67 |                     "accepted extension ($extensions)."
68 |             }
69 |             ref_files = [input]
70 |         }
71 |         fx2tab(Channel.of(ref_files).flatten())
72 |         combine(ref_files)
73 |     emit:
74 |         combined = combine.out.fasta
75 |         combined_index = combine.out.index
76 |         names_per_ref_file = fx2tab.out.names
77 |         lengths_per_ref_file = fx2tab.out.lengths
78 |         lengths_combined = fx2tab.out.lengths.collectFile(
79 |             name: "combined_lengths.tsv", keepHeader: true
80 |             // we need to call `.first()` to get a value channel (`.collectFile()`
81 |             // always returns a queue channel, even when it only produces a single file)
82 |         ).first()
83 | }
84 | 


--------------------------------------------------------------------------------
/test_data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-alignment/1bb08961e2aa8cfdb84fe2b4a8413c48039bbd04/test_data/.gitkeep


--------------------------------------------------------------------------------
/test_data/bam/test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-alignment/1bb08961e2aa8cfdb84fe2b4a8413c48039bbd04/test_data/bam/test.bam


--------------------------------------------------------------------------------
/test_data/counts/ERCC_mix1.csv:
--------------------------------------------------------------------------------
 1 | Reference,expected_count,expected_length
 2 | ERCC-00130,30000,1059
 3 | ERCC-00004,7500,523
 4 | ERCC-00136,1875,1033
 5 | ERCC-00108,937.5,1022
 6 | ERCC-00116,468.75,1991
 7 | ERCC-00092,234.375,1124
 8 | ERCC-00095,117.1875,521
 9 | ERCC-00131,117.1875,771
10 | ERCC-00062,58.59375,1023
11 | ERCC-00019,29.296875,644
12 | ERCC-00144,29.296875,538
13 | ERCC-00170,14.6484375,1023
14 | ERCC-00154,7.32421875,537
15 | ERCC-00085,7.32421875,844
16 | ERCC-00028,3.66210938,1130
17 | ERCC-00033,1.83105469,2022
18 | ERCC-00134,1.83105469,274
19 | ERCC-00147,0.91552734,1023
20 | ERCC-00097,0.45776367,523
21 | ERCC-00156,0.45776367,494
22 | ERCC-00123,0.22888184,1022
23 | ERCC-00017,0.11444092,1136
24 | ERCC-00083,0.02861023,1022
25 | ERCC-00096,15000,1107
26 | ERCC-00171,3750,505
27 | ERCC-00009,937.5,984
28 | ERCC-00042,468.75,1023
29 | ERCC-00060,234.375,523
30 | ERCC-00035,117.1875,1130
31 | ERCC-00025,58.59375,1994
32 | ERCC-00051,58.59375,274
33 | ERCC-00053,29.296875,1023
34 | ERCC-00148,14.6484375,494
35 | ERCC-00126,14.6484375,1118
36 | ERCC-00034,7.32421875,1019
37 | ERCC-00150,3.66210938,743
38 | ERCC-00067,3.66210938,644
39 | ERCC-00031,1.83105469,1138
40 | ERCC-00109,0.91552734,536
41 | ERCC-00073,0.91552734,603
42 | ERCC-00158,0.45776367,1027
43 | ERCC-00104,0.22888184,2022
44 | ERCC-00142,0.22888184,493
45 | ERCC-00138,0.11444092,1024
46 | ERCC-00117,0.05722046,1136
47 | ERCC-00075,0.01430512,1023
48 | ERCC-00074,15000,522
49 | ERCC-00113,3750,840
50 | ERCC-00145,937.5,1042
51 | ERCC-00111,468.75,994
52 | ERCC-00076,234.375,642
53 | ERCC-00044,117.1875,1156
54 | ERCC-00162,58.59375,523
55 | ERCC-00071,58.59375,642
56 | ERCC-00084,29.296875,994
57 | ERCC-00099,14.6484375,1350
58 | ERCC-00054,14.6484375,274
59 | ERCC-00157,7.32421875,1019
60 | ERCC-00143,3.66210938,784
61 | ERCC-00039,3.66210938,740
62 | ERCC-00058,1.83105469,1136
63 | ERCC-00120,0.91552734,536
64 | ERCC-00040,0.91552734,744
65 | ERCC-00164,0.45776367,1022
66 | ERCC-00024,0.22888184,536
67 | ERCC-00016,0.22888184,844
68 | ERCC-00012,0.11444092,994
69 | ERCC-00098,0.05722046,1143
70 | ERCC-00057,0.01430512,1021
71 | ERCC-00002,15000,1061
72 | ERCC-00046,3750,522
73 | ERCC-00003,937.5,1023
74 | ERCC-00043,468.75,1023
75 | ERCC-00022,234.375,751
76 | ERCC-00112,117.1875,1136
77 | ERCC-00165,58.59375,872
78 | ERCC-00079,58.59375,644
79 | ERCC-00078,29.296875,993
80 | ERCC-00163,14.6484375,543
81 | ERCC-00059,14.6484375,525
82 | ERCC-00160,7.32421875,743
83 | ERCC-00014,3.66210938,1957
84 | ERCC-00077,3.66210938,273
85 | ERCC-00069,1.83105469,1137
86 | ERCC-00137,0.91552734,537
87 | ERCC-00013,0.91552734,808
88 | ERCC-00168,0.45776367,1024
89 | ERCC-00041,0.22888184,1122
90 | ERCC-00081,0.22888184,534
91 | ERCC-00086,0.11444092,1020
92 | ERCC-00061,0.05722046,1136
93 | ERCC-00048,0.01430512,992


--------------------------------------------------------------------------------
/test_data/other_references/case01/reference.fasta:
--------------------------------------------------------------------------------
1 | >1 bla:primary_assembly primary_assembly:bla:1:1:0000:1 REF
2 | TCTTGCTTCAACAATAACGTCTCTTTCAGAAGGCATTGGTATCTTTTCCCCACTTCCAAGCATTTTTTCAACTAATCTTATGTTATTAACCATTTCCTTAAATTCTTCTGGGTCTGCTGACAAAGCATGATCAGGACCTTCCATATTTTTATCTAAGGTAAAGTGCTTCTCAATAACATCCGCTCCTAAGGCAACAGAAACTACTGGGGCGAGTATTCCCAATGTATGGTCAGAATATCCCACAGGGATATTGAATATACTTTTCAAGGTTTTAATAGCGTTTAAATTGACATCTTCATAAGGGGTTGGGTAAGATGAAATACAATGCAATAAAATAATATCCCTGCATCCATTATTTTCTAAAACTTTAACTGCTTCCCAAATTTCCCCAATATCAGACATTCCTGTAGATAAAATCACCGGCTTGCCTGTTTTTGCCACTTTTTCTAATAAGGGATAAAAGGTTAAATCACCAGAGGCAATTTTAAATCAGGCACATAAAAAAAAAAAAAAAAAAAAAAAA
3 | 
4 | 


--------------------------------------------------------------------------------
/test_data/other_references/case02/reference.fasta:
--------------------------------------------------------------------------------
1 | >0001 bla:primary_assembly primary_assembly:bla:1:1:0000:1 REF
2 | TCTTGCTTCAACAATAACGTCTCTTTCAGAAGGCATTGGTATCTTTTCCCCACTTCCAAGCATTTTTTCAACTAATCTTATGTTATTAACCATTTCCTTAAATTCTTCTGGGTCTGCTGACAAAGCATGATCAGGACCTTCCATATTTTTATCTAAGGTAAAGTGCTTCTCAATAACATCCGCTCCTAAGGCAACAGAAACTACTGGGGCGAGTATTCCCAATGTATGGTCAGAATATCCCACAGGGATATTGAATATACTTTTCAAGGTTTTAATAGCGTTTAAATTGACATCTTCATAAGGGGTTGGGTAAGATGAAATACAATGCAATAAAATAATATCCCTGCATCCATTATTTTCTAAAACTTTAACTGCTTCCCAAATTTCCCCAATATCAGACATTCCTGTAGATAAAATCACCGGCTTGCCTGTTTTTGCCACTTTTTCTAATAAGGGATAAAAGGTTAAATCACCAGAGGCAATTTTAAATCAGGCACATAAAAAAAAAAAAAAAAAAAAAAAA
3 | 
4 | 


--------------------------------------------------------------------------------
/test_data/references/combined_references.mmi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-alignment/1bb08961e2aa8cfdb84fe2b4a8413c48039bbd04/test_data/references/combined_references.mmi


--------------------------------------------------------------------------------
/test_data/ubam/test.ubam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-alignment/1bb08961e2aa8cfdb84fe2b4a8413c48039bbd04/test_data/ubam/test.ubam


--------------------------------------------------------------------------------