├── .dockerignore
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── config.yml
    │   ├── feature_request.yml
    │   └── question.yml
├── .gitignore
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── base.config
├── bin
    ├── add_jsons.py
    ├── check_sample_sheet.py
    ├── duplex_stats.py
    ├── fastcat_histogram.py
    ├── report.py
    ├── report_utils.py
    ├── workflow-glue
    └── workflow_glue
    │   ├── __init__.py
    │   ├── models
    │       ├── __init__.py
    │       └── common.py
    │   ├── util.py
    │   └── wfg_helpers
    │       ├── __init__.py
    │       ├── check_bam_headers_in_dir.py
    │       ├── check_sample_sheet.py
    │       ├── check_xam_index.py
    │       ├── configure_igv.py
    │       ├── get_max_depth_locus.py
    │       └── reheader_samstream.py
├── data
    └── OPTIONAL_FILE
├── docs
    ├── 01_brief_description.md
    ├── 02_introduction.md
    ├── 03_compute_requirements.md
    ├── 04_install_and_run.md
    ├── 05_related_protocols.md
    ├── 06_input_example.md
    ├── 06_input_parameters.md
    ├── 07_outputs.md
    ├── 08_pipeline_overview.md
    ├── 09_troubleshooting.md
    ├── 10_FAQ.md
    └── 11_other.md
├── lib
    ├── ArgumentParser.groovy
    ├── CWUtil.groovy
    ├── NfcoreSchema.groovy
    ├── NfcoreTemplate.groovy
    ├── Pinguscript.groovy
    ├── WorkflowMain.groovy
    ├── common.nf
    ├── ingress.nf
    ├── nfcore_external_java_deps.jar
    ├── reference.nf
    └── signal
    │   ├── ingress.nf
    │   └── merge.nf
├── main.nf
├── nextflow.config
├── nextflow_schema.json
├── output_definition.json
└── util
    └── update_models_schema.sh


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | bin
 3 | CHANGELOG.md
 4 | data
 5 | lib
 6 | LICENSE
 7 | main.nf
 8 | nextflow.config
 9 | README.md
10 | test_data
11 | # we typically run tests with outputs to these:
12 | output
13 | work
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
  1 | name: Bug Report
  2 | description: File a bug report
  3 | labels: ["triage"]
  4 | body:
  5 |   - type: markdown
  6 |     attributes:
  7 |       value: |
  8 |         Thanks for taking the time to fill out this bug report!
  9 | 
 10 | 
 11 |   - type: markdown
 12 |     attributes:
 13 |       value: |
 14 |           # Background
 15 |   - type: dropdown
 16 |     id: os
 17 |     attributes:
 18 |       label: Operating System
 19 |       description: What operating system are you running?
 20 |       options:
 21 |         - Windows 10
 22 |         - Windows 11
 23 |         - macOS
 24 |         - Ubuntu 22.04
 25 |         - CentOS 7
 26 |         - Other Linux (please specify below)
 27 |     validations:
 28 |       required: true
 29 |   - type: input
 30 |     id: other-os
 31 |     attributes:
 32 |       label: Other Linux
 33 |       placeholder: e.g. Fedora 38
 34 |   - type: input
 35 |     id: version
 36 |     attributes:
 37 |       label: Workflow Version
 38 |       description: This is most easily found in the workflow output log
 39 |       placeholder: v1.2.3
 40 |     validations:
 41 |       required: true
 42 |   - type: dropdown
 43 |     id: execution
 44 |     attributes:
 45 |       label: Workflow Execution
 46 |       description: Where are you running the workflow?
 47 |       options:
 48 |         - EPI2ME Desktop (Local)
 49 |         - EPI2ME Desktop (Cloud)
 50 |         - Command line (Local)
 51 |         - Command line (Cluster)
 52 |         - Other (please describe)
 53 |     validations:
 54 |       required: true
 55 |   - type: input
 56 |     id: other-workflow-execution
 57 |     attributes:
 58 |       label: Other workflow execution
 59 |       description: If "Other", please describe
 60 |       placeholder: Tell us where / how you are running the workflow.
 61 | 
 62 |   - type: markdown
 63 |     attributes:
 64 |       value: |
 65 |         # EPI2ME Desktop Application
 66 |         If you are using the application please provide the following.
 67 |   - type: input
 68 |     id: labs-version
 69 |     attributes:
 70 |       label: EPI2ME Version
 71 |       description: Available from the application settings page.
 72 |       placeholder: v5.1.1
 73 |     validations:
 74 |       required: false
 75 | 
 76 | 
 77 |   - type: markdown
 78 |     attributes:
 79 |       value: |
 80 |         # Command-line execution
 81 |         If you are using nextflow on a command-line, please provide the following.
 82 |   - type: textarea
 83 |     id: cli-command
 84 |     attributes:
 85 |       label: CLI command run
 86 |       description: Please tell us the command you are running
 87 |       placeholder: e.g. nextflow run epi2me-labs/wf-human-variations -profile standard --fastq my-reads/fastq
 88 |     validations:
 89 |       required: false
 90 |   - type: dropdown
 91 |     id: profile
 92 |     attributes:
 93 |       label: Workflow Execution - CLI Execution Profile
 94 |       description: Which execution profile are you using? If you are using a custom profile or nextflow configuration, please give details below.
 95 |       options:
 96 |         - standard (default)
 97 |         - singularity
 98 |         - custom
 99 |     validations:
100 |       required: false
101 | 
102 | 
103 |   - type: markdown
104 |     attributes:
105 |       value: |
106 |         # Report details
107 |   - type: textarea
108 |     id: what-happened
109 |     attributes:
110 |       label: What happened?
111 |       description: Also tell us, what did you expect to happen?
112 |       placeholder: Tell us what you see!
113 |     validations:
114 |       required: true
115 |   - type: textarea
116 |     id: logs
117 |     attributes:
118 |       label: Relevant log output
119 |       description: For CLI execution please include the full output from running nextflow. For execution from the EPI2ME application please copy the contents of the "Workflow logs" panel from the "Logs" tab corresponding to your workflow instance. (This will be automatically formatted into code, so no need for backticks).
120 |       render: shell
121 |     validations:
122 |       required: true
123 |   - type: textarea
124 |     id: activity-log
125 |     attributes:
126 |       label: Application activity log entry
127 |       description: For use with the EPI2ME application please see the Settings > View Activity Log page, and copy the contents of any items listed in red using the Copy to clipboard button.
128 |       render: shell
129 |     validations:
130 |       required: false
131 |   - type: dropdown
132 |     id: run-demo
133 |     attributes:
134 |       label: Were you able to successfully run the latest version of the workflow with the demo data?
135 |       description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button?
136 |       options:
137 |         - 'yes'
138 |         - 'no'
139 |         - other (please describe below)
140 |     validations:
141 |       required: true
142 |   - type: textarea
143 |     id: demo-other
144 |     attributes:
145 |       label: Other demo data information
146 |       render: shell
147 |     validations:
148 |       required: false
149 | 
150 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |     - name: Nanopore customer support
4 |       url: https://nanoporetech.com/contact
5 |       about: For general support, including bioinformatics questions.
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature request
 2 | description: Suggest an idea for this project
 3 | labels: ["feature request"]
 4 | body:
 5 |   
 6 |   - type: textarea
 7 |     id: question1
 8 |     attributes:
 9 |       label: Is your feature related to a problem?
10 |       placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 |     validations:
12 |       required: true
13 |   - type: textarea
14 |     id: question2
15 |     attributes:
16 |       label: Describe the solution you'd like
17 |       placeholder: A clear and concise description of what you want to happen.
18 |     validations:
19 |       required: true
20 |   - type: textarea
21 |     id: question3
22 |     attributes:
23 |       label: Describe alternatives you've considered
24 |       placeholder: A clear and concise description of any alternative solutions or features you've considered.
25 |     validations:
26 |       required: true
27 |   - type: textarea
28 |     id: question4
29 |     attributes:
30 |       label: Additional context
31 |       placeholder: Add any other context about the feature request here.
32 |     validations:
33 |       required: false
34 | 
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.yml:
--------------------------------------------------------------------------------
 1 | name: Question
 2 | description: Ask a generic question about this project unrelated to features or bugs.
 3 | labels: ["question"]
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Please reserve this form for issues not related to bugs or feature requests. If our developers deem your questions to be related to bugs or features you will be asked to fill in the appropriate form.
 9 |   - type: textarea
10 |     id: question1
11 |     attributes:
12 |       label: Ask away!
13 |       placeholder: |
14 |           Bad question: How do I use this workflow in my HPC cluster?
15 |           Good question: My HPC cluster uses a GridEngine scheduler. Can you point me to documentation for how to use your workflows to efficiently submit jobs to my cluster?
16 |     validations:
17 |       required: true
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | nextflow
2 | .nextflow*
3 | template-workflow
4 | .*.swp
5 | .*.swo
6 | *.pyc
7 | *.pyo
8 | .DS_store
9 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | # Include shared CI
  2 | include:
  3 |     - project: "epi2melabs/ci-templates"
  4 |       file: "wf-containers.yaml"
  5 | 
  6 | variables:
  7 |     CI_FLAVOUR: "new" # set to "classic" for old-style CI
  8 |     SKIP_PYTHON_TESTS: "not applicable"
  9 |     NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demo/VERSION && rm demo_data.tar.gz"
 10 |     NF_PROCESS_OPTIONS: "--basecaller_chunk_size 1 --ubam_map_threads 5 --ubam_sort_threads 2 --ubam_bam2fq_threads 1"
 11 |     PYTEST_CONTAINER_CONFIG_KEY: "container_sha_basecalling="
 12 | 
 13 | check-models:
 14 |     extends: .preflight
 15 |     script:
 16 |         - !reference [.install, nextflow] # requires nextflow to read config
 17 |         - bash util/update_models_schema.sh . docker
 18 |         - >
 19 |             if ! diff nextflow_schema.json nextflow_schema.json.new; then
 20 |                 echo "Model schema requires updating."
 21 |                 exit 1
 22 |             fi
 23 | 
 24 | docker-run:
 25 |     artifacts:
 26 |         when: always
 27 |         paths:
 28 |             - ${CI_PROJECT_NAME}
 29 |             - .nextflow.log
 30 |         exclude:
 31 |             - ${CI_PROJECT_NAME}/**/*.fa
 32 |             - ${CI_PROJECT_NAME}/**/*.fna
 33 |             - ${CI_PROJECT_NAME}/**/*.fasta
 34 |             - ${CI_PROJECT_NAME}/**/ref_cache/**
 35 | 
 36 |     # Define a 1D job matrix to inject a variable named MATRIX_NAME into
 37 |     #   the CI environment, we can use the value of MATRIX_NAME to determine
 38 |     #   which options to apply as part of the rules block below
 39 |     # NOTE There is a slightly cleaner way to define this matrix to include
 40 |     #   the variables, but it is broken when using long strings! See CW-756
 41 |     tags:
 42 |         - grid
 43 |         - shell
 44 |     parallel:
 45 |         matrix:
 46 |             - MATRIX_NAME: [
 47 |                 "dorado",
 48 |                 "dorado-igv",
 49 |                 "dorado-igv-gz",
 50 |                 "dorado_mod",
 51 |                 "dorado_fast5",
 52 |                 "dorado-gzref",
 53 |                 "dorado-output-fastq",
 54 |                 "dorado-qscore-filter",
 55 |                 "duplex",
 56 |                 "duplex_mod",
 57 |                 "duplex_fast5",
 58 |                 "duplex_watch",
 59 |                 "duplex_fqonly_fail",
 60 |                 "watch_path",
 61 |                 "no_reference",
 62 |                 "no_reference-output-fastq",
 63 |                 "output_bam",
 64 |                 "polya_tails",
 65 |                 "demux",
 66 |                 "duplex_demux",
 67 |                 "demux-align"
 68 |             ]
 69 |     rules:
 70 |         # NOTE As we're overriding the rules block for the included docker-run
 71 |         #  we must redefine this CI_COMMIT_BRANCH rule to prevent docker-run
 72 |         #  being incorrectly scheduled for "detached merge request pipelines" etc.
 73 |         - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template")
 74 |           when: never
 75 |         - if: $MATRIX_NAME == "dorado"
 76 |           variables:
 77 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 ${NF_PROCESS_OPTIONS}"
 78 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
 79 |         - if: $MATRIX_NAME == "dorado-igv"
 80 |           variables:
 81 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --igv ${NF_PROCESS_OPTIONS}"
 82 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
 83 |         - if: $MATRIX_NAME == "dorado-igv-gz"
 84 |           variables:
 85 |              NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demo/VERSION && rm demo_data.tar.gz && wget -q -O wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz"
 86 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --igv ${NF_PROCESS_OPTIONS}"
 87 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
 88 |         - if: $MATRIX_NAME == "dorado-gzref"
 89 |           variables:
 90 |              NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demo/VERSION && rm demo_data.tar.gz && wget -q -O wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz"
 91 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 ${NF_PROCESS_OPTIONS}"
 92 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
 93 |         - if: $MATRIX_NAME == "dorado-output-fastq"
 94 |           variables:
 95 |              NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demo/VERSION && rm demo_data.tar.gz && wget -q -O wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz"
 96 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --output_fmt fastq ${NF_PROCESS_OPTIONS}"
 97 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
 98 |         - if: $MATRIX_NAME == "dorado_mod"
 99 |           variables:
100 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --remora_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2 ${NF_PROCESS_OPTIONS}"
101 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
102 |         - if: $MATRIX_NAME == "dorado_fast5"
103 |           variables:
104 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/fast5 --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --dorado_ext fast5 --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 ${NF_PROCESS_OPTIONS}"
105 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
106 |         - if: $MATRIX_NAME == "dorado-qscore-filter"
107 |           variables:
108 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --qscore_filter 20 ${NF_PROCESS_OPTIONS}"
109 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
110 |         - if: $MATRIX_NAME == "watch_path"
111 |           variables:
112 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --watch_path --read_limit 3000 ${NF_PROCESS_OPTIONS}"
113 |              NF_IGNORE_PROCESSES: "pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
114 |         - if: $MATRIX_NAME == "no_reference"
115 |           variables:
116 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 ${NF_PROCESS_OPTIONS}"
117 |              NF_IGNORE_PROCESSES: "cram_cache,stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
118 |         - if: $MATRIX_NAME == "no_reference-output-fastq"
119 |           variables:
120 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --output_fmt fastq ${NF_PROCESS_OPTIONS}"
121 |              NF_IGNORE_PROCESSES: "cram_cache,stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
122 |              AFTER_NEXTFLOW_CMD: "[ -f wf-basecalling/SAMPLE.pass.fq.gz ] && echo 'Expected file wf-basecalling/SAMPLE.pass.fq.gz found' || exit 1"
123 |         - if: $MATRIX_NAME == "output_bam"
124 |           variables:
125 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --output_fmt bam ${NF_PROCESS_OPTIONS}"
126 |              NF_IGNORE_PROCESSES: "cram_cache,stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s"
127 |         - if: $MATRIX_NAME == "duplex"
128 |           variables:
129 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --duplex ${NF_PROCESS_OPTIONS}"
130 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,output_pod5s"
131 |         - if: $MATRIX_NAME == "duplex_mod"
132 |           variables:
133 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --remora_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2 --duplex ${NF_PROCESS_OPTIONS}"
134 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,output_pod5s"
135 |         - if: $MATRIX_NAME == "duplex_fast5"
136 |           variables:
137 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/fast5 --output_pod5 --dorado_ext fast5 --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --duplex ${NF_PROCESS_OPTIONS}"
138 |              NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado,make_mmi,align_and_qsFilter,\
139 |              merge_pass_calls,merge_fail_calls,getVersions,getParams,cram_cache,bamstats,progressive_stats,makeReport,output"
140 |         - if: $MATRIX_NAME == "duplex_watch"
141 |           variables:
142 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --watch_path --read_limit 3000 --duplex ${NF_PROCESS_OPTIONS}"
143 |              NF_IGNORE_PROCESSES: "output_pod5s"
144 |         - if: $MATRIX_NAME == "duplex_fqonly_fail"
145 |           variables:
146 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --watch_path --read_limit 3000 --output_fmt fastq --duplex ${NF_PROCESS_OPTIONS}"
147 |              NF_IGNORE_PROCESSES: "output_pod5s"
148 |              ASSERT_NEXTFLOW_FAILURE: "yes"
149 |              ASSERT_NEXTFLOW_FAILURE_REXP : "Duplex requires the outputs of Dorado to be in BAM format."
150 |         - if: $MATRIX_NAME == "polya_tails"
151 |           variables:
152 |              NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-polya-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-polya-demo/VERSION && rm demo_data.tar.gz"
153 |              NF_WORKFLOW_OPTS: "--poly_a_config  wf-basecalling-polya-demo/polya_conf.toml --input  wf-basecalling-polya-demo/input --ref  wf-basecalling-polya-demo/RCS-100A.fasta --basecaller_cfg rna004_130bps_hac@v3.0.1 ${NF_PROCESS_OPTIONS}"
154 |              NF_IGNORE_PROCESSES: "output_pod5s,pair_stats,progressive_pairings,split_xam,stopCondition"
155 |         - if: $MATRIX_NAME == "demux"
156 |           variables:
157 |              NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demux-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demux-demo/README && rm demo_data.tar.gz"
158 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demux-demo/input --barcode_kit SQK-RBK114-96 --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v5.0.0 ${NF_PROCESS_OPTIONS}"
159 |              NF_IGNORE_PROCESSES: "output_pod5s,pair_stats,progressive_pairings,split_xam,stopCondition,cram_cache"
160 |         - if: $MATRIX_NAME == "demux-align"
161 |           variables:
162 |              NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demux-demo.tar.gz && \
163 |               tar -xzvf demo_data.tar.gz && cat wf-basecalling-demux-demo/README && \
164 |               rm demo_data.tar.gz && \
165 |               wget -q -O wf-basecalling-demux-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz"
166 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demux-demo/input --barcode_kit SQK-RBK114-96 --ref wf-basecalling-demux-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v5.0.0 ${NF_PROCESS_OPTIONS}"
167 |              NF_IGNORE_PROCESSES: "output_pod5s,pair_stats,progressive_pairings,split_xam,stopCondition,cram_cache"
168 |         - if: $MATRIX_NAME == "duplex_demux"
169 |           variables:
170 |              NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demux-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demux-demo/README && rm demo_data.tar.gz"
171 |              NF_WORKFLOW_OPTS: "--input wf-basecalling-demux-demo/input --duplex true --barcode_kit SQK-RBK114-96 --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v5.0.0 ${NF_PROCESS_OPTIONS}"
172 |              NF_IGNORE_PROCESSES: "output_pod5s,pair_stats,progressive_pairings,split_xam,stopCondition,cram_cache"
173 |              ASSERT_NEXTFLOW_FAILURE: "yes"
174 |              ASSERT_NEXTFLOW_FAILURE_REXP : "Validation of pipeline parameters failed"
175 | 
176 | aws-run:
177 |     rules:
178 |         - when: never
179 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: docs_readme
 5 |         name: docs_readme
 6 |         entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json
 7 |         language: python
 8 |         always_run: true
 9 |         pass_filenames: false
10 |         additional_dependencies:
11 |           - epi2melabs==0.0.58
12 |   - repo: https://github.com/pycqa/flake8
13 |     rev: 5.0.4
14 |     hooks:
15 |       - id: flake8
16 |         pass_filenames: false
17 |         additional_dependencies:
18 |           - flake8-rst-docstrings
19 |           - flake8-docstrings
20 |           - flake8-import-order
21 |           - flake8-forbid-visual-indent
22 |           - pep8-naming
23 |           - flake8-no-types
24 |           - flake8-builtins
25 |           - flake8-absolute-import
26 |           - flake8-print
27 |           # avoid snowballstemmer>=3.0 as it causes flake8-docstrings to stop working [CW-6098]
28 |           - snowballstemmer==2.2.0
29 |         args: [
30 |             "bin",
31 |             "--import-order-style=google",
32 |             "--statistics",
33 |             "--max-line-length=88",
34 |             "--per-file-ignores=bin/workflow_glue/models/*:NT001",
35 |         ]
36 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ## [v1.5.1]
  8 | The updates in this release do not affect wf-basecalling but are required for EPI2ME workflows that make use of wf-basecalling, to maintain compliance with our latest wf-template standard.
  9 | Users do not need to update to this release.
 10 | ### Changed
 11 | - Updated to wf-template v5.6.1, changing:
 12 |     - pre-commit configuration to resolve an internal dependency problem with flake8. This has no effect on the workflow.
 13 |     - Log banner art to say "EPI2ME" instead of "EPI2ME Labs" to match current branding. This has no effect on the workflow outputs.
 14 | 
 15 | ## [v1.5.0]
 16 | This version of wf-basecalling updates Dorado to v0.9.5 which should improve the speed of basecalling on some GPU architectures.
 17 | Dorado v0.9.5 increases the minimum NVIDIA Driver requirement to 525.105.
 18 | ### Changed
 19 | - Updated Dorado to [v0.9.5](https://github.com/nanoporetech/dorado/releases/tag/v0.9.5)
 20 | - Alignment uses the high quality long read preset (-x lr:hq) to reduce mapping time.
 21 | - Basecaller model options are now reverse version sorted in the workflow schema to ensure newer models appear at the top of drop-downs and listings of available models.
 22 | 
 23 | ### Fixed
 24 | - "Input data problem" error in downstream workflows necessitating use of the override_basecaller_cfg option. Relevant metadata from the input XAM header is now retained after alignment to ensure that the basecaller configuration is automatically detected.
 25 | 
 26 | ## [v1.4.7]
 27 | This maintenance release updates the models used for outputting results.
 28 | This release is to support our other workflows.
 29 | Users do not need to update to this release.
 30 | ### Changed
 31 | - Removed Pydantic and autogenerated models from results_schema.yml and created a new model namespace using dataclasses.
 32 | 
 33 | ## [v1.4.6]
 34 | This maintenance release updates the version of our code that plots our post-workflow reports.
 35 | This release is to support our other workflows.
 36 | Users do not need to update to this release.
 37 | ### Changed
 38 | - Updated common Docker image to update ezcharts to v0.12.0. This improves formatting of plots in the report.
 39 | ### Fixed
 40 | - Typo in schema.
 41 | 
 42 | ## [v1.4.5]
 43 | ### Changed
 44 | - Reconciled workflow with wf-template v5.3.4.
 45 | 
 46 | ## [v1.4.4]
 47 | ### Changed
 48 | - Updated Dorado to [v0.9.0](https://github.com/nanoporetech/dorado/releases/tag/v0.9.0)
 49 | 
 50 | ## [v1.4.3]
 51 | ### Changed
 52 | - Reconciled workflow with wf-template v5.3.3.
 53 | 
 54 | ## [v1.4.2]
 55 | ### Added
 56 | - q-score filter added to signal ingress.
 57 | ### Changed
 58 | - Updated Dorado to [v0.8.3](https://github.com/nanoporetech/dorado/releases/tag/v0.8.3)
 59 | - Reconciled workflow with wf-template v5.3.1.
 60 | 
 61 | ## [v1.4.1]
 62 | ### Changed
 63 | - Reconciled workflow with wf-template v5.3.0.
 64 | - Updated Dorado to [v0.8.1](https://github.com/nanoporetech/dorado/releases/tag/v0.8.1)
 65 | 
 66 | ## [v1.4.0]
 67 | ### Added
 68 | - IGV configuration file with `--ref --igv` options and either `--output_fmt bam` or `--output_fmt cram`.
 69 | - Support for gzipped reference genomes.
 70 | - `output_fmt` selects the output format for basecalled and aligned files.
 71 | ### Changed
 72 | - Updated Dorado to [v0.8.0](https://github.com/nanoporetech/dorado/releases/tag/v0.8.0)
 73 | - Reconciled workflow with wf-template v5.2.6.
 74 | - Do not emit the reference FASTA file.
 75 | - Collapse redundant RG and PG header lines when emitting BAM or CRAM.
 76 | ### Fixed
 77 | - Workflow starting with `--duplex --barcode_kit`, despite duplex not supporting barcoding.
 78 | - Workflow crashing with `--ref {{ reference }} --barcode_kit`.
 79 | - Aligned reads will no longer be trimmed when demuxing to preserve mapping information.
 80 | - Workflow emits confusing warning about Bonito filtering when using Dorado.
 81 | ### Removed
 82 | - `fastq_only` and `output_bam` options replaced by `output_fmt`.
 83 |     - `--output_fmt fastq` can be used to output unaligned FASTQ instead of unaligned CRAM.
 84 |     - `--output_fmt bam` can be used to output unaligned or aligned BAM instead of CRAM.
 85 | 
 86 | ## [v1.3.0]
 87 | ### Added
 88 | - Modified base calling with `--duplex`.
 89 | - APK 5.0.0 model.
 90 | ### Changed
 91 | - Updated Dorado to v0.7.2 (see https://github.com/nanoporetech/dorado/releases/tag/v0.7.2)
 92 | 
 93 | ## [v1.2.2]
 94 | ### Changes
 95 | - Bug fix for downstream workflows and `--poly_a_config` which does not affect normal workflow use.
 96 | 
 97 | ## [v1.2.1]
 98 | ### Added
 99 | - Output channel for demuxed BAM files for downstream use.
100 | 
101 | ## [v1.2.0]
102 | ### Added
103 | - Support for `dorado demux` to demultiplex barcoded runs. Specify your `--barcode_kit` to activate demultiplexing.
104 | - Support for poly(a) tail length estimation with `--poly_a_config`. You can configure by providng a TOML file to `--poly_a_config` which is described in detail [here](https://github.com/nanoporetech/dorado?tab=readme-ov-file#polya-tail-estimation)
105 | ### Changed
106 | - Updated Dorado to v0.7.1 (see https://github.com/nanoporetech/dorado/releases/tag/v0.7.1)
107 | 
108 | ## [v1.1.9]
109 | ### Fixed
110 | - Report crashing when no data are present in the input pod5.
111 | - Reconciled workflow with wf-template v5.1.3.
112 | - Updated Dorado to v0.7.0 (see https://github.com/nanoporetech/dorado/releases/tag/v0.7.0)
113 | - Added new DNA and RNA 5.0.0 models.
114 | 
115 | ## [v1.1.8]
116 | ### Changed
117 | - Updated Dorado to v0.6.0 (see https://github.com/nanoporetech/dorado/releases/tag/v0.6.0)
118 | 
119 | ## [v1.1.7]
120 | ### Fixed
121 | - Workflow accepting incompatible `--fastq_only` and `--duplex` options
122 | - Dynamically updated report in `--watch_path` mode.
123 | 
124 | ## [v1.1.6]
125 | ### Fixed
126 | - qscore_filter inadvertently disabled in v1.1.5
127 | 
128 | ## [v1.1.5]
129 | ### Changed
130 | - Minor update to default resource requests on dorado task.
131 | ### Added
132 | - Experimental feature switch.
133 | 
134 | ## [v1.1.4]
135 | ### Changed
136 | - Updated Dorado to v0.5.2 (see https://github.com/nanoporetech/dorado/releases/tag/v0.5.2)
137 | 
138 | ## [v1.1.3]
139 | ### Changed
140 | - Bumped memory directives for intense tasks to reduce likelihood of job failure
141 | - Default to parallel GPU usage when using awsbatch profile
142 | ### Removed
143 | - Runtime driver check in Dorado process, as this is no longer available in the Dorado image
144 | 
145 | ## [v1.1.2]
146 | ### Changed
147 | - Updated dorado version to v0.5.1 (see https://github.com/nanoporetech/dorado/releases/tag/v0.5.1)
148 | 
149 | ## [v1.1.1]
150 | ### Added
151 | - Reintroduced RNA002 models
152 | 
153 | ## [v1.1.0]
154 | ### [Added]
155 | - `--duplex` basecalling converts FAST5 to POD5 automatically
156 |     - Converted POD5 files are deleted by default, use `--output_pod5` to output converted POD5 files to the workflow output directory.
157 | 
158 | ### Changed
159 | - Updated Dorado to v0.3.4 (see https://github.com/nanoporetech/dorado/releases/tag/v0.3.4)
160 | 
161 | ## [v1.0.1]
162 | ### Fixed
163 | - Workflow crashes with fast5 input
164 | - Workflow fails early when trying to use FAST5 input with Dorado duplex
165 | 
166 | ## [v1.0.0]
167 | ### Added
168 | - RNA004 models
169 | - R941 v3.3 5mCG 5hmCG models
170 | - Duplex calling with option `--duplex`
171 |     - Note that duplex calling is not optimised for streaming basecalling with `--watch_path` and may lead to lower duplex yield.
172 |     - Duplex basecalling is currently not compatible with modified basecalling.
173 | 
174 | ### Changed
175 | - Updated Dorado to v0.3.2 (see https://github.com/nanoporetech/dorado/releases/tag/v0.3.2)
176 | - Pascal architecture GPUs are now supported
177 | - Bumped minimum required Nextflow version to 23.04.2
178 | - Users no longer need to provide `--basecaller_cfg custom` and/or `--remora_cfg custom` to override models with `--basecaller_model_path` and/or `--remora_model_path` respectively.
179 | 
180 | ### Fixed
181 | - `bamstats` process very slow when `output_bam` has been selected
182 | 
183 | ## [v0.7.2]
184 | ### Added
185 | - v4.2 5mC and 6mA modification models
186 | 
187 | ### Changed
188 | - Updated Dorado to v0.3.1
189 | - GPU tasks are limited to run in serial by default to avoid memory errors
190 |     - Users in cluster and cloud environments where GPU devices are scheduled must use `-profile discrete_gpus` to parallelise GPU work
191 |     - A warning will be printed if the workflow detects it is running non-local execution but the discrete_gpus profile is not enabled
192 |     - Additional guidance on GPU support is provided in our Quickstart
193 | - Bumped minimum required Nextflow version to 22.10.8
194 | 
195 | ## [v0.7.1]
196 | ### Fixed
197 | - Command not found on `cram_cache` step
198 | - Typo in report that refers to the workflow as "wf-basecalling-report"
199 | 
200 | ## [v0.7.0]
201 | ### Changed
202 | - Updated Dorado to v0.3.0
203 | - BAM may be output **instead** of CRAM by providing `--output_bam`
204 | - `--help` message will list basecalling and modbasecalling models available for use with the workflow
205 | 
206 | ### Added
207 | - v4.2.0 models, which must be used for sequencing runs performed at new 5 kHz sampling rate
208 | - v4.1.0 models replace v4.0.0 models and must be used for sequencing runs performed at 4 kHz sampling rate
209 | 
210 | ### Removed
211 | - v4.0.0 models
212 | 
213 | ### Fixed
214 | - Custom models were previously rejected by the workflow as `basecaller_cfg` and `remora_cfg` are validated against a list of basecalling models installed in the Dorado container.
215 |     - Users should now provide `--basecaller_cfg custom` and/or `--remora_cfg custom` to override models with `--basecaller_model_path` and/or `--remora_model_path` respectively.
216 |     - Providing `--basecaller_cfg custom` or `--remora_cfg custom` without the corresponding `--basecaller_model_path` or `--remora_model_path` will result in an error.
217 | 
218 | ## [v0.6.0]
219 | ### Added
220 | - Ability to watch the input path and process files as they become available in real time.
221 | 
222 | ## [v0.5.2]
223 | ### Added
224 | - Configuration for running demo data in AWS
225 | 
226 | ## [v0.5.1]
227 | ### Fixed
228 | - Missing models from list of valid models
229 | - "dna_r9.4.1_e8_hac@v3.4_5mCG@v0" is now correctly referred to as "dna_r9.4.1_e8_hac@v3.3_5mCG@v0", to match the simplex model version
230 | - "dna_r9.4.1_e8_sup@v3.4_5mCG@v0" is now correctly referred to as "dna_r9.4.1_e8_sup@v3.3_5mCG@v0", to match the simplex model version
231 | 
232 | ## [v0.5.0]
233 | ### Changed
234 | - Updated Dorado to v0.2.4
235 | - Updated to Oxford Nanopore Technologies PLC. Public License
236 | 
237 | ### Fixed
238 | - Dorado image correctly ships with CUDA runtime library
239 | 
240 | ## [v0.4.1]
241 | ### Fixed
242 | - Input ref channel depleted after first alignment
243 | 
244 | ## [v0.4.0]
245 | ### Changed
246 | - Reference is no longer required for basecalling
247 |     - CRAM files with no alignments will be generated if `--ref` is not provided
248 |     - FASTQ may be output **instead** of CRAM by providing `--fastq_only`
249 | - PG line for converting Dorado SAM output to uBAM is no longer written to output header
250 | - Work directory is automatically cleaned up on successful completion to remove large intermediate files
251 |     - Override this by including `cleanup = false` in a custom Nextflow configuration file
252 | - Number of threads for merging is now configurable for advanced users
253 | 
254 | ## [v0.3.0]
255 | ### Changed
256 | - Updated Dorado to v0.2.1
257 | - `--basecaller_cfg` and `--remora_cfg` are now validated against a list of models installed in the Dorado container
258 | 
259 | ### Fixed
260 | - Workflow no longer prints a confusing error when Dorado fails
261 | 
262 | ## [v0.2.0]
263 | ### Added
264 | - `--basecaller_args` may be used to provide custom arguments to the basecalling process
265 | 
266 | ### Changed
267 | - Updated Dorado to v0.1.1
268 |     - Latest models are now v4.0.0
269 | - Workflow prints a more helpful error when Dorado fails due to unknown model name
270 | 
271 | ## [v0.1.2]
272 | ### Changed
273 | - Updated description in manifest
274 | 
275 | ## [v0.1.1]
276 | ### Fixed
277 | - Default basecaller_basemod_threads value
278 | - Undefined `colors` variable
279 | 
280 | ## [v0.1.0]
281 | ### Added
282 | - Workflow will now output pass and fail CRAM
283 |     - Reads are separated into pass and fail based on their mean qscore as calculated by dorado
284 |     - The threshold can be changed with `--qscore_filter`
285 | 
286 | ### Changed
287 | - Improved `--help` documentation
288 | 
289 | ### Fixed
290 | - Workflow will exit with "No files match pattern" if no suitable files are found to basecall
291 |     - Ensure to set `--dorado_ext` to `fast5` or `pod5` as appropriate
292 | 
293 | ## [v0.0.1]
294 | * Initial release of wf-basecalling supporting the Dorado basecaller
295 | 
296 | ## [v0.0.0]
297 | * Initialised wf-basecalling from wf-template #30ff92d
298 | 
299 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Oxford Nanopore Technologies PLC. Public License Version 1.0
  2 | =============================================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor’s Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Executable Form"
 25 |     means any form of the work other than Source Code Form.
 26 | 
 27 | 1.6. "Larger Work"
 28 |     means a work that combines Covered Software with other material, in
 29 |     a separate file or files, that is not Covered Software.
 30 | 
 31 | 1.7. "License"
 32 |     means this document.
 33 | 
 34 | 1.8. "Licensable"
 35 |     means having the right to grant, to the maximum extent possible,
 36 |     whether at the time of the initial grant or subsequently, any and
 37 |     all of the rights conveyed by this License.
 38 | 
 39 | 1.9. "Modifications"
 40 |     means any of the following:
 41 | 
 42 |     (a)	  any file in Source Code Form that results from an addition to,
 43 |           deletion from, or modification of the contents of Covered
 44 |           Software; or
 45 |     (b)   any new file in Source Code Form that contains any Covered
 46 |           Software.
 47 | 
 48 | 1.10. "Research Purposes"
 49 |     means use for internal research and not intended for or directed
 50 |     towards commercial advantages or monetary compensation; provided,
 51 |     however, that monetary compensation does not include sponsored
 52 |     research of research funded by grants.
 53 | 
 54 | 1.11  "Secondary License"
 55 |     means either the GNU General Public License, Version 2.0, the GNU
 56 |     Lesser General Public License, Version 2.1, the GNU Affero General
 57 |     Public License, Version 3.0, or any later versions of those
 58 |     licenses.
 59 | 
 60 | 1.12. "Source Code Form"
 61 |     means the form of the work preferred for making modifications.
 62 | 
 63 | 1.13. "You" (or "Your")
 64 |     means an individual or a legal entity exercising rights under this
 65 |     License. For legal entities, "You" includes any entity that
 66 |     controls, is controlled by, or is under common control with You. For
 67 |     purposes of this definition, "control" means (a) the power, direct
 68 |     or indirect, to cause the direction or management of such entity,
 69 |     whether by contract or otherwise, or (b) ownership of more than
 70 |     fifty percent (50%) of the outstanding shares or beneficial
 71 |     ownership of such entity.
 72 | 
 73 | 2. License Grants and Conditions
 74 | --------------------------------
 75 | 
 76 | 2.1. Grants
 77 | 
 78 | Each Contributor hereby grants You a world-wide, royalty-free,
 79 | non-exclusive license under Contributor copyrights Licensable by such
 80 | Contributor to use, reproduce, make available, modify, display,
 81 | perform, distribute, and otherwise exploit solely for Research Purposes
 82 | its Contributions, either on an unmodified basis, with Modifications,
 83 | or as part of a Larger Work.
 84 | 
 85 | 2.2. Effective Date
 86 | 
 87 | The licenses granted in Section 2.1 with respect to any Contribution
 88 | become effective for each Contribution on the date the Contributor
 89 | first distributes such Contribution.
 90 | 
 91 | 2.3. Limitations on Grant Scope
 92 | 
 93 | The licenses granted in this Section 2 are the only rights granted under
 94 | this License. No additional rights or licenses will be implied from the
 95 | distribution or licensing of Covered Software under this License. The
 96 | License is incompatible with Secondary Licenses.  Notwithstanding
 97 | Section 2.1 above, no copyright license is granted:
 98 | 
 99 | (a) for any code that a Contributor has removed from Covered Software;
100 |     or
101 | 
102 | (b) use of the Contributions or its Contributor Version other than for
103 | Research Purposes only; or
104 | 
105 | (c) for infringements caused by: (i) Your and any other third party’s
106 | modifications of Covered Software, or (ii) the combination of its
107 | Contributions with other software (except as part of its Contributor
108 | Version).
109 | 
110 | This License does not grant any rights in the patents, trademarks,
111 | service marks, or logos of any Contributor (except as may be necessary
112 | to comply with the notice requirements in Section 3.4).
113 | 
114 | 2.4. Subsequent Licenses
115 | 
116 | No Contributor makes additional grants as a result of Your choice to
117 | distribute the Covered Software under a subsequent version of this
118 | License (see Section 10.2) or under the terms of a Secondary License
119 | (if permitted under the terms of Section 3.3).
120 | 
121 | 2.5. Representation
122 | 
123 | Each Contributor represents that the Contributor believes its
124 | Contributions are its original creation(s) or it has sufficient rights
125 | to grant the rights to its Contributions conveyed by this License.
126 | 
127 | 2.6. Fair Use
128 | 
129 | This License is not intended to limit any rights You have under
130 | applicable copyright doctrines of fair use, fair dealing, or other
131 | equivalents.
132 | 
133 | 2.7. Conditions
134 | 
135 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
136 | in Section 2.1.
137 | 
138 | 3. Responsibilities
139 | -------------------
140 | 
141 | 3.1. Distribution of Source Form
142 | 
143 | All distribution of Covered Software in Source Code Form, including any
144 | Modifications that You create or to which You contribute, must be under
145 | the terms of this License. You must inform recipients that the Source
146 | Code Form of the Covered Software is governed by the terms of this
147 | License, and how they can obtain a copy of this License. You may not
148 | attempt to alter or restrict the recipients’ rights in the Source Code Form.
149 | 
150 | 3.2. Distribution of Executable Form
151 | 
152 | If You distribute Covered Software in Executable Form then:
153 | 
154 | (a) such Covered Software must also be made available in Source Code
155 |     Form, as described in Section 3.1, and You must inform recipients of
156 |     the Executable Form how they can obtain a copy of such Source Code
157 |     Form by reasonable means in a timely manner, at a charge no more
158 |     than the cost of distribution to the recipient; and
159 | 
160 | (b) You may distribute such Executable Form under the terms of this
161 |     License.
162 | 
163 | 3.3. Distribution of a Larger Work
164 | 
165 | You may create and distribute a Larger Work under terms of Your choice,
166 | provided that You also comply with the requirements of this License for
167 | the Covered Software. The Larger Work may not be a combination of Covered
168 | Software with a work governed by one or more Secondary Licenses.
169 | 
170 | 3.4. Notices
171 | 
172 | You may not remove or alter the substance of any license notices
173 | (including copyright notices, patent notices, disclaimers of warranty,
174 | or limitations of liability) contained within the Source Code Form of
175 | the Covered Software, except that You may alter any license notices to
176 | the extent required to remedy known factual inaccuracies.
177 | 
178 | 3.5. Application of Additional Terms
179 | 
180 | You may not choose to offer, or charge a fee for use of the Covered
181 | Software or a fee for, warranty, support, indemnity or liability
182 | obligations to one or more recipients of Covered Software.  You must
183 | make it absolutely clear that any such warranty, support, indemnity, or
184 | liability obligation is offered by You alone, and You hereby agree to
185 | indemnify every Contributor for any liability incurred by such
186 | Contributor as a result of warranty, support, indemnity or liability
187 | terms You offer. You may include additional disclaimers of warranty and
188 | limitations of liability specific to any jurisdiction.
189 | 
190 | 4. Inability to Comply Due to Statute or Regulation
191 | ---------------------------------------------------
192 | 
193 | If it is impossible for You to comply with any of the terms of this
194 | License with respect to some or all of the Covered Software due to
195 | statute, judicial order, or regulation then You must: (a) comply with
196 | the terms of this License to the maximum extent possible; and (b)
197 | describe the limitations and the code they affect. Such description must
198 | be placed in a text file included with all distributions of the Covered
199 | Software under this License. Except to the extent prohibited by statute
200 | or regulation, such description must be sufficiently detailed for a
201 | recipient of ordinary skill to be able to understand it.
202 | 
203 | 5. Termination
204 | --------------
205 | 
206 | 5.1. The rights granted under this License will terminate automatically
207 | if You fail to comply with any of its terms.
208 | 
209 | 5.2. If You initiate litigation against any entity by asserting an
210 | infringement claim (excluding declaratory judgment actions,
211 | counter-claims, and cross-claims) alleging that a Contributor Version
212 | directly or indirectly infringes, then the rights granted to
213 | You by any and all Contributors for the Covered Software under Section
214 | 2.1 of this License shall terminate.
215 | 
216 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
217 | end user license agreements (excluding distributors and resellers) which
218 | have been validly granted by You or Your distributors under this License
219 | prior to termination shall survive termination.
220 | 
221 | ************************************************************************
222 | *                                                                      *
223 | *  6. Disclaimer of Warranty                                           *
224 | *  -------------------------                                           *
225 | *                                                                      *
226 | *  Covered Software is provided under this License on an "as is"       *
227 | *  basis, without warranty of any kind, either expressed, implied, or  *
228 | *  statutory, including, without limitation, warranties that the       *
229 | *  Covered Software is free of defects, merchantable, fit for a        *
230 | *  particular purpose or non-infringing. The entire risk as to the     *
231 | *  quality and performance of the Covered Software is with You.        *
232 | *  Should any Covered Software prove defective in any respect, You     *
233 | *  (not any Contributor) assume the cost of any necessary servicing,   *
234 | *  repair, or correction. This disclaimer of warranty constitutes an   *
235 | *  essential part of this License. No use of any Covered Software is   *
236 | *  authorized under this License except under this disclaimer.         *
237 | *                                                                      *
238 | ************************************************************************
239 | 
240 | ************************************************************************
241 | *                                                                      *
242 | *  7. Limitation of Liability                                          *
243 | *  --------------------------                                          *
244 | *                                                                      *
245 | *  Under no circumstances and under no legal theory, whether tort      *
246 | *  (including negligence), contract, or otherwise, shall any           *
247 | *  Contributor, or anyone who distributes Covered Software as          *
248 | *  permitted above, be liable to You for any direct, indirect,         *
249 | *  special, incidental, or consequential damages of any character      *
250 | *  including, without limitation, damages for lost profits, loss of    *
251 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
252 | *  and all other commercial damages or losses, even if such party      *
253 | *  shall have been informed of the possibility of such damages. This   *
254 | *  limitation of liability shall not apply to liability for death or   *
255 | *  personal injury resulting from such party’s negligence to the       *
256 | *  extent applicable law prohibits such limitation, but in such event, *
257 | *  and to the greatest extent permissible, damages will be limited to  *
258 | *  direct damages not to exceed one hundred dollars. Some              *
259 | *  jurisdictions do not allow the exclusion or limitation of           *
260 | *  incidental or consequential damages, so this exclusion and          *
261 | *  limitation may not apply to You.                                    *
262 | *                                                                      *
263 | ************************************************************************
264 | 
265 | 8. Litigation
266 | -------------
267 | 
268 | Any litigation relating to this License may be brought only in the
269 | courts of a jurisdiction where the defendant maintains its principal
270 | place of business and such litigation shall be governed by laws of that
271 | jurisdiction, without reference to its conflict-of-law provisions.
272 | Nothing in this Section shall prevent a party’s ability to bring
273 | cross-claims or counter-claims.
274 | 
275 | 9. Miscellaneous
276 | ----------------
277 | 
278 | This License represents the complete agreement concerning the subject
279 | matter hereof. If any provision of this License is held to be
280 | unenforceable, such provision shall be reformed only to the extent
281 | necessary to make it enforceable. Any law or regulation which provides
282 | that the language of a contract shall be construed against the drafter
283 | shall not be used to construe this License against a Contributor.
284 | 
285 | 10. Versions of the License
286 | ---------------------------
287 | 
288 | 10.1. New Versions
289 | 
290 | Oxford Nanopore Technologies PLC. is the license steward. Except as
291 | provided in Section 10.3, no one other than the license steward has the
292 | right to modify or publish new versions of this License. Each version
293 | will be given a distinguishing version number.
294 | 
295 | 10.2. Effect of New Versions
296 | 
297 | You may distribute the Covered Software under the terms of the version
298 | of the License under which You originally received the Covered Software,
299 | or under the terms of any subsequent version published by the license
300 | steward.
301 | 
302 | 10.3. Modified Versions
303 | 
304 | If you create software not governed by this License, and you want to
305 | create a new license for such software, you may create and use a
306 | modified version of this License if you rename the license and remove
307 | any references to the name of the license steward (except to note that
308 | such modified license differs from this License).
309 | 
310 | Exhibit A - Source Code Form License Notice
311 | -------------------------------------------
312 | 
313 |   This Source Code Form is subject to the terms of the Oxford Nanopore
314 |   Technologies PLC. Public License, v. 1.0. Full licence can be found
315 |   obtained from support@nanoporetech.com
316 | 
317 | If it is not possible or desirable to put the notice in a particular
318 | file, then You may include the notice in a location (such as a LICENSE
319 | file in a relevant directory) where a recipient would be likely to look
320 | for such a notice.
321 | 
322 | You may add additional accurate notices of copyright ownership.
323 | 


--------------------------------------------------------------------------------
/base.config:
--------------------------------------------------------------------------------
  1 | params {
  2 |     out_dir = "output"
  3 |     wf {
  4 |         basecaller_container = "ontresearch/dorado"
  5 |         container_sha_basecalling = "sha268dcb4cd02093e75cdc58821f8b93719c4255ed"
  6 |         bonito_container = "ontresearch/bonito"
  7 |         bonito_sha = "shaea43ca2333f91fa78a823f640ba158e4268f1f98"
  8 |         common_sha = "sha1c69fd30053aad5d516e9567b3944384325a0fee"
  9 |     }
 10 | }
 11 | 
 12 | 
 13 | // used by default for "standard" (docker) and singularity profiles,
 14 | // other profiles may override.
 15 | process {
 16 |     withLabel:wf_basecalling {
 17 |         container = "${params.wf.basecaller_container}:${params.wf.container_sha_basecalling}"
 18 |     }
 19 |     withLabel:wf_common {
 20 |         container = "ontresearch/wf-common:${params.wf.common_sha}"
 21 |     }
 22 | 
 23 |     shell = ['/bin/bash', '-euo', 'pipefail']
 24 | 
 25 |     // by default GPU tasks will run in serial to avoid GPU management.
 26 |     // cluster and cloud users can remove this with -profile discrete_gpus.
 27 |     // we use profiles to handle this as maxForks cannot be set dynamically
 28 |     // see https://github.com/nextflow-io/nextflow/discussions/3806 and CW-1857
 29 |     withLabel:gpu {
 30 |         maxForks = 1
 31 |     }
 32 | }
 33 | 
 34 | profiles {
 35 |     // the "standard" profile is used implicitely by nextflow
 36 |     // if no other profile is given on the CLI
 37 |     standard {
 38 |         docker {
 39 |             enabled = true
 40 |             // this ensures container is run as host user and group, but
 41 |             //    also adds host user to the within-container group
 42 |             runOptions = "--user \$(id -u):\$(id -g) --group-add 100"
 43 |         }
 44 |         process."withLabel:gpu".containerOptions = "--gpus all"
 45 |     }
 46 | 
 47 |     // using singularity instead of docker
 48 |     singularity {
 49 |         singularity {
 50 |             enabled = true
 51 |             autoMounts = true
 52 |             //envWhitelist = "" // if your cluster sets a variable to indicate which GPU has been assigned you will want to allow it here
 53 |         }
 54 |         process."withLabel:gpu".containerOptions = "--nv"
 55 |     }
 56 | 
 57 | 
 58 |     // keep stub conda profile to prevent unknown profile warning so users get a better error
 59 |     conda {
 60 |         conda.enabled = true
 61 |     }
 62 | 
 63 | 
 64 |     // Using AWS batch.
 65 |     // May need to set aws.region and aws.batch.cliPath
 66 |     awsbatch {
 67 |         process {
 68 |             executor = 'awsbatch'
 69 |             queue = "${params.aws_queue}"
 70 |             memory = "16 GB" // likely not enough!
 71 |             withLabel:wf_common {
 72 |                 container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}"
 73 |             }
 74 |             shell = ['/bin/bash', '-euo', 'pipefail']
 75 |             
 76 |             // lift limit on simultaneous gpu jobs for cloud
 77 |             // and ensure that the host mounts relevant driver bobbins inside the container
 78 |             withLabel:gpu {
 79 |                 maxForks = null
 80 |                 containerOptions = "-e NVIDIA_DRIVER_CAPABILITIES=compute,utility --gpus all"
 81 |             }
 82 |             withLabel:wf_basecalling {
 83 |                 container = "${params.aws_image_prefix}-dorado:${params.wf.container_sha_basecalling}"
 84 |             }
 85 |             withLabel:wf_bonito {
 86 |                 container = "${params.aws_image_prefix}-bonito:${params.wf.bonito_sha}"
 87 |             }
 88 |         }
 89 |     }
 90 | 
 91 |     // local profile for simplified development testing
 92 |     local {
 93 |         process.executor = 'local'
 94 |     }
 95 | 
 96 |     // lift limit on simultaneous gpu jobs
 97 |     discrete_gpus {
 98 |         process."withLabel:gpu".maxForks = null
 99 |     }
100 | }
101 | 
102 | 
103 | timeline {
104 |   enabled = true
105 |   file = "${params.out_dir}/execution/timeline.html"
106 |   overwrite = true
107 | }
108 | report {
109 |   enabled = true
110 |   file = "${params.out_dir}/execution/report.html"
111 |   overwrite = true
112 | }
113 | trace {
114 |   enabled = true
115 |   file = "${params.out_dir}/execution/trace.txt"
116 |   overwrite = true
117 | }
118 | 
119 | env {
120 |     PYTHONNOUSERSITE = 1
121 |     JAVA_TOOL_OPTIONS = "-Xlog:disable -Xlog:all=warning:stderr"
122 | }
123 | 
124 | cleanup = true
125 | 


--------------------------------------------------------------------------------
/bin/add_jsons.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Combine two JSONS, sum values by matching json keys."""
 3 | 
 4 | import argparse
 5 | import json
 6 | import os
 7 | 
 8 | 
 9 | def add_dicts(d1, d2):
10 |     """Extend json, sum values."""
11 |     def sum_a(v1, v2):
12 |         if v2 is None:
13 |             return v1
14 |         try:
15 |             if isinstance(v1 + v2, int):
16 |                 return v1 + v2
17 |             elif isinstance(v1 + v2, str):
18 |                 return v1
19 |         except TypeError:
20 |             return add_dicts(v1, v2)
21 |     result = d2.copy()
22 |     result.update({k: sum_a(v, d2.get(k)) for k, v in d1.items()})
23 |     return result
24 | 
25 | 
26 | def main(args):
27 |     """Run the entry point."""
28 |     if os.stat(args.state).st_size == 0:
29 |         state = {}
30 |     else:
31 |         with open(args.state) as json_file:
32 |             state = json.load(json_file)
33 |     with open(args.new_file) as json_file:
34 |         new_file = json.load(json_file)
35 |     combined = add_dicts(state, new_file)
36 |     with open(args.output, "w") as outfile:
37 |         json.dump(combined, outfile)
38 | 
39 | 
40 | def argparser():
41 |     """Create argument parser."""
42 |     parser = argparse.ArgumentParser(
43 |         "add_jsons",
44 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
45 |         add_help=False)
46 |     parser.add_argument("new_file")
47 |     parser.add_argument("state")
48 |     parser.add_argument("output")
49 |     return parser
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     args = argparser().parse_args()
54 |     main(args)
55 | 


--------------------------------------------------------------------------------
/bin/check_sample_sheet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Script to check that sample sheet is well-formatted."""
 3 | import argparse
 4 | import sys
 5 | 
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def main():
10 |     """Run entry point."""
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('sample_sheet')
13 |     parser.add_argument('output')
14 |     args = parser.parse_args()
15 | 
16 |     try:
17 |         samples = pd.read_csv(args.sample_sheet, sep=None)
18 |         if 'alias' in samples.columns:
19 |             if 'sample_id' in samples.columns:
20 |                 sys.stderr.write(
21 |                     "Warning: sample sheet contains both 'alias' and "
22 |                     'sample_id, using the former.')
23 |             samples['sample_id'] = samples['alias']
24 |         if not set(['sample_id', 'barcode']).intersection(samples.columns):
25 |             raise IOError()
26 |     except Exception:
27 |         raise IOError(
28 |             "Could not parse sample sheet, it must contain two columns "
29 |             "named 'barcode' and 'sample_id' or 'alias'.")
30 |     # check duplicates
31 |     dup_bc = samples['barcode'].duplicated()
32 |     dup_sample = samples['sample_id'].duplicated()
33 |     if any(dup_bc) or any(dup_sample):
34 |         raise IOError(
35 |             "Sample sheet contains duplicate values.")
36 |     samples.to_csv(args.output, sep=",", index=False)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/bin/duplex_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Count duplex and simplex reads in xam file."""
 3 | import argparse
 4 | 
 5 | import pysam
 6 | 
 7 | 
 8 | def main():
 9 |     """Run entry point."""
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('xam')
12 |     parser.add_argument('outname')
13 |     args = parser.parse_args()
14 | 
15 |     # Prepare input files
16 |     xam = pysam.AlignmentFile(args.xam, check_sq=False)
17 | 
18 |     # Count simplex and duplex reads in a xam
19 |     sx = dx = 0
20 |     for read in xam.fetch(until_eof=True):
21 |         if read.get_tag('dx') == 1:
22 |             dx += 1
23 |         else:
24 |             sx += 1
25 | 
26 |     # Save counts to output
27 |     with open(f'{args.outname}', 'w') as out:
28 |         out.write("Filename,Duplex,Paired,Simplex\n")
29 |         out.write(f"{args.xam},{dx},{dx*2},{sx}\n")
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/bin/fastcat_histogram.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Histogram-json."""
 3 | 
 4 | import argparse
 5 | import json
 6 | import sys
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | 
12 | def histogram_counts(data, dmin=0, bin_width=100):
13 |     """Histogram bins and counts."""
14 |     bins = np.arange(dmin, max(data) + bin_width, bin_width)
15 |     counts, _ = np.histogram(data, bins=bins)
16 |     # Note that there can be small differences with/without batch_size=1.
17 |     # https://numpy.org/doc/stable/reference/generated/numpy.histogram.html
18 |     # bins from =[1, 2, 3, 4] => First bin=[1, 2), last bin=[3, 4].
19 |     # i.e. in batch_size=1, the count will be in the last interval (both edges included)
20 |     # With more sequences, there can be different intervals and edge value can be moved
21 |     # to the next consecutive interval.
22 |     return bins.tolist(), counts.tolist()
23 | 
24 | 
25 | def get_stats(seq_summary):
26 |     """Get Stats Json."""
27 |     stats_json = {
28 |         "total_reads": len(seq_summary)}
29 |     if len(seq_summary) >= 1:
30 |         len_data = seq_summary['read_length']
31 |         len_bins, len_counts = histogram_counts(
32 |             len_data, dmin=0, bin_width=50)
33 |         stats_json["len"] = dict(list(zip(len_bins, len_counts)))
34 | 
35 |         qual_data = seq_summary['mean_quality']
36 |         qual_bins, qual_counts = histogram_counts(
37 |             qual_data, dmin=0, bin_width=0.2)
38 |         stats_json["qual"] = dict(list(zip(qual_bins, qual_counts)))
39 |     else:
40 |         sys.stderr.write("WARNING: summary file was empty.\n")
41 |         stats_json["len"] = dict()
42 |         stats_json["qual"] = dict()
43 |     return stats_json
44 | 
45 | 
46 | def main(args):
47 |     """Run the entry point."""
48 |     df = pd.read_csv(
49 |         args.input, sep="\t",
50 |         usecols=['read_length', 'mean_quality'],
51 |         dtype={'read_length': int, 'mean_quality': float})
52 |     final = {args.sample_id: get_stats(df)}
53 |     with open(args.output, 'w') as fp:
54 |         json.dump(final, fp)
55 | 
56 | 
57 | def argparser():
58 |     """Argument parser for entrypoint."""
59 |     parser = argparse.ArgumentParser()
60 |     parser.add_argument(
61 |         "input", help="Read summary file.")
62 |     parser.add_argument(
63 |         "output", help="Output summary JSON.")
64 |     parser.add_argument(
65 |         "--sample_id", help="Sample name.")
66 |     return parser
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparser()
71 |     main(parser.parse_args())
72 | 


--------------------------------------------------------------------------------
/bin/report.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Create workflow report."""
  3 | import argparse
  4 | import json
  5 | 
  6 | from bokeh.models import Title
  7 | from dominate.tags import p
  8 | import ezcharts as ezc
  9 | from ezcharts.components.ezchart import EZChart
 10 | from ezcharts.components.reports import labs
 11 | from ezcharts.layout.snippets import DataTable
 12 | from ezcharts.layout.snippets import Grid
 13 | from ezcharts.layout.snippets import Tabs
 14 | import pandas as pd
 15 | from report_utils import read_length_plot, read_quality_plot
 16 | from ezcharts.util import get_named_logger  # noqa: ABS101
 17 | 
 18 | 
 19 | THEME = 'epi2melabs'
 20 | 
 21 | 
 22 | def main(args):
 23 |     """Run the entry point."""
 24 |     logger = get_named_logger("Report")
 25 |     report = labs.LabsReport(
 26 |         "Basecalling report", "wf-basecalling",
 27 |         args.params, args.versions,
 28 |         args.workflow_version,)
 29 | 
 30 |     # Create statistics from the histogram json generated from
 31 |     #  the workflow, summarising the results from bamstats.
 32 |     if args.stats:
 33 |         with report.add_section("Read summary", "Read summary"):
 34 |             with open(args.stats[0]) as f:
 35 |                 datas = json.load(f)
 36 |                 tabs = Tabs()
 37 |                 total_reads = {}
 38 |                 for sample_id, data in sorted(datas.items()):
 39 |                     with tabs.add_tab(sample_id):
 40 |                         total_reads[sample_id] = data['total_reads']
 41 |                         if data['total_reads'] == 0:
 42 |                             p("No reads called.")
 43 |                             continue
 44 |                         with Grid(columns=2):
 45 |                             EZChart(
 46 |                                 read_quality_plot(data), THEME)
 47 |                             EZChart(read_length_plot(data), THEME)
 48 |                 with tabs.add_tab('total'):
 49 |                     with Grid(columns=1):  # total read counts per sample
 50 |                         df_stats = pd.DataFrame.from_dict(total_reads.items())
 51 |                         df_stats.columns = ['Sample_name', 'Number of reads']
 52 |                         plt = ezc.barplot(
 53 |                             data=df_stats, x='Sample_name', y='Number of reads')
 54 |                         plt._fig.add_layout(
 55 |                             Title(
 56 |                                 text="Number of reads per sample.",
 57 |                                 text_font_size="1.5em"
 58 |                             ),
 59 |                             'above'
 60 |                         )
 61 |                         EZChart(plt, THEME)
 62 | 
 63 |     # If pairing rates are provided, show them.
 64 |     if args.pairings:
 65 |         with report.add_section("Pairing summary", "Pairing summary"):
 66 |             with open(args.pairings[0]) as f:
 67 |                 # Load data
 68 |                 data = pd.read_csv(f)
 69 |                 if data.empty:
 70 |                     p("No reads called.")
 71 |                 else:
 72 |                     # Make summary
 73 |                     data_sum = data\
 74 |                         .drop(columns=['Filename'])\
 75 |                         .sum()\
 76 |                         .to_frame()\
 77 |                         .T
 78 |                     data_sum['Pairing rate'] = data_sum['Paired'] / data_sum['Simplex']
 79 |                     data_sum['Pairing rate'] = data_sum['Pairing rate'].round(4)
 80 |                     DataTable.from_pandas(
 81 |                         data_sum, use_index=False, export=True,
 82 |                         file_name=(
 83 |                             f'{args.sample_name}-wf-basecalling-duplex-summary'))
 84 |             p(
 85 |                 'Simplex: the number of initial reads.',
 86 |                 'Paired: the number of simplex reads belonging to a pair.',
 87 |                 'Duplex: the number of duplex reads.',
 88 |             )
 89 | 
 90 |     report.write(args.report)
 91 |     logger.info(f"Report written to {args.report}.")
 92 | 
 93 | 
 94 | def argparser():
 95 |     """Argument parser for entrypoint."""
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument("report", help="Report output file")
 98 |     parser.add_argument(
 99 |         "--stats", nargs='*', help="Fastcat per-read stats file(s).")
100 |     parser.add_argument(
101 |         "--pairings", nargs='*', help="Pairing per-chunk stats.", required=False)
102 |     parser.add_argument(
103 |         "--sample_name", required=True, help="Sample name.")
104 |     parser.add_argument(
105 |         "--versions", required=True,
106 |         help="directory containing CSVs containing name,version.")
107 |     parser.add_argument(
108 |         "--params", default=None, required=True,
109 |         help="A JSON file containing the workflow parameter key/values")
110 |     parser.add_argument(
111 |         "--revision", default='unknown',
112 |         help="git branch/tag of the executed workflow")
113 |     parser.add_argument(
114 |         "--commit", default='unknown',
115 |         help="git commit of the executed workflow")
116 |     parser.add_argument(
117 |         "--workflow_version", default='unknown',
118 |         help="Workflow version")
119 |     return parser
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     parser = argparser()
124 |     main(parser.parse_args())
125 | 


--------------------------------------------------------------------------------
/bin/report_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Create tables for the report."""
 3 | from bokeh.models import Title
 4 | from ezcharts.plots.distribution import histplot
 5 | import pandas as pd
 6 | 
 7 | # PLOTS
 8 | 
 9 | # The SeqSummary from ezcharts.components.fastcat cannot be used.
10 | # It groups data into bins, but from the real time analysis output
11 | # the input data is already grouped into bins.
12 | # Use weights of histplot for y axis.
13 | 
14 | 
15 | def read_quality_plot(seq_summary, min_qual=4, max_qual=30, title='Read quality'):
16 |     """Create read quality summary plot."""
17 |     df = pd.DataFrame.from_dict(seq_summary['qual'].items())
18 |     df.columns = ['mean_quality', 'counts']
19 |     df['mean_quality'] = df['mean_quality'].astype('float')
20 |     plt = histplot(
21 |         data=df['mean_quality'],
22 |         bins=len(df),
23 |         weights=list(df['counts'])
24 |         )
25 |     plt._fig.add_layout(
26 |         Title(text=title, text_font_size="1.5em"),
27 |         'above'
28 |     )
29 |     plt._fig.xaxis.axis_label = "Quality score"
30 |     plt._fig.yaxis.axis_label = "Number of reads"
31 |     plt._fig.x_range.start = min_qual
32 |     plt._fig.x_range.end = max_qual
33 |     return plt
34 | 
35 | 
36 | def read_length_plot(seq_summary, title='Read length'):
37 |     """Create a read length plot."""
38 |     df = pd.DataFrame.from_dict(seq_summary['len'].items())
39 |     df.columns = ['read_length', 'counts']
40 |     df['read_length'] = df['read_length'].astype('uint64')
41 |     df['read_length'] = df['read_length'] / 1000
42 |     plt = histplot(
43 |         data=df['read_length'],
44 |         bins=len(df),
45 |         weights=list(df['counts']))
46 |     plt._fig.add_layout(
47 |         Title(text=title, text_font_size="1.5em"),
48 |         'above'
49 |     )
50 |     plt._fig.x_range.start = 0
51 |     plt._fig.xaxis.axis_label = "Read length / kb"
52 |     plt._fig.yaxis.axis_label = "Number of reads"
53 |     return plt
54 | 


--------------------------------------------------------------------------------
/bin/workflow-glue:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Entrypoint of pseudo-package for all the code used in the workflow."""
3 | 
4 | from workflow_glue import cli
5 | 
6 | if __name__ == "__main__":
7 |     cli()
8 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/__init__.py:
--------------------------------------------------------------------------------
 1 | """Workflow Python code."""
 2 | import argparse
 3 | import glob
 4 | import importlib
 5 | import itertools
 6 | import os
 7 | import sys
 8 | 
 9 | from .util import _log_level, get_main_logger  # noqa: ABS101
10 | 
11 | 
12 | __version__ = "0.0.1"
13 | _package_name = "workflow_glue"
14 | 
15 | HELPERS = "wfg_helpers"
16 | 
17 | 
18 | def get_components(allowed_components=None):
19 |     """Find a list of workflow command scripts."""
20 |     logger = get_main_logger(_package_name)
21 | 
22 |     # gather all python files in the current directory and the wfg_helpers
23 |     home_path = os.path.dirname(os.path.abspath(__file__))
24 |     standard_lib = os.path.join(home_path, HELPERS)
25 |     globs = itertools.chain.from_iterable((
26 |         glob.glob(os.path.join(path, "*.py"))
27 |         for path in (home_path, standard_lib)))
28 | 
29 |     components = dict()
30 |     for fname in globs:
31 |         name = os.path.splitext(os.path.basename(fname))[0]
32 |         if name in ("__init__", "util"):
33 |             continue
34 |         if allowed_components is not None and name not in allowed_components:
35 |             continue
36 | 
37 |         # leniently attempt to import module
38 |         try:
39 |             if HELPERS in fname:
40 |                 mod = importlib.import_module(f"{_package_name}.{HELPERS}.{name}")
41 |             else:
42 |                 mod = importlib.import_module(f"{_package_name}.{name}")
43 |         except ModuleNotFoundError as e:
44 |             # if imports cannot be satisifed, refuse to add the component
45 |             # rather than exploding
46 |             logger.warn(f"Could not load {name} due to missing module {e.name}")
47 |             continue
48 | 
49 |         # if theres a main() and and argparser() thats good enough for us.
50 |         try:
51 |             req = "main", "argparser"
52 |             if all(callable(getattr(mod, x)) for x in req):
53 |                 components[name] = mod
54 |         except Exception:
55 |             pass
56 |     return components
57 | 
58 | 
59 | def cli():
60 |     """Run workflow entry points."""
61 |     logger = get_main_logger(_package_name)
62 |     logger.info("Bootstrapping CLI.")
63 |     parser = argparse.ArgumentParser(
64 |         'wf-glue',
65 |         parents=[_log_level()],
66 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
67 | 
68 |     parser.add_argument(
69 |         '-v', '--version', action='version',
70 |         version='%(prog)s {}'.format(__version__))
71 | 
72 |     subparsers = parser.add_subparsers(
73 |         title='subcommands', description='valid commands',
74 |         help='additional help', dest='command')
75 |     subparsers.required = True
76 | 
77 |     # importing everything can take time, try to shortcut
78 |     if len(sys.argv) > 1:
79 |         components = get_components(allowed_components=[sys.argv[1]])
80 |         if not sys.argv[1] in components:
81 |             logger.warn("Importing all modules, this may take some time.")
82 |             components = get_components()
83 |     else:
84 |         components = get_components()
85 | 
86 |     # add all module parsers to main CLI
87 |     for name, module in components.items():
88 |         p = subparsers.add_parser(
89 |             name.split(".")[-1], parents=[module.argparser()])
90 |         p.set_defaults(func=module.main)
91 | 
92 |     args = parser.parse_args()
93 | 
94 |     logger.info("Starting entrypoint.")
95 |     args.func(args)
96 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/models/__init__.py:
--------------------------------------------------------------------------------
1 | """A collection of scripts for results models."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/models/common.py:
--------------------------------------------------------------------------------
  1 | """Common model classes used across all workflows."""
  2 | from dataclasses import asdict, dataclass, field
  3 | from enum import Enum
  4 | import json
  5 | from pathlib import Path
  6 | from typing import Any, Dict, List, Optional
  7 | 
  8 | from ..util import get_named_logger  # noqa: ABS101
  9 | 
 10 | logger = get_named_logger("Models")
 11 | 
 12 | 
 13 | @dataclass
 14 | class WorkflowBaseModel:
 15 |     """Common things for stuff in the model."""
 16 | 
 17 |     def get(
 18 |         self,
 19 |         field_name: str,
 20 |         title: bool = True,
 21 |         **kwargs
 22 |     ):
 23 |         """Get reportable field tuple."""
 24 |         field_info = self.__dataclass_fields__.get(field_name)
 25 |         # provide an empty string default title to minimise drama
 26 |         field_title = field_info.metadata.get("title", "")
 27 |         value = self.get_reportable_value(field_name=field_name, **kwargs)
 28 |         if title:
 29 |             return (field_title, value)
 30 |         return value
 31 | 
 32 |     def get_reportable_value(
 33 |             self,
 34 |             field_name: str,
 35 |             *,
 36 |             decimal_places: int = None,
 37 |             default_value: str = "N/A") -> Optional[str]:
 38 |         """Get the value of a value and make it reportable."""
 39 |         # Get the field info using the field name
 40 |         field_info = self.__dataclass_fields__.get(field_name)
 41 |         if field_info is None:
 42 |             raise AttributeError(
 43 |                 f"{field_name!r} is not a field on {self.__class__.__name__}"
 44 |             )
 45 | 
 46 |         value = getattr(self, field_name)
 47 | 
 48 |         if value is None:
 49 |             return default_value
 50 | 
 51 |         if isinstance(value, (int, float)):
 52 |             if decimal_places:
 53 |                 value = round(value, decimal_places)
 54 |             if value < 0.0001 or value > 99999999:
 55 |                 value = f"{value:.2E}"
 56 |         else:
 57 |             if decimal_places:
 58 |                 raise TypeError(
 59 |                     "decimal_places is not a supported argument for a non-numeric.")
 60 | 
 61 |         unit = field_info.metadata.get('unit')
 62 | 
 63 |         if unit:
 64 |             return f"{value} {unit}"
 65 | 
 66 |         return str(value)
 67 | 
 68 | 
 69 | class SampleType(str, Enum):
 70 |     """The type of the sample."""
 71 | 
 72 |     no_template_control = "no_template_control"
 73 |     positive_control = "positive_control"
 74 |     negative_control = "negative_control"
 75 |     test_sample = "test_sample"
 76 | 
 77 |     def friendly_name(self):
 78 |         """Convert sample type to string."""
 79 |         return self.name.replace("_", " ").capitalize()
 80 | 
 81 | 
 82 | @dataclass
 83 | class SampleIdentifier:
 84 |     """Additional identifiers for a sample."""
 85 | 
 86 |     name: str = field(
 87 |         metadata={
 88 |             "title": "Identifier name",
 89 |             "Description": "The name of the sample identifier"})
 90 |     value: str = field(
 91 |         metadata={
 92 |             "title": "Identifier value",
 93 |             "Description": "The value of the sample identifier"})
 94 | 
 95 | 
 96 | @dataclass
 97 | class CheckResult:
 98 |     """
 99 |     A result of some check the workflow has performed.
100 | 
101 |     This can be at sample or workflow level.
102 |     """
103 | 
104 |     check_category: str = field(
105 |         metadata={
106 |             "title": "Check category",
107 |             "description": "The category of the check"})
108 |     check_name: str = field(
109 |         metadata={
110 |             "title": "Check name",
111 |             "description": "The name of the check"})
112 |     check_pass: bool = field(
113 |         metadata={
114 |             "title": "Check pass",
115 |             "description": "If true the check has passed"})
116 |     check_threshold: str | None = field(
117 |         default=None, metadata={
118 |             "title": "Check threshold",
119 |             "description": "The threshold for the check, useful for reporting later"})
120 | 
121 |     categories = {}
122 | 
123 |     def friendly_check_category(self):
124 |         """Convert category to string."""
125 |         if self.check_category not in self.categories:
126 |             raise ValueError(f"{self.check_category} has no friendly name")
127 |         return self.categories[self.check_category]
128 | 
129 |     def friendly_check_name(self):
130 |         """Convert check name to string."""
131 |         return self.check_name.replace("_", " ").capitalize()
132 | 
133 | 
134 | @dataclass
135 | class ResultsContents:
136 |     """Placeholder class for results contents."""
137 | 
138 |     pass
139 | 
140 | 
141 | @dataclass
142 | class Sample:
143 |     """A sample sheet entry and its corresponding checks and related results."""
144 | 
145 |     alias: str = field(
146 |         metadata={
147 |             "title": "Sample alias",
148 |             "description": "The alias for the sample given by the user"})
149 |     sample_type: SampleType = field(
150 |         metadata={
151 |             "title": "Sample type",
152 |             "description": "The type of the sample"})
153 |     sample_pass: bool = field(
154 |         metadata={
155 |             "title": "Sample pass",
156 |             "description": "If true the sample has passed workflow checks"})
157 |     barcode: str | None = field(
158 |         default=None,
159 |         metadata={
160 |             "title": "Sample barcode",
161 |             "description": "The physical barcode assigned to the sample"})
162 |     additional_identifiers: List[SampleIdentifier] = field(
163 |         default_factory=list, metadata={
164 |             "title": "Additional sample identifiers",
165 |             "description": "Additional identifiers for the sample"})
166 |     sample_checks: list[CheckResult] = field(
167 |         default_factory=list, metadata={
168 |             "title": "Sample checks",
169 |             "description": "An array of checks performed on the sample"})
170 |     results: ResultsContents | None = field(
171 |         default=None, metadata={
172 |             "title": "Sample results",
173 |             "description": "Further specific workflow results for this sample"})
174 |     config:  Dict[str, Any] | None = field(
175 |         default=None, metadata={
176 |             "title": "Sample configuration",
177 |             "description": """Sample specific config parameters
178 |             used for running analysis"""})
179 | 
180 |     def __post_init__(self):
181 |         """Determine overall status for a sample given the individual check results."""
182 |         self.sample_pass = all(
183 |             check.check_pass for check in self.sample_checks)
184 | 
185 |     def get_sample_identifier(self, sample_identifier):
186 |         """Get a sample identifier given the identifier name."""
187 |         for identifier in self.additional_identifiers:
188 |             if identifier.name == sample_identifier:
189 |                 return identifier.value
190 |         raise KeyError("Sample identifier not found")
191 | 
192 |     def set_sample_identifier(self, name, value):
193 |         """Set a sample identifier."""
194 |         sample_identifier = SampleIdentifier(
195 |             name=name,
196 |             value=value)
197 |         self.additional_identifiers.append(sample_identifier)
198 |         return self.additional_identifiers
199 | 
200 |     def to_json(self, filename):
201 |         """Save class as JSON."""
202 |         with open(filename, 'w') as f:
203 |             json.dump(asdict(self), f, default=str, indent=2)
204 | 
205 |     def get_reportable_qc_status(self, max_criteria=4):
206 |         """Store global status of the sample and list of QC criteria to show.
207 | 
208 |         :params max_criteria: Maximum number of criteria to be reported.
209 |         """
210 |         # Store global status: pass/ failed
211 |         qc_global_status = {"status": self.sample_pass, "scope": "QC status"}
212 |         qc_criteria = []
213 |         if self.sample_pass:
214 |             qc_criteria.append(
215 |                     {"status": self.sample_pass, "scope": "All acceptance criteria met"}
216 |             )
217 |         else:
218 |             # Report failed criteria until a maximum value
219 |             for qc in self.sample_checks:
220 |                 if not qc.check_pass:  # append criteria if failed
221 |                     qc_criteria.append(
222 |                         {
223 |                             "status": qc.check_pass,
224 |                             "category": qc.friendly_check_category(),
225 |                             "scope": qc.friendly_check_name(),
226 |                         }
227 |                     )
228 |             if len(qc_criteria) > max_criteria:
229 |                 # Replace all the failed criteria, with a sentence with the number
230 |                 # instead of listing all of them.
231 |                 # Set status to False as more than max_criteria are failed.
232 |                 qc_criteria = [
233 |                     {
234 |                         "status": False,
235 |                         "scope": f"{len(qc_criteria)} acceptance criteria",
236 |                     },
237 |                 ]
238 |         return qc_global_status, qc_criteria
239 | 
240 | 
241 | @dataclass
242 | class RunStats:
243 |     """Basic run statistics for the entire run."""
244 | 
245 |     total_reads: int | None = field(
246 |         default=None, metadata={
247 |             "title": "Total reads",
248 |             "description": "Total number of reads on run"})
249 |     total_ambiguous_reads: int | None = field(
250 |         default=None, metadata={
251 |             "title": "Total ambiguous reads",
252 |             "description": "Number of reads of unknown provenance"})
253 |     total_unaligned_reads: int | None = field(
254 |         default=None, metadata={
255 |             "title": "Total unaligned reads",
256 |             "description": "Number of unaligned reads"})
257 | 
258 | 
259 | @dataclass
260 | class WorkflowResult(WorkflowBaseModel):
261 |     """
262 |     Definition for results that will be returned by this workflow.
263 | 
264 |     This structure will be passed through by Gizmo speaking clients
265 |     as WorkflowInstance.results.
266 |     """
267 | 
268 |     samples: list[Sample] = field(
269 |         metadata={
270 |             "title": "Samples",
271 |             "description": "Samples in this workflow instance"})
272 |     workflow_pass: bool | None = field(
273 |         default=None, metadata={
274 |             "title": "Workflow pass",
275 |             "description": "True if this workflow instance passes all checks"})
276 |     workflow_checks: list[CheckResult] = field(
277 |         default_factory=list, metadata={
278 |             "title": "Workflow checks",
279 |             "description": "An array of checks performed on the workflow instance"})
280 |     run_stats: RunStats | None = field(
281 |         default=None, metadata={
282 |             "title": "Samples",
283 |             "description": "Basic run statistics"})
284 |     client_fields: dict[str, Any] | None = field(
285 |         default_factory=dict, metadata={
286 |             "title": "Client fields",
287 |             "description": "Arbitrary key-value pairs provided by the client"})
288 |     versions: dict[str, Any] | None = field(
289 |         default_factory=dict, metadata={
290 |             "title": "Analysis tool versions",
291 |             "description": """Key-value pairs collecting the
292 |             software used and the corresponding versions"""})
293 |     params: dict[str, Any] | None = field(
294 |         default_factory=dict, metadata={
295 |             "title": "Pertinent parameters",
296 |             "description": """Key-value pairs with the
297 |             options chosen by the user"""})
298 | 
299 |     def load_client_fields(self, filename):
300 |         """Load client fields."""
301 |         with open(filename) as f:
302 |             try:
303 |                 client_fields = json.loads(f.read())
304 |                 # convert any lists into strings for display
305 |                 for key, value in client_fields.items():
306 |                     if isinstance(value, list):
307 |                         client_fields[key] = ', '.join(value)
308 |             except json.decoder.JSONDecodeError:
309 |                 client_fields = {"error": "Error parsing client fields file."}
310 | 
311 |         self.client_fields = client_fields
312 |         return self.client_fields
313 | 
314 |     def load_params(self, params_json, keep=None):
315 |         """Create a workflow params dict."""
316 |         params_json = Path(params_json)
317 |         if keep is None:
318 |             keep = []
319 |         if not params_json.is_file():
320 |             raise FileNotFoundError(f"No such file: {params_json}")
321 |         with open(params_json, "r") as f:
322 |             try:
323 |                 params_dict = json.loads(f.read())
324 |                 self.params = {
325 |                     k: v for k, v in params_dict.items() if k in set(keep)
326 |                 }
327 |                 return self.params
328 |             except ValueError:
329 |                 raise ValueError(f"Invalid JSON file: {params_json}")
330 | 
331 |     def load_versions(self, versions_path):
332 |         """Create a version list of dict."""
333 |         versions_path = Path(versions_path)
334 |         if not versions_path.exists():
335 |             raise FileNotFoundError(f"No such file: {versions_path}")
336 | 
337 |         if versions_path.is_dir():
338 |             version_files = [
339 |                 vp for vp in versions_path.iterdir() if vp.is_file()
340 |             ]
341 |         elif versions_path.is_file():
342 |             version_files = [versions_path]
343 |         else:
344 |             raise IOError(f"{versions_path} should be either a directory or a file")
345 |         for fname in version_files:
346 |             versions = {}
347 |             with open(fname, "r", encoding="utf-8") as fh:
348 |                 for line in fh.readlines():
349 |                     name, version = line.strip().split(",")
350 |                     versions[name] = version
351 |         self.versions = versions
352 |         return self.versions
353 | 
354 |     def to_json(self, filename):
355 |         """Save class as JSON."""
356 |         with open(filename, 'w') as f:
357 |             json.dump(asdict(self), f, default=str, indent=2)
358 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/util.py:
--------------------------------------------------------------------------------
 1 | """The odd helper function.
 2 | 
 3 | Be careful what you place in here. This file is imported into all glue.
 4 | """
 5 | import argparse
 6 | import logging
 7 | 
 8 | 
 9 | _log_name = None
10 | 
11 | 
12 | def get_main_logger(name):
13 |     """Create the top-level logger."""
14 |     global _log_name
15 |     _log_name = name
16 |     logging.basicConfig(
17 |         format='[%(asctime)s - %(name)s] %(message)s',
18 |         datefmt='%H:%M:%S', level=logging.INFO)
19 |     return logging.getLogger(name)
20 | 
21 | 
22 | def get_named_logger(name):
23 |     """Create a logger with a name.
24 | 
25 |     :param name: name of logger.
26 |     """
27 |     name = name.ljust(10)[:10]  # so logging is aligned
28 |     logger = logging.getLogger('{}.{}'.format(_log_name, name))
29 |     return logger
30 | 
31 | 
32 | def wf_parser(name):
33 |     """Make an argument parser for a workflow command."""
34 |     return argparse.ArgumentParser(
35 |         name,
36 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
37 |         add_help=False)
38 | 
39 | 
40 | def _log_level():
41 |     """Parser to set logging level and acquire software version/commit."""
42 |     parser = argparse.ArgumentParser(
43 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False)
44 | 
45 |     modify_log_level = parser.add_mutually_exclusive_group()
46 |     modify_log_level.add_argument(
47 |         '--debug', action='store_const',
48 |         dest='log_level', const=logging.DEBUG, default=logging.INFO,
49 |         help='Verbose logging of debug information.')
50 |     modify_log_level.add_argument(
51 |         '--quiet', action='store_const',
52 |         dest='log_level', const=logging.WARNING, default=logging.INFO,
53 |         help='Minimal logging; warnings only.')
54 | 
55 |     return parser
56 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | """A collection of helper scripts common to workflows."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_bam_headers_in_dir.py:
--------------------------------------------------------------------------------
 1 | """Check (u)BAM files for `@SQ` lines whether they are the same in all headers."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("checkBamHdr")
14 | 
15 |     if not args.input_path.is_dir():
16 |         raise ValueError(f"Input path '{args.input_path}' must be a directory.")
17 | 
18 |     target_files = list(args.input_path.glob("*"))
19 |     if not target_files:
20 |         raise ValueError(f"No files found in input directory '{args.input_path}'.")
21 |     # Loop over target files and check if there are `@SQ` lines in all headers or not.
22 |     # Set `is_unaligned` accordingly. If there are mixed headers (either with some files
23 |     # containing `@SQ` lines and some not or with different files containing different
24 |     # `@SQ` lines), set `mixed_headers` to `True`.
25 |     # Also check if there is the SO line, to validate whether the file is (un)sorted.
26 |     first_sq_lines = None
27 |     mixed_headers = False
28 |     sorted_xam = False
29 |     for xam_file in target_files:
30 |         # get the `@SQ` and `@HD` lines in the header
31 |         with pysam.AlignmentFile(xam_file, check_sq=False) as f:
32 |             # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with
33 |             # same reference but different SQ.UR as mixed_header (see CW-4842)
34 |             sq_lines = [{
35 |                 "SN": sq["SN"],
36 |                 "LN": sq["LN"],
37 |                 "M5": sq.get("M5"),
38 |             } for sq in f.header.get("SQ", [])]
39 |             hd_lines = f.header.get("HD")
40 |         # Check if it is sorted.
41 |         # When there is more than one BAM, merging/sorting
42 |         # will happen regardless of this flag.
43 |         if hd_lines is not None and hd_lines.get('SO') == 'coordinate':
44 |             sorted_xam = True
45 |         if first_sq_lines is None:
46 |             # this is the first file
47 |             first_sq_lines = sq_lines
48 |         else:
49 |             # this is a subsequent file; check with the first `@SQ` lines
50 |             if sq_lines != first_sq_lines:
51 |                 mixed_headers = True
52 |                 break
53 | 
54 |     # we set `is_unaligned` to `True` if there were no mixed headers and the last file
55 |     # didn't have `@SQ` lines (as we can then be sure that none of the files did)
56 |     is_unaligned = not mixed_headers and not sq_lines
57 |     # write `is_unaligned` and `mixed_headers` out so that they can be set as env.
58 |     # variables
59 |     sys.stdout.write(
60 |         f"IS_UNALIGNED={int(is_unaligned)};" +
61 |         f"MIXED_HEADERS={int(mixed_headers)};" +
62 |         f"IS_SORTED={int(sorted_xam)}"
63 |     )
64 |     logger.info(f"Checked (u)BAM headers in '{args.input_path}'.")
65 | 
66 | 
67 | def argparser():
68 |     """Argument parser for entrypoint."""
69 |     parser = wf_parser("check_bam_headers_in_dir")
70 |     parser.add_argument("input_path", type=Path, help="Path to target directory")
71 |     return parser
72 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_sample_sheet.py:
--------------------------------------------------------------------------------
  1 | """Check if a sample sheet is valid."""
  2 | import codecs
  3 | import csv
  4 | import os
  5 | import re
  6 | import sys
  7 | 
  8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
  9 | 
 10 | 
 11 | # Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my
 12 | # comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8
 13 | # I should add). If we do not handle this with the correct encoding, the mark will
 14 | # appear in the parsed data, causing the header to be malformed.
 15 | # See CW-2310
 16 | def determine_codec(f):
 17 |     """Peek at a file and return an appropriate reading codec."""
 18 |     with open(f, 'rb') as f_bytes:
 19 |         # Could use chardet here if we need to expand codec support
 20 |         initial_bytes = f_bytes.read(8)
 21 | 
 22 |         for codec, encoding_name in [
 23 |             [codecs.BOM_UTF8, "utf-8-sig"],  # use the -sig codec to drop the mark
 24 |             [codecs.BOM_UTF16_BE, "utf-16"],  # don't specify LE or BE to drop mark
 25 |             [codecs.BOM_UTF16_LE, "utf-16"],
 26 |             [codecs.BOM_UTF32_BE, "utf-32"],  # handle 32 for completeness
 27 |             [codecs.BOM_UTF32_LE, "utf-32"],  # again skip LE or BE to drop mark
 28 |         ]:
 29 |             if initial_bytes.startswith(codec):
 30 |                 return encoding_name
 31 |         return None  # will cause file to be opened with default encoding
 32 | 
 33 | 
 34 | def main(args):
 35 |     """Run the entry point."""
 36 |     logger = get_named_logger("checkSheet")
 37 | 
 38 |     barcodes = []
 39 |     aliases = []
 40 |     sample_types = []
 41 |     analysis_groups = []
 42 |     allowed_sample_types = [
 43 |         "test_sample", "positive_control", "negative_control", "no_template_control"
 44 |     ]
 45 | 
 46 |     if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet):
 47 |         sys.stdout.write("Could not open sample sheet file.")
 48 |         sys.exit()
 49 | 
 50 |     try:
 51 |         encoding = determine_codec(args.sample_sheet)
 52 |         with open(args.sample_sheet, "r", encoding=encoding) as f:
 53 |             try:
 54 |                 # Excel files don't throw any error until here
 55 |                 csv.Sniffer().sniff(f.readline())
 56 |                 f.seek(0)  # return to initial position again
 57 |             except Exception as e:
 58 |                 # Excel fails with UniCode error
 59 |                 sys.stdout.write(
 60 |                     "The sample sheet doesn't seem to be a CSV file.\n"
 61 |                     "The sample sheet has to be a CSV file.\n"
 62 |                     "Please verify that the sample sheet is a CSV file.\n"
 63 |                     f"Parsing error: {e}"
 64 |                  )
 65 | 
 66 |                 sys.exit()
 67 | 
 68 |             csv_reader = csv.DictReader(f)
 69 |             n_row = 0
 70 |             for row in csv_reader:
 71 |                 n_row += 1
 72 |                 if n_row == 1:
 73 |                     n_cols = len(row)
 74 |                 else:
 75 |                     # check we got the same number of fields
 76 |                     if len(row) != n_cols:
 77 |                         sys.stdout.write(
 78 |                             f"Unexpected number of cells in row number {n_row}"
 79 |                         )
 80 |                         sys.exit()
 81 |                 try:
 82 |                     barcodes.append(row["barcode"])
 83 |                 except KeyError:
 84 |                     sys.stdout.write("'barcode' column missing")
 85 |                     sys.exit()
 86 |                 try:
 87 |                     aliases.append(row["alias"])
 88 |                 except KeyError:
 89 |                     sys.stdout.write("'alias' column missing")
 90 |                     sys.exit()
 91 |                 try:
 92 |                     sample_types.append(row["type"])
 93 |                 except KeyError:
 94 |                     pass
 95 |                 try:
 96 |                     analysis_groups.append(row["analysis_group"])
 97 |                 except KeyError:
 98 |                     pass
 99 |     except Exception as e:
100 |         sys.stdout.write(f"Parsing error: {e}")
101 |         sys.exit()
102 | 
103 |     # check barcodes are correct format
104 |     for barcode in barcodes:
105 |         if not re.match(r'^barcode\d\d+$', barcode):
106 |             sys.stdout.write("values in 'barcode' column are incorrect format")
107 |             sys.exit()
108 | 
109 |     # check aliases are correct format
110 |     # for now we have decided they may not start with "barcode"
111 |     for alias in aliases:
112 |         if alias.startswith("barcode"):
113 |             sys.stdout.write("values in 'alias' column must not begin with 'barcode'")
114 |             sys.exit()
115 | 
116 |     # check barcodes are all the same length
117 |     first_length = len(barcodes[0])
118 |     for barcode in barcodes[1:]:
119 |         if len(barcode) != first_length:
120 |             sys.stdout.write("values in 'barcode' column are different lengths")
121 |             sys.exit()
122 | 
123 |     # check barcode and alias values are unique
124 |     if len(barcodes) > len(set(barcodes)):
125 |         sys.stdout.write("values in 'barcode' column not unique")
126 |         sys.exit()
127 |     if len(aliases) > len(set(aliases)):
128 |         sys.stdout.write("values in 'alias' column not unique")
129 |         sys.exit()
130 | 
131 |     if sample_types:
132 |         # check if "type" column has unexpected values
133 |         unexp_type_vals = set(sample_types) - set(allowed_sample_types)
134 | 
135 |         if unexp_type_vals:
136 |             sys.stdout.write(
137 |                 f"found unexpected values in 'type' column: {unexp_type_vals}. "
138 |                 f"Allowed values are: {allowed_sample_types}"
139 |             )
140 |             sys.exit()
141 | 
142 |         if args.required_sample_types:
143 |             for required_type in args.required_sample_types:
144 |                 if required_type not in allowed_sample_types:
145 |                     sys.stdout.write(f"Not an allowed sample type: {required_type}")
146 |                     sys.exit()
147 |                 if sample_types.count(required_type) < 1:
148 |                     sys.stdout.write(
149 |                         f"Sample sheet requires at least 1 of {required_type}")
150 |                     sys.exit()
151 |     if analysis_groups:
152 |         # if there was a "analysis_group" column, make sure it had values for all
153 |         # samples
154 |         if not all(analysis_groups):
155 |             sys.stdout.write(
156 |                 "if an 'analysis_group' column exists, it needs values in each row"
157 |             )
158 |             sys.exit()
159 | 
160 |     logger.info(f"Checked sample sheet {args.sample_sheet}.")
161 | 
162 | 
163 | def argparser():
164 |     """Argument parser for entrypoint."""
165 |     parser = wf_parser("check_sample_sheet")
166 |     parser.add_argument("sample_sheet", help="Sample sheet to check")
167 |     parser.add_argument(
168 |         "--required_sample_types",
169 |         help="List of required sample types. Each sample type provided must "
170 |              "appear at least once in the sample sheet",
171 |         nargs="*"
172 |     )
173 |     return parser
174 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_xam_index.py:
--------------------------------------------------------------------------------
 1 | """Validate a single (u)BAM file index."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def validate_xam_index(xam_file):
12 |     """Use fetch to validate the index.
13 | 
14 |     Invalid indexes will fail the call with a ValueError:
15 |     ValueError: fetch called on bamfile without index
16 |     """
17 |     with pysam.AlignmentFile(xam_file, check_sq=False) as alignments:
18 |         try:
19 |             alignments.fetch()
20 |             has_valid_index = True
21 |         except ValueError:
22 |             has_valid_index = False
23 |     return has_valid_index
24 | 
25 | 
26 | def main(args):
27 |     """Run the entry point."""
28 |     logger = get_named_logger("checkBamIdx")
29 | 
30 |     # Check if a XAM has a valid index
31 |     has_valid_index = validate_xam_index(args.input_xam)
32 |     # write `has_valid_index` out so that they can be set as env.
33 |     sys.stdout.write(
34 |         f"HAS_VALID_INDEX={int(has_valid_index)}"
35 |     )
36 |     logger.info(f"Checked (u)BAM index for: '{args.input_xam}'.")
37 | 
38 | 
39 | def argparser():
40 |     """Argument parser for entrypoint."""
41 |     parser = wf_parser("check_xam_index")
42 |     parser.add_argument("input_xam", type=Path, help="Path to target XAM")
43 |     return parser
44 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/configure_igv.py:
--------------------------------------------------------------------------------
  1 | """Create an IGV config file."""
  2 | 
  3 | import json
  4 | from pathlib import Path
  5 | import sys
  6 | 
  7 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
  8 | 
  9 | 
 10 | # Common variables
 11 | REF_EXTENSIONS = [".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fna.gz"]
 12 | DATA_TYPES_LISTS = {
 13 |     "bam": ["bam"],
 14 |     "bam_idx": ["bam.bai"],
 15 |     "cram": ["cram"],
 16 |     "cram_idx": ["cram.crai"],
 17 |     "vcf": ["vcf", "vcf.gz"],
 18 |     "vcf_idx": ["vcf.gz.tbi", "vcf.gz.csi"],
 19 |     "bcf": ["bcf"],
 20 |     "bcf_idx": ["bcf.csi"],
 21 |     "gtf": ["gtf", "gtf.gz"],
 22 |     "gtf_idx": ["gtf.gz.tbi"],
 23 |     "gff": ["gff", "gff.gz", "gff3", "gff3.gz"],
 24 |     "gff_idx": ["gff.gz.tbi", "gff3.gz.tbi"],
 25 |     "bed": ["bed", "bed.gz"],
 26 |     "bed_idx": ["bed.gz.tbi"],
 27 |     "bedmethyl": ["bedmethyl", "bedmethyl.gz"],
 28 |     "bedmethyl_idx": ["bedmethyl.gz.tbi"],
 29 |     "ref": REF_EXTENSIONS,
 30 | }
 31 | DATA_TYPES = {
 32 |     ext: ftype for ftype, extlist in DATA_TYPES_LISTS.items() for ext in extlist
 33 | }
 34 | 
 35 | # Data by idx
 36 | DATA_INDEXES_FMT = {
 37 |     fmt: f"{fmt}_idx" for fmt, dtype in DATA_TYPES.items() if "_idx" not in dtype
 38 | }
 39 | 
 40 | # Assign each format to its index
 41 | INDEX_PAIRS = {
 42 |     "bam": ("bai",),
 43 |     "cram": ("crai",),
 44 |     "vcf": ("tbi", "csi"),
 45 |     "bcf": ("csi",),
 46 |     "bed": ("tbi",),
 47 |     "bedmethyl": ("tbi",),
 48 |     "gff": ("tbi",),
 49 |     "gtf": ("tbi",),
 50 | }
 51 | 
 52 | 
 53 | class TrackBuilder:
 54 |     """Class that builds an IGV track."""
 55 | 
 56 |     def __init__(self):
 57 |         """Initialize properties for interval track."""
 58 |         # Reference properties
 59 |         self.ref = None
 60 |         self.fai = None
 61 |         self.gzi = None
 62 |         # Samples info
 63 |         self.samples = {}
 64 |         # Track properties
 65 |         self.igv_json = {"reference": {}, "tracks": []}
 66 |         self.track_type = {
 67 |             "bam": "alignment",
 68 |             "cram": "alignment",
 69 |             "bcf": "variant",
 70 |             "vcf": "variant",
 71 |             "bedmethyl": "annotation",
 72 |             "bed": "annotation",
 73 |             "gtf": "annotation",
 74 |             "gff": "annotation",
 75 |         }
 76 |         # Here we save aliases of file formats that IGV.js
 77 |         # wants and that do not match the input file extension.
 78 |         self.igv_fmt_alias = {"gff": "gff3"}
 79 |         # lookup of extra options for each data type
 80 |         self.extra_opts_lookups = {
 81 |             "bam": {},
 82 |             "cram": {},
 83 |             "bcf": {},
 84 |             "vcf": {},
 85 |             "bed": {},
 86 |             "bedmethyl": {},
 87 |             "gtf": {},
 88 |             "gff": {},
 89 |         }
 90 | 
 91 |     def add_ref(self, ref=None):
 92 |         """Add reference file, unless already defined."""
 93 |         if self.ref:
 94 |             raise Exception(
 95 |                 f"Reference genome has already been set to {self.ref}.\n"
 96 |                 "Only one reference FASTA file is expected."
 97 |             )
 98 |         else:
 99 |             self.ref = ref
100 | 
101 |     def add_ref_index(self, ref_index=None):
102 |         """Add reference index if valid."""
103 |         basename = Path(self.ref).name
104 |         idx_basename = Path(ref_index).name
105 |         if idx_basename == f"{basename}.fai":
106 |             self.fai = ref_index
107 |         if idx_basename == f"{basename}.gzi" and basename.endswith(".gz"):
108 |             self.gzi = ref_index
109 | 
110 |     def parse_fnames(self, fofn):
111 |         """Parse list with filenames and return them grouped.
112 | 
113 |         :param fofn: File with list of file names (one per line)
114 |         """
115 |         tmp_samples = {}
116 |         with open(fofn, "r") as f:
117 |             for line in f:
118 |                 # If the line contains the sample name, prepare the data structure
119 |                 if "," in line:
120 |                     sample, fname = line.strip().split(",")
121 |                     if sample not in tmp_samples:
122 |                         tmp_samples[sample] = SampleBundle(sample=sample)
123 |                     tmp_samples[sample].append(fname)
124 |                 else:
125 |                     # Otherwise, assign everything to NO_SAMPLE
126 |                     # Files will still be displayed, but in no specific order.
127 |                     fname = line.strip()
128 |                     if any(fname.endswith(ext) for ext in REF_EXTENSIONS):
129 |                         self.add_ref(ref=fname)
130 |                     elif fname.endswith(".fai") or fname.endswith(".gzi"):
131 |                         self.add_ref_index(ref_index=fname)
132 |                     else:
133 |                         if "NO_SAMPLE" not in tmp_samples.keys():
134 |                             tmp_samples["NO_SAMPLE"] = SampleBundle(sample="NO_SAMPLE")
135 |                         tmp_samples["NO_SAMPLE"].append(fname)
136 |         # Re-order samples in dict and add them to the list, leaving
137 |         # NO_SAMPLE as last
138 |         sorted_samples = (
139 |             sorted([sample for sample in tmp_samples.keys() if sample != 'NO_SAMPLE'])
140 |         )
141 |         if 'NO_SAMPLE' in tmp_samples.keys():
142 |             sorted_samples += ['NO_SAMPLE']
143 |         for sample in sorted_samples:
144 |             self.samples[sample] = tmp_samples[sample]
145 | 
146 |     def build_igv_json(self):
147 |         """Ensure there is a reference genome."""
148 |         if not self.ref:
149 |             raise ValueError(
150 |                 "No reference file (i.e. file ending in one of "
151 |                 f"{REF_EXTENSIONS} was found)."
152 |             )
153 |         # Evaluate that a bgzipped reference has the appropriate index.
154 |         if self.ref.endswith(".gz") and not self.gzi:
155 |             raise ValueError(f"GZI reference index for {self.ref} not found.")
156 | 
157 |         # Create the base track if there is a reference genome.
158 |         self.igv_json["reference"] = {
159 |             "id": "ref",
160 |             "name": "ref",
161 |             "wholeGenomeView": False,
162 |             "fastaURL": self.ref,
163 |         }
164 |         if self.fai:
165 |             self.igv_json["reference"]["indexURL"] = self.fai
166 |         if self.gzi:
167 |             self.igv_json["reference"]["compressedIndexURL"] = self.gzi
168 | 
169 |         # Add samples data now
170 |         for sample, bundle in self.samples.items():
171 |             bundle.process_data()
172 |             # Add the bundled data to the tracks
173 |             for fname, index, file_fmt in bundle.data_bundles:
174 |                 self.add_track(
175 |                     fname,
176 |                     file_fmt,
177 |                     sample_name=sample if sample != "NO_SAMPLE" else None,
178 |                     index=index,
179 |                     extra_opts=self.extra_opts_lookups[file_fmt],
180 |                 )
181 | 
182 |     def add_track(self, infile, file_fmt, sample_name=None, index=None, extra_opts={}):
183 |         """Add a track to an IGV json.
184 | 
185 |         This function takes an input file, an optional index file, its
186 |         file format and additional extra options for the track.
187 | 
188 |         :param infile: input file to create a track for
189 |         :param file_fmt: input file track type
190 |         :param sample_name: Name of the sample to display in the track name
191 |         :param index: index for the input file
192 |         :param extra_opts: dict of extra options for the track
193 |         :return: dict with track options
194 |         """
195 |         # Define track name depending on whether the sample ID is provided
196 |         track_name = Path(infile).name
197 |         if sample_name:
198 |             track_name = f"{sample_name}: {Path(infile).name}"
199 |         track_dict = {
200 |             "name": track_name,
201 |             "type": self.track_type[file_fmt],
202 |             "format": self.igv_fmt_alias.get(file_fmt, file_fmt),
203 |             "url": infile,
204 |         }
205 |         # add the index, if present
206 |         if index:
207 |             track_dict["indexURL"] = index
208 |         track_dict.update(extra_opts)
209 |         self.igv_json["tracks"] += [track_dict]
210 | 
211 |     def add_locus(self, locus):
212 |         """Add target locus to the json."""
213 |         self.igv_json["locus"] = locus
214 | 
215 |     def add_extra_opts(
216 |         self,
217 |         extra_alignment_opts=None,
218 |         extra_variant_opts=None,
219 |         extra_interval_opts=None,
220 |     ):
221 |         """Import extra options from json files."""
222 |         if extra_alignment_opts is not None:
223 |             with open(extra_alignment_opts, "r") as f:
224 |                 extra_alignment_opts_json = json.load(f)
225 |                 for ftype in ["bam", "cram"]:
226 |                     self.extra_opts_lookups[ftype] = extra_alignment_opts_json
227 |         if extra_variant_opts is not None:
228 |             with open(extra_variant_opts, "r") as f:
229 |                 extra_variant_opts_json = json.load(f)
230 |                 for ftype in ["vcf", "bcf"]:
231 |                     self.extra_opts_lookups[ftype] = extra_variant_opts_json
232 |         if extra_interval_opts is not None:
233 |             with open(extra_interval_opts, "r") as f:
234 |                 extra_interval_opts_json = json.load(f)
235 |                 for ftype in ["bed", "bedmethyl", "gff", "gtf"]:
236 |                     self.extra_opts_lookups[ftype] = extra_interval_opts_json
237 | 
238 | 
239 | class SampleBundle:
240 |     """Sample data class.
241 | 
242 |     This class stores the data for multiple tracks for a
243 |     single sample, then is used to generate a collection of
244 |     IGV.js tracks.
245 |     """
246 | 
247 |     def __init__(self, sample):
248 |         """Initialize properties for a sample."""
249 |         self.sample = sample
250 |         self.infiles = []
251 |         self.data_bundles = []
252 | 
253 |     def append(self, fname):
254 |         """Add a new raw file to the bundle."""
255 |         self.infiles.append(fname)
256 | 
257 |     def process_data(self):
258 |         """Process input files."""
259 |         fbasenames = [Path(fname).name for fname in self.infiles]
260 |         ftypes = [self.classify_files(bname) for bname in fbasenames]
261 |         self.data_bundles = self.pair_file_with_index(self.infiles, fbasenames, ftypes)
262 | 
263 |     @staticmethod
264 |     def classify_files(fname):
265 |         """Classify inputs."""
266 |         for extension, ftype in DATA_TYPES.items():
267 |             if fname.endswith(f".{extension}"):
268 |                 return ftype
269 | 
270 |     @staticmethod
271 |     def pair_file_with_index(infiles, fbasenames, ftypes):
272 |         """Clump files with their indexes."""
273 |         # Collect data by group type
274 |         groups = {ftype: {"basenames": [], "paths": []} for ftype in set(ftypes)}
275 |         # Group each file by its type and base name
276 |         for ftype, fbasename, fname in zip(ftypes, fbasenames, infiles):
277 |             groups[ftype]["basenames"] += [fbasename]
278 |             groups[ftype]["paths"] += [fname]
279 | 
280 |         # Output bundles
281 |         outputs = []
282 |         # Start matching the variant files
283 |         for ftype, itype in DATA_INDEXES_FMT.items():
284 |             # Ignore file formats that are not present in the bundle.
285 |             if ftype not in groups:
286 |                 continue
287 |             # Make pairs of files.
288 |             for fbasename, fpath in zip(
289 |                 groups[ftype]["basenames"], groups[ftype]["paths"]
290 |             ):
291 |                 #  Construct potential index file names based on basename of input files
292 |                 idx_basenames = set(
293 |                     [f"{fbasename}.{idx}" for idx in INDEX_PAIRS[ftype]]
294 |                 )
295 |                 # Find which indexes are available
296 |                 if itype in groups.keys():
297 |                     idx_basenames = list(
298 |                         idx_basenames.intersection(set(groups[itype]["basenames"]))
299 |                     )
300 |                     # Get the first index (if there are more than one,
301 |                     # it doesn't matter)
302 |                     bname = idx_basenames[0]
303 |                     idx_fn = groups[itype]["paths"][
304 |                         groups[itype]["basenames"].index(bname)
305 |                     ]
306 |                     outputs.append([fpath, idx_fn, ftype])
307 |                 # Otherwise, return only the simple file.
308 |                 else:
309 |                     outputs.append([fpath, None, ftype])
310 |         return outputs
311 | 
312 | 
313 | def main(args):
314 |     """Run the entry point."""
315 |     logger = get_named_logger("configIGV")
316 | 
317 |     # parse the FOFN
318 |     igv_builder = TrackBuilder()
319 | 
320 |     # Add the additional track configurations
321 |     igv_builder.add_extra_opts(
322 |         extra_alignment_opts=args.extra_alignment_opts,
323 |         extra_variant_opts=args.extra_variant_opts,
324 |         extra_interval_opts=args.extra_interval_opts
325 |     )
326 | 
327 |     # Import files
328 |     igv_builder.parse_fnames(args.fofn)
329 | 
330 |     # initialise the IGV options dict with the reference options
331 |     igv_builder.build_igv_json()
332 | 
333 |     # Add locus information
334 |     if args.locus is not None:
335 |         igv_builder.add_locus(args.locus)
336 | 
337 |     json.dump(igv_builder.igv_json, sys.stdout, indent=4)
338 | 
339 |     logger.info("Printed IGV config JSON to STDOUT.")
340 | 
341 | 
342 | def argparser():
343 |     """Argument parser for entrypoint."""
344 |     parser = wf_parser("configure_igv")
345 |     parser.add_argument(
346 |         "--fofn",
347 |         required=True,
348 |         help=(
349 |             "File with list of names of reference / XAM / VCF files and indices "
350 |             "(one filename per line)"
351 |         ),
352 |     )
353 |     parser.add_argument(
354 |         "--locus",
355 |         help="Locus string to set initial genomic coordinates to display in IGV",
356 |     )
357 |     parser.add_argument(
358 |         "--extra-alignment-opts",
359 |         help="JSON file with extra options for alignment tracks",
360 |     )
361 |     parser.add_argument(
362 |         "--extra-variant-opts",
363 |         help="JSON file with extra options for variant tracks",
364 |     )
365 |     parser.add_argument(
366 |         "--extra_interval_opts",
367 |         help="JSON file with extra options for interval tracks",
368 |     )
369 |     return parser
370 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/get_max_depth_locus.py:
--------------------------------------------------------------------------------
 1 | """Find max depth window in a `mosdepth` regions BED file and write as locus string."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("getMaxDepth")
14 | 
15 |     # read the regions BED file
16 |     df = pd.read_csv(
17 |         args.depths_bed, sep="\t", header=None, names=["ref", "start", "end", "depth"]
18 |     )
19 | 
20 |     # get the window with the largest depth
21 |     ref, start, end, depth = df.loc[df["depth"].idxmax()]
22 | 
23 |     # get the length of the reference of that window
24 |     ref_length = df.query("ref == @ref")["end"].iloc[-1]
25 | 
26 |     # show the whole reference in case it's shorter than the desired locus size
27 |     if ref_length < args.locus_size:
28 |         start = 1
29 |         end = ref_length
30 |     else:
31 |         # otherwise, show a region of the desired size around the window
32 |         half_size = args.locus_size // 2
33 |         mid = (start + end) // 2
34 |         start = mid - half_size
35 |         end = mid + half_size
36 |         # check if the region starts below `1` or ends beyond the end of the reference
37 |         if start < 1:
38 |             start = 1
39 |             end = args.locus_size
40 |         if end > ref_length:
41 |             start = ref_length - args.locus_size
42 |             end = ref_length
43 | 
44 |     # write depth and locus string
45 |     sys.stdout.write(f"{depth}\t{ref}:{start}-{end}")
46 | 
47 |     logger.info("Wrote locus with maximum depth to STDOUT.")
48 | 
49 | 
50 | def argparser():
51 |     """Argument parser for entrypoint."""
52 |     parser = wf_parser("get_max_depth_locus")
53 |     parser.add_argument(
54 |         "depths_bed",
55 |         type=Path,
56 |         help="path to mosdepth regions depth file (can be compressed)",
57 |     )
58 |     parser.add_argument(
59 |         "locus_size", type=int, help="size of the locus in basepairs (e.g. '2000')"
60 |     )
61 |     return parser
62 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/reheader_samstream.py:
--------------------------------------------------------------------------------
  1 | """Reheader a SAM in a stream.
  2 | 
  3 | When using the bam2fq -> minimap2 pattern for (re)aligning BAM data, we
  4 | lose any existing RG and PG headers. This is particularly egregious when
  5 | handling basecalled data as lines related to dorado basecalling settings
  6 | as well as dorado RG headers are lost; orphaning RG tags in the reads.
  7 | This is problematic for downstream anaylses that would like to read the
  8 | XAM header to intelligently determine how to handle the reads based on
  9 | the basecaller model and basecaller configuration.
 10 | 
 11 | This script handles:
 12 |   - Inserting RG, PG and CO lines from an existing XAM header into the
 13 |     header of the SAM emitted from minimap2's alignment stream
 14 |   - Inserting a PG header to indicate that a call to bam2fq was made
 15 |   - Updating the first streamed PG.PP parent tag with the last PG.ID
 16 |     of the existing XAM header to maintain a chain of custody
 17 |   - Updating any streamed PG.ID (and PG.PP) tags to avoid collisions
 18 |     with inserted PG.ID
 19 | 
 20 | Handling collisions may seem like overkill but it is anticipated that
 21 | this script will be called immediately after minimap2, any previous
 22 | attempt to use minimap2 will lead to ambiguity. This would be the
 23 | expected case where users have used wf-basecalling or wf-alignment to
 24 | align a set of reads, only to realign them to another reference (eg.
 25 | via wf-human-variation). Arguably, we should remove older references to
 26 | minimap2 as they will have been invalidated by the call to bam2fq but
 27 | removing PG records and sticking the PG chain back together seems more
 28 | fraught with annoying future bugs than simply resolving conflicts.
 29 | 
 30 | This script will explode on a stream that contains:
 31 |   - PG lines in the original header where the last PG in the chain is
 32 |     ambiguous, or where the parent PP IDs are not injective
 33 |   - PG lines in the stream that do not appear in the order of their
 34 |     chain (that is if a PG.PP refers to a PG.ID that has not been
 35 |     encountered yet)
 36 | 
 37 | SQ lines are retained after an HD line. That is to say, the most recent
 38 | set of SQ lines observed after an HD will appear in the final output.
 39 | SQ, RG, PG and CO lines are emitted as a group together, with elements
 40 | written out in the order observed.
 41 | 
 42 | PG lines are naively appended to the last PG element in the chain. No
 43 | attempt is made to keep multiple program chains intact as this can lead
 44 | to bloated headers. Broken PG metadata is a known problem (see
 45 | samtools/hts-specs#275) but one that is preferable to headers that
 46 | become unwieldly large to process: there IS an upper limit to a SAM
 47 | header's size after all.
 48 | 
 49 | This script takes advantage of minimap2's SAM output to immediately
 50 | reheader the stream before any downstream calls to other programs pollute
 51 | the PG header. This script is a little overkill but attempts to be robust
 52 | with handling PG collisions and more obviously encapsulates reheadering
 53 | behaviour, and leaves some room to do more clever things as necessary.
 54 | """
 55 | from shutil import copyfileobj
 56 | import sys
 57 | 
 58 | from ..util import wf_parser  # noqa: ABS101
 59 | 
 60 | 
 61 | class SamHeader:
 62 |     """An overkill container to manage merging PG lines in SAM headers.
 63 | 
 64 |     Collision handling is simple. If a PG.ID is duplicated by the stream
 65 |     then we add a suffix to its name and keep an eye out for the
 66 |     corresponding PG.PP later. We assume that headers emitted by the
 67 |     stream are chronological because this script should not be called as
 68 |     part of any complicated pipework other than immediately following
 69 |     minimap2.
 70 |     """
 71 | 
 72 |     def __init__(self):
 73 |         """Initialise a collision aware PG container."""
 74 |         self.remapped_pgids = {}
 75 |         self.collision_suffix = 0
 76 | 
 77 |         # Default HD, in case the new stream does not provide one
 78 |         self.hd = "@HD\tVN:1.6\tSO:unknown"
 79 | 
 80 |         # We'll merge RG, CO and PG
 81 |         self.rg_records = []
 82 |         self.co_records = []
 83 |         self.pg_records = []
 84 | 
 85 |         # We keep the most recently observed block of SQ records by
 86 |         # resetting SQ on the first SQ seen after non-SQ. We cannot
 87 |         # rely on HD being emitted (as minimap2 does not do this!)
 88 |         self.sq_records = []
 89 |         self.reset_sq = False
 90 | 
 91 |         self.observed_rgids = set()
 92 |         self.observed_pgids = set()
 93 |         self.last_pgid = None
 94 | 
 95 |     @staticmethod
 96 |     def str_to_record(line):
 97 |         """Return an appropriate struct for a given string record."""
 98 |         try:
 99 |             record_type, record_data = line.strip().split('\t', 1)
100 |         except ValueError:
101 |             raise Exception(f"Record type could not be determined: {line}")
102 | 
103 |         if len(record_type) > 3:
104 |             raise Exception(f"Record type malformed: {record_type}")
105 | 
106 |         record = {}
107 |         if record_type in ["@HD", "@CO", "@SQ"]:
108 |             return record_type, record_data
109 |         elif record_type in ["@RG", "@PG"]:
110 |             for field in record_data.strip().split('\t'):
111 |                 k, v = field.split(':', 1)
112 |                 if len(k) == 2 and k[0].isalpha() and k[1].isalnum():
113 |                     record[k] = v
114 |                 else:
115 |                     raise Exception(f"{record_type} with invalid tag: '{k}'")
116 |             if "ID" not in record:
117 |                 raise Exception(f"{record_type} with no ID: {record_data}")
118 |             return record_type, record
119 |         else:
120 |             raise Exception(f"Unknown record type: {line}")
121 | 
122 |     @staticmethod
123 |     def record_to_str(record_type, record_data):
124 |         """Form a string from a header record."""
125 |         if record_type in ["@PG", "@RG"]:
126 |             tags = [f"{k}:{v}" for k, v in record_data.items()]
127 |             return f"{record_type}\t" + '\t'.join(tags)
128 |         elif record_type in ["@SQ", "@CO"]:
129 |             return f"{record_type}\t{record_data}"
130 | 
131 |     @staticmethod
132 |     def resolve_pg_chain(pg_dicts):
133 |         """Check links between PG.ID and PP.ID, exploding if inconsistent."""
134 |         links = {}
135 |         # Document links between all ID and their PP parent
136 |         pgids_without_ppid = 0
137 |         for pgd in pg_dicts:
138 |             pgid = pgd["ID"]
139 |             pgpp = pgd.get("PP")
140 |             links[pgid] = pgpp
141 |             if pgpp is None:
142 |                 pgids_without_ppid += 1
143 |         if len(links) > 0:
144 |             # If there are links, exactly one should have a None parent
145 |             # to indicate the first PG in the chain. Explode if we see
146 |             # no head or multiple heads.
147 |             if pgids_without_ppid == 0:
148 |                 raise Exception("PG chain does not have a head.")
149 |             elif pgids_without_ppid > 1:
150 |                 raise Exception("PG chain has multiple heads.")
151 |         for source in links:
152 |             head = source
153 |             path = [head]
154 |             while True:
155 |                 head = links[head]
156 |                 if head is None:
157 |                     break
158 |                 if head in path:
159 |                     path.append(head)
160 |                     raise Exception(f"PG chain appears to contain cycle: {path}")
161 |                 path.append(head)
162 |         # This function is only really called to catch any explosions
163 |         # but we'll return the links here as it is useful for testing
164 |         return links
165 | 
166 |     def _bump_pg_collider(self):
167 |         """Alter the collision suffix after determining a collision."""
168 |         self.collision_suffix += 1
169 | 
170 |     def _uncollide_pgid(self, pgid):
171 |         """Return an uncollided string for a given PG ID."""
172 |         new_pgid = f"{pgid}-{self.collision_suffix}"
173 |         self.remapped_pgids[pgid] = new_pgid
174 |         self._bump_pg_collider()
175 |         return new_pgid
176 | 
177 |     def add_line(self, line):
178 |         """Add a header line to the header."""
179 |         record_type, record = self.str_to_record(line)
180 | 
181 |         if record_type == "@HD":
182 |             self.hd = f"@HD\t{record}"
183 |         elif record_type == "@CO":
184 |             self.co_records.append(record)
185 |         elif record_type == "@SQ":
186 |             if self.reset_sq:
187 |                 self.sq_records = []
188 |                 self.reset_sq = False
189 |             self.sq_records.append(record)
190 |         elif record_type == "@RG":
191 |             rgid = record["ID"]
192 |             if rgid not in self.observed_rgids:
193 |                 self.observed_rgids.add(rgid)
194 |                 self.rg_records.append(record)
195 |             elif record not in self.rg_records:
196 |                 # if rgid has been seen before, abort if this record is different
197 |                 raise Exception(
198 |                     f"Duplicate RG with ID '{rgid}' conflicts with previously seen RG with same ID."  # noqa:E501
199 |                 )
200 |         elif record_type == "@PG":
201 |             pgid = record["ID"]
202 |             if pgid in self.observed_pgids:
203 |                 # collision, rewrite the pgid
204 |                 pgid = self._uncollide_pgid(pgid)
205 |                 record["ID"] = pgid
206 |             else:
207 |                 self.observed_pgids.add(pgid)
208 | 
209 |             # maintain chain
210 |             ppid = record.get("PP")
211 |             if not ppid:
212 |                 # record has no parent, this is either
213 |                 # - the first record (last_pgid is None) so is the tail
214 |                 # - an inserted record that needs its parent to be the current tail
215 |                 if not self.last_pgid:
216 |                     self.last_pgid = pgid
217 |                 else:
218 |                     record["PP"] = self.last_pgid
219 |                     self.last_pgid = pgid
220 |             else:
221 |                 if ppid not in self.observed_pgids:
222 |                     raise Exception(
223 |                         f"Encountered PG.PP '{ppid}' before observing corresponding PG.ID"  # noqa:E501
224 |                     )
225 |                 # remap parent id (if needed)
226 |                 record["PP"] = self.remapped_pgids.get(ppid, ppid)
227 |                 # set tail to this record
228 |                 self.last_pgid = pgid
229 | 
230 |             self.pg_records.append(record)
231 | 
232 |         if len(self.sq_records) > 0 and record_type != '@SQ':
233 |             self.reset_sq = True
234 | 
235 |         return record
236 | 
237 |     def write_header(self, fh):
238 |         """Write this header to a file handle."""
239 |         self.resolve_pg_chain(self.pg_records)  # check PG header
240 |         fh.write(f"{self.hd}\n")
241 |         for sq in self.sq_records:
242 |             fh.write(self.record_to_str("@SQ", sq) + '\n')
243 |         for rg in self.rg_records:
244 |             fh.write(self.record_to_str("@RG", rg) + '\n')
245 |         for pg in self.pg_records:
246 |             fh.write(self.record_to_str("@PG", pg) + '\n')
247 |         for co in self.co_records:
248 |             fh.write(self.record_to_str("@CO", co) + '\n')
249 | 
250 | 
251 | def reheader_samstream(header_in, stream_in, stream_out, args):
252 |     """Run reheader_samstream."""
253 |     # read original header into container
254 |     sh = SamHeader()
255 |     for line in header_in:
256 |         sh.add_line(line)
257 | 
258 |     # append user provided lines to container
259 |     for line in args.insert:
260 |         sh.add_line(line)
261 | 
262 |     # read the header portion of the minimap2 stream
263 |     wrote_header = False
264 |     for line in stream_in:
265 |         if line[0] != '@':
266 |             # write out header on first alignment
267 |             sh.write_header(stream_out)
268 |             wrote_header = True
269 |             # and actually write the first alignment
270 |             stream_out.write(line)
271 |             break
272 |         sh.add_line(line)
273 | 
274 |     # Pass through the rest of the alignments.
275 |     # I toyed with a few ways of doing this:
276 |     #   - A trivial iter over the input file was slow. presumably as we incurred some
277 |     #     overhead calling read() and write() and decoding more than other methods.
278 |     #   - os.read/write avoids dealing with higher level python read/write but requires
279 |     #     file descriptors which rules out non-file-like objects. this made testing more
280 |     #     annoying as StringIO does not have a file descriptor. we could have mocked fds
281 |     #     but i was not happy with the discrepancy between real and test execution.
282 |     #   - copyfileobj with the stream_in.buffer would also avoid some of the higher
283 |     #     level text handling but would require all tests to provide inputs that have
284 |     #     an underlying binary buffer. it was also not possible to seek the buffer to
285 |     #     the position of the text stream as we've used next() to iterate over the
286 |     #     header lines, fixing this would have required rewriting of the header
287 |     #     handling or keeping track of the position in the stream ourselves which
288 |     #     just seemed unncessary overkill given how we expect this program to be used.
289 |     # copyfileobj on the text streams is more efficient than merely iterating the file
290 |     # and dumping the lines out and seems to do the job. this keeps the code and tests
291 |     # simple with minimal additional cost to performance. i anticipate any overhead of
292 |     # this program will be dwarfed by that of minimap2/samtools sort anyway.
293 |     # increasing the buffer size gave worse performance in my limited testing so we
294 |     # leave it as the default here.
295 |     copyfileobj(stream_in, stream_out)
296 | 
297 |     # If there were no alignments, we won't have hit the != @ case in the first stdin,
298 |     # and we won't have written the header out. Write a header if we haven't already.
299 |     if not wrote_header:
300 |         sh.write_header(stream_out)
301 | 
302 | 
303 | def argparser():
304 |     """Argument parser for entrypoint."""
305 |     parser = wf_parser("reheader_samstream")
306 |     parser.add_argument("header_in")
307 |     parser.add_argument("--insert", action="append", default=[])
308 |     return parser
309 | 
310 | 
311 | def main(args):
312 |     """reheader_samstream default entry point."""
313 |     with open(args.header_in) as header_in:
314 |         reheader_samstream(header_in, sys.stdin, sys.stdout, args)
315 | 


--------------------------------------------------------------------------------
/data/OPTIONAL_FILE:
--------------------------------------------------------------------------------
 1 | # Nothing to see here. A sentinel file to replace real data.
 2 | # e.g.:
 3 | #
 4 | # process run {
 5 | #     input:
 6 | #         path some_data
 7 | #         path extra_data
 8 | #     script:
 9 | #     def extra = extra_data.name != 'OPTIONAL_FILE' ? "--extra-data $opt" : ''
10 | #     """
11 | #     command ${some_data} ${extra}
12 | #     """
13 | # }
14 | #
15 | # some_data = ...
16 | # extra_data = Channel.fromPath("$projectDir/data/OPTIONAL_FILE"))
17 | # run(some_data, extra_data)
18 | 


--------------------------------------------------------------------------------
/docs/01_brief_description.md:
--------------------------------------------------------------------------------
1 | Helper workflow for basecalling nanopore sequencing data.


--------------------------------------------------------------------------------
/docs/02_introduction.md:
--------------------------------------------------------------------------------
 1 | In brief this workflow can be used to perform: 
 2 | 
 3 | + Basecalling of a directory of pod5 or fast5 signal data
 4 | + Basecalling in Duplex mode
 5 | + Modified basecalling
 6 | + Basecalling in real time
 7 | + Output basecalled sequences in various formats: FASTQ, CRAM or Unaligned BAM
 8 | + If a reference is provided a sorted and indexed BAM or CRAM will be output
 9 | for basecalling a directory of `pod5` or `fast5` signal data with `dorado`
10 | and aligning it with `minimap2` to produce a sorted, indexed CRAM.
11 | 


--------------------------------------------------------------------------------
/docs/03_compute_requirements.md:
--------------------------------------------------------------------------------
 1 | Recommended requirements:
 2 | 
 3 | + CPUs = 64
 4 | + Memory = 256GB
 5 | 
 6 | Minimum requirements:
 7 | 
 8 | + CPUs = 8
 9 | + Memory = 64GB
10 | 
11 | Approximate run time: Variable depending on coverage, genome size, model of choice and GPU model.
12 | 
13 | ARM processor support: False
14 | 


--------------------------------------------------------------------------------
/docs/04_install_and_run.md:
--------------------------------------------------------------------------------
 1 | 
 2 | These are instructions to install and run the workflow on command line.
 3 | You can also access the workflow via the
 4 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/).
 5 | 
 6 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage
 7 | compute and software resources,
 8 | therefore Nextflow will need to be
 9 | installed before attempting to run the workflow.
10 | 
11 | The workflow can currently be run using either
12 | [Docker](https://docs.docker.com/get-started/)
13 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
14 | to provide isolation of the required software.
15 | Both methods are automated out-of-the-box provided
16 | either Docker or Singularity is installed.
17 | This is controlled by the
18 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles)
19 | parameter as exemplified below.
20 | 
21 | It is not required to clone or download the git repository
22 | in order to run the workflow.
23 | More information on running EPI2ME workflows can
24 | be found on our [website](https://labs.epi2me.io/wfindex).
25 | 
26 | The following command can be used to obtain the workflow.
27 | This will pull the repository in to the assets folder of
28 | Nextflow and provide a list of all parameters
29 | available for the workflow as well as an example command:
30 | 
31 | ```
32 | nextflow run epi2me-labs/wf-basecalling --help
33 | ```
34 | To update a workflow to the latest version on the command line use
35 | the following command:
36 | ```
37 | nextflow pull epi2me-labs/wf-basecalling
38 | ```
39 | 
40 | A demo dataset is provided for testing of the workflow.
41 | It can be downloaded and unpacked using the following commands:
42 | ```
43 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz
44 | tar -xzvf wf-basecalling-demo.tar.gz
45 | ```
46 | The workflow can then be run with the downloaded demo data using:
47 | ```
48 | nextflow run epi2me-labs/wf-basecalling \
49 | 	--basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' \
50 | 	--dorado_ext 'pod5' \
51 | 	--input 'wf-basecalling-demo/input' \
52 | 	--ref 'wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta' \
53 | 	--remora_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2' \
54 | 	-profile standard
55 | ```
56 | 
57 | For further information about running a workflow on
58 | the command line see https://labs.epi2me.io/wfquickstart/
59 | 


--------------------------------------------------------------------------------
/docs/05_related_protocols.md:
--------------------------------------------------------------------------------
1 | <!---Hyperlinks to any related protocols that are directly related to this workflow, check the community for any such protocols.--->
2 | 
3 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices.
4 | 
5 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/).


--------------------------------------------------------------------------------
/docs/06_input_example.md:
--------------------------------------------------------------------------------
1 | This workflow accepts a folder containing FAST5 or POD5 files as input.
2 | The folder may contain other folders of FAST5 or POD5 files and all files will be processed by the workflow.
3 | 
4 | 


--------------------------------------------------------------------------------
/docs/06_input_parameters.md:
--------------------------------------------------------------------------------
 1 | ### Input Options
 2 | 
 3 | | Nextflow parameter name  | Type | Description | Help | Default |
 4 | |--------------------------|------|-------------|------|---------|
 5 | | input | string | Directory containing FAST5 (or POD5) signal for basecalling. | This directory will be searched recursively. All FAST5 or POD5 files (depending on which extension you select in the Basecalling Options) in this directory or any subdirectory (no matter how deep) will be basecalled. |  |
 6 | | ref | string | Optional reference FASTA file to align basecalled reads to. | Without a reference, basecalls are output to unaligned CRAM. When using a reference, take care to retain this FASTA file as the output CRAM file cannot be read without the reference it was aligned to. |  |
 7 | 
 8 | 
 9 | ### Output Options
10 | 
11 | | Nextflow parameter name  | Type | Description | Help | Default |
12 | |--------------------------|------|-------------|------|---------|
13 | | out_dir | string | Directory for output of all files. |  | output |
14 | | sample_name | string | Sample name to prefix file names of workflow outputs. |  | SAMPLE |
15 | | output_fmt | string | Desired file format of files created by basecalling and alignment. | FASTQ can only be output when a reference has not been provided. Aligned output will always be written to CRAM unless BAM is selected. | cram |
16 | | igv | boolean | Visualize outputs in the EPI2ME IGV visualizer. | Enabling this option will visualize the output alignment files in the EPI2ME desktop app IGV visualizer. | False |
17 | 
18 | 
19 | ### Basecalling options
20 | 
21 | | Nextflow parameter name  | Type | Description | Help | Default |
22 | |--------------------------|------|-------------|------|---------|
23 | | basecaller_cfg | string | Name of the model to use for converting signal. | Required for basecalling. The model list only shows models that are compatible with this workflow. |  |
24 | | duplex | boolean | Run the basecaller in duplex mode. | By default the workflow conducts simplex basecalling. If you used a chemistry and flowcell combination that supports duplex reads, you may switch this option on. This option is incompatible with the watch_path option due to the way the input files must be traversed in order to find duplex pairs. | False |
25 | | remora_cfg | string | Name of the model to use for calling modified bases. | Required for calling modified bases while basecalling. The model list only shows models that are compatible with this workflow. |  |
26 | | dorado_ext | string | File extension for Dorado inputs. | Set this to fast5 if you have not converted your fast5 to pod5. It is recommended to [convert existing fast5 files to pod5 for use with Dorado](https://github.com/nanoporetech/pod5-file-format/blob/master/python/README.md#pod5-convert-from-fast5). | pod5 |
27 | | poly_a_config | string | Provide this TOML file to turn on and configure dorado poly(A) calling. | This TOML file allows you to turn on and configure poly(A) tail calling options in dorado. This feature is described [here](https://github.com/nanoporetech/dorado?tab=readme-ov-file#polya-tail-estimation). |  |
28 | | barcode_kit | string | Name of the kit to use for barcoding. Demultiplex the data. | Providing a kit here will instruct the workflow to demultiplex your 'pass' data to BAM files, which can be found in your output directory under the folder 'demuxed' in a structure reminiscent of MinKNOW. |  |
29 | 
30 | 
31 | ### Advanced basecalling options
32 | 
33 | | Nextflow parameter name  | Type | Description | Help | Default |
34 | |--------------------------|------|-------------|------|---------|
35 | | output_pod5 | boolean | Save the converted POD5 when running in duplex with FAST5 inputs. | Dorado duplex only supports POD5 input. The workflow will automatically convert FAST5 input to POD5 when duplex calling. By default, converted POD5 are deleted to save disk space. Enabling this option will make the workflow output converted POD5 files to a subfolder within the output directory. | False |
36 | | qscore_filter | number | Mean qscore by which to filter reads. Inclusive such that reads with score >= qscore_filter are kept. | The mean qscore of reads is calculated by dorado and rounded to an integer by dorado and stored as a tag in dorado's SAM output. The pipeline separates reads into pass and fail categories based on this SAM tag. | 10 |
37 | | cuda_device | string | GPU device to use for basecalling [cuda:all]. | For local execution this can be used to pin GPU tasks to one (or more) specific GPU devices. Use cuda:all to use all available GPU devices, or cuda:idx[,idx,...] where idx is an index number(s) of GPU device(s) to use. | cuda:all |
38 | | basecaller_model_path | string | Override the named basecalling model with a custom basecalling model. | For typical use, users should set --basecaller_cfg which will use a named model from inside the container. Experimental or custom basecallers will not be available in the container and can be loaded from the host with --basecaller_model_path. |  |
39 | | remora_model_path | string | Override the named remora model with a custom remora model. | For typical use, users should set --remora_cfg which will use a named model from inside the container. Experimental or custom models will not be available in the container and can be loaded from the host with --remora_model_path. |  |
40 | | basecaller_basemod_threads | number | Number of threads to use for base modification calling. | You must set this to > 0 when using a modbase aware model. Modbase calling does not require much additional CPU and should be set carefully when using GPU servers with a small number of CPUs per GPU. | 2 |
41 | | basecaller_args | string | Additional command line arguments to pass to the basecaller process. |  |  |
42 | | demux_args | string | Additional command line arguments to pass to the basecaller barcoding process. |  |  |
43 | 
44 | 
45 | ### Multiprocessing Options
46 | 
47 | | Nextflow parameter name  | Type | Description | Help | Default |
48 | |--------------------------|------|-------------|------|---------|
49 | | ubam_map_threads | integer | Set max number of threads to use for aligning reads from uBAM (limited by config executor cpus) |  | 8 |
50 | | ubam_sort_threads | integer | Set max number of threads to use for sorting and indexing aligned reads from uBAM (limited by config executor cpus) |  | 3 |
51 | | ubam_bam2fq_threads | integer | Set max number of threads to use for uncompressing uBAM and generating FASTQ for alignment (limited by config executor cpus) |  | 1 |
52 | | merge_threads | integer | Set max number of threads to use for merging BAM files (limited by config executor cpus) |  | 4 |
53 | | stats_threads | integer | Set max number of threads to use for getting stats from output files. (limited by config executor cpus) |  | 4 |
54 | 
55 | 
56 | ### Real Time Analysis Options
57 | 
58 | | Nextflow parameter name  | Type | Description | Help | Default |
59 | |--------------------------|------|-------------|------|---------|
60 | | watch_path | boolean | Enable to continuously watch the input directory for new input files. Reads will be analysed as they appear. | This option enables the use of Nextflow's directory watching feature to constantly monitor input directories for new files. As soon as files are written by an external process Nextflow will begin analysing these files. The workflow will accumulate data over time to produce an updating report. Real time analysis of duplex data may lead to lower duplex rates than what would have been obtained by running basecalling after sequencing. | False |
61 | | read_limit | integer | Stop processing data when a particular number of reads have been analysed. | By default the workflow will run indefinitely when using the real time watch path option. This will set the upper bound on the number of reads that will be analysed before the workflow is automatically stopped and no more data is analysed. |  |
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/docs/07_outputs.md:
--------------------------------------------------------------------------------
 1 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}.
 2 | 
 3 | | Title | File path | Description | Per sample or aggregated |
 4 | |-------|-----------|-------------|--------------------------|
 5 | | workflow report | wf-basecalling-report.html | Report summarising the work done by the basecalling workflow | per-sample |
 6 | | Simplex alignment file of passed reads | {{ alias }}.pass.simplex.{{ format }} | BAM or CRAM file of simplex reads for the sample that pass QC filtering. | per-sample |
 7 | | Duplex alignment file of passed reads | {{ alias }}.pass.duplex.{{ format }} | BAM or CRAM file of duplex reads for the sample that pass QC filtering. Created if duplex basecalling is requested. | per-sample |
 8 | | Simplex alignment file index of passed reads | {{ alias }}.pass.simplex.{{ format }}.{{ index_format }} | The index of the resulting BAM or CRAM file with the simplex reads that pass QC filtering. | per-sample |
 9 | | Duplex alignment file index of passed reads | {{ alias }}.pass.duplex.{{ format }}.{{ index_format }} | The index of the resulting BAM or CRAM file with the duplex reads that pass QC filtering. Created if duplex basecalling is requested. | per-sample |
10 | | Simplex alignment file of failed reads | {{ alias }}.fail.simplex.{{ format }} | BAM or CRAM file of simplex reads for the sample that fail QC filtering. | per-sample |
11 | | Duplex alignment file of failed reads | {{ alias }}.fail.duplex.{{ format }} | BAM or CRAM file of duplex reads for the sample that fail QC filtering. Created if duplex basecalling is requested. | per-sample |
12 | | Simplex alignment file index of failed reads | {{ alias }}.fail.simplex.{{ format }}.{{ index_format }} | The index of the resulting BAM or CRAM file with the simplex reads that fail QC filtering. | per-sample |
13 | | Duplex alignment file index of failed reads | {{ alias }}.fail.duplex.{{ format }}.{{ index_format }} | The index of the resulting BAM or CRAM file with the duplex reads that fail QC filtering. Created if duplex basecalling is requested. | per-sample |
14 | | Index of the reference FASTA file | {{ ref }}.fai | Index of the reference FASTA file. | aggregated |
15 | | JSON configuration file for IGV browser | igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reference genome. | aggregated |
16 | 


--------------------------------------------------------------------------------
/docs/08_pipeline_overview.md:
--------------------------------------------------------------------------------
 1 | ### 1. Prerequisites
 2 | 
 3 | The workflow uses [Dorado](https://github.com/nanoporetech/dorado) for basecalling which includes the use of [Remora](https://github.com/nanoporetech/remora) for modified basecalling.
 4 | Basecalling with `Dorado` requires an NVIDIA GPU with [Pascal architecture or newer](https://www.nvidia.com/en-gb/technologies/) and at least 8 GB of vRAM.
 5 | 
 6 | #### Windows
 7 | 
 8 | Windows should not be considered as a supported operating systems for wf-basecalling as we do not directly support configuration of accelerated computing through WSL2 and Docker.
 9 | Although we do not offer support, it is possible to set up Docker to use GPUs for most versions of Windows 11 and some versions of Windows 10 and we direct users to the [CUDA on WSL User Guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html).
10 | Users should take note of the support constraints section to ensure their environment is suitable before following the guidance. **Do not install an NVIDIA driver into your WSL2 environment**.
11 | Users are encouraged to download Dorado for Windows from the [Dorado GitHub repository](https://github.com/nanoporetech/dorado#installation).
12 | 
13 | #### MacOS
14 | 
15 | MacOS should not be considered as a supported operating systems for wf-basecalling as we do not support accelerated computing through Docker on MacOS.
16 | On MacOS, GPU support through Docker remains in technical infancy. In addition, the containers we provide will not be able to leverage the M1 and M2 architecture and will not run as performantly as if Dorado had been run natively.
17 | Users are encouraged to download Dorado for MacOS directly from the [Dorado GitHub repository](https://github.com/nanoporetech/dorado#installation).
18 | 
19 | #### Linux
20 | 
21 | When using Docker for accelerated computing on Linux, you will need the `nvidia-container-toolkit` installed.
22 | If you observe the error "could not select device driver with capabilities gpu", you should follow the instructions to install `nvidia-container-toolkit` [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#setting-up-nvidia-container-toolkit). You will need to follow the steps to:
23 | 
24 | - Setup the package repository and the GPG key (ignore the box about experimental releases)
25 | - Update package listings
26 | - Install nvidia-container-toolkit
27 | - Configure the Docker daemon to recognize the NVIDIA Container Runtime
28 | - Restart the Docker daemon to complete the installation after setting the default runtime
29 | 
30 | By default, workflows are configured to run GPU tasks in serial. That is, only one basecalling task will be run at a time. This is to prevent the GPU from running out of memory on local execution.
31 | When running workflows on a cluster, or in a cloud where GPU resources are isolated from one another, users should specify `-profile discrete_gpus` as part of the command invocation. This will allow for parallel execution of GPU tasks.
32 | You should ask your system administrator if you need to configure any additional options to leverage GPUs on your cluster. For example, you may need to provide a special string to the workflow's `--cuda_device` option to ensure tasks use the GPU assigned to them by the job scheduler.
33 | 
34 | ### 2. Choosing a model
35 | 
36 | To select the relevant model see the `dorado` repository for a [table of available models](https://github.com/nanoporetech/dorado#available-basecalling-models) to select for `--basecaller_cfg` and `--remora_cfg`.
37 | 
38 | ### 3. Aligning to a reference
39 | 
40 | The workflow can optionally perform the alignment of the basecalled data using [minimap2](https://github.com/lh3/minimap2) to a reference of choice, provided with the `--ref` option.
41 | Additionally, the workflow will generate an IGV configuration file. This file allows the user to view the filtered aligned BAM in the EPI2ME Desktop Application in the Viewer tab.
42 | 
43 | ### 4. Duplex calling
44 | 
45 | wf-basecalling supports [duplex calling](https://github.com/nanoporetech/dorado#duplex), which is enabled with the `--duplex` option. If you used a chemistry and flowcell combination that supported duplex reads, you should switch this option on. The resulting BAM/CRAM will quality filtered and then automatically split in separate BAM/CRAM files for the simplex and duplex reads.
46 | Since `dorado duplex` requires the inputs to be in `pod5` format, the workflow will perform the conversion automatically using [pod5 convert fast5](https://github.com/nanoporetech/pod5-file-format/blob/master/python/pod5/README.md#pod5-convert-fast5). These files are normally deleted upon completion of the analysis, but can optionally be saved by the user by providing the `--output_pod5` option.
47 | 
48 | ### 5. Real-time analysis
49 | 
50 | wf-basecalling can perform the basecalling as the pod5 files are generated. To enable this, provide the `--watch_path` option. The workflow will process the newly generated files as soon as they become available.
51 | 
52 | ### 6. Barcode classification and demultiplexing
53 | 
54 | wf-basecalling can perform data demultiplexing by providing the appropriate barcoding kit with the `--barcode_kit` option.
55 | This will generate a new `{{ out_dir }}/demuxed` directory, with one subfolder for each barcode and one additional `unclassified` folder for reads that cannot be demultiplexed. This option is not available for `dorado duplex`.
56 | Please note that the demultiplexed reads will always be in BAM format, even when the user sets `--output_bam false`.


--------------------------------------------------------------------------------
/docs/09_troubleshooting.md:
--------------------------------------------------------------------------------
1 | * Duplex mode with wf-basecalling is reliant on internal optimisations to organise input files for better duplex rates, which is not possible when using streaming basecalling; therefore duplex combined with the `--watch_path` option could lead to lower duplex rates than what would be achieved running the algorithm after sequencing is completed.
2 | * Renaming, moving or deleting the reference genome or the output directory from the location provided at runtime will cause IGV to not load anymore.


--------------------------------------------------------------------------------
/docs/10_FAQ.md:
--------------------------------------------------------------------------------
1 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-basecalling/issues) page or start a discussion on the [community](https://community.nanoporetech.com/).
2 | 


--------------------------------------------------------------------------------
/docs/11_other.md:
--------------------------------------------------------------------------------
1 | + [Importing third-party workflows into EPI2ME Labs](https://labs.epi2me.io/nexflow-for-epi2melabs/)
2 | 
3 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts.


--------------------------------------------------------------------------------
/lib/ArgumentParser.groovy:
--------------------------------------------------------------------------------
 1 | /* Check arguments of a Nextflow function
 2 |  *
 3 |  * Nextflow script does not support the Groovy idiom:
 4 |  *
 5 |  *     def function(Map args[:], arg1, arg2, ...)
 6 |  * 
 7 |  * to support unordered kwargs. The methods here are designed
 8 |  * to reduce boileplate while allowing Nextflow script to implement
 9 |  *
10 |  *     def function(Map args[:])
11 |  *
12 |  * with required and default values. This is similar to some Python
13 |  * libraries' (notably matplotlib) extensive use of things like:
14 |  *
15 |  *     def function(*args, **kwargs)
16 |  *
17 |  * to implement generic APIs. Why do we want to do all this? Because
18 |  * we want to write library code with a clean set of required parameters
19 |  * but also extensible with non-required parameters with default values.
20 |  * This allows us to later add parameters without breaking existing code,
21 |  * and is very common practice elsewhere.
22 |  */
23 | 
24 | import java.util.Set
25 | 
26 | class ArgumentParser {
27 |     Set args
28 |     Map kwargs
29 |     String name
30 | 
31 |     /* Parse arguments, raising an error on unknown keys */
32 |     public Map parse_args(LinkedHashMap given_args) {
33 |         Set opt_keys = kwargs.keySet()
34 |         Set given_keys = given_args.keySet()
35 |         check_required(given_keys)
36 |         check_unknown(given_keys, opt_keys)
37 |         return kwargs + given_args
38 |     }
39 |     
40 |     /* Parse arguments, without raising an error for extra keys */
41 |     public Map parse_known_args(LinkedHashMap given_args) {
42 |         Set opt_keys = kwargs.keySet()
43 |         Set given_keys = given_args.keySet()
44 |         check_required(given_keys)
45 |         return kwargs + given_args
46 |     }
47 |     
48 |     private void check_required(Set given) {
49 |         Set missing_keys = args - given
50 |         if (!missing_keys.isEmpty()) {
51 |             throw new Exception("Missing arguments for function ${name}: ${missing_keys}")
52 |         }
53 |     }
54 |     
55 |     private void check_unknown(Set given, Set kwargs_keys) {
56 |         Set extra_keys = given - (args + kwargs_keys)
57 |         if (!extra_keys.isEmpty()) {
58 |             throw new Exception("Unknown arguments provided to function ${name}: ${extra_keys}.")
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/lib/CWUtil.groovy:
--------------------------------------------------------------------------------
 1 | /* Miscellaneous utilities for workflows from the ONT Customer Workflows Group.
 2 |  */
 3 | class CWUtil {
 4 | 
 5 |     /* Mutate the global Nextflow params map
 6 |     *
 7 |     * Occasionally, we may wish to mutate the value of a parameter provided
 8 |     * by the user. Typically, this leads to workflows with `params.my_param`
 9 |     * and `params._my_param` which is ripe for confusion. Instead, we can
10 |     * mutate the parameter value in the Nextflow params ScriptMap itself
11 |     * with the following call:
12 |     *
13 |     *     CWUtil.mutateParam(params, k, v)
14 |     *
15 |     * This is possible as Groovy actually has a surprisingly loose
16 |     * definition of "private", and allows us to call the private `allowNames`
17 |     * method on the ScriptMap which removes the read-only status for a key set.
18 |     * We can follow this up with a call to the private `put0` to reinsert
19 |     * the key and mark it as read-only again.
20 |     */
21 |     public static void mutateParam(nf_params, key, value) {
22 |         Set s = [key] // must be a set to allow call to allowNames
23 |         nf_params.allowNames(s)
24 |         nf_params.put0(key, value)
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/lib/NfcoreTemplate.groovy:
--------------------------------------------------------------------------------
  1 | //
  2 | // This file holds several functions used within the nf-core pipeline template.
  3 | //
  4 | 
  5 | // MIT License
  6 | // 
  7 | // Copyright (c) 2018 nf-core
  8 | // 
  9 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 10 | // of this software and associated documentation files (the "Software"), to deal
 11 | // in the Software without restriction, including without limitation the rights
 12 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 | // copies of the Software, and to permit persons to whom the Software is
 14 | // furnished to do so, subject to the following conditions:
 15 | // 
 16 | // The above copyright notice and this permission notice shall be included in all
 17 | // copies or substantial portions of the Software.
 18 | // 
 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 | // SOFTWARE.
 26 | 
 27 | 
 28 | import org.yaml.snakeyaml.Yaml
 29 | 
 30 | class NfcoreTemplate {
 31 | 
 32 |     //
 33 |     // Check AWS Batch related parameters have been specified correctly
 34 |     //
 35 |     public static void awsBatch(workflow, params) {
 36 |         if (workflow.profile.contains('awsbatch')) {
 37 |             // Check params.awsqueue and params.awsregion have been set if running on AWSBatch
 38 |             assert (params.awsqueue && params.awsregion) : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!"
 39 |             // Check outdir paths to be S3 buckets if running on AWSBatch
 40 |             assert params.outdir.startsWith('s3:')       : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!"
 41 |         }
 42 |     }
 43 | 
 44 |     //
 45 |     // Check params.hostnames
 46 |     //
 47 |     public static void hostName(workflow, params, log) {
 48 |         Map colors = logColours(params.monochrome_logs)
 49 |         if (params.hostnames) {
 50 |             try {
 51 |                 def hostname = "hostname".execute().text.trim()
 52 |                 params.hostnames.each { prof, hnames ->
 53 |                     hnames.each { hname ->
 54 |                         if (hostname.contains(hname) && !workflow.profile.contains(prof)) {
 55 |                             log.info "=${colors.yellow}====================================================${colors.reset}=\n" +
 56 |                                 "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" +
 57 |                                 "      but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" +
 58 |                                 "      ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" +
 59 |                                 "=${colors.yellow}====================================================${colors.reset}="
 60 |                         }
 61 |                     }
 62 |                 }
 63 |             } catch (Exception e) {
 64 |                 log.warn "[$workflow.manifest.name] Could not determine 'hostname' - skipping check. Reason: ${e.message}."
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 |     //
 70 |     // Generate version string
 71 |     //
 72 |     public static String version(workflow) {
 73 |         String version_string = ""
 74 | 
 75 |         if (workflow.manifest.version) {
 76 |             def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : ''
 77 |             version_string += "${prefix_v}${workflow.manifest.version}"
 78 |         }
 79 | 
 80 |         if (workflow.commitId) {
 81 |             def git_shortsha = workflow.commitId.substring(0, 7)
 82 |             version_string += "-g${git_shortsha}"
 83 |         }
 84 | 
 85 |         return version_string
 86 |     }
 87 | 
 88 |     //
 89 |     // Construct and send completion email
 90 |     //
 91 |     public static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], fail_mapped_reads=[:]) {
 92 | 
 93 |         // Set up the e-mail variables
 94 |         def subject = "[$workflow.manifest.name] Successful: $workflow.runName"
 95 |         if (fail_mapped_reads.size() > 0) {
 96 |             subject = "[$workflow.manifest.name] Partially successful (${fail_mapped_reads.size()} skipped): $workflow.runName"
 97 |         }
 98 |         if (!workflow.success) {
 99 |             subject = "[$workflow.manifest.name] FAILED: $workflow.runName"
100 |         }
101 | 
102 |         def summary = [:]
103 |         for (group in summary_params.keySet()) {
104 |             summary << summary_params[group]
105 |         }
106 | 
107 |         def misc_fields = [:]
108 |         misc_fields['Date Started']              = workflow.start
109 |         misc_fields['Date Completed']            = workflow.complete
110 |         misc_fields['Pipeline script file path'] = workflow.scriptFile
111 |         misc_fields['Pipeline script hash ID']   = workflow.scriptId
112 |         if (workflow.repository) misc_fields['Pipeline repository Git URL']    = workflow.repository
113 |         if (workflow.commitId)   misc_fields['Pipeline repository Git Commit'] = workflow.commitId
114 |         if (workflow.revision)   misc_fields['Pipeline Git branch/tag']        = workflow.revision
115 |         misc_fields['Nextflow Version']           = workflow.nextflow.version
116 |         misc_fields['Nextflow Build']             = workflow.nextflow.build
117 |         misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp
118 | 
119 |         def email_fields = [:]
120 |         email_fields['version']           = NfcoreTemplate.version(workflow)
121 |         email_fields['runName']           = workflow.runName
122 |         email_fields['success']           = workflow.success
123 |         email_fields['dateComplete']      = workflow.complete
124 |         email_fields['duration']          = workflow.duration
125 |         email_fields['exitStatus']        = workflow.exitStatus
126 |         email_fields['errorMessage']      = (workflow.errorMessage ?: 'None')
127 |         email_fields['errorReport']       = (workflow.errorReport ?: 'None')
128 |         email_fields['commandLine']       = workflow.commandLine
129 |         email_fields['projectDir']        = workflow.projectDir
130 |         email_fields['summary']           = summary << misc_fields
131 |         email_fields['fail_mapped_reads'] = fail_mapped_reads.keySet()
132 |         email_fields['min_mapped_reads']  = params.min_mapped_reads
133 | 
134 |         // On success try attach the multiqc report
135 |         def mqc_report = null
136 |         try {
137 |             if (workflow.success && !params.skip_multiqc) {
138 |                 mqc_report = multiqc_report.getVal()
139 |                 if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) {
140 |                     if (mqc_report.size() > 1) {
141 |                         log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one"
142 |                     }
143 |                     mqc_report = mqc_report[0]
144 |                 }
145 |             }
146 |         } catch (all) {
147 |             if (multiqc_report) {
148 |                 log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email"
149 |             }
150 |         }
151 | 
152 |         // Check if we are only sending emails on failure
153 |         def email_address = params.email
154 |         if (!params.email && params.email_on_fail && !workflow.success) {
155 |             email_address = params.email_on_fail
156 |         }
157 | 
158 |         // Render the TXT template
159 |         def engine       = new groovy.text.GStringTemplateEngine()
160 |         def tf           = new File("$projectDir/assets/email_template.txt")
161 |         def txt_template = engine.createTemplate(tf).make(email_fields)
162 |         def email_txt    = txt_template.toString()
163 | 
164 |         // Render the HTML template
165 |         def hf            = new File("$projectDir/assets/email_template.html")
166 |         def html_template = engine.createTemplate(hf).make(email_fields)
167 |         def email_html    = html_template.toString()
168 | 
169 |         // Render the sendmail template
170 |         def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit
171 |         def smail_fields           = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ]
172 |         def sf                     = new File("$projectDir/assets/sendmail_template.txt")
173 |         def sendmail_template      = engine.createTemplate(sf).make(smail_fields)
174 |         def sendmail_html          = sendmail_template.toString()
175 | 
176 |         // Send the HTML e-mail
177 |         Map colors = logColours(params.monochrome_logs)
178 |         if (email_address) {
179 |             try {
180 |                 if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') }
181 |                 // Try to send HTML e-mail using sendmail
182 |                 [ 'sendmail', '-t' ].execute() << sendmail_html
183 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-"
184 |             } catch (all) {
185 |                 // Catch failures and try with plaintext
186 |                 def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ]
187 |                 if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) {
188 |                     mail_cmd += [ '-A', mqc_report ]
189 |                 }
190 |                 mail_cmd.execute() << email_html
191 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-"
192 |             }
193 |         }
194 | 
195 |         // Write summary e-mail HTML to a file
196 |         def output_d = new File("${params.outdir}/pipeline_info/")
197 |         if (!output_d.exists()) {
198 |             output_d.mkdirs()
199 |         }
200 |         def output_hf = new File(output_d, "pipeline_report.html")
201 |         output_hf.withWriter { w -> w << email_html }
202 |         def output_tf = new File(output_d, "pipeline_report.txt")
203 |         output_tf.withWriter { w -> w << email_txt }
204 |     }
205 | 
206 |     //
207 |     // Print pipeline summary on completion
208 |     //
209 |     public static void summary(workflow, params, log, fail_mapped_reads=[:], pass_mapped_reads=[:]) {
210 |         Map colors = logColours(params.monochrome_logs)
211 | 
212 |         if (pass_mapped_reads.size() > 0) {
213 |             def idx = 0
214 |             def samp_aln = ''
215 |             def total_aln_count = pass_mapped_reads.size() + fail_mapped_reads.size()
216 |             for (samp in pass_mapped_reads) {
217 |                 samp_aln += "    ${samp.value}: ${samp.key}\n"
218 |                 idx += 1
219 |                 if (idx > 5) {
220 |                     samp_aln += "    ..see pipeline reports for full list\n"
221 |                     break;
222 |                 }
223 |             }
224 |             log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} ${pass_mapped_reads.size()}/$total_aln_count samples passed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-"
225 |         }
226 |         if (fail_mapped_reads.size() > 0) {
227 |             def samp_aln = ''
228 |             for (samp in fail_mapped_reads) {
229 |                 samp_aln += "    ${samp.value}: ${samp.key}\n"
230 |             }
231 |             log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} ${fail_mapped_reads.size()} samples skipped since they failed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-"
232 |         }
233 | 
234 |         if (workflow.success) {
235 |             if (workflow.stats.ignoredCount == 0) {
236 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-"
237 |             } else {
238 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-"
239 |             }
240 |         } else {
241 |             hostName(workflow, params, log)
242 |             log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-"
243 |         }
244 |     }
245 | 
246 |     //
247 |     // ANSII Colours used for terminal logging
248 |     //
249 |     public static Map logColours(Boolean monochrome_logs) {
250 |         Map colorcodes = [:]
251 | 
252 |         // Reset / Meta
253 |         colorcodes['reset']      = monochrome_logs ? '' : "\033[0m"
254 |         colorcodes['bold']       = monochrome_logs ? '' : "\033[1m"
255 |         colorcodes['dim']        = monochrome_logs ? '' : "\033[2m"
256 |         colorcodes['underlined'] = monochrome_logs ? '' : "\033[4m"
257 |         colorcodes['blink']      = monochrome_logs ? '' : "\033[5m"
258 |         colorcodes['reverse']    = monochrome_logs ? '' : "\033[7m"
259 |         colorcodes['hidden']     = monochrome_logs ? '' : "\033[8m"
260 | 
261 |         // Regular Colors
262 |         colorcodes['black']      = monochrome_logs ? '' : "\033[0;30m"
263 |         colorcodes['red']        = monochrome_logs ? '' : "\033[0;31m"
264 |         colorcodes['green']      = monochrome_logs ? '' : "\033[0;32m"
265 |         colorcodes['yellow']     = monochrome_logs ? '' : "\033[0;33m"
266 |         colorcodes['blue']       = monochrome_logs ? '' : "\033[0;34m"
267 |         colorcodes['purple']     = monochrome_logs ? '' : "\033[0;35m"
268 |         colorcodes['cyan']       = monochrome_logs ? '' : "\033[0;36m"
269 |         colorcodes['white']      = monochrome_logs ? '' : "\033[0;37m"
270 | 
271 |         // Bold
272 |         colorcodes['bblack']     = monochrome_logs ? '' : "\033[1;30m"
273 |         colorcodes['bred']       = monochrome_logs ? '' : "\033[1;31m"
274 |         colorcodes['bgreen']     = monochrome_logs ? '' : "\033[1;32m"
275 |         colorcodes['byellow']    = monochrome_logs ? '' : "\033[1;33m"
276 |         colorcodes['bblue']      = monochrome_logs ? '' : "\033[1;34m"
277 |         colorcodes['bpurple']    = monochrome_logs ? '' : "\033[1;35m"
278 |         colorcodes['bcyan']      = monochrome_logs ? '' : "\033[1;36m"
279 |         colorcodes['bwhite']     = monochrome_logs ? '' : "\033[1;37m"
280 | 
281 |         // Underline
282 |         colorcodes['ublack']     = monochrome_logs ? '' : "\033[4;30m"
283 |         colorcodes['ured']       = monochrome_logs ? '' : "\033[4;31m"
284 |         colorcodes['ugreen']     = monochrome_logs ? '' : "\033[4;32m"
285 |         colorcodes['uyellow']    = monochrome_logs ? '' : "\033[4;33m"
286 |         colorcodes['ublue']      = monochrome_logs ? '' : "\033[4;34m"
287 |         colorcodes['upurple']    = monochrome_logs ? '' : "\033[4;35m"
288 |         colorcodes['ucyan']      = monochrome_logs ? '' : "\033[4;36m"
289 |         colorcodes['uwhite']     = monochrome_logs ? '' : "\033[4;37m"
290 | 
291 |         // High Intensity
292 |         colorcodes['iblack']     = monochrome_logs ? '' : "\033[0;90m"
293 |         colorcodes['ired']       = monochrome_logs ? '' : "\033[0;91m"
294 |         colorcodes['igreen']     = monochrome_logs ? '' : "\033[0;92m"
295 |         colorcodes['iyellow']    = monochrome_logs ? '' : "\033[0;93m"
296 |         colorcodes['iblue']      = monochrome_logs ? '' : "\033[0;94m"
297 |         colorcodes['ipurple']    = monochrome_logs ? '' : "\033[0;95m"
298 |         colorcodes['icyan']      = monochrome_logs ? '' : "\033[0;96m"
299 |         colorcodes['iwhite']     = monochrome_logs ? '' : "\033[0;97m"
300 | 
301 |         // Bold High Intensity
302 |         colorcodes['biblack']    = monochrome_logs ? '' : "\033[1;90m"
303 |         colorcodes['bired']      = monochrome_logs ? '' : "\033[1;91m"
304 |         colorcodes['bigreen']    = monochrome_logs ? '' : "\033[1;92m"
305 |         colorcodes['biyellow']   = monochrome_logs ? '' : "\033[1;93m"
306 |         colorcodes['biblue']     = monochrome_logs ? '' : "\033[1;94m"
307 |         colorcodes['bipurple']   = monochrome_logs ? '' : "\033[1;95m"
308 |         colorcodes['bicyan']     = monochrome_logs ? '' : "\033[1;96m"
309 |         colorcodes['biwhite']    = monochrome_logs ? '' : "\033[1;97m"
310 | 
311 |         return colorcodes
312 |     }
313 | 
314 |     //
315 |     // Does what is says on the tin
316 |     //
317 |     public static String dashedLine(monochrome_logs) {
318 |         Map colors = logColours(monochrome_logs)
319 |         return "${colors.dim}--------------------------------------------------------------------------------${colors.reset}"
320 |     }
321 | 
322 |     // epi2me-labs logo
323 |     public static String logo(workflow, monochrome_logs) {
324 |         Map colors = NfcoreTemplate.logColours(monochrome_logs)
325 |         String workflow_name = workflow.manifest.name.split("/")[1]
326 |         String workflow_version = version(workflow)
327 |         String.format(
328 |             """
329 |             ${colors.igreen}||||||||||   ${colors.reset}${colors.dim}_____ ____ ___ ____  __  __ _____
330 |             ${colors.igreen}||||||||||  ${colors.reset}${colors.dim}| ____|  _ \\_ _|___ \\|  \\/  | ____|
331 |             ${colors.yellow}|||||       ${colors.reset}${colors.dim}|  _| | |_) | |  __) | |\\/| |  _|
332 |             ${colors.yellow}|||||       ${colors.reset}${colors.dim}| |___|  __/| | / __/| |  | | |__
333 |             ${colors.iblue}||||||||||  ${colors.reset}${colors.dim}|_____|_|  |___|_____|_|  |_|_____|
334 |             ${colors.iblue}||||||||||  ${colors.reset}${colors.bold}${workflow_name} ${workflow_version}${colors.reset}
335 |             ${NfcoreTemplate.dashedLine(monochrome_logs)}
336 |             """.stripIndent()
337 |         )
338 |     }
339 | }
340 | 
341 | 
342 | 


--------------------------------------------------------------------------------
/lib/Pinguscript.groovy:
--------------------------------------------------------------------------------
  1 | import static groovy.json.JsonOutput.toJson
  2 | import groovy.json.JsonBuilder
  3 | import groovy.json.JsonSlurper
  4 | 
  5 | 
  6 | class Pinguscript {
  7 | 
  8 |     // Send a ping for the start of a workflow
  9 |     public static void ping_start(nextflow, workflow, params) {
 10 |         wf_ping(nextflow, workflow, "start", null, params)
 11 |     }
 12 |     // Send a ping for a completed workflow (successful or otherwise)
 13 |     public static void ping_complete(nextflow, workflow, params) {
 14 |         wf_ping(nextflow, workflow, "end", null, params)
 15 |     }
 16 |     // Send a ping for a workflow error
 17 |     public static void ping_error(nextflow, workflow, params) {
 18 |         def error_message = workflow.errorMessage
 19 |         wf_ping(nextflow, workflow, "error", error_message, params)
 20 |     }
 21 |     // Shared handler to construct a ping JSON and send it
 22 |     private static String wf_ping(nextflow, workflow, event, error_message, params) {
 23 |         if (params.disable_ping) {
 24 |             return "{}"
 25 |         }
 26 |         def body_json = make_wf_ping(nextflow, workflow, event, error_message, params)
 27 |         send_ping_post("epilaby", body_json)
 28 |     }
 29 | 
 30 |     // Helper to removing keys from a map
 31 |     private static clean_meta(meta, keys_to_remove) {
 32 |         for (key in keys_to_remove) {
 33 |             if (meta.containsKey(key)) {
 34 |                 meta.remove(key)
 35 |             }
 36 |         }
 37 |     }
 38 | 
 39 |     // Helper for fetching a key from the params map
 40 |     // seems pointless but you just know someone is going to end up writing meta.this ? meta.that
 41 |     private static get_meta(meta, key) {
 42 |         (meta.containsKey(key) && meta[key]) ? meta[key].toString() : null
 43 |     }
 44 | 
 45 |     // Construct workflow ping JSON
 46 |     private static String make_wf_ping(nextflow, workflow, event, error_message, params) {
 47 |         // cheeky deepcopy using json
 48 |         String paramsJSON = new JsonBuilder(params).toPrettyString()
 49 |         def params_data = new JsonSlurper().parseText(paramsJSON)
 50 | 
 51 |         // OS
 52 |         // TODO check version on WSL
 53 |         def opsys = System.properties['os.name'].toLowerCase()
 54 |         def opver = System.properties['os.version']
 55 |         if (opver.toLowerCase().contains("wsl")){
 56 |             opsys = "wsl"
 57 |         }
 58 | 
 59 |         // placeholder for any future okta business
 60 |         // for now we'll use the guest_<ulid> sent to wf.epi2me_user
 61 |         def user = get_meta(params.wf, "epi2me_user")
 62 | 
 63 |         // drop cruft to save some precious bytes
 64 |         // affects the deep copy rather than original params
 65 |         clean_meta(params_data, [
 66 |             "schema_ignore_params",
 67 |         ])
 68 |         def ingress_ids = []
 69 |         if (params_data.containsKey("wf")) {
 70 |             ingress_ids = params_data.wf["ingress.run_ids"] ?: []
 71 |             clean_meta(params_data.wf, [
 72 |                 "agent", // we send this later
 73 |                 "epi2me_instance", // we send this later
 74 |                 "epi2me_user", // we send this later
 75 |                 "example_cmd",
 76 |                 "ingress.run_ids", // we will send this elsewhere
 77 |             ])
 78 |         }
 79 | 
 80 |         // try and get runtime information
 81 |         def cpus = null
 82 |         try {
 83 |             cpus = Runtime.getRuntime().availableProcessors()
 84 |         }
 85 |         catch(Exception e) {}
 86 | 
 87 |         def workflow_success = null
 88 |         def workflow_exitcode = null
 89 |         if (event != "start") {
 90 |             workflow_success = workflow.success
 91 |             workflow_exitcode = workflow.exitStatus
 92 |         }
 93 | 
 94 |         /// build message
 95 |         def body_json = new JsonBuilder()
 96 |         body_json \
 97 |             "tracking_id": [
 98 |                 "msg_id": UUID.randomUUID().toString(),
 99 |                 "version": "3.0.1"
100 |             ],
101 |             "source": "workflow",
102 |             "event": event,
103 |             "params": params_data,
104 |             // data will be null on start events, as ingress has not run
105 |             "data": event != "start" ? [run_ids: ingress_ids] : null,
106 |             "workflow": [
107 |                 "name": workflow.manifest.name,
108 |                 "version": workflow.manifest.version, // could use NfcoreTemplate.version(workflow)
109 |                 "run_name": workflow.runName, // required to disambiguate sessions
110 |                 "session": workflow.sessionId,
111 |                 "profile": workflow.profile,
112 |                 "resume": workflow.resume,
113 |                 "error": error_message, // null if no error
114 |                 "success": workflow_success,
115 |                 "exitcode": workflow_exitcode,
116 |             ],
117 |             "env": [
118 |                 "user": user, // placeholder for any future okta
119 |                 "os": [
120 |                     "name": opsys,
121 |                     "version": opver
122 |                 ],
123 |                 "resource": [
124 |                     "cpus": cpus,
125 |                     "memory": null, // placeholder, no point asking via Runtime as it will just give us the Xmx size
126 |                 ],
127 |                 "agent": get_meta(params.wf, "agent"), // access via original params
128 |                 "epi2me": [
129 |                     "instance": get_meta(params.wf, "epi2me_instance"),
130 |                     "user": user,
131 |                 ],
132 |                 "nextflow": [
133 |                     "version": nextflow.version.toString(),
134 |                     "version_compat": nextflow.version.matches(workflow.manifest.nextflowVersion)
135 |                 ]
136 |             ]
137 |         return body_json
138 |     }
139 | 
140 |     // Send a JSON payload to a given endpoint
141 |     private static String send_ping_post(endpoint, body_json) {
142 |         // Attempt to send payload and absorb any possible Exception gracefully
143 |         String postResult
144 |         boolean raise_exception = false
145 |         try {
146 |             ((HttpURLConnection)new URL("https://ping.oxfordnanoportal.com/${endpoint}").openConnection()).with({
147 |                 requestMethod = 'POST'
148 |                 doOutput = true
149 |                 setConnectTimeout(5000)
150 |                 setReadTimeout(10000)
151 |                 setRequestProperty('Content-Type', 'application/json')
152 |                 setRequestProperty('accept', 'application/json')
153 |                 outputStream.withPrintWriter({printWriter ->
154 |                     printWriter.write(body_json.toString())
155 |                 })
156 | 
157 |                 // Rethrow exceptions that imply we're not using this endpoint properly
158 |                 if(responseCode >= 400 && agent.toString() == "cw-ci") {
159 |                     raise_exception = true
160 |                 }
161 |                 // Accessing inputStream.text will raise an Exception for failed requests
162 |                 postResult = inputStream.text
163 |             })
164 |         }
165 |         catch(Exception e) {
166 |             if(raise_exception) { throw e }
167 |         }
168 |         return (postResult)
169 |     }
170 | }
171 | 


--------------------------------------------------------------------------------
/lib/WorkflowMain.groovy:
--------------------------------------------------------------------------------
 1 | // This file is based on the nf-core/tools pipeline-template.
 2 | // Changes to this file must be propagated via wf-template.
 3 | 
 4 | class WorkflowMain {
 5 | 
 6 |     // Citation string for pipeline
 7 |     public static String citation(workflow) {
 8 |         return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" +
 9 |             "* The nf-core framework\n" +
10 |             "  https://doi.org/10.1038/s41587-020-0439-x\n\n"
11 |     }
12 | 
13 |     // Generate help string
14 |     public static String help(workflow, params, log) {
15 |         String line_sep = ' \\ \n\t'
16 |         String command_example = params.wf.example_cmd.join(line_sep)
17 |         String command = 'nextflow run ' + workflow.manifest.name + line_sep + command_example
18 |         String help_string = ''
19 |         help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs)
20 |         help_string += NfcoreSchema.paramsHelp(workflow, params, command)
21 |         help_string += '\n' + citation(workflow) + '\n'
22 |         return help_string
23 |     }
24 | 
25 |     // Generate parameter summary log string
26 |     public static String paramsSummaryLog(workflow, params, log) {
27 |         String workflow_version = NfcoreTemplate.version(workflow)
28 |         String summary_log = ''
29 |         summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs)
30 |         summary_log += NfcoreSchema.paramsSummaryLog(workflow, params)
31 |         summary_log += '\n' + citation(workflow) + '\n'
32 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
33 |         summary_log += "\nThis is ${workflow.manifest.name} ${workflow_version}.\n"
34 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
35 |         return summary_log
36 |     }
37 | 
38 |     // Validate parameters and print summary to screen
39 |     public static void initialise(workflow, params, log) {
40 |         // Print help to screen if required
41 |         if (params.help) {
42 |             log.info help(workflow, params, log)
43 |             System.exit(0)
44 |         }
45 | 
46 |         // Print workflow version and exit on --version
47 |         if (params.version) {
48 |             String workflow_version = NfcoreTemplate.version(workflow)
49 |             log.info "${workflow.manifest.name} ${workflow_version}"
50 |             System.exit(0)
51 |         }
52 | 
53 |         // Explode on conda
54 |         // conda.enabled seems to be backward compatible but wrap this
55 |         // in a generic catch just in case
56 |         try {
57 |             if (workflow.session.config.conda.enabled) {
58 |                 log.error "Sorry, this workflow is not compatible with Conda, please use -profile standard (Docker) or -profile singularity."
59 |                 System.exit(1)
60 |             }
61 |         } catch(Exception e) {}
62 | 
63 |         // Validate workflow parameters via the JSON schema
64 |         if (params.validate_params) {
65 |             NfcoreSchema.validateParameters(workflow, params, log)
66 |         }
67 | 
68 |         // Print parameter summary log to screen
69 |         log.info paramsSummaryLog(workflow, params, log)
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/lib/common.nf:
--------------------------------------------------------------------------------
 1 | import groovy.json.JsonBuilder
 2 | 
 3 | process getParams {
 4 |     label "wf_common"
 5 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "params.json"
 6 |     cache false
 7 |     cpus 1
 8 |     memory "2 GB"
 9 |     output:
10 |         path "params.json"
11 |     script:
12 |         def paramsJSON = new JsonBuilder(params).toPrettyString().replaceAll("'", "'\\\\''")
13 |     """
14 |     # Output nextflow params object to JSON
15 |     echo '$paramsJSON' > params.json
16 |     """
17 | }
18 | 
19 | process configure_igv {
20 |     publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv
21 |     label "wf_common"
22 |     cpus 1
23 |     memory "2 GB"
24 |     input:
25 |         // the python script will work out what to do with all the files based on their
26 |         // extensions
27 |         path "file-names.txt"
28 |         val locus_str
29 |         val aln_extra_opts
30 |         val var_extra_opts
31 |     output: path "igv.json"
32 |     script:
33 |     // the locus argument just makes sure that the initial view in IGV shows something
34 |     // interesting
35 |     String locus_arg = locus_str ? "--locus $locus_str" : ""
36 |     // extra options for alignment tracks
37 |     def aln_opts_json_str = \
38 |         aln_extra_opts ? new JsonBuilder(aln_extra_opts).toPrettyString() : ""
39 |     String aln_extra_opts_arg = \
40 |         aln_extra_opts ? "--extra-alignment-opts extra-aln-opts.json" : ""
41 |     // extra options for variant tracks
42 |     def var_opts_json_str = \
43 |         var_extra_opts ? new JsonBuilder(var_extra_opts).toPrettyString() : ""
44 |     String var_extra_opts_arg = \
45 |         var_extra_opts ? "--extra-vcf-opts extra-var-opts.json" : ""
46 |     """
47 |     # write out JSON files with extra options for the alignment and variant tracks
48 |     echo '$aln_opts_json_str' > extra-aln-opts.json
49 |     echo '$var_opts_json_str' > extra-var-opts.json
50 | 
51 |     workflow-glue configure_igv \
52 |         --fofn file-names.txt \
53 |         $locus_arg \
54 |         $aln_extra_opts_arg \
55 |         $var_extra_opts_arg \
56 |     > igv.json
57 |     """
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/lib/nfcore_external_java_deps.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-basecalling/84442a77c74646c8971342d2ef80f124b5aeaf31/lib/nfcore_external_java_deps.jar


--------------------------------------------------------------------------------
/lib/reference.nf:
--------------------------------------------------------------------------------
  1 | // Argument parser
  2 | Map parse_reference(Map arguments) {
  3 |     ArgumentParser parser = new ArgumentParser(
  4 |         args:[
  5 |             "input_ref",
  6 |         ],
  7 |         kwargs:[
  8 |             "output_cache": false,
  9 |             "output_mmi": false,
 10 |         ],
 11 |         name: "reference_ingress")
 12 |     return parser.parse_args(arguments)
 13 | }
 14 | 
 15 | // Process to generate the CRAM cache and
 16 | // create the REF_PATH variable
 17 | process cram_cache {
 18 |     label "wf_common"
 19 |     cpus 1
 20 |     memory 4.GB
 21 |     input:
 22 |         path reference
 23 |     output:
 24 |         tuple path("ref_cache/"), env(REF_PATH), emit: ref_cache
 25 |     shell:
 26 |     '''
 27 |     # Invoke from binary installed to container PATH
 28 |     seq_cache_populate.pl -root ref_cache/ !{reference}
 29 |     REF_PATH="ref_cache/%2s/%2s/%s"
 30 |     '''
 31 | }
 32 | 
 33 | // Process to create the faidx index
 34 | process faidx {
 35 |     label "wf_common"
 36 |     cpus 1
 37 |     memory 4.GB
 38 |     input:
 39 |         path(ref)
 40 |     output:
 41 |         path("${ref}.fai")
 42 |     script:
 43 |     """
 44 |     samtools faidx ${ref}
 45 |     """
 46 | }
 47 | 
 48 | // Decompress the reference genome, if it is compressed
 49 | // NOTE -f required to compress symlink
 50 | process decompress_ref {
 51 |     label "wf_common"
 52 |     cpus 1
 53 |     memory 4.GB
 54 |     input:
 55 |         file compressed_ref
 56 |     output:
 57 |         path "${compressed_ref.baseName}", emit: decompressed_ref
 58 |     """
 59 |     gzip -df ${compressed_ref}
 60 |     """
 61 | }
 62 | 
 63 | // Prepare minimap2 .mmi index
 64 | process make_mmi {
 65 |     // Minimap2 is not available in wf_common
 66 |     label "wf_basecalling"
 67 |     cpus 4
 68 |     memory 16.GB
 69 |     input:
 70 |         path(ref)
 71 |     output:
 72 |         path("ref.mmi")
 73 |     script:
 74 |     """
 75 |     minimap2 -t ${task.cpus} -x lr:hq -d ref.mmi ${ref}
 76 |     """
 77 | }
 78 | 
 79 | 
 80 | // Workflow to prepare the reference genome and its indexes.
 81 | workflow prepare_reference {
 82 |     take:
 83 |         arguments
 84 |     main:
 85 |         Map margs = parse_reference(arguments)
 86 | 
 87 |         // Base ref channel
 88 |         ref = Channel.fromPath(margs.input_ref)
 89 | 
 90 |         // Check ref and decompress if needed
 91 |         // gzipped ref not supported by some downstream tools (e.g. cram_cache)
 92 |         // easier to just decompress and pass it around rather than confusing the user
 93 |         is_compressed = margs.input_ref.toLowerCase().endsWith("gz")
 94 |         if (is_compressed) {
 95 |             ref = decompress_ref(ref)
 96 |         }
 97 | 
 98 |         // Generate fai index if the file is either compressed, or if fai doesn't exists
 99 |         if (!is_compressed && file("${margs.input_ref}.fai").exists()){
100 |             ref_idx = Channel.fromPath("${margs.input_ref}.fai")
101 |         } else {
102 |             ref_idx = faidx(ref)
103 |         }
104 | 
105 |         // Generate CRAM cache
106 |         if (margs.output_cache){
107 |             cram_cache(ref)
108 |             ref_cache = cram_cache.out.ref_cache
109 |         } else {
110 |             ref_cache = null
111 |         }
112 | 
113 |         // Generate mmi index
114 |         if (margs.output_mmi){
115 |             ref_mmi = make_mmi(ref)
116 |         } else {
117 |             ref_mmi = null
118 |         }
119 | 
120 |     // Run collect on the outputs, allowing to treat them as value channels, and avoiding
121 |     // conflicts with other queue channels downstream.
122 |     emit:
123 |         ref = ref | collect
124 |         ref_idx = ref_idx | collect
125 |         ref_cache = ref_cache | collect
126 |         ref_mmi = ref_mmi | collect
127 | }
128 | 


--------------------------------------------------------------------------------
/lib/signal/merge.nf:
--------------------------------------------------------------------------------
 1 | 
 2 | // this process is shared by both the uCRAM and CRAM arms of the basecalling workflow
 3 | // for uCRAM the staged ref is the OPTIONAL_FILE, so we withhold the ref arg
 4 | process merge_calls {
 5 |     label "wf_basecalling"
 6 |     cpus params.merge_threads
 7 |     memory 16.GB
 8 |     input:
 9 |         path(ref)
10 |         path(crams, stageAs: "filtered_*.cram")
11 |         val(filetag)
12 |         tuple val(align_ext), val(index_ext) // either [bam, bai] or [cram, crai]
13 |     output:
14 |         tuple path("${params.sample_name}.${filetag}.${align_ext}"), path("${params.sample_name}.${filetag}.${align_ext}.${index_ext}")
15 |     script:
16 |     def ref_arg = ref.name != "OPTIONAL_FILE" ? "--reference ${ref}" : ""
17 |     """
18 |     samtools merge -c -p "${params.sample_name}.${filetag}.${align_ext}##idx##${params.sample_name}.${filetag}.${align_ext}.${index_ext}" ${crams} --no-PG -O ${align_ext} --write-index ${ref_arg} --threads ${task.cpus}
19 |     """
20 | }
21 | 
22 | process merge_calls_to_fastq {
23 |     label "wf_basecalling"
24 |     cpus { params.merge_threads + params.ubam_bam2fq_threads }
25 |     memory 16.GB
26 |     input:
27 |         path(crams)
28 |         val(filetag)
29 |     output:
30 |         path("${params.sample_name}.${filetag}.fq.gz")
31 |     script:
32 |     """
33 |     samtools merge -c -p ${crams} --no-PG -O CRAM -@ ${params.merge_threads} -o - | samtools bam2fq -T 1 -@ ${params.ubam_bam2fq_threads} -0 ${params.sample_name}.${filetag}.fq.gz -
34 |     """
35 | }
36 | 


--------------------------------------------------------------------------------
/main.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | import groovy.json.JsonBuilder
  3 | import nextflow.util.BlankSeparatedList
  4 | 
  5 | nextflow.enable.dsl = 2
  6 | 
  7 | include { wf_dorado } from './lib/signal/ingress'
  8 | include {
  9 |     configure_igv;
 10 |     getParams } from './lib/common'
 11 | include { prepare_reference } from './lib/reference'
 12 | nextflow.preview.recursion=true
 13 | 
 14 | process getVersions {
 15 |     label "wf_basecalling"
 16 |     cpus 1
 17 |     output:
 18 |         path "versions.txt"
 19 |     script:
 20 |     """
 21 |     dorado --version 2>&1 | head -n1 | sed 's/^/dorado,/' >> versions.txt
 22 |     minimap2 --version | head -n 1 | sed 's/^/minimap2,/' >> versions.txt
 23 |     """
 24 | }
 25 | 
 26 | 
 27 | process bamstats {
 28 |     label "wf_common"
 29 |     cpus params.stats_threads
 30 |     input:
 31 |         path "input.cram" // chunks are always CRAM
 32 |         tuple path(ref_cache), env(REF_PATH)
 33 | 
 34 |     output:
 35 |         path "bamstats.tsv", emit: stats
 36 |         path "stats.${task.index}.json", emit: json
 37 |     script:
 38 |     """
 39 |     bamstats --threads=${task.cpus} -u input.cram > bamstats.tsv
 40 |     fastcat_histogram.py \
 41 |             --sample_id "${params.sample_name}" \
 42 |             bamstats.tsv "stats.${task.index}.json"
 43 |     """
 44 | }
 45 | 
 46 | 
 47 | // Scan step for accumulating fastcat stats
 48 | //
 49 | // Nextflow scan does a silly thing where it feeds back the growing list of
 50 | // historical outputs. We only ever need the most recent output (the "state").
 51 | process progressive_stats {
 52 |     label "wf_common"
 53 |     maxForks 1
 54 |     cpus 1
 55 |     input: 
 56 |         path fastcat_stats
 57 |     output:
 58 |         path("all_stats.${task.index}")
 59 |     script:
 60 |         def new_input = fastcat_stats instanceof BlankSeparatedList ? fastcat_stats.first() : fastcat_stats
 61 |         def state = fastcat_stats instanceof BlankSeparatedList ? fastcat_stats.last() : "NOSTATE"
 62 |         def output = "all_stats.${task.index}"
 63 |     """
 64 |     touch "${state}"
 65 |     add_jsons.py "${new_input}" "${state}" "${output}"
 66 |     """
 67 | }
 68 | 
 69 | 
 70 | // Split simplex reads belonging to a pair
 71 | process split_xam {
 72 |     label "wf_common"
 73 |     cpus 2
 74 |     input:
 75 |         tuple path(xam), path(xam_index)
 76 |         tuple val(align_ext), val(index_ext)
 77 |         path ref
 78 |     output:
 79 |         tuple path("${xam.baseName}.duplex.${align_ext}"), path("${xam.baseName}.duplex.${align_ext}.${index_ext}"), emit: xam_dx
 80 |         tuple path("${xam.baseName}.simplex.${align_ext}"), path("${xam.baseName}.simplex.${align_ext}.${index_ext}"), emit: xam_sx
 81 |     script:
 82 |         String reference = ref.name.startsWith('OPTIONAL_FILE') ? '' : "--reference ${ref}"
 83 |     """
 84 |     samtools view ${reference} \
 85 |         -@ ${task.cpus} \
 86 |         -O ${align_ext} \
 87 |         --tag dx:-1 \
 88 |         --unoutput ${xam.baseName}.duplex.${align_ext} \
 89 |         -o ${xam.baseName}.simplex.${align_ext} \
 90 |         ${xam}
 91 |     samtools index ${xam.baseName}.simplex.${align_ext}
 92 |     samtools index ${xam.baseName}.duplex.${align_ext}
 93 |     """
 94 | }
 95 | 
 96 | 
 97 | // Compute pairing statistics progressively, if duplex enabled
 98 | process pair_stats {
 99 |     label "wf_common"
100 |     cpus 1
101 |     input:
102 |         path cram // chunks are always CRAM
103 |         tuple path(ref_cache), env(REF_PATH)
104 |     output:
105 |         path("pairs.${task.index}.csv"), emit: csv
106 |     script:
107 |     """
108 |     duplex_stats.py ${cram} pairs.${task.index}.csv
109 |     """
110 | }
111 | 
112 | 
113 | process progressive_pairings {
114 |     label "wf_common"
115 |     maxForks 1
116 |     cpus 1
117 |     input: 
118 |         path pairings
119 |     output:
120 |         path("pairing_stats.${task.index}")
121 |     script:
122 |         // By passing around a directory
123 |         // The state file within it will be a symlink containing the latest cumulative data
124 |         // eg. ls -l will look like this
125 |         // pairing_stats.1 -> /work/ab/xyz/pairing_stats.1
126 |         // pairing_stats.2 -> /work/ab/xyz/pairing_stats.1
127 |         // pairing_stats.3 -> /work/ab/xyz/pairing_stats.1
128 |         def new_input = pairings instanceof BlankSeparatedList ? pairings.first() : pairings
129 |         def state = pairings instanceof BlankSeparatedList ? pairings.last() : "NOSTATE"
130 |         def new_state = "pairing_stats.${task.index}"
131 |         def new_file = "pairing_stats.new"
132 |         // n.b where this is used below the files will have been moved, hence new_state
133 |         def dynamic_input = "${new_state}/sample.pairings_stats"
134 |     """
135 |     # If first iteration create empty directory
136 |     if [[ "${task.index}" == "1" ]]; then
137 |         mkdir "${state}"
138 |     fi
139 |     # cp to another new folder
140 |     cp -r "${state}" "${new_state}" 
141 |     # Create a new file with headers
142 |     echo "Filename,Duplex,Paired,Simplex" > ${new_file}
143 |     # If dynamic_input exists, save it to new_file
144 |     if [ -f $dynamic_input ]; then
145 |         # append everything from the old state file in to the new file
146 |         # skip header with 'FNR>1' as already added above
147 |         awk 'FNR>1' "${dynamic_input}" >> ${new_file}
148 |     fi
149 |     # append everything from the latest input file in to the new file
150 |     awk 'FNR>1' ${new_input} >> ${new_file}
151 |     # the new file now becomes the next state to be output
152 |     mv "${new_file}" "${dynamic_input}"
153 |     """
154 | }
155 | 
156 | 
157 | // Make reports
158 | process makeReport {
159 |     label "wf_common"
160 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "*"
161 |     input:
162 |         path per_read_stats
163 |         path pairings
164 |         path "versions/*"
165 |         path "params.json"
166 |     output:
167 |         path "wf-basecalling-*.html"
168 |     script:
169 |         String report_name = "wf-basecalling-report.html"
170 |         def report_pairings = params.duplex ? "--pairings ${pairings}/*" : ""
171 |     """
172 |     report.py $report_name \
173 |         --sample_name $params.sample_name \
174 |         --versions versions \
175 |         --stats $per_read_stats \
176 |         --params params.json \
177 |         --workflow_version ${workflow.manifest.version} \
178 |         $report_pairings
179 |     """
180 | }
181 | 
182 | 
183 | // watch path stop condition, if params.read_limit is met will inject a stop file in to input folder.
184 | process stopCondition { 
185 |     label "wf_common"
186 |     cpus 1 
187 |     publishDir params.input, mode: 'copy', pattern: "*"
188 |     input:
189 |         path json
190 |         val (stop_filename)
191 |     output:
192 |         path "${stop_filename}", optional: true, emit: stop
193 |     script:
194 |         int threshold = params.read_limit
195 |     """    
196 |     #!/usr/bin/env python
197 |     import json
198 |     from pathlib import Path
199 |     with open("$json") as json_file:
200 |         state = json.load(json_file)
201 |         total = 0 
202 |         for k,v in state.items():
203 |             total += v["total_reads"]
204 |         if total >= $threshold:
205 |             p = Path("$stop_filename")
206 |             p.touch(exist_ok=False)
207 |     """
208 | }
209 | 
210 | 
211 | // See https://github.com/nextflow-io/nextflow/issues/1636
212 | // This is the only way to publish files from a workflow whilst
213 | // decoupling the publish from the process steps.
214 | process output_stream {
215 |     // publish inputs to output directory
216 |     label "wf_basecalling"
217 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "*"
218 |     input:
219 |         path fname
220 |     output:
221 |         path fname
222 |     """
223 |     echo "Writing output files."
224 |     """
225 | }
226 | 
227 | // Output the last report, once each of them finish
228 | process output_last {
229 |     // publish inputs to output directory
230 |     label "wf_basecalling"
231 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "*"
232 |     input:
233 |         path fname
234 |     output:
235 |         path fname
236 |     """
237 |     echo "Writing output files."
238 |     """
239 | }
240 | 
241 | // CW-2569: Emit pod5s if requested, in a new directory
242 | process output_pod5s {
243 |     // publish inputs to output directory
244 |     label "wf_basecalling"
245 |     publishDir "${params.out_dir}/pod5s/", mode: 'copy', pattern: "*"
246 |     input:
247 |         path pod5s
248 |     output:
249 |         path pod5s
250 |     """
251 |     echo "Writing output files."
252 |     """
253 | }
254 | 
255 | 
256 | // entrypoint workflow
257 | WorkflowMain.initialise(workflow, params, log)
258 | workflow {
259 | 
260 |     Map colors = NfcoreTemplate.logColours(params.monochrome_logs)
261 | 
262 |     Pinguscript.ping_start(nextflow, workflow, params)
263 | 
264 |     // Basecall
265 |     // Ensure basecaller config is set
266 |     if (!params.basecaller_cfg && !params.basecaller_model_path) {
267 |         throw new Exception(colors.red + "You must provide a basecaller profile with --basecaller_cfg <profile>" + colors.reset)
268 |     }
269 |     if (params.duplex && params.output_fmt == "fastq") {
270 |         throw new Exception(colors.red + "Duplex requires the outputs of Dorado to be in BAM format." + colors.reset)
271 |     }
272 |     if (params.ref && params.output_fmt == "fastq") {
273 |         log.warn("Alignment will output data in BAM format and ignore `--output_fmt fastq`.")
274 |     }
275 |     if (params.basecaller_cfg && params.basecaller_model_path) {
276 |         log.warn("--basecaller_cfg and --basecaller_model_path both provided. Custom remora model path (${params.basecaller_cfg}) will override enum choice (${params.basecaller_model_path}).")
277 |     }
278 |     if (params.remora_cfg && params.remora_model_path) {
279 |         log.warn("--remora_cfg and --remora_model_path both provided. Custom remora model path (${params.remora_model_path}) will override enum choice (${params.remora_cfg}).")
280 |     }
281 |     if (params.duplex && params.dorado_ext != 'pod5') {
282 |         log.warn("Duplex currently requires POD5 files and is not compatible with FAST5. The workflow will convert the FAST5 inputs to POD5 format automatically.")
283 |     }
284 |     if (params.duplex && params.barcode_kit) {
285 |         throw new Exception(colors.red + "Duplex does not support barcoded data." + colors.reset)
286 |     }
287 |     if (params.igv && (!params.ref || params.output_fmt == 'fastq' )){
288 |         log.warn("IGV configuration works only for aligned BAM/CRAM outputs. Please provide a reference with `--ref`, and request either cram or bam output with `--output_fmt`.")
289 |     }
290 | 
291 |     // Ensure modbase threads are set if calling them
292 |     if (params.remora_cfg || params.remora_model_path) {
293 |         if (params.basecaller_basemod_threads == 0) {
294 |             throw new Exception(colors.red + "--remora_cfg modbase aware config requires setting --basecaller_basemod_threads > 0" + colors.reset)
295 |         }
296 |     }
297 | 
298 |     //
299 |     if (params.use_bonito) {
300 |         log.warn("Using bonito for basecalling, bonito is an experimental feature for which no support is entertained.")
301 |         if (!params.experimental) {
302 |             error "Use of bonito is locked behind the `--experimental` option."
303 |         }
304 |     }
305 | 
306 |     // Prepare the reference genome
307 |     Boolean run_alignment = false
308 |     if (params.ref) {
309 |         prepare_reference([
310 |             "input_ref": params.ref,
311 |             "output_mmi": true,
312 |             "output_cache": true
313 |         ])
314 |         ref = prepare_reference.out.ref
315 |         ref_cache = prepare_reference.out.ref_cache
316 |         ref_fai = prepare_reference.out.ref_idx
317 |         ref_mmi = prepare_reference.out.ref_mmi
318 |         run_alignment = true
319 |     } else {
320 |         ref = Channel.fromPath("${projectDir}/data/OPTIONAL_FILE") | collect
321 |         ref_cache = Channel.of([file("${projectDir}/data/OPTIONAL_FILE"), null]) | collect
322 |         ref_fai = Channel.empty()
323 |         ref_mmi = Channel.empty()
324 |     }
325 | 
326 |     // ring ring it's for you
327 |     basecaller_out = wf_dorado([
328 |         "input_path": params.input,
329 |         "input_ref": ref,
330 |         "input_mmi": ref_mmi,
331 |         "input_cache": ref_cache,
332 |         "run_alignment": run_alignment,
333 |         "basecaller_model_name": params.use_bonito ? params.bonito_cfg : params.basecaller_cfg,
334 |         "remora_model_name": params.remora_cfg,
335 |         "basecaller_model_path": params.basecaller_model_path,
336 |         "remora_model_path": params.remora_model_path,
337 |         "watch_path": params.watch_path,
338 |         "output_fmt": params.output_fmt,
339 |         "dorado_ext": params.dorado_ext,
340 |         "poly_a_config": params.poly_a_config,
341 |         "qscore_filter": params.qscore_filter
342 |     ])
343 |     software_versions = getVersions()
344 |     workflow_params = getParams()
345 | 
346 |     // stream stats for report
347 |     stat = bamstats(basecaller_out.chunked_pass_crams, ref_cache)
348 |     stats = progressive_stats.scan(stat.json)
349 | 
350 |     // stream pair stats for report
351 |     // use first() to coerce this to a value channel
352 |     pairings = Channel.fromPath("${projectDir}/data/OPTIONAL_FILE", checkIfExists: true).first()
353 |     if (params.duplex){
354 |         // Separate the simplex reads belonging to a pair from the
355 |         // duplex and simplex reads.
356 |         // Save the simplex reads in a duplex in a separate xam file.
357 |         split_xam(
358 |             basecaller_out.pass.concat(
359 |                 basecaller_out.fail
360 |             ),
361 |             basecaller_out.output_exts,
362 |             ref
363 |             )
364 | 
365 |         // Create emission channel
366 |         emit_xam = split_xam.out.xam_dx.flatten()
367 |             .concat(split_xam.out.xam_sx.flatten())
368 |             .concat(basecaller_out.summary)
369 | 
370 |         // Then, compute the stats on the duplex
371 |         pairs = pair_stats(basecaller_out.chunked_pass_crams, ref_cache)
372 |         pairings = progressive_pairings.scan(pairs.csv)
373 |     } else {
374 |         emit_xam = basecaller_out.pass.flatten()
375 |             .concat(basecaller_out.fail.flatten())
376 |     }
377 |     // Make the report
378 |     report = makeReport(stats, pairings, software_versions, workflow_params) | last | collect | output_last
379 | 
380 |     // Create IGV if the reference genome is passed
381 |     if (params.ref && params.igv && params.output_fmt!='fastq'){
382 |         // Create temporary channel of FASTA + FAI
383 |         ref_ch = ref
384 |         | combine(
385 |             ref_fai
386 |         )
387 | 
388 |         igv_files = ref_ch
389 |             // Use full path of the input reference, allowing to not emit the reference
390 |             | map{
391 |                 fna, fai ->
392 |                 // If the FASTA is compressed, then it should start with the work dir path, and therefore is emitted
393 |                 String fna_path = fna.startsWith("${workflow.workDir}") ? "${fna.name}" : "${fna.toUriString()}"
394 |                 // Same for the FAIDX
395 |                 String fai_path = fai.startsWith("${workflow.workDir}") ? "${fai.name}" : "${fai.toUriString()}"
396 |                 [fna_path, fai_path]
397 |             }
398 |             // We only show the pass BAM files as tracks.
399 |             | concat (
400 |                 basecaller_out.pass | map{ it -> "${it.Name}" }
401 |             )
402 |             | flatten
403 |             | collectFile(name: "file-names.txt", newLine: true, sort: false)
404 |         igv_conf = configure_igv(igv_files, Channel.of(null), Channel.of(null), Channel.of(null))
405 |         // If the input reference is compressed, or the input fasta does not exists, emit faidx
406 |         if (params.ref.toLowerCase().endsWith("gz") || !file("${params.ref}.fai").exists()){
407 |             igv_conf = igv_conf
408 |             | concat(
409 |                 // If either the FASTA or the FAI have been modified in any way, emit them
410 |                 ref_ch
411 |                 | flatten
412 |                 | filter{it.startsWith("${workflow.workDir}")}
413 |             )
414 |         }
415 |     } else {
416 |         igv_conf = Channel.empty()
417 |     }
418 | 
419 |     // dump out artifacts thanks for calling
420 |     output_stream(
421 |         emit_xam
422 |         | concat(
423 |             pairings.last(),
424 |             software_versions,
425 |             workflow_params,
426 |             igv_conf
427 |         )
428 |         | filter{ it -> it.Name != "OPTIONAL_FILE"}
429 |     )
430 | 
431 |     // dump pod5s if requested
432 |     if (params.duplex && params.dorado_ext == 'fast5' && params.output_pod5){
433 |         output_pod5s(basecaller_out.converted_pod5s)
434 |     }
435 | 
436 |     //  Stop file to input folder when read_limit stop condition is met.
437 |     String stop_filename = "STOP.${workflow.sessionId}.${params.dorado_ext}"
438 |     if (params.watch_path && params.read_limit){
439 |         stopCondition(stats, stop_filename).first().subscribe {
440 |             log.info "Creating STOP file: '$stop_filename'"
441 |         }
442 |     }
443 | 
444 | }
445 | 
446 | workflow.onComplete {
447 |     Pinguscript.ping_complete(nextflow, workflow, params)
448 | }
449 | workflow.onError {
450 |     Pinguscript.ping_error(nextflow, workflow, params)
451 | }
452 | 


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
 1 | // import profiles and workflow SHA from core
 2 | includeConfig "base.config"
 3 | 
 4 | 
 5 | // define workflow params
 6 | params {
 7 |     help = false
 8 |     version = false
 9 |     aws_image_prefix = null
10 |     aws_queue = null
11 |     disable_ping = false
12 | 
13 |     monochrome_logs = false
14 |     validate_params = true
15 |     show_hidden_params = false
16 |     schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf'
17 | 
18 |     // I/O
19 |     input = null
20 |     ref = null
21 |     sample_name = "SAMPLE"
22 |     store_dir = null
23 | 
24 |     // basecalling
25 |     experimental = false
26 |     /// common
27 |     basecaller_chunk_size = 25
28 |     basecaller_cfg = null
29 |     basecaller_args = null
30 |     basecaller_basemod_threads = 2
31 |     duplex = false
32 |     cuda_device = "cuda:all"
33 |     ubam_map_threads = 8
34 |     ubam_sort_threads = 3
35 |     ubam_bam2fq_threads = 1
36 |     merge_threads = 4
37 |     stats_threads = 4
38 |     basecaller_model_path = null
39 |     remora_model_path = null
40 |     qscore_filter = 10
41 |     /// dorado
42 |     remora_cfg = null
43 |     dorado_ext = "pod5"
44 |     poly_a_config = null
45 |     /// bonito
46 |     use_bonito = false
47 |     bonito_cfg = 'dna_r10.4.1_e8.2_400bps_trns@v5.0.alpha'
48 |     /// wf-basecalling
49 |     output_fmt = "cram"
50 |     output_pod5 = false
51 |     // demuxing
52 |     barcode_kit = null
53 |     demux_args = null
54 |     /// Stream input
55 |     watch_path = false
56 |     read_limit = null
57 |     // Create IGV configuration
58 |     igv = false
59 | 
60 |     wf {
61 |         example_cmd = [
62 |             "--basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'",
63 |             "--dorado_ext 'pod5'",
64 |             "--input 'wf-basecalling-demo/input'",
65 |             "--ref 'wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta'",
66 |             "--remora_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2'",
67 |         ]
68 |         agent = null
69 |     }
70 | }
71 | 
72 | manifest {
73 |     name            = 'epi2me-labs/wf-basecalling'
74 |     author          = 'Oxford Nanopore Technologies'
75 |     homePage        = 'https://github.com/epi2me-labs/wf-basecalling'
76 |     description     = 'Helper workflow for basecalling ONT reads.'
77 |     mainScript      = 'main.nf'
78 |     nextflowVersion = '>=23.04.2'
79 |     version         = '1.5.1'
80 | }
81 | 
82 | epi2melabs {
83 |     tags = "wf-basecalling,dorado,basecalling,utility"
84 |     icon = "faTty"
85 | }
86 | 


--------------------------------------------------------------------------------
/output_definition.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": {
 3 |     "workflow-report": {
 4 |       "filepath": "wf-basecalling-report.html",
 5 |       "title": "workflow report",
 6 |       "description": "Report summarising the work done by the basecalling workflow",
 7 |       "mime-type": "text/html",
 8 |       "optional": false,
 9 |       "type": "per-sample"
10 |     },
11 |     "simplex-alignment-pass": {
12 |       "filepath": "{{ alias }}.pass.simplex.{{ format }}",
13 |       "title": "Simplex alignment file of passed reads",
14 |       "description": "BAM or CRAM file of simplex reads for the sample that pass QC filtering.",
15 |       "mime-type": "application/x-gzip",
16 |       "optional": false,
17 |       "type": "per-sample"
18 |     },    
19 |     "duplex-alignment-pass": {
20 |       "filepath": "{{ alias }}.pass.duplex.{{ format }}",
21 |       "title": "Duplex alignment file of passed reads",
22 |       "description": "BAM or CRAM file of duplex reads for the sample that pass QC filtering. Created if duplex basecalling is requested.",
23 |       "mime-type": "application/x-gzip",
24 |       "optional": true,
25 |       "type": "per-sample"
26 |     },    
27 |     "simplex-alignment-pass-index": {
28 |       "filepath": "{{ alias }}.pass.simplex.{{ format }}.{{ index_format }}",
29 |       "title": "Simplex alignment file index of passed reads",
30 |       "description": "The index of the resulting BAM or CRAM file with the simplex reads that pass QC filtering.",
31 |       "mime-type": "application/octet-stream",
32 |       "optional": false,
33 |       "type": "per-sample"
34 |     },
35 |     "duplex-alignment-pass-index": {
36 |       "filepath": "{{ alias }}.pass.duplex.{{ format }}.{{ index_format }}",
37 |       "title": "Duplex alignment file index of passed reads",
38 |       "description": "The index of the resulting BAM or CRAM file with the duplex reads that pass QC filtering. Created if duplex basecalling is requested.",
39 |       "mime-type": "application/octet-stream",
40 |       "optional": true,
41 |       "type": "per-sample"
42 |     },
43 |     "simplex-alignment-fail": {
44 |       "filepath": "{{ alias }}.fail.simplex.{{ format }}",
45 |       "title": "Simplex alignment file of failed reads",
46 |       "description": "BAM or CRAM file of simplex reads for the sample that fail QC filtering.",
47 |       "mime-type": "application/x-gzip",
48 |       "optional": false,
49 |       "type": "per-sample"
50 |     },    
51 |     "duplex-alignment-fail": {
52 |       "filepath": "{{ alias }}.fail.duplex.{{ format }}",
53 |       "title": "Duplex alignment file of failed reads",
54 |       "description": "BAM or CRAM file of duplex reads for the sample that fail QC filtering. Created if duplex basecalling is requested.",
55 |       "mime-type": "application/x-gzip",
56 |       "optional": true,
57 |       "type": "per-sample"
58 |     },    
59 |     "simplex-alignment-fail-index": {
60 |       "filepath": "{{ alias }}.fail.simplex.{{ format }}.{{ index_format }}",
61 |       "title": "Simplex alignment file index of failed reads",
62 |       "description": "The index of the resulting BAM or CRAM file with the simplex reads that fail QC filtering.",
63 |       "mime-type": "application/octet-stream",
64 |       "optional": false,
65 |       "type": "per-sample"
66 |     },
67 |     "duplex-alignment-fail-index": {
68 |       "filepath": "{{ alias }}.fail.duplex.{{ format }}.{{ index_format }}",
69 |       "title": "Duplex alignment file index of failed reads",
70 |       "description": "The index of the resulting BAM or CRAM file with the duplex reads that fail QC filtering. Created if duplex basecalling is requested.",
71 |       "mime-type": "application/octet-stream",
72 |       "optional": true,
73 |       "type": "per-sample"
74 |     },
75 |     "reference-index": {
76 |       "filepath": "{{ ref }}.fai",
77 |       "title": "Index of the reference FASTA file",
78 |       "description": "Index of the reference FASTA file.",
79 |       "mime-type": "text/tab-separated-values",
80 |       "optional": true,
81 |       "type": "aggregated"
82 |     },
83 |     "igv-config": {
84 |       "filepath": "igv.json",
85 |       "title": "JSON configuration file for IGV browser",
86 |       "description": "JSON configuration file to be loaded in IGV for visualising alignments against the reference genome.",
87 |       "mime-type": "text/json",
88 |       "optional": true,
89 |       "type": "aggregated"
90 |     }    
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/util/update_models_schema.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Generate nextflow_schema with updated basecaller enumerations
 3 | #
 4 | # This script uses `nextflow config` to obtain the basecaller container,
 5 | # creates JSON arrays of the models using the container's list-models script
 6 | # and injects them with jq to create nextflow_schema.json.new.
 7 | set -euo pipefail
 8 | 
 9 | TARGET=$1
10 | ENGINE=$2
11 | 
12 | if ! command -v nextflow &> /dev/null
13 | then
14 |     # we should be in CI, nextflow is installed right here
15 |     NEXTFLOW="./nextflow"
16 | else
17 |     NEXTFLOW=`which nextflow`
18 | fi
19 | 
20 | # work out how to inspect the container contents
21 | DORADO_CONTAINER=$(${NEXTFLOW} config -flat | grep "process.'withLabel:wf_basecalling'.container" | awk -F'= ' '{print $2}' | sed "s,',,g")
22 | echo "# DORADO_CONTAINER=${DORADO_CONTAINER}"
23 | if [ "$ENGINE" = "simg" ]; then
24 |     CMD_PREFIX="singularity exec docker://${DORADO_CONTAINER}"
25 | else
26 |     CMD_PREFIX="docker run ${DORADO_CONTAINER}"
27 | fi
28 | 
29 | # Convert model lists to JSON arrays
30 | SIMPLEX_MODELS=$(${CMD_PREFIX} list-models --simplex --only-names | jq -Rn '[inputs]')
31 | MODBASE_MODELS=$(${CMD_PREFIX} list-models --modbase --only-names | jq -Rn '[inputs]')
32 | 
33 | # Inject JSON arrays to relevant schema enum
34 | jq \
35 |     -j \
36 |     --indent 4 \
37 |     --argjson simplex_models "${SIMPLEX_MODELS}" \
38 |     --argjson modbase_models "${MODBASE_MODELS}" \
39 |     '(.definitions.basecalling_options.properties.basecaller_cfg.enum) = $simplex_models |
40 |     (.definitions.basecalling_options.properties.remora_cfg.enum) = $modbase_models' \
41 |     ${TARGET}/nextflow_schema.json > ${TARGET}/nextflow_schema.json.new
42 | 
43 | echo "# Updated schema generated, you should inspect it before adopting it!"
44 | echo "diff ${TARGET}/nextflow_schema.json ${TARGET}/nextflow_schema.json.new"
45 | echo "mv ${TARGET}/nextflow_schema.json.new ${TARGET}/nextflow_schema.json"
46 | 


--------------------------------------------------------------------------------