├── .dockerignore ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── config.yml │ ├── feature_request.yml │ └── question.yml ├── .gitignore ├── .gitlab-ci.yml ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── base.config ├── bin ├── add_jsons.py ├── check_sample_sheet.py ├── duplex_stats.py ├── fastcat_histogram.py ├── report.py ├── report_utils.py ├── workflow-glue └── workflow_glue │ ├── __init__.py │ ├── models │ ├── __init__.py │ └── common.py │ ├── util.py │ └── wfg_helpers │ ├── __init__.py │ ├── check_bam_headers_in_dir.py │ ├── check_sample_sheet.py │ ├── check_xam_index.py │ ├── configure_igv.py │ ├── get_max_depth_locus.py │ └── reheader_samstream.py ├── data └── OPTIONAL_FILE ├── docs ├── 01_brief_description.md ├── 02_introduction.md ├── 03_compute_requirements.md ├── 04_install_and_run.md ├── 05_related_protocols.md ├── 06_input_example.md ├── 06_input_parameters.md ├── 07_outputs.md ├── 08_pipeline_overview.md ├── 09_troubleshooting.md ├── 10_FAQ.md └── 11_other.md ├── lib ├── ArgumentParser.groovy ├── CWUtil.groovy ├── NfcoreSchema.groovy ├── NfcoreTemplate.groovy ├── Pinguscript.groovy ├── WorkflowMain.groovy ├── common.nf ├── ingress.nf ├── nfcore_external_java_deps.jar ├── reference.nf └── signal │ ├── ingress.nf │ └── merge.nf ├── main.nf ├── nextflow.config ├── nextflow_schema.json ├── output_definition.json └── util └── update_models_schema.sh /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | bin 3 | CHANGELOG.md 4 | data 5 | lib 6 | LICENSE 7 | main.nf 8 | nextflow.config 9 | README.md 10 | test_data 11 | # we typically run tests with outputs to these: 12 | output 13 | work 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report 3 | labels: ["triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for taking the time to fill out this bug report! 9 | 10 | 11 | - type: markdown 12 | attributes: 13 | value: | 14 | # Background 15 | - type: dropdown 16 | id: os 17 | attributes: 18 | label: Operating System 19 | description: What operating system are you running? 20 | options: 21 | - Windows 10 22 | - Windows 11 23 | - macOS 24 | - Ubuntu 22.04 25 | - CentOS 7 26 | - Other Linux (please specify below) 27 | validations: 28 | required: true 29 | - type: input 30 | id: other-os 31 | attributes: 32 | label: Other Linux 33 | placeholder: e.g. Fedora 38 34 | - type: input 35 | id: version 36 | attributes: 37 | label: Workflow Version 38 | description: This is most easily found in the workflow output log 39 | placeholder: v1.2.3 40 | validations: 41 | required: true 42 | - type: dropdown 43 | id: execution 44 | attributes: 45 | label: Workflow Execution 46 | description: Where are you running the workflow? 47 | options: 48 | - EPI2ME Desktop (Local) 49 | - EPI2ME Desktop (Cloud) 50 | - Command line (Local) 51 | - Command line (Cluster) 52 | - Other (please describe) 53 | validations: 54 | required: true 55 | - type: input 56 | id: other-workflow-execution 57 | attributes: 58 | label: Other workflow execution 59 | description: If "Other", please describe 60 | placeholder: Tell us where / how you are running the workflow. 61 | 62 | - type: markdown 63 | attributes: 64 | value: | 65 | # EPI2ME Desktop Application 66 | If you are using the application please provide the following. 67 | - type: input 68 | id: labs-version 69 | attributes: 70 | label: EPI2ME Version 71 | description: Available from the application settings page. 72 | placeholder: v5.1.1 73 | validations: 74 | required: false 75 | 76 | 77 | - type: markdown 78 | attributes: 79 | value: | 80 | # Command-line execution 81 | If you are using nextflow on a command-line, please provide the following. 82 | - type: textarea 83 | id: cli-command 84 | attributes: 85 | label: CLI command run 86 | description: Please tell us the command you are running 87 | placeholder: e.g. nextflow run epi2me-labs/wf-human-variations -profile standard --fastq my-reads/fastq 88 | validations: 89 | required: false 90 | - type: dropdown 91 | id: profile 92 | attributes: 93 | label: Workflow Execution - CLI Execution Profile 94 | description: Which execution profile are you using? If you are using a custom profile or nextflow configuration, please give details below. 95 | options: 96 | - standard (default) 97 | - singularity 98 | - custom 99 | validations: 100 | required: false 101 | 102 | 103 | - type: markdown 104 | attributes: 105 | value: | 106 | # Report details 107 | - type: textarea 108 | id: what-happened 109 | attributes: 110 | label: What happened? 111 | description: Also tell us, what did you expect to happen? 112 | placeholder: Tell us what you see! 113 | validations: 114 | required: true 115 | - type: textarea 116 | id: logs 117 | attributes: 118 | label: Relevant log output 119 | description: For CLI execution please include the full output from running nextflow. For execution from the EPI2ME application please copy the contents of the "Workflow logs" panel from the "Logs" tab corresponding to your workflow instance. (This will be automatically formatted into code, so no need for backticks). 120 | render: shell 121 | validations: 122 | required: true 123 | - type: textarea 124 | id: activity-log 125 | attributes: 126 | label: Application activity log entry 127 | description: For use with the EPI2ME application please see the Settings > View Activity Log page, and copy the contents of any items listed in red using the Copy to clipboard button. 128 | render: shell 129 | validations: 130 | required: false 131 | - type: dropdown 132 | id: run-demo 133 | attributes: 134 | label: Were you able to successfully run the latest version of the workflow with the demo data? 135 | description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button? 136 | options: 137 | - 'yes' 138 | - 'no' 139 | - other (please describe below) 140 | validations: 141 | required: true 142 | - type: textarea 143 | id: demo-other 144 | attributes: 145 | label: Other demo data information 146 | render: shell 147 | validations: 148 | required: false 149 | 150 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Nanopore customer support 4 | url: https://nanoporetech.com/contact 5 | about: For general support, including bioinformatics questions. 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature request 2 | description: Suggest an idea for this project 3 | labels: ["feature request"] 4 | body: 5 | 6 | - type: textarea 7 | id: question1 8 | attributes: 9 | label: Is your feature related to a problem? 10 | placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 11 | validations: 12 | required: true 13 | - type: textarea 14 | id: question2 15 | attributes: 16 | label: Describe the solution you'd like 17 | placeholder: A clear and concise description of what you want to happen. 18 | validations: 19 | required: true 20 | - type: textarea 21 | id: question3 22 | attributes: 23 | label: Describe alternatives you've considered 24 | placeholder: A clear and concise description of any alternative solutions or features you've considered. 25 | validations: 26 | required: true 27 | - type: textarea 28 | id: question4 29 | attributes: 30 | label: Additional context 31 | placeholder: Add any other context about the feature request here. 32 | validations: 33 | required: false 34 | 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.yml: -------------------------------------------------------------------------------- 1 | name: Question 2 | description: Ask a generic question about this project unrelated to features or bugs. 3 | labels: ["question"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Please reserve this form for issues not related to bugs or feature requests. If our developers deem your questions to be related to bugs or features you will be asked to fill in the appropriate form. 9 | - type: textarea 10 | id: question1 11 | attributes: 12 | label: Ask away! 13 | placeholder: | 14 | Bad question: How do I use this workflow in my HPC cluster? 15 | Good question: My HPC cluster uses a GridEngine scheduler. Can you point me to documentation for how to use your workflows to efficiently submit jobs to my cluster? 16 | validations: 17 | required: true 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | nextflow 2 | .nextflow* 3 | template-workflow 4 | .*.swp 5 | .*.swo 6 | *.pyc 7 | *.pyo 8 | .DS_store 9 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | # Include shared CI 2 | include: 3 | - project: "epi2melabs/ci-templates" 4 | file: "wf-containers.yaml" 5 | 6 | variables: 7 | CI_FLAVOUR: "new" # set to "classic" for old-style CI 8 | SKIP_PYTHON_TESTS: "not applicable" 9 | NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demo/VERSION && rm demo_data.tar.gz" 10 | NF_PROCESS_OPTIONS: "--basecaller_chunk_size 1 --ubam_map_threads 5 --ubam_sort_threads 2 --ubam_bam2fq_threads 1" 11 | PYTEST_CONTAINER_CONFIG_KEY: "container_sha_basecalling=" 12 | 13 | check-models: 14 | extends: .preflight 15 | script: 16 | - !reference [.install, nextflow] # requires nextflow to read config 17 | - bash util/update_models_schema.sh . docker 18 | - > 19 | if ! diff nextflow_schema.json nextflow_schema.json.new; then 20 | echo "Model schema requires updating." 21 | exit 1 22 | fi 23 | 24 | docker-run: 25 | artifacts: 26 | when: always 27 | paths: 28 | - ${CI_PROJECT_NAME} 29 | - .nextflow.log 30 | exclude: 31 | - ${CI_PROJECT_NAME}/**/*.fa 32 | - ${CI_PROJECT_NAME}/**/*.fna 33 | - ${CI_PROJECT_NAME}/**/*.fasta 34 | - ${CI_PROJECT_NAME}/**/ref_cache/** 35 | 36 | # Define a 1D job matrix to inject a variable named MATRIX_NAME into 37 | # the CI environment, we can use the value of MATRIX_NAME to determine 38 | # which options to apply as part of the rules block below 39 | # NOTE There is a slightly cleaner way to define this matrix to include 40 | # the variables, but it is broken when using long strings! See CW-756 41 | tags: 42 | - grid 43 | - shell 44 | parallel: 45 | matrix: 46 | - MATRIX_NAME: [ 47 | "dorado", 48 | "dorado-igv", 49 | "dorado-igv-gz", 50 | "dorado_mod", 51 | "dorado_fast5", 52 | "dorado-gzref", 53 | "dorado-output-fastq", 54 | "dorado-qscore-filter", 55 | "duplex", 56 | "duplex_mod", 57 | "duplex_fast5", 58 | "duplex_watch", 59 | "duplex_fqonly_fail", 60 | "watch_path", 61 | "no_reference", 62 | "no_reference-output-fastq", 63 | "output_bam", 64 | "polya_tails", 65 | "demux", 66 | "duplex_demux", 67 | "demux-align" 68 | ] 69 | rules: 70 | # NOTE As we're overriding the rules block for the included docker-run 71 | # we must redefine this CI_COMMIT_BRANCH rule to prevent docker-run 72 | # being incorrectly scheduled for "detached merge request pipelines" etc. 73 | - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template") 74 | when: never 75 | - if: $MATRIX_NAME == "dorado" 76 | variables: 77 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 ${NF_PROCESS_OPTIONS}" 78 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 79 | - if: $MATRIX_NAME == "dorado-igv" 80 | variables: 81 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --igv ${NF_PROCESS_OPTIONS}" 82 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 83 | - if: $MATRIX_NAME == "dorado-igv-gz" 84 | variables: 85 | NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demo/VERSION && rm demo_data.tar.gz && wget -q -O wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz" 86 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --igv ${NF_PROCESS_OPTIONS}" 87 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 88 | - if: $MATRIX_NAME == "dorado-gzref" 89 | variables: 90 | NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demo/VERSION && rm demo_data.tar.gz && wget -q -O wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz" 91 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 ${NF_PROCESS_OPTIONS}" 92 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 93 | - if: $MATRIX_NAME == "dorado-output-fastq" 94 | variables: 95 | NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demo/VERSION && rm demo_data.tar.gz && wget -q -O wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz" 96 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --output_fmt fastq ${NF_PROCESS_OPTIONS}" 97 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 98 | - if: $MATRIX_NAME == "dorado_mod" 99 | variables: 100 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --remora_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2 ${NF_PROCESS_OPTIONS}" 101 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 102 | - if: $MATRIX_NAME == "dorado_fast5" 103 | variables: 104 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/fast5 --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --dorado_ext fast5 --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 ${NF_PROCESS_OPTIONS}" 105 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 106 | - if: $MATRIX_NAME == "dorado-qscore-filter" 107 | variables: 108 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --qscore_filter 20 ${NF_PROCESS_OPTIONS}" 109 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 110 | - if: $MATRIX_NAME == "watch_path" 111 | variables: 112 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --watch_path --read_limit 3000 ${NF_PROCESS_OPTIONS}" 113 | NF_IGNORE_PROCESSES: "pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 114 | - if: $MATRIX_NAME == "no_reference" 115 | variables: 116 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 ${NF_PROCESS_OPTIONS}" 117 | NF_IGNORE_PROCESSES: "cram_cache,stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 118 | - if: $MATRIX_NAME == "no_reference-output-fastq" 119 | variables: 120 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --output_fmt fastq ${NF_PROCESS_OPTIONS}" 121 | NF_IGNORE_PROCESSES: "cram_cache,stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 122 | AFTER_NEXTFLOW_CMD: "[ -f wf-basecalling/SAMPLE.pass.fq.gz ] && echo 'Expected file wf-basecalling/SAMPLE.pass.fq.gz found' || exit 1" 123 | - if: $MATRIX_NAME == "output_bam" 124 | variables: 125 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --output_fmt bam ${NF_PROCESS_OPTIONS}" 126 | NF_IGNORE_PROCESSES: "cram_cache,stopCondition,pair_stats,progressive_pairings,dorado_summary,split_xam,combine_dorado_summaries,output_pod5s" 127 | - if: $MATRIX_NAME == "duplex" 128 | variables: 129 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --duplex ${NF_PROCESS_OPTIONS}" 130 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,output_pod5s" 131 | - if: $MATRIX_NAME == "duplex_mod" 132 | variables: 133 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --remora_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2 --duplex ${NF_PROCESS_OPTIONS}" 134 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,output_pod5s" 135 | - if: $MATRIX_NAME == "duplex_fast5" 136 | variables: 137 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/fast5 --output_pod5 --dorado_ext fast5 --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v4.1.0 --duplex ${NF_PROCESS_OPTIONS}" 138 | NF_IGNORE_PROCESSES: "stopCondition,pair_stats,progressive_pairings,dorado,make_mmi,align_and_qsFilter,\ 139 | merge_pass_calls,merge_fail_calls,getVersions,getParams,cram_cache,bamstats,progressive_stats,makeReport,output" 140 | - if: $MATRIX_NAME == "duplex_watch" 141 | variables: 142 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --watch_path --read_limit 3000 --duplex ${NF_PROCESS_OPTIONS}" 143 | NF_IGNORE_PROCESSES: "output_pod5s" 144 | - if: $MATRIX_NAME == "duplex_fqonly_fail" 145 | variables: 146 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demo/input --ref wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg dna_r10.4.1_e8.2_400bps_sup@v4.1.0 --watch_path --read_limit 3000 --output_fmt fastq --duplex ${NF_PROCESS_OPTIONS}" 147 | NF_IGNORE_PROCESSES: "output_pod5s" 148 | ASSERT_NEXTFLOW_FAILURE: "yes" 149 | ASSERT_NEXTFLOW_FAILURE_REXP : "Duplex requires the outputs of Dorado to be in BAM format." 150 | - if: $MATRIX_NAME == "polya_tails" 151 | variables: 152 | NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-polya-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-polya-demo/VERSION && rm demo_data.tar.gz" 153 | NF_WORKFLOW_OPTS: "--poly_a_config wf-basecalling-polya-demo/polya_conf.toml --input wf-basecalling-polya-demo/input --ref wf-basecalling-polya-demo/RCS-100A.fasta --basecaller_cfg rna004_130bps_hac@v3.0.1 ${NF_PROCESS_OPTIONS}" 154 | NF_IGNORE_PROCESSES: "output_pod5s,pair_stats,progressive_pairings,split_xam,stopCondition" 155 | - if: $MATRIX_NAME == "demux" 156 | variables: 157 | NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demux-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demux-demo/README && rm demo_data.tar.gz" 158 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demux-demo/input --barcode_kit SQK-RBK114-96 --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v5.0.0 ${NF_PROCESS_OPTIONS}" 159 | NF_IGNORE_PROCESSES: "output_pod5s,pair_stats,progressive_pairings,split_xam,stopCondition,cram_cache" 160 | - if: $MATRIX_NAME == "demux-align" 161 | variables: 162 | NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demux-demo.tar.gz && \ 163 | tar -xzvf demo_data.tar.gz && cat wf-basecalling-demux-demo/README && \ 164 | rm demo_data.tar.gz && \ 165 | wget -q -O wf-basecalling-demux-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz" 166 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demux-demo/input --barcode_kit SQK-RBK114-96 --ref wf-basecalling-demux-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa.gz --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v5.0.0 ${NF_PROCESS_OPTIONS}" 167 | NF_IGNORE_PROCESSES: "output_pod5s,pair_stats,progressive_pairings,split_xam,stopCondition,cram_cache" 168 | - if: $MATRIX_NAME == "duplex_demux" 169 | variables: 170 | NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demux-demo.tar.gz && tar -xzvf demo_data.tar.gz && cat wf-basecalling-demux-demo/README && rm demo_data.tar.gz" 171 | NF_WORKFLOW_OPTS: "--input wf-basecalling-demux-demo/input --duplex true --barcode_kit SQK-RBK114-96 --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v5.0.0 ${NF_PROCESS_OPTIONS}" 172 | NF_IGNORE_PROCESSES: "output_pod5s,pair_stats,progressive_pairings,split_xam,stopCondition,cram_cache" 173 | ASSERT_NEXTFLOW_FAILURE: "yes" 174 | ASSERT_NEXTFLOW_FAILURE_REXP : "Validation of pipeline parameters failed" 175 | 176 | aws-run: 177 | rules: 178 | - when: never 179 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: docs_readme 5 | name: docs_readme 6 | entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json 7 | language: python 8 | always_run: true 9 | pass_filenames: false 10 | additional_dependencies: 11 | - epi2melabs==0.0.58 12 | - repo: https://github.com/pycqa/flake8 13 | rev: 5.0.4 14 | hooks: 15 | - id: flake8 16 | pass_filenames: false 17 | additional_dependencies: 18 | - flake8-rst-docstrings 19 | - flake8-docstrings 20 | - flake8-import-order 21 | - flake8-forbid-visual-indent 22 | - pep8-naming 23 | - flake8-no-types 24 | - flake8-builtins 25 | - flake8-absolute-import 26 | - flake8-print 27 | # avoid snowballstemmer>=3.0 as it causes flake8-docstrings to stop working [CW-6098] 28 | - snowballstemmer==2.2.0 29 | args: [ 30 | "bin", 31 | "--import-order-style=google", 32 | "--statistics", 33 | "--max-line-length=88", 34 | "--per-file-ignores=bin/workflow_glue/models/*:NT001", 35 | ] 36 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [v1.5.1] 8 | The updates in this release do not affect wf-basecalling but are required for EPI2ME workflows that make use of wf-basecalling, to maintain compliance with our latest wf-template standard. 9 | Users do not need to update to this release. 10 | ### Changed 11 | - Updated to wf-template v5.6.1, changing: 12 | - pre-commit configuration to resolve an internal dependency problem with flake8. This has no effect on the workflow. 13 | - Log banner art to say "EPI2ME" instead of "EPI2ME Labs" to match current branding. This has no effect on the workflow outputs. 14 | 15 | ## [v1.5.0] 16 | This version of wf-basecalling updates Dorado to v0.9.5 which should improve the speed of basecalling on some GPU architectures. 17 | Dorado v0.9.5 increases the minimum NVIDIA Driver requirement to 525.105. 18 | ### Changed 19 | - Updated Dorado to [v0.9.5](https://github.com/nanoporetech/dorado/releases/tag/v0.9.5) 20 | - Alignment uses the high quality long read preset (-x lr:hq) to reduce mapping time. 21 | - Basecaller model options are now reverse version sorted in the workflow schema to ensure newer models appear at the top of drop-downs and listings of available models. 22 | 23 | ### Fixed 24 | - "Input data problem" error in downstream workflows necessitating use of the override_basecaller_cfg option. Relevant metadata from the input XAM header is now retained after alignment to ensure that the basecaller configuration is automatically detected. 25 | 26 | ## [v1.4.7] 27 | This maintenance release updates the models used for outputting results. 28 | This release is to support our other workflows. 29 | Users do not need to update to this release. 30 | ### Changed 31 | - Removed Pydantic and autogenerated models from results_schema.yml and created a new model namespace using dataclasses. 32 | 33 | ## [v1.4.6] 34 | This maintenance release updates the version of our code that plots our post-workflow reports. 35 | This release is to support our other workflows. 36 | Users do not need to update to this release. 37 | ### Changed 38 | - Updated common Docker image to update ezcharts to v0.12.0. This improves formatting of plots in the report. 39 | ### Fixed 40 | - Typo in schema. 41 | 42 | ## [v1.4.5] 43 | ### Changed 44 | - Reconciled workflow with wf-template v5.3.4. 45 | 46 | ## [v1.4.4] 47 | ### Changed 48 | - Updated Dorado to [v0.9.0](https://github.com/nanoporetech/dorado/releases/tag/v0.9.0) 49 | 50 | ## [v1.4.3] 51 | ### Changed 52 | - Reconciled workflow with wf-template v5.3.3. 53 | 54 | ## [v1.4.2] 55 | ### Added 56 | - q-score filter added to signal ingress. 57 | ### Changed 58 | - Updated Dorado to [v0.8.3](https://github.com/nanoporetech/dorado/releases/tag/v0.8.3) 59 | - Reconciled workflow with wf-template v5.3.1. 60 | 61 | ## [v1.4.1] 62 | ### Changed 63 | - Reconciled workflow with wf-template v5.3.0. 64 | - Updated Dorado to [v0.8.1](https://github.com/nanoporetech/dorado/releases/tag/v0.8.1) 65 | 66 | ## [v1.4.0] 67 | ### Added 68 | - IGV configuration file with `--ref --igv` options and either `--output_fmt bam` or `--output_fmt cram`. 69 | - Support for gzipped reference genomes. 70 | - `output_fmt` selects the output format for basecalled and aligned files. 71 | ### Changed 72 | - Updated Dorado to [v0.8.0](https://github.com/nanoporetech/dorado/releases/tag/v0.8.0) 73 | - Reconciled workflow with wf-template v5.2.6. 74 | - Do not emit the reference FASTA file. 75 | - Collapse redundant RG and PG header lines when emitting BAM or CRAM. 76 | ### Fixed 77 | - Workflow starting with `--duplex --barcode_kit`, despite duplex not supporting barcoding. 78 | - Workflow crashing with `--ref {{ reference }} --barcode_kit`. 79 | - Aligned reads will no longer be trimmed when demuxing to preserve mapping information. 80 | - Workflow emits confusing warning about Bonito filtering when using Dorado. 81 | ### Removed 82 | - `fastq_only` and `output_bam` options replaced by `output_fmt`. 83 | - `--output_fmt fastq` can be used to output unaligned FASTQ instead of unaligned CRAM. 84 | - `--output_fmt bam` can be used to output unaligned or aligned BAM instead of CRAM. 85 | 86 | ## [v1.3.0] 87 | ### Added 88 | - Modified base calling with `--duplex`. 89 | - APK 5.0.0 model. 90 | ### Changed 91 | - Updated Dorado to v0.7.2 (see https://github.com/nanoporetech/dorado/releases/tag/v0.7.2) 92 | 93 | ## [v1.2.2] 94 | ### Changes 95 | - Bug fix for downstream workflows and `--poly_a_config` which does not affect normal workflow use. 96 | 97 | ## [v1.2.1] 98 | ### Added 99 | - Output channel for demuxed BAM files for downstream use. 100 | 101 | ## [v1.2.0] 102 | ### Added 103 | - Support for `dorado demux` to demultiplex barcoded runs. Specify your `--barcode_kit` to activate demultiplexing. 104 | - Support for poly(a) tail length estimation with `--poly_a_config`. You can configure by providng a TOML file to `--poly_a_config` which is described in detail [here](https://github.com/nanoporetech/dorado?tab=readme-ov-file#polya-tail-estimation) 105 | ### Changed 106 | - Updated Dorado to v0.7.1 (see https://github.com/nanoporetech/dorado/releases/tag/v0.7.1) 107 | 108 | ## [v1.1.9] 109 | ### Fixed 110 | - Report crashing when no data are present in the input pod5. 111 | - Reconciled workflow with wf-template v5.1.3. 112 | - Updated Dorado to v0.7.0 (see https://github.com/nanoporetech/dorado/releases/tag/v0.7.0) 113 | - Added new DNA and RNA 5.0.0 models. 114 | 115 | ## [v1.1.8] 116 | ### Changed 117 | - Updated Dorado to v0.6.0 (see https://github.com/nanoporetech/dorado/releases/tag/v0.6.0) 118 | 119 | ## [v1.1.7] 120 | ### Fixed 121 | - Workflow accepting incompatible `--fastq_only` and `--duplex` options 122 | - Dynamically updated report in `--watch_path` mode. 123 | 124 | ## [v1.1.6] 125 | ### Fixed 126 | - qscore_filter inadvertently disabled in v1.1.5 127 | 128 | ## [v1.1.5] 129 | ### Changed 130 | - Minor update to default resource requests on dorado task. 131 | ### Added 132 | - Experimental feature switch. 133 | 134 | ## [v1.1.4] 135 | ### Changed 136 | - Updated Dorado to v0.5.2 (see https://github.com/nanoporetech/dorado/releases/tag/v0.5.2) 137 | 138 | ## [v1.1.3] 139 | ### Changed 140 | - Bumped memory directives for intense tasks to reduce likelihood of job failure 141 | - Default to parallel GPU usage when using awsbatch profile 142 | ### Removed 143 | - Runtime driver check in Dorado process, as this is no longer available in the Dorado image 144 | 145 | ## [v1.1.2] 146 | ### Changed 147 | - Updated dorado version to v0.5.1 (see https://github.com/nanoporetech/dorado/releases/tag/v0.5.1) 148 | 149 | ## [v1.1.1] 150 | ### Added 151 | - Reintroduced RNA002 models 152 | 153 | ## [v1.1.0] 154 | ### [Added] 155 | - `--duplex` basecalling converts FAST5 to POD5 automatically 156 | - Converted POD5 files are deleted by default, use `--output_pod5` to output converted POD5 files to the workflow output directory. 157 | 158 | ### Changed 159 | - Updated Dorado to v0.3.4 (see https://github.com/nanoporetech/dorado/releases/tag/v0.3.4) 160 | 161 | ## [v1.0.1] 162 | ### Fixed 163 | - Workflow crashes with fast5 input 164 | - Workflow fails early when trying to use FAST5 input with Dorado duplex 165 | 166 | ## [v1.0.0] 167 | ### Added 168 | - RNA004 models 169 | - R941 v3.3 5mCG 5hmCG models 170 | - Duplex calling with option `--duplex` 171 | - Note that duplex calling is not optimised for streaming basecalling with `--watch_path` and may lead to lower duplex yield. 172 | - Duplex basecalling is currently not compatible with modified basecalling. 173 | 174 | ### Changed 175 | - Updated Dorado to v0.3.2 (see https://github.com/nanoporetech/dorado/releases/tag/v0.3.2) 176 | - Pascal architecture GPUs are now supported 177 | - Bumped minimum required Nextflow version to 23.04.2 178 | - Users no longer need to provide `--basecaller_cfg custom` and/or `--remora_cfg custom` to override models with `--basecaller_model_path` and/or `--remora_model_path` respectively. 179 | 180 | ### Fixed 181 | - `bamstats` process very slow when `output_bam` has been selected 182 | 183 | ## [v0.7.2] 184 | ### Added 185 | - v4.2 5mC and 6mA modification models 186 | 187 | ### Changed 188 | - Updated Dorado to v0.3.1 189 | - GPU tasks are limited to run in serial by default to avoid memory errors 190 | - Users in cluster and cloud environments where GPU devices are scheduled must use `-profile discrete_gpus` to parallelise GPU work 191 | - A warning will be printed if the workflow detects it is running non-local execution but the discrete_gpus profile is not enabled 192 | - Additional guidance on GPU support is provided in our Quickstart 193 | - Bumped minimum required Nextflow version to 22.10.8 194 | 195 | ## [v0.7.1] 196 | ### Fixed 197 | - Command not found on `cram_cache` step 198 | - Typo in report that refers to the workflow as "wf-basecalling-report" 199 | 200 | ## [v0.7.0] 201 | ### Changed 202 | - Updated Dorado to v0.3.0 203 | - BAM may be output **instead** of CRAM by providing `--output_bam` 204 | - `--help` message will list basecalling and modbasecalling models available for use with the workflow 205 | 206 | ### Added 207 | - v4.2.0 models, which must be used for sequencing runs performed at new 5 kHz sampling rate 208 | - v4.1.0 models replace v4.0.0 models and must be used for sequencing runs performed at 4 kHz sampling rate 209 | 210 | ### Removed 211 | - v4.0.0 models 212 | 213 | ### Fixed 214 | - Custom models were previously rejected by the workflow as `basecaller_cfg` and `remora_cfg` are validated against a list of basecalling models installed in the Dorado container. 215 | - Users should now provide `--basecaller_cfg custom` and/or `--remora_cfg custom` to override models with `--basecaller_model_path` and/or `--remora_model_path` respectively. 216 | - Providing `--basecaller_cfg custom` or `--remora_cfg custom` without the corresponding `--basecaller_model_path` or `--remora_model_path` will result in an error. 217 | 218 | ## [v0.6.0] 219 | ### Added 220 | - Ability to watch the input path and process files as they become available in real time. 221 | 222 | ## [v0.5.2] 223 | ### Added 224 | - Configuration for running demo data in AWS 225 | 226 | ## [v0.5.1] 227 | ### Fixed 228 | - Missing models from list of valid models 229 | - "dna_r9.4.1_e8_hac@v3.4_5mCG@v0" is now correctly referred to as "dna_r9.4.1_e8_hac@v3.3_5mCG@v0", to match the simplex model version 230 | - "dna_r9.4.1_e8_sup@v3.4_5mCG@v0" is now correctly referred to as "dna_r9.4.1_e8_sup@v3.3_5mCG@v0", to match the simplex model version 231 | 232 | ## [v0.5.0] 233 | ### Changed 234 | - Updated Dorado to v0.2.4 235 | - Updated to Oxford Nanopore Technologies PLC. Public License 236 | 237 | ### Fixed 238 | - Dorado image correctly ships with CUDA runtime library 239 | 240 | ## [v0.4.1] 241 | ### Fixed 242 | - Input ref channel depleted after first alignment 243 | 244 | ## [v0.4.0] 245 | ### Changed 246 | - Reference is no longer required for basecalling 247 | - CRAM files with no alignments will be generated if `--ref` is not provided 248 | - FASTQ may be output **instead** of CRAM by providing `--fastq_only` 249 | - PG line for converting Dorado SAM output to uBAM is no longer written to output header 250 | - Work directory is automatically cleaned up on successful completion to remove large intermediate files 251 | - Override this by including `cleanup = false` in a custom Nextflow configuration file 252 | - Number of threads for merging is now configurable for advanced users 253 | 254 | ## [v0.3.0] 255 | ### Changed 256 | - Updated Dorado to v0.2.1 257 | - `--basecaller_cfg` and `--remora_cfg` are now validated against a list of models installed in the Dorado container 258 | 259 | ### Fixed 260 | - Workflow no longer prints a confusing error when Dorado fails 261 | 262 | ## [v0.2.0] 263 | ### Added 264 | - `--basecaller_args` may be used to provide custom arguments to the basecalling process 265 | 266 | ### Changed 267 | - Updated Dorado to v0.1.1 268 | - Latest models are now v4.0.0 269 | - Workflow prints a more helpful error when Dorado fails due to unknown model name 270 | 271 | ## [v0.1.2] 272 | ### Changed 273 | - Updated description in manifest 274 | 275 | ## [v0.1.1] 276 | ### Fixed 277 | - Default basecaller_basemod_threads value 278 | - Undefined `colors` variable 279 | 280 | ## [v0.1.0] 281 | ### Added 282 | - Workflow will now output pass and fail CRAM 283 | - Reads are separated into pass and fail based on their mean qscore as calculated by dorado 284 | - The threshold can be changed with `--qscore_filter` 285 | 286 | ### Changed 287 | - Improved `--help` documentation 288 | 289 | ### Fixed 290 | - Workflow will exit with "No files match pattern" if no suitable files are found to basecall 291 | - Ensure to set `--dorado_ext` to `fast5` or `pod5` as appropriate 292 | 293 | ## [v0.0.1] 294 | * Initial release of wf-basecalling supporting the Dorado basecaller 295 | 296 | ## [v0.0.0] 297 | * Initialised wf-basecalling from wf-template #30ff92d 298 | 299 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Oxford Nanopore Technologies PLC. Public License Version 1.0 2 | ============================================================= 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor’s Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Executable Form" 25 | means any form of the work other than Source Code Form. 26 | 27 | 1.6. "Larger Work" 28 | means a work that combines Covered Software with other material, in 29 | a separate file or files, that is not Covered Software. 30 | 31 | 1.7. "License" 32 | means this document. 33 | 34 | 1.8. "Licensable" 35 | means having the right to grant, to the maximum extent possible, 36 | whether at the time of the initial grant or subsequently, any and 37 | all of the rights conveyed by this License. 38 | 39 | 1.9. "Modifications" 40 | means any of the following: 41 | 42 | (a) any file in Source Code Form that results from an addition to, 43 | deletion from, or modification of the contents of Covered 44 | Software; or 45 | (b) any new file in Source Code Form that contains any Covered 46 | Software. 47 | 48 | 1.10. "Research Purposes" 49 | means use for internal research and not intended for or directed 50 | towards commercial advantages or monetary compensation; provided, 51 | however, that monetary compensation does not include sponsored 52 | research of research funded by grants. 53 | 54 | 1.11 "Secondary License" 55 | means either the GNU General Public License, Version 2.0, the GNU 56 | Lesser General Public License, Version 2.1, the GNU Affero General 57 | Public License, Version 3.0, or any later versions of those 58 | licenses. 59 | 60 | 1.12. "Source Code Form" 61 | means the form of the work preferred for making modifications. 62 | 63 | 1.13. "You" (or "Your") 64 | means an individual or a legal entity exercising rights under this 65 | License. For legal entities, "You" includes any entity that 66 | controls, is controlled by, or is under common control with You. For 67 | purposes of this definition, "control" means (a) the power, direct 68 | or indirect, to cause the direction or management of such entity, 69 | whether by contract or otherwise, or (b) ownership of more than 70 | fifty percent (50%) of the outstanding shares or beneficial 71 | ownership of such entity. 72 | 73 | 2. License Grants and Conditions 74 | -------------------------------- 75 | 76 | 2.1. Grants 77 | 78 | Each Contributor hereby grants You a world-wide, royalty-free, 79 | non-exclusive license under Contributor copyrights Licensable by such 80 | Contributor to use, reproduce, make available, modify, display, 81 | perform, distribute, and otherwise exploit solely for Research Purposes 82 | its Contributions, either on an unmodified basis, with Modifications, 83 | or as part of a Larger Work. 84 | 85 | 2.2. Effective Date 86 | 87 | The licenses granted in Section 2.1 with respect to any Contribution 88 | become effective for each Contribution on the date the Contributor 89 | first distributes such Contribution. 90 | 91 | 2.3. Limitations on Grant Scope 92 | 93 | The licenses granted in this Section 2 are the only rights granted under 94 | this License. No additional rights or licenses will be implied from the 95 | distribution or licensing of Covered Software under this License. The 96 | License is incompatible with Secondary Licenses. Notwithstanding 97 | Section 2.1 above, no copyright license is granted: 98 | 99 | (a) for any code that a Contributor has removed from Covered Software; 100 | or 101 | 102 | (b) use of the Contributions or its Contributor Version other than for 103 | Research Purposes only; or 104 | 105 | (c) for infringements caused by: (i) Your and any other third party’s 106 | modifications of Covered Software, or (ii) the combination of its 107 | Contributions with other software (except as part of its Contributor 108 | Version). 109 | 110 | This License does not grant any rights in the patents, trademarks, 111 | service marks, or logos of any Contributor (except as may be necessary 112 | to comply with the notice requirements in Section 3.4). 113 | 114 | 2.4. Subsequent Licenses 115 | 116 | No Contributor makes additional grants as a result of Your choice to 117 | distribute the Covered Software under a subsequent version of this 118 | License (see Section 10.2) or under the terms of a Secondary License 119 | (if permitted under the terms of Section 3.3). 120 | 121 | 2.5. Representation 122 | 123 | Each Contributor represents that the Contributor believes its 124 | Contributions are its original creation(s) or it has sufficient rights 125 | to grant the rights to its Contributions conveyed by this License. 126 | 127 | 2.6. Fair Use 128 | 129 | This License is not intended to limit any rights You have under 130 | applicable copyright doctrines of fair use, fair dealing, or other 131 | equivalents. 132 | 133 | 2.7. Conditions 134 | 135 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 136 | in Section 2.1. 137 | 138 | 3. Responsibilities 139 | ------------------- 140 | 141 | 3.1. Distribution of Source Form 142 | 143 | All distribution of Covered Software in Source Code Form, including any 144 | Modifications that You create or to which You contribute, must be under 145 | the terms of this License. You must inform recipients that the Source 146 | Code Form of the Covered Software is governed by the terms of this 147 | License, and how they can obtain a copy of this License. You may not 148 | attempt to alter or restrict the recipients’ rights in the Source Code Form. 149 | 150 | 3.2. Distribution of Executable Form 151 | 152 | If You distribute Covered Software in Executable Form then: 153 | 154 | (a) such Covered Software must also be made available in Source Code 155 | Form, as described in Section 3.1, and You must inform recipients of 156 | the Executable Form how they can obtain a copy of such Source Code 157 | Form by reasonable means in a timely manner, at a charge no more 158 | than the cost of distribution to the recipient; and 159 | 160 | (b) You may distribute such Executable Form under the terms of this 161 | License. 162 | 163 | 3.3. Distribution of a Larger Work 164 | 165 | You may create and distribute a Larger Work under terms of Your choice, 166 | provided that You also comply with the requirements of this License for 167 | the Covered Software. The Larger Work may not be a combination of Covered 168 | Software with a work governed by one or more Secondary Licenses. 169 | 170 | 3.4. Notices 171 | 172 | You may not remove or alter the substance of any license notices 173 | (including copyright notices, patent notices, disclaimers of warranty, 174 | or limitations of liability) contained within the Source Code Form of 175 | the Covered Software, except that You may alter any license notices to 176 | the extent required to remedy known factual inaccuracies. 177 | 178 | 3.5. Application of Additional Terms 179 | 180 | You may not choose to offer, or charge a fee for use of the Covered 181 | Software or a fee for, warranty, support, indemnity or liability 182 | obligations to one or more recipients of Covered Software. You must 183 | make it absolutely clear that any such warranty, support, indemnity, or 184 | liability obligation is offered by You alone, and You hereby agree to 185 | indemnify every Contributor for any liability incurred by such 186 | Contributor as a result of warranty, support, indemnity or liability 187 | terms You offer. You may include additional disclaimers of warranty and 188 | limitations of liability specific to any jurisdiction. 189 | 190 | 4. Inability to Comply Due to Statute or Regulation 191 | --------------------------------------------------- 192 | 193 | If it is impossible for You to comply with any of the terms of this 194 | License with respect to some or all of the Covered Software due to 195 | statute, judicial order, or regulation then You must: (a) comply with 196 | the terms of this License to the maximum extent possible; and (b) 197 | describe the limitations and the code they affect. Such description must 198 | be placed in a text file included with all distributions of the Covered 199 | Software under this License. Except to the extent prohibited by statute 200 | or regulation, such description must be sufficiently detailed for a 201 | recipient of ordinary skill to be able to understand it. 202 | 203 | 5. Termination 204 | -------------- 205 | 206 | 5.1. The rights granted under this License will terminate automatically 207 | if You fail to comply with any of its terms. 208 | 209 | 5.2. If You initiate litigation against any entity by asserting an 210 | infringement claim (excluding declaratory judgment actions, 211 | counter-claims, and cross-claims) alleging that a Contributor Version 212 | directly or indirectly infringes, then the rights granted to 213 | You by any and all Contributors for the Covered Software under Section 214 | 2.1 of this License shall terminate. 215 | 216 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 217 | end user license agreements (excluding distributors and resellers) which 218 | have been validly granted by You or Your distributors under this License 219 | prior to termination shall survive termination. 220 | 221 | ************************************************************************ 222 | * * 223 | * 6. Disclaimer of Warranty * 224 | * ------------------------- * 225 | * * 226 | * Covered Software is provided under this License on an "as is" * 227 | * basis, without warranty of any kind, either expressed, implied, or * 228 | * statutory, including, without limitation, warranties that the * 229 | * Covered Software is free of defects, merchantable, fit for a * 230 | * particular purpose or non-infringing. The entire risk as to the * 231 | * quality and performance of the Covered Software is with You. * 232 | * Should any Covered Software prove defective in any respect, You * 233 | * (not any Contributor) assume the cost of any necessary servicing, * 234 | * repair, or correction. This disclaimer of warranty constitutes an * 235 | * essential part of this License. No use of any Covered Software is * 236 | * authorized under this License except under this disclaimer. * 237 | * * 238 | ************************************************************************ 239 | 240 | ************************************************************************ 241 | * * 242 | * 7. Limitation of Liability * 243 | * -------------------------- * 244 | * * 245 | * Under no circumstances and under no legal theory, whether tort * 246 | * (including negligence), contract, or otherwise, shall any * 247 | * Contributor, or anyone who distributes Covered Software as * 248 | * permitted above, be liable to You for any direct, indirect, * 249 | * special, incidental, or consequential damages of any character * 250 | * including, without limitation, damages for lost profits, loss of * 251 | * goodwill, work stoppage, computer failure or malfunction, or any * 252 | * and all other commercial damages or losses, even if such party * 253 | * shall have been informed of the possibility of such damages. This * 254 | * limitation of liability shall not apply to liability for death or * 255 | * personal injury resulting from such party’s negligence to the * 256 | * extent applicable law prohibits such limitation, but in such event, * 257 | * and to the greatest extent permissible, damages will be limited to * 258 | * direct damages not to exceed one hundred dollars. Some * 259 | * jurisdictions do not allow the exclusion or limitation of * 260 | * incidental or consequential damages, so this exclusion and * 261 | * limitation may not apply to You. * 262 | * * 263 | ************************************************************************ 264 | 265 | 8. Litigation 266 | ------------- 267 | 268 | Any litigation relating to this License may be brought only in the 269 | courts of a jurisdiction where the defendant maintains its principal 270 | place of business and such litigation shall be governed by laws of that 271 | jurisdiction, without reference to its conflict-of-law provisions. 272 | Nothing in this Section shall prevent a party’s ability to bring 273 | cross-claims or counter-claims. 274 | 275 | 9. Miscellaneous 276 | ---------------- 277 | 278 | This License represents the complete agreement concerning the subject 279 | matter hereof. If any provision of this License is held to be 280 | unenforceable, such provision shall be reformed only to the extent 281 | necessary to make it enforceable. Any law or regulation which provides 282 | that the language of a contract shall be construed against the drafter 283 | shall not be used to construe this License against a Contributor. 284 | 285 | 10. Versions of the License 286 | --------------------------- 287 | 288 | 10.1. New Versions 289 | 290 | Oxford Nanopore Technologies PLC. is the license steward. Except as 291 | provided in Section 10.3, no one other than the license steward has the 292 | right to modify or publish new versions of this License. Each version 293 | will be given a distinguishing version number. 294 | 295 | 10.2. Effect of New Versions 296 | 297 | You may distribute the Covered Software under the terms of the version 298 | of the License under which You originally received the Covered Software, 299 | or under the terms of any subsequent version published by the license 300 | steward. 301 | 302 | 10.3. Modified Versions 303 | 304 | If you create software not governed by this License, and you want to 305 | create a new license for such software, you may create and use a 306 | modified version of this License if you rename the license and remove 307 | any references to the name of the license steward (except to note that 308 | such modified license differs from this License). 309 | 310 | Exhibit A - Source Code Form License Notice 311 | ------------------------------------------- 312 | 313 | This Source Code Form is subject to the terms of the Oxford Nanopore 314 | Technologies PLC. Public License, v. 1.0. Full licence can be found 315 | obtained from support@nanoporetech.com 316 | 317 | If it is not possible or desirable to put the notice in a particular 318 | file, then You may include the notice in a location (such as a LICENSE 319 | file in a relevant directory) where a recipient would be likely to look 320 | for such a notice. 321 | 322 | You may add additional accurate notices of copyright ownership. 323 | -------------------------------------------------------------------------------- /base.config: -------------------------------------------------------------------------------- 1 | params { 2 | out_dir = "output" 3 | wf { 4 | basecaller_container = "ontresearch/dorado" 5 | container_sha_basecalling = "sha268dcb4cd02093e75cdc58821f8b93719c4255ed" 6 | bonito_container = "ontresearch/bonito" 7 | bonito_sha = "shaea43ca2333f91fa78a823f640ba158e4268f1f98" 8 | common_sha = "sha1c69fd30053aad5d516e9567b3944384325a0fee" 9 | } 10 | } 11 | 12 | 13 | // used by default for "standard" (docker) and singularity profiles, 14 | // other profiles may override. 15 | process { 16 | withLabel:wf_basecalling { 17 | container = "${params.wf.basecaller_container}:${params.wf.container_sha_basecalling}" 18 | } 19 | withLabel:wf_common { 20 | container = "ontresearch/wf-common:${params.wf.common_sha}" 21 | } 22 | 23 | shell = ['/bin/bash', '-euo', 'pipefail'] 24 | 25 | // by default GPU tasks will run in serial to avoid GPU management. 26 | // cluster and cloud users can remove this with -profile discrete_gpus. 27 | // we use profiles to handle this as maxForks cannot be set dynamically 28 | // see https://github.com/nextflow-io/nextflow/discussions/3806 and CW-1857 29 | withLabel:gpu { 30 | maxForks = 1 31 | } 32 | } 33 | 34 | profiles { 35 | // the "standard" profile is used implicitely by nextflow 36 | // if no other profile is given on the CLI 37 | standard { 38 | docker { 39 | enabled = true 40 | // this ensures container is run as host user and group, but 41 | // also adds host user to the within-container group 42 | runOptions = "--user \$(id -u):\$(id -g) --group-add 100" 43 | } 44 | process."withLabel:gpu".containerOptions = "--gpus all" 45 | } 46 | 47 | // using singularity instead of docker 48 | singularity { 49 | singularity { 50 | enabled = true 51 | autoMounts = true 52 | //envWhitelist = "" // if your cluster sets a variable to indicate which GPU has been assigned you will want to allow it here 53 | } 54 | process."withLabel:gpu".containerOptions = "--nv" 55 | } 56 | 57 | 58 | // keep stub conda profile to prevent unknown profile warning so users get a better error 59 | conda { 60 | conda.enabled = true 61 | } 62 | 63 | 64 | // Using AWS batch. 65 | // May need to set aws.region and aws.batch.cliPath 66 | awsbatch { 67 | process { 68 | executor = 'awsbatch' 69 | queue = "${params.aws_queue}" 70 | memory = "16 GB" // likely not enough! 71 | withLabel:wf_common { 72 | container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}" 73 | } 74 | shell = ['/bin/bash', '-euo', 'pipefail'] 75 | 76 | // lift limit on simultaneous gpu jobs for cloud 77 | // and ensure that the host mounts relevant driver bobbins inside the container 78 | withLabel:gpu { 79 | maxForks = null 80 | containerOptions = "-e NVIDIA_DRIVER_CAPABILITIES=compute,utility --gpus all" 81 | } 82 | withLabel:wf_basecalling { 83 | container = "${params.aws_image_prefix}-dorado:${params.wf.container_sha_basecalling}" 84 | } 85 | withLabel:wf_bonito { 86 | container = "${params.aws_image_prefix}-bonito:${params.wf.bonito_sha}" 87 | } 88 | } 89 | } 90 | 91 | // local profile for simplified development testing 92 | local { 93 | process.executor = 'local' 94 | } 95 | 96 | // lift limit on simultaneous gpu jobs 97 | discrete_gpus { 98 | process."withLabel:gpu".maxForks = null 99 | } 100 | } 101 | 102 | 103 | timeline { 104 | enabled = true 105 | file = "${params.out_dir}/execution/timeline.html" 106 | overwrite = true 107 | } 108 | report { 109 | enabled = true 110 | file = "${params.out_dir}/execution/report.html" 111 | overwrite = true 112 | } 113 | trace { 114 | enabled = true 115 | file = "${params.out_dir}/execution/trace.txt" 116 | overwrite = true 117 | } 118 | 119 | env { 120 | PYTHONNOUSERSITE = 1 121 | JAVA_TOOL_OPTIONS = "-Xlog:disable -Xlog:all=warning:stderr" 122 | } 123 | 124 | cleanup = true 125 | -------------------------------------------------------------------------------- /bin/add_jsons.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Combine two JSONS, sum values by matching json keys.""" 3 | 4 | import argparse 5 | import json 6 | import os 7 | 8 | 9 | def add_dicts(d1, d2): 10 | """Extend json, sum values.""" 11 | def sum_a(v1, v2): 12 | if v2 is None: 13 | return v1 14 | try: 15 | if isinstance(v1 + v2, int): 16 | return v1 + v2 17 | elif isinstance(v1 + v2, str): 18 | return v1 19 | except TypeError: 20 | return add_dicts(v1, v2) 21 | result = d2.copy() 22 | result.update({k: sum_a(v, d2.get(k)) for k, v in d1.items()}) 23 | return result 24 | 25 | 26 | def main(args): 27 | """Run the entry point.""" 28 | if os.stat(args.state).st_size == 0: 29 | state = {} 30 | else: 31 | with open(args.state) as json_file: 32 | state = json.load(json_file) 33 | with open(args.new_file) as json_file: 34 | new_file = json.load(json_file) 35 | combined = add_dicts(state, new_file) 36 | with open(args.output, "w") as outfile: 37 | json.dump(combined, outfile) 38 | 39 | 40 | def argparser(): 41 | """Create argument parser.""" 42 | parser = argparse.ArgumentParser( 43 | "add_jsons", 44 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 45 | add_help=False) 46 | parser.add_argument("new_file") 47 | parser.add_argument("state") 48 | parser.add_argument("output") 49 | return parser 50 | 51 | 52 | if __name__ == "__main__": 53 | args = argparser().parse_args() 54 | main(args) 55 | -------------------------------------------------------------------------------- /bin/check_sample_sheet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Script to check that sample sheet is well-formatted.""" 3 | import argparse 4 | import sys 5 | 6 | import pandas as pd 7 | 8 | 9 | def main(): 10 | """Run entry point.""" 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('sample_sheet') 13 | parser.add_argument('output') 14 | args = parser.parse_args() 15 | 16 | try: 17 | samples = pd.read_csv(args.sample_sheet, sep=None) 18 | if 'alias' in samples.columns: 19 | if 'sample_id' in samples.columns: 20 | sys.stderr.write( 21 | "Warning: sample sheet contains both 'alias' and " 22 | 'sample_id, using the former.') 23 | samples['sample_id'] = samples['alias'] 24 | if not set(['sample_id', 'barcode']).intersection(samples.columns): 25 | raise IOError() 26 | except Exception: 27 | raise IOError( 28 | "Could not parse sample sheet, it must contain two columns " 29 | "named 'barcode' and 'sample_id' or 'alias'.") 30 | # check duplicates 31 | dup_bc = samples['barcode'].duplicated() 32 | dup_sample = samples['sample_id'].duplicated() 33 | if any(dup_bc) or any(dup_sample): 34 | raise IOError( 35 | "Sample sheet contains duplicate values.") 36 | samples.to_csv(args.output, sep=",", index=False) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /bin/duplex_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Count duplex and simplex reads in xam file.""" 3 | import argparse 4 | 5 | import pysam 6 | 7 | 8 | def main(): 9 | """Run entry point.""" 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('xam') 12 | parser.add_argument('outname') 13 | args = parser.parse_args() 14 | 15 | # Prepare input files 16 | xam = pysam.AlignmentFile(args.xam, check_sq=False) 17 | 18 | # Count simplex and duplex reads in a xam 19 | sx = dx = 0 20 | for read in xam.fetch(until_eof=True): 21 | if read.get_tag('dx') == 1: 22 | dx += 1 23 | else: 24 | sx += 1 25 | 26 | # Save counts to output 27 | with open(f'{args.outname}', 'w') as out: 28 | out.write("Filename,Duplex,Paired,Simplex\n") 29 | out.write(f"{args.xam},{dx},{dx*2},{sx}\n") 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /bin/fastcat_histogram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Histogram-json.""" 3 | 4 | import argparse 5 | import json 6 | import sys 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | 12 | def histogram_counts(data, dmin=0, bin_width=100): 13 | """Histogram bins and counts.""" 14 | bins = np.arange(dmin, max(data) + bin_width, bin_width) 15 | counts, _ = np.histogram(data, bins=bins) 16 | # Note that there can be small differences with/without batch_size=1. 17 | # https://numpy.org/doc/stable/reference/generated/numpy.histogram.html 18 | # bins from =[1, 2, 3, 4] => First bin=[1, 2), last bin=[3, 4]. 19 | # i.e. in batch_size=1, the count will be in the last interval (both edges included) 20 | # With more sequences, there can be different intervals and edge value can be moved 21 | # to the next consecutive interval. 22 | return bins.tolist(), counts.tolist() 23 | 24 | 25 | def get_stats(seq_summary): 26 | """Get Stats Json.""" 27 | stats_json = { 28 | "total_reads": len(seq_summary)} 29 | if len(seq_summary) >= 1: 30 | len_data = seq_summary['read_length'] 31 | len_bins, len_counts = histogram_counts( 32 | len_data, dmin=0, bin_width=50) 33 | stats_json["len"] = dict(list(zip(len_bins, len_counts))) 34 | 35 | qual_data = seq_summary['mean_quality'] 36 | qual_bins, qual_counts = histogram_counts( 37 | qual_data, dmin=0, bin_width=0.2) 38 | stats_json["qual"] = dict(list(zip(qual_bins, qual_counts))) 39 | else: 40 | sys.stderr.write("WARNING: summary file was empty.\n") 41 | stats_json["len"] = dict() 42 | stats_json["qual"] = dict() 43 | return stats_json 44 | 45 | 46 | def main(args): 47 | """Run the entry point.""" 48 | df = pd.read_csv( 49 | args.input, sep="\t", 50 | usecols=['read_length', 'mean_quality'], 51 | dtype={'read_length': int, 'mean_quality': float}) 52 | final = {args.sample_id: get_stats(df)} 53 | with open(args.output, 'w') as fp: 54 | json.dump(final, fp) 55 | 56 | 57 | def argparser(): 58 | """Argument parser for entrypoint.""" 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument( 61 | "input", help="Read summary file.") 62 | parser.add_argument( 63 | "output", help="Output summary JSON.") 64 | parser.add_argument( 65 | "--sample_id", help="Sample name.") 66 | return parser 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparser() 71 | main(parser.parse_args()) 72 | -------------------------------------------------------------------------------- /bin/report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Create workflow report.""" 3 | import argparse 4 | import json 5 | 6 | from bokeh.models import Title 7 | from dominate.tags import p 8 | import ezcharts as ezc 9 | from ezcharts.components.ezchart import EZChart 10 | from ezcharts.components.reports import labs 11 | from ezcharts.layout.snippets import DataTable 12 | from ezcharts.layout.snippets import Grid 13 | from ezcharts.layout.snippets import Tabs 14 | import pandas as pd 15 | from report_utils import read_length_plot, read_quality_plot 16 | from ezcharts.util import get_named_logger # noqa: ABS101 17 | 18 | 19 | THEME = 'epi2melabs' 20 | 21 | 22 | def main(args): 23 | """Run the entry point.""" 24 | logger = get_named_logger("Report") 25 | report = labs.LabsReport( 26 | "Basecalling report", "wf-basecalling", 27 | args.params, args.versions, 28 | args.workflow_version,) 29 | 30 | # Create statistics from the histogram json generated from 31 | # the workflow, summarising the results from bamstats. 32 | if args.stats: 33 | with report.add_section("Read summary", "Read summary"): 34 | with open(args.stats[0]) as f: 35 | datas = json.load(f) 36 | tabs = Tabs() 37 | total_reads = {} 38 | for sample_id, data in sorted(datas.items()): 39 | with tabs.add_tab(sample_id): 40 | total_reads[sample_id] = data['total_reads'] 41 | if data['total_reads'] == 0: 42 | p("No reads called.") 43 | continue 44 | with Grid(columns=2): 45 | EZChart( 46 | read_quality_plot(data), THEME) 47 | EZChart(read_length_plot(data), THEME) 48 | with tabs.add_tab('total'): 49 | with Grid(columns=1): # total read counts per sample 50 | df_stats = pd.DataFrame.from_dict(total_reads.items()) 51 | df_stats.columns = ['Sample_name', 'Number of reads'] 52 | plt = ezc.barplot( 53 | data=df_stats, x='Sample_name', y='Number of reads') 54 | plt._fig.add_layout( 55 | Title( 56 | text="Number of reads per sample.", 57 | text_font_size="1.5em" 58 | ), 59 | 'above' 60 | ) 61 | EZChart(plt, THEME) 62 | 63 | # If pairing rates are provided, show them. 64 | if args.pairings: 65 | with report.add_section("Pairing summary", "Pairing summary"): 66 | with open(args.pairings[0]) as f: 67 | # Load data 68 | data = pd.read_csv(f) 69 | if data.empty: 70 | p("No reads called.") 71 | else: 72 | # Make summary 73 | data_sum = data\ 74 | .drop(columns=['Filename'])\ 75 | .sum()\ 76 | .to_frame()\ 77 | .T 78 | data_sum['Pairing rate'] = data_sum['Paired'] / data_sum['Simplex'] 79 | data_sum['Pairing rate'] = data_sum['Pairing rate'].round(4) 80 | DataTable.from_pandas( 81 | data_sum, use_index=False, export=True, 82 | file_name=( 83 | f'{args.sample_name}-wf-basecalling-duplex-summary')) 84 | p( 85 | 'Simplex: the number of initial reads.', 86 | 'Paired: the number of simplex reads belonging to a pair.', 87 | 'Duplex: the number of duplex reads.', 88 | ) 89 | 90 | report.write(args.report) 91 | logger.info(f"Report written to {args.report}.") 92 | 93 | 94 | def argparser(): 95 | """Argument parser for entrypoint.""" 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument("report", help="Report output file") 98 | parser.add_argument( 99 | "--stats", nargs='*', help="Fastcat per-read stats file(s).") 100 | parser.add_argument( 101 | "--pairings", nargs='*', help="Pairing per-chunk stats.", required=False) 102 | parser.add_argument( 103 | "--sample_name", required=True, help="Sample name.") 104 | parser.add_argument( 105 | "--versions", required=True, 106 | help="directory containing CSVs containing name,version.") 107 | parser.add_argument( 108 | "--params", default=None, required=True, 109 | help="A JSON file containing the workflow parameter key/values") 110 | parser.add_argument( 111 | "--revision", default='unknown', 112 | help="git branch/tag of the executed workflow") 113 | parser.add_argument( 114 | "--commit", default='unknown', 115 | help="git commit of the executed workflow") 116 | parser.add_argument( 117 | "--workflow_version", default='unknown', 118 | help="Workflow version") 119 | return parser 120 | 121 | 122 | if __name__ == "__main__": 123 | parser = argparser() 124 | main(parser.parse_args()) 125 | -------------------------------------------------------------------------------- /bin/report_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Create tables for the report.""" 3 | from bokeh.models import Title 4 | from ezcharts.plots.distribution import histplot 5 | import pandas as pd 6 | 7 | # PLOTS 8 | 9 | # The SeqSummary from ezcharts.components.fastcat cannot be used. 10 | # It groups data into bins, but from the real time analysis output 11 | # the input data is already grouped into bins. 12 | # Use weights of histplot for y axis. 13 | 14 | 15 | def read_quality_plot(seq_summary, min_qual=4, max_qual=30, title='Read quality'): 16 | """Create read quality summary plot.""" 17 | df = pd.DataFrame.from_dict(seq_summary['qual'].items()) 18 | df.columns = ['mean_quality', 'counts'] 19 | df['mean_quality'] = df['mean_quality'].astype('float') 20 | plt = histplot( 21 | data=df['mean_quality'], 22 | bins=len(df), 23 | weights=list(df['counts']) 24 | ) 25 | plt._fig.add_layout( 26 | Title(text=title, text_font_size="1.5em"), 27 | 'above' 28 | ) 29 | plt._fig.xaxis.axis_label = "Quality score" 30 | plt._fig.yaxis.axis_label = "Number of reads" 31 | plt._fig.x_range.start = min_qual 32 | plt._fig.x_range.end = max_qual 33 | return plt 34 | 35 | 36 | def read_length_plot(seq_summary, title='Read length'): 37 | """Create a read length plot.""" 38 | df = pd.DataFrame.from_dict(seq_summary['len'].items()) 39 | df.columns = ['read_length', 'counts'] 40 | df['read_length'] = df['read_length'].astype('uint64') 41 | df['read_length'] = df['read_length'] / 1000 42 | plt = histplot( 43 | data=df['read_length'], 44 | bins=len(df), 45 | weights=list(df['counts'])) 46 | plt._fig.add_layout( 47 | Title(text=title, text_font_size="1.5em"), 48 | 'above' 49 | ) 50 | plt._fig.x_range.start = 0 51 | plt._fig.xaxis.axis_label = "Read length / kb" 52 | plt._fig.yaxis.axis_label = "Number of reads" 53 | return plt 54 | -------------------------------------------------------------------------------- /bin/workflow-glue: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Entrypoint of pseudo-package for all the code used in the workflow.""" 3 | 4 | from workflow_glue import cli 5 | 6 | if __name__ == "__main__": 7 | cli() 8 | -------------------------------------------------------------------------------- /bin/workflow_glue/__init__.py: -------------------------------------------------------------------------------- 1 | """Workflow Python code.""" 2 | import argparse 3 | import glob 4 | import importlib 5 | import itertools 6 | import os 7 | import sys 8 | 9 | from .util import _log_level, get_main_logger # noqa: ABS101 10 | 11 | 12 | __version__ = "0.0.1" 13 | _package_name = "workflow_glue" 14 | 15 | HELPERS = "wfg_helpers" 16 | 17 | 18 | def get_components(allowed_components=None): 19 | """Find a list of workflow command scripts.""" 20 | logger = get_main_logger(_package_name) 21 | 22 | # gather all python files in the current directory and the wfg_helpers 23 | home_path = os.path.dirname(os.path.abspath(__file__)) 24 | standard_lib = os.path.join(home_path, HELPERS) 25 | globs = itertools.chain.from_iterable(( 26 | glob.glob(os.path.join(path, "*.py")) 27 | for path in (home_path, standard_lib))) 28 | 29 | components = dict() 30 | for fname in globs: 31 | name = os.path.splitext(os.path.basename(fname))[0] 32 | if name in ("__init__", "util"): 33 | continue 34 | if allowed_components is not None and name not in allowed_components: 35 | continue 36 | 37 | # leniently attempt to import module 38 | try: 39 | if HELPERS in fname: 40 | mod = importlib.import_module(f"{_package_name}.{HELPERS}.{name}") 41 | else: 42 | mod = importlib.import_module(f"{_package_name}.{name}") 43 | except ModuleNotFoundError as e: 44 | # if imports cannot be satisifed, refuse to add the component 45 | # rather than exploding 46 | logger.warn(f"Could not load {name} due to missing module {e.name}") 47 | continue 48 | 49 | # if theres a main() and and argparser() thats good enough for us. 50 | try: 51 | req = "main", "argparser" 52 | if all(callable(getattr(mod, x)) for x in req): 53 | components[name] = mod 54 | except Exception: 55 | pass 56 | return components 57 | 58 | 59 | def cli(): 60 | """Run workflow entry points.""" 61 | logger = get_main_logger(_package_name) 62 | logger.info("Bootstrapping CLI.") 63 | parser = argparse.ArgumentParser( 64 | 'wf-glue', 65 | parents=[_log_level()], 66 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 67 | 68 | parser.add_argument( 69 | '-v', '--version', action='version', 70 | version='%(prog)s {}'.format(__version__)) 71 | 72 | subparsers = parser.add_subparsers( 73 | title='subcommands', description='valid commands', 74 | help='additional help', dest='command') 75 | subparsers.required = True 76 | 77 | # importing everything can take time, try to shortcut 78 | if len(sys.argv) > 1: 79 | components = get_components(allowed_components=[sys.argv[1]]) 80 | if not sys.argv[1] in components: 81 | logger.warn("Importing all modules, this may take some time.") 82 | components = get_components() 83 | else: 84 | components = get_components() 85 | 86 | # add all module parsers to main CLI 87 | for name, module in components.items(): 88 | p = subparsers.add_parser( 89 | name.split(".")[-1], parents=[module.argparser()]) 90 | p.set_defaults(func=module.main) 91 | 92 | args = parser.parse_args() 93 | 94 | logger.info("Starting entrypoint.") 95 | args.func(args) 96 | -------------------------------------------------------------------------------- /bin/workflow_glue/models/__init__.py: -------------------------------------------------------------------------------- 1 | """A collection of scripts for results models.""" 2 | -------------------------------------------------------------------------------- /bin/workflow_glue/models/common.py: -------------------------------------------------------------------------------- 1 | """Common model classes used across all workflows.""" 2 | from dataclasses import asdict, dataclass, field 3 | from enum import Enum 4 | import json 5 | from pathlib import Path 6 | from typing import Any, Dict, List, Optional 7 | 8 | from ..util import get_named_logger # noqa: ABS101 9 | 10 | logger = get_named_logger("Models") 11 | 12 | 13 | @dataclass 14 | class WorkflowBaseModel: 15 | """Common things for stuff in the model.""" 16 | 17 | def get( 18 | self, 19 | field_name: str, 20 | title: bool = True, 21 | **kwargs 22 | ): 23 | """Get reportable field tuple.""" 24 | field_info = self.__dataclass_fields__.get(field_name) 25 | # provide an empty string default title to minimise drama 26 | field_title = field_info.metadata.get("title", "") 27 | value = self.get_reportable_value(field_name=field_name, **kwargs) 28 | if title: 29 | return (field_title, value) 30 | return value 31 | 32 | def get_reportable_value( 33 | self, 34 | field_name: str, 35 | *, 36 | decimal_places: int = None, 37 | default_value: str = "N/A") -> Optional[str]: 38 | """Get the value of a value and make it reportable.""" 39 | # Get the field info using the field name 40 | field_info = self.__dataclass_fields__.get(field_name) 41 | if field_info is None: 42 | raise AttributeError( 43 | f"{field_name!r} is not a field on {self.__class__.__name__}" 44 | ) 45 | 46 | value = getattr(self, field_name) 47 | 48 | if value is None: 49 | return default_value 50 | 51 | if isinstance(value, (int, float)): 52 | if decimal_places: 53 | value = round(value, decimal_places) 54 | if value < 0.0001 or value > 99999999: 55 | value = f"{value:.2E}" 56 | else: 57 | if decimal_places: 58 | raise TypeError( 59 | "decimal_places is not a supported argument for a non-numeric.") 60 | 61 | unit = field_info.metadata.get('unit') 62 | 63 | if unit: 64 | return f"{value} {unit}" 65 | 66 | return str(value) 67 | 68 | 69 | class SampleType(str, Enum): 70 | """The type of the sample.""" 71 | 72 | no_template_control = "no_template_control" 73 | positive_control = "positive_control" 74 | negative_control = "negative_control" 75 | test_sample = "test_sample" 76 | 77 | def friendly_name(self): 78 | """Convert sample type to string.""" 79 | return self.name.replace("_", " ").capitalize() 80 | 81 | 82 | @dataclass 83 | class SampleIdentifier: 84 | """Additional identifiers for a sample.""" 85 | 86 | name: str = field( 87 | metadata={ 88 | "title": "Identifier name", 89 | "Description": "The name of the sample identifier"}) 90 | value: str = field( 91 | metadata={ 92 | "title": "Identifier value", 93 | "Description": "The value of the sample identifier"}) 94 | 95 | 96 | @dataclass 97 | class CheckResult: 98 | """ 99 | A result of some check the workflow has performed. 100 | 101 | This can be at sample or workflow level. 102 | """ 103 | 104 | check_category: str = field( 105 | metadata={ 106 | "title": "Check category", 107 | "description": "The category of the check"}) 108 | check_name: str = field( 109 | metadata={ 110 | "title": "Check name", 111 | "description": "The name of the check"}) 112 | check_pass: bool = field( 113 | metadata={ 114 | "title": "Check pass", 115 | "description": "If true the check has passed"}) 116 | check_threshold: str | None = field( 117 | default=None, metadata={ 118 | "title": "Check threshold", 119 | "description": "The threshold for the check, useful for reporting later"}) 120 | 121 | categories = {} 122 | 123 | def friendly_check_category(self): 124 | """Convert category to string.""" 125 | if self.check_category not in self.categories: 126 | raise ValueError(f"{self.check_category} has no friendly name") 127 | return self.categories[self.check_category] 128 | 129 | def friendly_check_name(self): 130 | """Convert check name to string.""" 131 | return self.check_name.replace("_", " ").capitalize() 132 | 133 | 134 | @dataclass 135 | class ResultsContents: 136 | """Placeholder class for results contents.""" 137 | 138 | pass 139 | 140 | 141 | @dataclass 142 | class Sample: 143 | """A sample sheet entry and its corresponding checks and related results.""" 144 | 145 | alias: str = field( 146 | metadata={ 147 | "title": "Sample alias", 148 | "description": "The alias for the sample given by the user"}) 149 | sample_type: SampleType = field( 150 | metadata={ 151 | "title": "Sample type", 152 | "description": "The type of the sample"}) 153 | sample_pass: bool = field( 154 | metadata={ 155 | "title": "Sample pass", 156 | "description": "If true the sample has passed workflow checks"}) 157 | barcode: str | None = field( 158 | default=None, 159 | metadata={ 160 | "title": "Sample barcode", 161 | "description": "The physical barcode assigned to the sample"}) 162 | additional_identifiers: List[SampleIdentifier] = field( 163 | default_factory=list, metadata={ 164 | "title": "Additional sample identifiers", 165 | "description": "Additional identifiers for the sample"}) 166 | sample_checks: list[CheckResult] = field( 167 | default_factory=list, metadata={ 168 | "title": "Sample checks", 169 | "description": "An array of checks performed on the sample"}) 170 | results: ResultsContents | None = field( 171 | default=None, metadata={ 172 | "title": "Sample results", 173 | "description": "Further specific workflow results for this sample"}) 174 | config: Dict[str, Any] | None = field( 175 | default=None, metadata={ 176 | "title": "Sample configuration", 177 | "description": """Sample specific config parameters 178 | used for running analysis"""}) 179 | 180 | def __post_init__(self): 181 | """Determine overall status for a sample given the individual check results.""" 182 | self.sample_pass = all( 183 | check.check_pass for check in self.sample_checks) 184 | 185 | def get_sample_identifier(self, sample_identifier): 186 | """Get a sample identifier given the identifier name.""" 187 | for identifier in self.additional_identifiers: 188 | if identifier.name == sample_identifier: 189 | return identifier.value 190 | raise KeyError("Sample identifier not found") 191 | 192 | def set_sample_identifier(self, name, value): 193 | """Set a sample identifier.""" 194 | sample_identifier = SampleIdentifier( 195 | name=name, 196 | value=value) 197 | self.additional_identifiers.append(sample_identifier) 198 | return self.additional_identifiers 199 | 200 | def to_json(self, filename): 201 | """Save class as JSON.""" 202 | with open(filename, 'w') as f: 203 | json.dump(asdict(self), f, default=str, indent=2) 204 | 205 | def get_reportable_qc_status(self, max_criteria=4): 206 | """Store global status of the sample and list of QC criteria to show. 207 | 208 | :params max_criteria: Maximum number of criteria to be reported. 209 | """ 210 | # Store global status: pass/ failed 211 | qc_global_status = {"status": self.sample_pass, "scope": "QC status"} 212 | qc_criteria = [] 213 | if self.sample_pass: 214 | qc_criteria.append( 215 | {"status": self.sample_pass, "scope": "All acceptance criteria met"} 216 | ) 217 | else: 218 | # Report failed criteria until a maximum value 219 | for qc in self.sample_checks: 220 | if not qc.check_pass: # append criteria if failed 221 | qc_criteria.append( 222 | { 223 | "status": qc.check_pass, 224 | "category": qc.friendly_check_category(), 225 | "scope": qc.friendly_check_name(), 226 | } 227 | ) 228 | if len(qc_criteria) > max_criteria: 229 | # Replace all the failed criteria, with a sentence with the number 230 | # instead of listing all of them. 231 | # Set status to False as more than max_criteria are failed. 232 | qc_criteria = [ 233 | { 234 | "status": False, 235 | "scope": f"{len(qc_criteria)} acceptance criteria", 236 | }, 237 | ] 238 | return qc_global_status, qc_criteria 239 | 240 | 241 | @dataclass 242 | class RunStats: 243 | """Basic run statistics for the entire run.""" 244 | 245 | total_reads: int | None = field( 246 | default=None, metadata={ 247 | "title": "Total reads", 248 | "description": "Total number of reads on run"}) 249 | total_ambiguous_reads: int | None = field( 250 | default=None, metadata={ 251 | "title": "Total ambiguous reads", 252 | "description": "Number of reads of unknown provenance"}) 253 | total_unaligned_reads: int | None = field( 254 | default=None, metadata={ 255 | "title": "Total unaligned reads", 256 | "description": "Number of unaligned reads"}) 257 | 258 | 259 | @dataclass 260 | class WorkflowResult(WorkflowBaseModel): 261 | """ 262 | Definition for results that will be returned by this workflow. 263 | 264 | This structure will be passed through by Gizmo speaking clients 265 | as WorkflowInstance.results. 266 | """ 267 | 268 | samples: list[Sample] = field( 269 | metadata={ 270 | "title": "Samples", 271 | "description": "Samples in this workflow instance"}) 272 | workflow_pass: bool | None = field( 273 | default=None, metadata={ 274 | "title": "Workflow pass", 275 | "description": "True if this workflow instance passes all checks"}) 276 | workflow_checks: list[CheckResult] = field( 277 | default_factory=list, metadata={ 278 | "title": "Workflow checks", 279 | "description": "An array of checks performed on the workflow instance"}) 280 | run_stats: RunStats | None = field( 281 | default=None, metadata={ 282 | "title": "Samples", 283 | "description": "Basic run statistics"}) 284 | client_fields: dict[str, Any] | None = field( 285 | default_factory=dict, metadata={ 286 | "title": "Client fields", 287 | "description": "Arbitrary key-value pairs provided by the client"}) 288 | versions: dict[str, Any] | None = field( 289 | default_factory=dict, metadata={ 290 | "title": "Analysis tool versions", 291 | "description": """Key-value pairs collecting the 292 | software used and the corresponding versions"""}) 293 | params: dict[str, Any] | None = field( 294 | default_factory=dict, metadata={ 295 | "title": "Pertinent parameters", 296 | "description": """Key-value pairs with the 297 | options chosen by the user"""}) 298 | 299 | def load_client_fields(self, filename): 300 | """Load client fields.""" 301 | with open(filename) as f: 302 | try: 303 | client_fields = json.loads(f.read()) 304 | # convert any lists into strings for display 305 | for key, value in client_fields.items(): 306 | if isinstance(value, list): 307 | client_fields[key] = ', '.join(value) 308 | except json.decoder.JSONDecodeError: 309 | client_fields = {"error": "Error parsing client fields file."} 310 | 311 | self.client_fields = client_fields 312 | return self.client_fields 313 | 314 | def load_params(self, params_json, keep=None): 315 | """Create a workflow params dict.""" 316 | params_json = Path(params_json) 317 | if keep is None: 318 | keep = [] 319 | if not params_json.is_file(): 320 | raise FileNotFoundError(f"No such file: {params_json}") 321 | with open(params_json, "r") as f: 322 | try: 323 | params_dict = json.loads(f.read()) 324 | self.params = { 325 | k: v for k, v in params_dict.items() if k in set(keep) 326 | } 327 | return self.params 328 | except ValueError: 329 | raise ValueError(f"Invalid JSON file: {params_json}") 330 | 331 | def load_versions(self, versions_path): 332 | """Create a version list of dict.""" 333 | versions_path = Path(versions_path) 334 | if not versions_path.exists(): 335 | raise FileNotFoundError(f"No such file: {versions_path}") 336 | 337 | if versions_path.is_dir(): 338 | version_files = [ 339 | vp for vp in versions_path.iterdir() if vp.is_file() 340 | ] 341 | elif versions_path.is_file(): 342 | version_files = [versions_path] 343 | else: 344 | raise IOError(f"{versions_path} should be either a directory or a file") 345 | for fname in version_files: 346 | versions = {} 347 | with open(fname, "r", encoding="utf-8") as fh: 348 | for line in fh.readlines(): 349 | name, version = line.strip().split(",") 350 | versions[name] = version 351 | self.versions = versions 352 | return self.versions 353 | 354 | def to_json(self, filename): 355 | """Save class as JSON.""" 356 | with open(filename, 'w') as f: 357 | json.dump(asdict(self), f, default=str, indent=2) 358 | -------------------------------------------------------------------------------- /bin/workflow_glue/util.py: -------------------------------------------------------------------------------- 1 | """The odd helper function. 2 | 3 | Be careful what you place in here. This file is imported into all glue. 4 | """ 5 | import argparse 6 | import logging 7 | 8 | 9 | _log_name = None 10 | 11 | 12 | def get_main_logger(name): 13 | """Create the top-level logger.""" 14 | global _log_name 15 | _log_name = name 16 | logging.basicConfig( 17 | format='[%(asctime)s - %(name)s] %(message)s', 18 | datefmt='%H:%M:%S', level=logging.INFO) 19 | return logging.getLogger(name) 20 | 21 | 22 | def get_named_logger(name): 23 | """Create a logger with a name. 24 | 25 | :param name: name of logger. 26 | """ 27 | name = name.ljust(10)[:10] # so logging is aligned 28 | logger = logging.getLogger('{}.{}'.format(_log_name, name)) 29 | return logger 30 | 31 | 32 | def wf_parser(name): 33 | """Make an argument parser for a workflow command.""" 34 | return argparse.ArgumentParser( 35 | name, 36 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 37 | add_help=False) 38 | 39 | 40 | def _log_level(): 41 | """Parser to set logging level and acquire software version/commit.""" 42 | parser = argparse.ArgumentParser( 43 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False) 44 | 45 | modify_log_level = parser.add_mutually_exclusive_group() 46 | modify_log_level.add_argument( 47 | '--debug', action='store_const', 48 | dest='log_level', const=logging.DEBUG, default=logging.INFO, 49 | help='Verbose logging of debug information.') 50 | modify_log_level.add_argument( 51 | '--quiet', action='store_const', 52 | dest='log_level', const=logging.WARNING, default=logging.INFO, 53 | help='Minimal logging; warnings only.') 54 | 55 | return parser 56 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | """A collection of helper scripts common to workflows.""" 2 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/check_bam_headers_in_dir.py: -------------------------------------------------------------------------------- 1 | """Check (u)BAM files for `@SQ` lines whether they are the same in all headers.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pysam 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def main(args): 12 | """Run the entry point.""" 13 | logger = get_named_logger("checkBamHdr") 14 | 15 | if not args.input_path.is_dir(): 16 | raise ValueError(f"Input path '{args.input_path}' must be a directory.") 17 | 18 | target_files = list(args.input_path.glob("*")) 19 | if not target_files: 20 | raise ValueError(f"No files found in input directory '{args.input_path}'.") 21 | # Loop over target files and check if there are `@SQ` lines in all headers or not. 22 | # Set `is_unaligned` accordingly. If there are mixed headers (either with some files 23 | # containing `@SQ` lines and some not or with different files containing different 24 | # `@SQ` lines), set `mixed_headers` to `True`. 25 | # Also check if there is the SO line, to validate whether the file is (un)sorted. 26 | first_sq_lines = None 27 | mixed_headers = False 28 | sorted_xam = False 29 | for xam_file in target_files: 30 | # get the `@SQ` and `@HD` lines in the header 31 | with pysam.AlignmentFile(xam_file, check_sq=False) as f: 32 | # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with 33 | # same reference but different SQ.UR as mixed_header (see CW-4842) 34 | sq_lines = [{ 35 | "SN": sq["SN"], 36 | "LN": sq["LN"], 37 | "M5": sq.get("M5"), 38 | } for sq in f.header.get("SQ", [])] 39 | hd_lines = f.header.get("HD") 40 | # Check if it is sorted. 41 | # When there is more than one BAM, merging/sorting 42 | # will happen regardless of this flag. 43 | if hd_lines is not None and hd_lines.get('SO') == 'coordinate': 44 | sorted_xam = True 45 | if first_sq_lines is None: 46 | # this is the first file 47 | first_sq_lines = sq_lines 48 | else: 49 | # this is a subsequent file; check with the first `@SQ` lines 50 | if sq_lines != first_sq_lines: 51 | mixed_headers = True 52 | break 53 | 54 | # we set `is_unaligned` to `True` if there were no mixed headers and the last file 55 | # didn't have `@SQ` lines (as we can then be sure that none of the files did) 56 | is_unaligned = not mixed_headers and not sq_lines 57 | # write `is_unaligned` and `mixed_headers` out so that they can be set as env. 58 | # variables 59 | sys.stdout.write( 60 | f"IS_UNALIGNED={int(is_unaligned)};" + 61 | f"MIXED_HEADERS={int(mixed_headers)};" + 62 | f"IS_SORTED={int(sorted_xam)}" 63 | ) 64 | logger.info(f"Checked (u)BAM headers in '{args.input_path}'.") 65 | 66 | 67 | def argparser(): 68 | """Argument parser for entrypoint.""" 69 | parser = wf_parser("check_bam_headers_in_dir") 70 | parser.add_argument("input_path", type=Path, help="Path to target directory") 71 | return parser 72 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/check_sample_sheet.py: -------------------------------------------------------------------------------- 1 | """Check if a sample sheet is valid.""" 2 | import codecs 3 | import csv 4 | import os 5 | import re 6 | import sys 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | # Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my 12 | # comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8 13 | # I should add). If we do not handle this with the correct encoding, the mark will 14 | # appear in the parsed data, causing the header to be malformed. 15 | # See CW-2310 16 | def determine_codec(f): 17 | """Peek at a file and return an appropriate reading codec.""" 18 | with open(f, 'rb') as f_bytes: 19 | # Could use chardet here if we need to expand codec support 20 | initial_bytes = f_bytes.read(8) 21 | 22 | for codec, encoding_name in [ 23 | [codecs.BOM_UTF8, "utf-8-sig"], # use the -sig codec to drop the mark 24 | [codecs.BOM_UTF16_BE, "utf-16"], # don't specify LE or BE to drop mark 25 | [codecs.BOM_UTF16_LE, "utf-16"], 26 | [codecs.BOM_UTF32_BE, "utf-32"], # handle 32 for completeness 27 | [codecs.BOM_UTF32_LE, "utf-32"], # again skip LE or BE to drop mark 28 | ]: 29 | if initial_bytes.startswith(codec): 30 | return encoding_name 31 | return None # will cause file to be opened with default encoding 32 | 33 | 34 | def main(args): 35 | """Run the entry point.""" 36 | logger = get_named_logger("checkSheet") 37 | 38 | barcodes = [] 39 | aliases = [] 40 | sample_types = [] 41 | analysis_groups = [] 42 | allowed_sample_types = [ 43 | "test_sample", "positive_control", "negative_control", "no_template_control" 44 | ] 45 | 46 | if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet): 47 | sys.stdout.write("Could not open sample sheet file.") 48 | sys.exit() 49 | 50 | try: 51 | encoding = determine_codec(args.sample_sheet) 52 | with open(args.sample_sheet, "r", encoding=encoding) as f: 53 | try: 54 | # Excel files don't throw any error until here 55 | csv.Sniffer().sniff(f.readline()) 56 | f.seek(0) # return to initial position again 57 | except Exception as e: 58 | # Excel fails with UniCode error 59 | sys.stdout.write( 60 | "The sample sheet doesn't seem to be a CSV file.\n" 61 | "The sample sheet has to be a CSV file.\n" 62 | "Please verify that the sample sheet is a CSV file.\n" 63 | f"Parsing error: {e}" 64 | ) 65 | 66 | sys.exit() 67 | 68 | csv_reader = csv.DictReader(f) 69 | n_row = 0 70 | for row in csv_reader: 71 | n_row += 1 72 | if n_row == 1: 73 | n_cols = len(row) 74 | else: 75 | # check we got the same number of fields 76 | if len(row) != n_cols: 77 | sys.stdout.write( 78 | f"Unexpected number of cells in row number {n_row}" 79 | ) 80 | sys.exit() 81 | try: 82 | barcodes.append(row["barcode"]) 83 | except KeyError: 84 | sys.stdout.write("'barcode' column missing") 85 | sys.exit() 86 | try: 87 | aliases.append(row["alias"]) 88 | except KeyError: 89 | sys.stdout.write("'alias' column missing") 90 | sys.exit() 91 | try: 92 | sample_types.append(row["type"]) 93 | except KeyError: 94 | pass 95 | try: 96 | analysis_groups.append(row["analysis_group"]) 97 | except KeyError: 98 | pass 99 | except Exception as e: 100 | sys.stdout.write(f"Parsing error: {e}") 101 | sys.exit() 102 | 103 | # check barcodes are correct format 104 | for barcode in barcodes: 105 | if not re.match(r'^barcode\d\d+$', barcode): 106 | sys.stdout.write("values in 'barcode' column are incorrect format") 107 | sys.exit() 108 | 109 | # check aliases are correct format 110 | # for now we have decided they may not start with "barcode" 111 | for alias in aliases: 112 | if alias.startswith("barcode"): 113 | sys.stdout.write("values in 'alias' column must not begin with 'barcode'") 114 | sys.exit() 115 | 116 | # check barcodes are all the same length 117 | first_length = len(barcodes[0]) 118 | for barcode in barcodes[1:]: 119 | if len(barcode) != first_length: 120 | sys.stdout.write("values in 'barcode' column are different lengths") 121 | sys.exit() 122 | 123 | # check barcode and alias values are unique 124 | if len(barcodes) > len(set(barcodes)): 125 | sys.stdout.write("values in 'barcode' column not unique") 126 | sys.exit() 127 | if len(aliases) > len(set(aliases)): 128 | sys.stdout.write("values in 'alias' column not unique") 129 | sys.exit() 130 | 131 | if sample_types: 132 | # check if "type" column has unexpected values 133 | unexp_type_vals = set(sample_types) - set(allowed_sample_types) 134 | 135 | if unexp_type_vals: 136 | sys.stdout.write( 137 | f"found unexpected values in 'type' column: {unexp_type_vals}. " 138 | f"Allowed values are: {allowed_sample_types}" 139 | ) 140 | sys.exit() 141 | 142 | if args.required_sample_types: 143 | for required_type in args.required_sample_types: 144 | if required_type not in allowed_sample_types: 145 | sys.stdout.write(f"Not an allowed sample type: {required_type}") 146 | sys.exit() 147 | if sample_types.count(required_type) < 1: 148 | sys.stdout.write( 149 | f"Sample sheet requires at least 1 of {required_type}") 150 | sys.exit() 151 | if analysis_groups: 152 | # if there was a "analysis_group" column, make sure it had values for all 153 | # samples 154 | if not all(analysis_groups): 155 | sys.stdout.write( 156 | "if an 'analysis_group' column exists, it needs values in each row" 157 | ) 158 | sys.exit() 159 | 160 | logger.info(f"Checked sample sheet {args.sample_sheet}.") 161 | 162 | 163 | def argparser(): 164 | """Argument parser for entrypoint.""" 165 | parser = wf_parser("check_sample_sheet") 166 | parser.add_argument("sample_sheet", help="Sample sheet to check") 167 | parser.add_argument( 168 | "--required_sample_types", 169 | help="List of required sample types. Each sample type provided must " 170 | "appear at least once in the sample sheet", 171 | nargs="*" 172 | ) 173 | return parser 174 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/check_xam_index.py: -------------------------------------------------------------------------------- 1 | """Validate a single (u)BAM file index.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pysam 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def validate_xam_index(xam_file): 12 | """Use fetch to validate the index. 13 | 14 | Invalid indexes will fail the call with a ValueError: 15 | ValueError: fetch called on bamfile without index 16 | """ 17 | with pysam.AlignmentFile(xam_file, check_sq=False) as alignments: 18 | try: 19 | alignments.fetch() 20 | has_valid_index = True 21 | except ValueError: 22 | has_valid_index = False 23 | return has_valid_index 24 | 25 | 26 | def main(args): 27 | """Run the entry point.""" 28 | logger = get_named_logger("checkBamIdx") 29 | 30 | # Check if a XAM has a valid index 31 | has_valid_index = validate_xam_index(args.input_xam) 32 | # write `has_valid_index` out so that they can be set as env. 33 | sys.stdout.write( 34 | f"HAS_VALID_INDEX={int(has_valid_index)}" 35 | ) 36 | logger.info(f"Checked (u)BAM index for: '{args.input_xam}'.") 37 | 38 | 39 | def argparser(): 40 | """Argument parser for entrypoint.""" 41 | parser = wf_parser("check_xam_index") 42 | parser.add_argument("input_xam", type=Path, help="Path to target XAM") 43 | return parser 44 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/configure_igv.py: -------------------------------------------------------------------------------- 1 | """Create an IGV config file.""" 2 | 3 | import json 4 | from pathlib import Path 5 | import sys 6 | 7 | from ..util import get_named_logger, wf_parser # noqa: ABS101 8 | 9 | 10 | # Common variables 11 | REF_EXTENSIONS = [".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fna.gz"] 12 | DATA_TYPES_LISTS = { 13 | "bam": ["bam"], 14 | "bam_idx": ["bam.bai"], 15 | "cram": ["cram"], 16 | "cram_idx": ["cram.crai"], 17 | "vcf": ["vcf", "vcf.gz"], 18 | "vcf_idx": ["vcf.gz.tbi", "vcf.gz.csi"], 19 | "bcf": ["bcf"], 20 | "bcf_idx": ["bcf.csi"], 21 | "gtf": ["gtf", "gtf.gz"], 22 | "gtf_idx": ["gtf.gz.tbi"], 23 | "gff": ["gff", "gff.gz", "gff3", "gff3.gz"], 24 | "gff_idx": ["gff.gz.tbi", "gff3.gz.tbi"], 25 | "bed": ["bed", "bed.gz"], 26 | "bed_idx": ["bed.gz.tbi"], 27 | "bedmethyl": ["bedmethyl", "bedmethyl.gz"], 28 | "bedmethyl_idx": ["bedmethyl.gz.tbi"], 29 | "ref": REF_EXTENSIONS, 30 | } 31 | DATA_TYPES = { 32 | ext: ftype for ftype, extlist in DATA_TYPES_LISTS.items() for ext in extlist 33 | } 34 | 35 | # Data by idx 36 | DATA_INDEXES_FMT = { 37 | fmt: f"{fmt}_idx" for fmt, dtype in DATA_TYPES.items() if "_idx" not in dtype 38 | } 39 | 40 | # Assign each format to its index 41 | INDEX_PAIRS = { 42 | "bam": ("bai",), 43 | "cram": ("crai",), 44 | "vcf": ("tbi", "csi"), 45 | "bcf": ("csi",), 46 | "bed": ("tbi",), 47 | "bedmethyl": ("tbi",), 48 | "gff": ("tbi",), 49 | "gtf": ("tbi",), 50 | } 51 | 52 | 53 | class TrackBuilder: 54 | """Class that builds an IGV track.""" 55 | 56 | def __init__(self): 57 | """Initialize properties for interval track.""" 58 | # Reference properties 59 | self.ref = None 60 | self.fai = None 61 | self.gzi = None 62 | # Samples info 63 | self.samples = {} 64 | # Track properties 65 | self.igv_json = {"reference": {}, "tracks": []} 66 | self.track_type = { 67 | "bam": "alignment", 68 | "cram": "alignment", 69 | "bcf": "variant", 70 | "vcf": "variant", 71 | "bedmethyl": "annotation", 72 | "bed": "annotation", 73 | "gtf": "annotation", 74 | "gff": "annotation", 75 | } 76 | # Here we save aliases of file formats that IGV.js 77 | # wants and that do not match the input file extension. 78 | self.igv_fmt_alias = {"gff": "gff3"} 79 | # lookup of extra options for each data type 80 | self.extra_opts_lookups = { 81 | "bam": {}, 82 | "cram": {}, 83 | "bcf": {}, 84 | "vcf": {}, 85 | "bed": {}, 86 | "bedmethyl": {}, 87 | "gtf": {}, 88 | "gff": {}, 89 | } 90 | 91 | def add_ref(self, ref=None): 92 | """Add reference file, unless already defined.""" 93 | if self.ref: 94 | raise Exception( 95 | f"Reference genome has already been set to {self.ref}.\n" 96 | "Only one reference FASTA file is expected." 97 | ) 98 | else: 99 | self.ref = ref 100 | 101 | def add_ref_index(self, ref_index=None): 102 | """Add reference index if valid.""" 103 | basename = Path(self.ref).name 104 | idx_basename = Path(ref_index).name 105 | if idx_basename == f"{basename}.fai": 106 | self.fai = ref_index 107 | if idx_basename == f"{basename}.gzi" and basename.endswith(".gz"): 108 | self.gzi = ref_index 109 | 110 | def parse_fnames(self, fofn): 111 | """Parse list with filenames and return them grouped. 112 | 113 | :param fofn: File with list of file names (one per line) 114 | """ 115 | tmp_samples = {} 116 | with open(fofn, "r") as f: 117 | for line in f: 118 | # If the line contains the sample name, prepare the data structure 119 | if "," in line: 120 | sample, fname = line.strip().split(",") 121 | if sample not in tmp_samples: 122 | tmp_samples[sample] = SampleBundle(sample=sample) 123 | tmp_samples[sample].append(fname) 124 | else: 125 | # Otherwise, assign everything to NO_SAMPLE 126 | # Files will still be displayed, but in no specific order. 127 | fname = line.strip() 128 | if any(fname.endswith(ext) for ext in REF_EXTENSIONS): 129 | self.add_ref(ref=fname) 130 | elif fname.endswith(".fai") or fname.endswith(".gzi"): 131 | self.add_ref_index(ref_index=fname) 132 | else: 133 | if "NO_SAMPLE" not in tmp_samples.keys(): 134 | tmp_samples["NO_SAMPLE"] = SampleBundle(sample="NO_SAMPLE") 135 | tmp_samples["NO_SAMPLE"].append(fname) 136 | # Re-order samples in dict and add them to the list, leaving 137 | # NO_SAMPLE as last 138 | sorted_samples = ( 139 | sorted([sample for sample in tmp_samples.keys() if sample != 'NO_SAMPLE']) 140 | ) 141 | if 'NO_SAMPLE' in tmp_samples.keys(): 142 | sorted_samples += ['NO_SAMPLE'] 143 | for sample in sorted_samples: 144 | self.samples[sample] = tmp_samples[sample] 145 | 146 | def build_igv_json(self): 147 | """Ensure there is a reference genome.""" 148 | if not self.ref: 149 | raise ValueError( 150 | "No reference file (i.e. file ending in one of " 151 | f"{REF_EXTENSIONS} was found)." 152 | ) 153 | # Evaluate that a bgzipped reference has the appropriate index. 154 | if self.ref.endswith(".gz") and not self.gzi: 155 | raise ValueError(f"GZI reference index for {self.ref} not found.") 156 | 157 | # Create the base track if there is a reference genome. 158 | self.igv_json["reference"] = { 159 | "id": "ref", 160 | "name": "ref", 161 | "wholeGenomeView": False, 162 | "fastaURL": self.ref, 163 | } 164 | if self.fai: 165 | self.igv_json["reference"]["indexURL"] = self.fai 166 | if self.gzi: 167 | self.igv_json["reference"]["compressedIndexURL"] = self.gzi 168 | 169 | # Add samples data now 170 | for sample, bundle in self.samples.items(): 171 | bundle.process_data() 172 | # Add the bundled data to the tracks 173 | for fname, index, file_fmt in bundle.data_bundles: 174 | self.add_track( 175 | fname, 176 | file_fmt, 177 | sample_name=sample if sample != "NO_SAMPLE" else None, 178 | index=index, 179 | extra_opts=self.extra_opts_lookups[file_fmt], 180 | ) 181 | 182 | def add_track(self, infile, file_fmt, sample_name=None, index=None, extra_opts={}): 183 | """Add a track to an IGV json. 184 | 185 | This function takes an input file, an optional index file, its 186 | file format and additional extra options for the track. 187 | 188 | :param infile: input file to create a track for 189 | :param file_fmt: input file track type 190 | :param sample_name: Name of the sample to display in the track name 191 | :param index: index for the input file 192 | :param extra_opts: dict of extra options for the track 193 | :return: dict with track options 194 | """ 195 | # Define track name depending on whether the sample ID is provided 196 | track_name = Path(infile).name 197 | if sample_name: 198 | track_name = f"{sample_name}: {Path(infile).name}" 199 | track_dict = { 200 | "name": track_name, 201 | "type": self.track_type[file_fmt], 202 | "format": self.igv_fmt_alias.get(file_fmt, file_fmt), 203 | "url": infile, 204 | } 205 | # add the index, if present 206 | if index: 207 | track_dict["indexURL"] = index 208 | track_dict.update(extra_opts) 209 | self.igv_json["tracks"] += [track_dict] 210 | 211 | def add_locus(self, locus): 212 | """Add target locus to the json.""" 213 | self.igv_json["locus"] = locus 214 | 215 | def add_extra_opts( 216 | self, 217 | extra_alignment_opts=None, 218 | extra_variant_opts=None, 219 | extra_interval_opts=None, 220 | ): 221 | """Import extra options from json files.""" 222 | if extra_alignment_opts is not None: 223 | with open(extra_alignment_opts, "r") as f: 224 | extra_alignment_opts_json = json.load(f) 225 | for ftype in ["bam", "cram"]: 226 | self.extra_opts_lookups[ftype] = extra_alignment_opts_json 227 | if extra_variant_opts is not None: 228 | with open(extra_variant_opts, "r") as f: 229 | extra_variant_opts_json = json.load(f) 230 | for ftype in ["vcf", "bcf"]: 231 | self.extra_opts_lookups[ftype] = extra_variant_opts_json 232 | if extra_interval_opts is not None: 233 | with open(extra_interval_opts, "r") as f: 234 | extra_interval_opts_json = json.load(f) 235 | for ftype in ["bed", "bedmethyl", "gff", "gtf"]: 236 | self.extra_opts_lookups[ftype] = extra_interval_opts_json 237 | 238 | 239 | class SampleBundle: 240 | """Sample data class. 241 | 242 | This class stores the data for multiple tracks for a 243 | single sample, then is used to generate a collection of 244 | IGV.js tracks. 245 | """ 246 | 247 | def __init__(self, sample): 248 | """Initialize properties for a sample.""" 249 | self.sample = sample 250 | self.infiles = [] 251 | self.data_bundles = [] 252 | 253 | def append(self, fname): 254 | """Add a new raw file to the bundle.""" 255 | self.infiles.append(fname) 256 | 257 | def process_data(self): 258 | """Process input files.""" 259 | fbasenames = [Path(fname).name for fname in self.infiles] 260 | ftypes = [self.classify_files(bname) for bname in fbasenames] 261 | self.data_bundles = self.pair_file_with_index(self.infiles, fbasenames, ftypes) 262 | 263 | @staticmethod 264 | def classify_files(fname): 265 | """Classify inputs.""" 266 | for extension, ftype in DATA_TYPES.items(): 267 | if fname.endswith(f".{extension}"): 268 | return ftype 269 | 270 | @staticmethod 271 | def pair_file_with_index(infiles, fbasenames, ftypes): 272 | """Clump files with their indexes.""" 273 | # Collect data by group type 274 | groups = {ftype: {"basenames": [], "paths": []} for ftype in set(ftypes)} 275 | # Group each file by its type and base name 276 | for ftype, fbasename, fname in zip(ftypes, fbasenames, infiles): 277 | groups[ftype]["basenames"] += [fbasename] 278 | groups[ftype]["paths"] += [fname] 279 | 280 | # Output bundles 281 | outputs = [] 282 | # Start matching the variant files 283 | for ftype, itype in DATA_INDEXES_FMT.items(): 284 | # Ignore file formats that are not present in the bundle. 285 | if ftype not in groups: 286 | continue 287 | # Make pairs of files. 288 | for fbasename, fpath in zip( 289 | groups[ftype]["basenames"], groups[ftype]["paths"] 290 | ): 291 | # Construct potential index file names based on basename of input files 292 | idx_basenames = set( 293 | [f"{fbasename}.{idx}" for idx in INDEX_PAIRS[ftype]] 294 | ) 295 | # Find which indexes are available 296 | if itype in groups.keys(): 297 | idx_basenames = list( 298 | idx_basenames.intersection(set(groups[itype]["basenames"])) 299 | ) 300 | # Get the first index (if there are more than one, 301 | # it doesn't matter) 302 | bname = idx_basenames[0] 303 | idx_fn = groups[itype]["paths"][ 304 | groups[itype]["basenames"].index(bname) 305 | ] 306 | outputs.append([fpath, idx_fn, ftype]) 307 | # Otherwise, return only the simple file. 308 | else: 309 | outputs.append([fpath, None, ftype]) 310 | return outputs 311 | 312 | 313 | def main(args): 314 | """Run the entry point.""" 315 | logger = get_named_logger("configIGV") 316 | 317 | # parse the FOFN 318 | igv_builder = TrackBuilder() 319 | 320 | # Add the additional track configurations 321 | igv_builder.add_extra_opts( 322 | extra_alignment_opts=args.extra_alignment_opts, 323 | extra_variant_opts=args.extra_variant_opts, 324 | extra_interval_opts=args.extra_interval_opts 325 | ) 326 | 327 | # Import files 328 | igv_builder.parse_fnames(args.fofn) 329 | 330 | # initialise the IGV options dict with the reference options 331 | igv_builder.build_igv_json() 332 | 333 | # Add locus information 334 | if args.locus is not None: 335 | igv_builder.add_locus(args.locus) 336 | 337 | json.dump(igv_builder.igv_json, sys.stdout, indent=4) 338 | 339 | logger.info("Printed IGV config JSON to STDOUT.") 340 | 341 | 342 | def argparser(): 343 | """Argument parser for entrypoint.""" 344 | parser = wf_parser("configure_igv") 345 | parser.add_argument( 346 | "--fofn", 347 | required=True, 348 | help=( 349 | "File with list of names of reference / XAM / VCF files and indices " 350 | "(one filename per line)" 351 | ), 352 | ) 353 | parser.add_argument( 354 | "--locus", 355 | help="Locus string to set initial genomic coordinates to display in IGV", 356 | ) 357 | parser.add_argument( 358 | "--extra-alignment-opts", 359 | help="JSON file with extra options for alignment tracks", 360 | ) 361 | parser.add_argument( 362 | "--extra-variant-opts", 363 | help="JSON file with extra options for variant tracks", 364 | ) 365 | parser.add_argument( 366 | "--extra_interval_opts", 367 | help="JSON file with extra options for interval tracks", 368 | ) 369 | return parser 370 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/get_max_depth_locus.py: -------------------------------------------------------------------------------- 1 | """Find max depth window in a `mosdepth` regions BED file and write as locus string.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pandas as pd 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def main(args): 12 | """Run the entry point.""" 13 | logger = get_named_logger("getMaxDepth") 14 | 15 | # read the regions BED file 16 | df = pd.read_csv( 17 | args.depths_bed, sep="\t", header=None, names=["ref", "start", "end", "depth"] 18 | ) 19 | 20 | # get the window with the largest depth 21 | ref, start, end, depth = df.loc[df["depth"].idxmax()] 22 | 23 | # get the length of the reference of that window 24 | ref_length = df.query("ref == @ref")["end"].iloc[-1] 25 | 26 | # show the whole reference in case it's shorter than the desired locus size 27 | if ref_length < args.locus_size: 28 | start = 1 29 | end = ref_length 30 | else: 31 | # otherwise, show a region of the desired size around the window 32 | half_size = args.locus_size // 2 33 | mid = (start + end) // 2 34 | start = mid - half_size 35 | end = mid + half_size 36 | # check if the region starts below `1` or ends beyond the end of the reference 37 | if start < 1: 38 | start = 1 39 | end = args.locus_size 40 | if end > ref_length: 41 | start = ref_length - args.locus_size 42 | end = ref_length 43 | 44 | # write depth and locus string 45 | sys.stdout.write(f"{depth}\t{ref}:{start}-{end}") 46 | 47 | logger.info("Wrote locus with maximum depth to STDOUT.") 48 | 49 | 50 | def argparser(): 51 | """Argument parser for entrypoint.""" 52 | parser = wf_parser("get_max_depth_locus") 53 | parser.add_argument( 54 | "depths_bed", 55 | type=Path, 56 | help="path to mosdepth regions depth file (can be compressed)", 57 | ) 58 | parser.add_argument( 59 | "locus_size", type=int, help="size of the locus in basepairs (e.g. '2000')" 60 | ) 61 | return parser 62 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/reheader_samstream.py: -------------------------------------------------------------------------------- 1 | """Reheader a SAM in a stream. 2 | 3 | When using the bam2fq -> minimap2 pattern for (re)aligning BAM data, we 4 | lose any existing RG and PG headers. This is particularly egregious when 5 | handling basecalled data as lines related to dorado basecalling settings 6 | as well as dorado RG headers are lost; orphaning RG tags in the reads. 7 | This is problematic for downstream anaylses that would like to read the 8 | XAM header to intelligently determine how to handle the reads based on 9 | the basecaller model and basecaller configuration. 10 | 11 | This script handles: 12 | - Inserting RG, PG and CO lines from an existing XAM header into the 13 | header of the SAM emitted from minimap2's alignment stream 14 | - Inserting a PG header to indicate that a call to bam2fq was made 15 | - Updating the first streamed PG.PP parent tag with the last PG.ID 16 | of the existing XAM header to maintain a chain of custody 17 | - Updating any streamed PG.ID (and PG.PP) tags to avoid collisions 18 | with inserted PG.ID 19 | 20 | Handling collisions may seem like overkill but it is anticipated that 21 | this script will be called immediately after minimap2, any previous 22 | attempt to use minimap2 will lead to ambiguity. This would be the 23 | expected case where users have used wf-basecalling or wf-alignment to 24 | align a set of reads, only to realign them to another reference (eg. 25 | via wf-human-variation). Arguably, we should remove older references to 26 | minimap2 as they will have been invalidated by the call to bam2fq but 27 | removing PG records and sticking the PG chain back together seems more 28 | fraught with annoying future bugs than simply resolving conflicts. 29 | 30 | This script will explode on a stream that contains: 31 | - PG lines in the original header where the last PG in the chain is 32 | ambiguous, or where the parent PP IDs are not injective 33 | - PG lines in the stream that do not appear in the order of their 34 | chain (that is if a PG.PP refers to a PG.ID that has not been 35 | encountered yet) 36 | 37 | SQ lines are retained after an HD line. That is to say, the most recent 38 | set of SQ lines observed after an HD will appear in the final output. 39 | SQ, RG, PG and CO lines are emitted as a group together, with elements 40 | written out in the order observed. 41 | 42 | PG lines are naively appended to the last PG element in the chain. No 43 | attempt is made to keep multiple program chains intact as this can lead 44 | to bloated headers. Broken PG metadata is a known problem (see 45 | samtools/hts-specs#275) but one that is preferable to headers that 46 | become unwieldly large to process: there IS an upper limit to a SAM 47 | header's size after all. 48 | 49 | This script takes advantage of minimap2's SAM output to immediately 50 | reheader the stream before any downstream calls to other programs pollute 51 | the PG header. This script is a little overkill but attempts to be robust 52 | with handling PG collisions and more obviously encapsulates reheadering 53 | behaviour, and leaves some room to do more clever things as necessary. 54 | """ 55 | from shutil import copyfileobj 56 | import sys 57 | 58 | from ..util import wf_parser # noqa: ABS101 59 | 60 | 61 | class SamHeader: 62 | """An overkill container to manage merging PG lines in SAM headers. 63 | 64 | Collision handling is simple. If a PG.ID is duplicated by the stream 65 | then we add a suffix to its name and keep an eye out for the 66 | corresponding PG.PP later. We assume that headers emitted by the 67 | stream are chronological because this script should not be called as 68 | part of any complicated pipework other than immediately following 69 | minimap2. 70 | """ 71 | 72 | def __init__(self): 73 | """Initialise a collision aware PG container.""" 74 | self.remapped_pgids = {} 75 | self.collision_suffix = 0 76 | 77 | # Default HD, in case the new stream does not provide one 78 | self.hd = "@HD\tVN:1.6\tSO:unknown" 79 | 80 | # We'll merge RG, CO and PG 81 | self.rg_records = [] 82 | self.co_records = [] 83 | self.pg_records = [] 84 | 85 | # We keep the most recently observed block of SQ records by 86 | # resetting SQ on the first SQ seen after non-SQ. We cannot 87 | # rely on HD being emitted (as minimap2 does not do this!) 88 | self.sq_records = [] 89 | self.reset_sq = False 90 | 91 | self.observed_rgids = set() 92 | self.observed_pgids = set() 93 | self.last_pgid = None 94 | 95 | @staticmethod 96 | def str_to_record(line): 97 | """Return an appropriate struct for a given string record.""" 98 | try: 99 | record_type, record_data = line.strip().split('\t', 1) 100 | except ValueError: 101 | raise Exception(f"Record type could not be determined: {line}") 102 | 103 | if len(record_type) > 3: 104 | raise Exception(f"Record type malformed: {record_type}") 105 | 106 | record = {} 107 | if record_type in ["@HD", "@CO", "@SQ"]: 108 | return record_type, record_data 109 | elif record_type in ["@RG", "@PG"]: 110 | for field in record_data.strip().split('\t'): 111 | k, v = field.split(':', 1) 112 | if len(k) == 2 and k[0].isalpha() and k[1].isalnum(): 113 | record[k] = v 114 | else: 115 | raise Exception(f"{record_type} with invalid tag: '{k}'") 116 | if "ID" not in record: 117 | raise Exception(f"{record_type} with no ID: {record_data}") 118 | return record_type, record 119 | else: 120 | raise Exception(f"Unknown record type: {line}") 121 | 122 | @staticmethod 123 | def record_to_str(record_type, record_data): 124 | """Form a string from a header record.""" 125 | if record_type in ["@PG", "@RG"]: 126 | tags = [f"{k}:{v}" for k, v in record_data.items()] 127 | return f"{record_type}\t" + '\t'.join(tags) 128 | elif record_type in ["@SQ", "@CO"]: 129 | return f"{record_type}\t{record_data}" 130 | 131 | @staticmethod 132 | def resolve_pg_chain(pg_dicts): 133 | """Check links between PG.ID and PP.ID, exploding if inconsistent.""" 134 | links = {} 135 | # Document links between all ID and their PP parent 136 | pgids_without_ppid = 0 137 | for pgd in pg_dicts: 138 | pgid = pgd["ID"] 139 | pgpp = pgd.get("PP") 140 | links[pgid] = pgpp 141 | if pgpp is None: 142 | pgids_without_ppid += 1 143 | if len(links) > 0: 144 | # If there are links, exactly one should have a None parent 145 | # to indicate the first PG in the chain. Explode if we see 146 | # no head or multiple heads. 147 | if pgids_without_ppid == 0: 148 | raise Exception("PG chain does not have a head.") 149 | elif pgids_without_ppid > 1: 150 | raise Exception("PG chain has multiple heads.") 151 | for source in links: 152 | head = source 153 | path = [head] 154 | while True: 155 | head = links[head] 156 | if head is None: 157 | break 158 | if head in path: 159 | path.append(head) 160 | raise Exception(f"PG chain appears to contain cycle: {path}") 161 | path.append(head) 162 | # This function is only really called to catch any explosions 163 | # but we'll return the links here as it is useful for testing 164 | return links 165 | 166 | def _bump_pg_collider(self): 167 | """Alter the collision suffix after determining a collision.""" 168 | self.collision_suffix += 1 169 | 170 | def _uncollide_pgid(self, pgid): 171 | """Return an uncollided string for a given PG ID.""" 172 | new_pgid = f"{pgid}-{self.collision_suffix}" 173 | self.remapped_pgids[pgid] = new_pgid 174 | self._bump_pg_collider() 175 | return new_pgid 176 | 177 | def add_line(self, line): 178 | """Add a header line to the header.""" 179 | record_type, record = self.str_to_record(line) 180 | 181 | if record_type == "@HD": 182 | self.hd = f"@HD\t{record}" 183 | elif record_type == "@CO": 184 | self.co_records.append(record) 185 | elif record_type == "@SQ": 186 | if self.reset_sq: 187 | self.sq_records = [] 188 | self.reset_sq = False 189 | self.sq_records.append(record) 190 | elif record_type == "@RG": 191 | rgid = record["ID"] 192 | if rgid not in self.observed_rgids: 193 | self.observed_rgids.add(rgid) 194 | self.rg_records.append(record) 195 | elif record not in self.rg_records: 196 | # if rgid has been seen before, abort if this record is different 197 | raise Exception( 198 | f"Duplicate RG with ID '{rgid}' conflicts with previously seen RG with same ID." # noqa:E501 199 | ) 200 | elif record_type == "@PG": 201 | pgid = record["ID"] 202 | if pgid in self.observed_pgids: 203 | # collision, rewrite the pgid 204 | pgid = self._uncollide_pgid(pgid) 205 | record["ID"] = pgid 206 | else: 207 | self.observed_pgids.add(pgid) 208 | 209 | # maintain chain 210 | ppid = record.get("PP") 211 | if not ppid: 212 | # record has no parent, this is either 213 | # - the first record (last_pgid is None) so is the tail 214 | # - an inserted record that needs its parent to be the current tail 215 | if not self.last_pgid: 216 | self.last_pgid = pgid 217 | else: 218 | record["PP"] = self.last_pgid 219 | self.last_pgid = pgid 220 | else: 221 | if ppid not in self.observed_pgids: 222 | raise Exception( 223 | f"Encountered PG.PP '{ppid}' before observing corresponding PG.ID" # noqa:E501 224 | ) 225 | # remap parent id (if needed) 226 | record["PP"] = self.remapped_pgids.get(ppid, ppid) 227 | # set tail to this record 228 | self.last_pgid = pgid 229 | 230 | self.pg_records.append(record) 231 | 232 | if len(self.sq_records) > 0 and record_type != '@SQ': 233 | self.reset_sq = True 234 | 235 | return record 236 | 237 | def write_header(self, fh): 238 | """Write this header to a file handle.""" 239 | self.resolve_pg_chain(self.pg_records) # check PG header 240 | fh.write(f"{self.hd}\n") 241 | for sq in self.sq_records: 242 | fh.write(self.record_to_str("@SQ", sq) + '\n') 243 | for rg in self.rg_records: 244 | fh.write(self.record_to_str("@RG", rg) + '\n') 245 | for pg in self.pg_records: 246 | fh.write(self.record_to_str("@PG", pg) + '\n') 247 | for co in self.co_records: 248 | fh.write(self.record_to_str("@CO", co) + '\n') 249 | 250 | 251 | def reheader_samstream(header_in, stream_in, stream_out, args): 252 | """Run reheader_samstream.""" 253 | # read original header into container 254 | sh = SamHeader() 255 | for line in header_in: 256 | sh.add_line(line) 257 | 258 | # append user provided lines to container 259 | for line in args.insert: 260 | sh.add_line(line) 261 | 262 | # read the header portion of the minimap2 stream 263 | wrote_header = False 264 | for line in stream_in: 265 | if line[0] != '@': 266 | # write out header on first alignment 267 | sh.write_header(stream_out) 268 | wrote_header = True 269 | # and actually write the first alignment 270 | stream_out.write(line) 271 | break 272 | sh.add_line(line) 273 | 274 | # Pass through the rest of the alignments. 275 | # I toyed with a few ways of doing this: 276 | # - A trivial iter over the input file was slow. presumably as we incurred some 277 | # overhead calling read() and write() and decoding more than other methods. 278 | # - os.read/write avoids dealing with higher level python read/write but requires 279 | # file descriptors which rules out non-file-like objects. this made testing more 280 | # annoying as StringIO does not have a file descriptor. we could have mocked fds 281 | # but i was not happy with the discrepancy between real and test execution. 282 | # - copyfileobj with the stream_in.buffer would also avoid some of the higher 283 | # level text handling but would require all tests to provide inputs that have 284 | # an underlying binary buffer. it was also not possible to seek the buffer to 285 | # the position of the text stream as we've used next() to iterate over the 286 | # header lines, fixing this would have required rewriting of the header 287 | # handling or keeping track of the position in the stream ourselves which 288 | # just seemed unncessary overkill given how we expect this program to be used. 289 | # copyfileobj on the text streams is more efficient than merely iterating the file 290 | # and dumping the lines out and seems to do the job. this keeps the code and tests 291 | # simple with minimal additional cost to performance. i anticipate any overhead of 292 | # this program will be dwarfed by that of minimap2/samtools sort anyway. 293 | # increasing the buffer size gave worse performance in my limited testing so we 294 | # leave it as the default here. 295 | copyfileobj(stream_in, stream_out) 296 | 297 | # If there were no alignments, we won't have hit the != @ case in the first stdin, 298 | # and we won't have written the header out. Write a header if we haven't already. 299 | if not wrote_header: 300 | sh.write_header(stream_out) 301 | 302 | 303 | def argparser(): 304 | """Argument parser for entrypoint.""" 305 | parser = wf_parser("reheader_samstream") 306 | parser.add_argument("header_in") 307 | parser.add_argument("--insert", action="append", default=[]) 308 | return parser 309 | 310 | 311 | def main(args): 312 | """reheader_samstream default entry point.""" 313 | with open(args.header_in) as header_in: 314 | reheader_samstream(header_in, sys.stdin, sys.stdout, args) 315 | -------------------------------------------------------------------------------- /data/OPTIONAL_FILE: -------------------------------------------------------------------------------- 1 | # Nothing to see here. A sentinel file to replace real data. 2 | # e.g.: 3 | # 4 | # process run { 5 | # input: 6 | # path some_data 7 | # path extra_data 8 | # script: 9 | # def extra = extra_data.name != 'OPTIONAL_FILE' ? "--extra-data $opt" : '' 10 | # """ 11 | # command ${some_data} ${extra} 12 | # """ 13 | # } 14 | # 15 | # some_data = ... 16 | # extra_data = Channel.fromPath("$projectDir/data/OPTIONAL_FILE")) 17 | # run(some_data, extra_data) 18 | -------------------------------------------------------------------------------- /docs/01_brief_description.md: -------------------------------------------------------------------------------- 1 | Helper workflow for basecalling nanopore sequencing data. -------------------------------------------------------------------------------- /docs/02_introduction.md: -------------------------------------------------------------------------------- 1 | In brief this workflow can be used to perform: 2 | 3 | + Basecalling of a directory of pod5 or fast5 signal data 4 | + Basecalling in Duplex mode 5 | + Modified basecalling 6 | + Basecalling in real time 7 | + Output basecalled sequences in various formats: FASTQ, CRAM or Unaligned BAM 8 | + If a reference is provided a sorted and indexed BAM or CRAM will be output 9 | for basecalling a directory of `pod5` or `fast5` signal data with `dorado` 10 | and aligning it with `minimap2` to produce a sorted, indexed CRAM. 11 | -------------------------------------------------------------------------------- /docs/03_compute_requirements.md: -------------------------------------------------------------------------------- 1 | Recommended requirements: 2 | 3 | + CPUs = 64 4 | + Memory = 256GB 5 | 6 | Minimum requirements: 7 | 8 | + CPUs = 8 9 | + Memory = 64GB 10 | 11 | Approximate run time: Variable depending on coverage, genome size, model of choice and GPU model. 12 | 13 | ARM processor support: False 14 | -------------------------------------------------------------------------------- /docs/04_install_and_run.md: -------------------------------------------------------------------------------- 1 | 2 | These are instructions to install and run the workflow on command line. 3 | You can also access the workflow via the 4 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/). 5 | 6 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage 7 | compute and software resources, 8 | therefore Nextflow will need to be 9 | installed before attempting to run the workflow. 10 | 11 | The workflow can currently be run using either 12 | [Docker](https://docs.docker.com/get-started/) 13 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) 14 | to provide isolation of the required software. 15 | Both methods are automated out-of-the-box provided 16 | either Docker or Singularity is installed. 17 | This is controlled by the 18 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) 19 | parameter as exemplified below. 20 | 21 | It is not required to clone or download the git repository 22 | in order to run the workflow. 23 | More information on running EPI2ME workflows can 24 | be found on our [website](https://labs.epi2me.io/wfindex). 25 | 26 | The following command can be used to obtain the workflow. 27 | This will pull the repository in to the assets folder of 28 | Nextflow and provide a list of all parameters 29 | available for the workflow as well as an example command: 30 | 31 | ``` 32 | nextflow run epi2me-labs/wf-basecalling --help 33 | ``` 34 | To update a workflow to the latest version on the command line use 35 | the following command: 36 | ``` 37 | nextflow pull epi2me-labs/wf-basecalling 38 | ``` 39 | 40 | A demo dataset is provided for testing of the workflow. 41 | It can be downloaded and unpacked using the following commands: 42 | ``` 43 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/wf-basecalling-demo.tar.gz 44 | tar -xzvf wf-basecalling-demo.tar.gz 45 | ``` 46 | The workflow can then be run with the downloaded demo data using: 47 | ``` 48 | nextflow run epi2me-labs/wf-basecalling \ 49 | --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' \ 50 | --dorado_ext 'pod5' \ 51 | --input 'wf-basecalling-demo/input' \ 52 | --ref 'wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta' \ 53 | --remora_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2' \ 54 | -profile standard 55 | ``` 56 | 57 | For further information about running a workflow on 58 | the command line see https://labs.epi2me.io/wfquickstart/ 59 | -------------------------------------------------------------------------------- /docs/05_related_protocols.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. 4 | 5 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). -------------------------------------------------------------------------------- /docs/06_input_example.md: -------------------------------------------------------------------------------- 1 | This workflow accepts a folder containing FAST5 or POD5 files as input. 2 | The folder may contain other folders of FAST5 or POD5 files and all files will be processed by the workflow. 3 | 4 | -------------------------------------------------------------------------------- /docs/06_input_parameters.md: -------------------------------------------------------------------------------- 1 | ### Input Options 2 | 3 | | Nextflow parameter name | Type | Description | Help | Default | 4 | |--------------------------|------|-------------|------|---------| 5 | | input | string | Directory containing FAST5 (or POD5) signal for basecalling. | This directory will be searched recursively. All FAST5 or POD5 files (depending on which extension you select in the Basecalling Options) in this directory or any subdirectory (no matter how deep) will be basecalled. | | 6 | | ref | string | Optional reference FASTA file to align basecalled reads to. | Without a reference, basecalls are output to unaligned CRAM. When using a reference, take care to retain this FASTA file as the output CRAM file cannot be read without the reference it was aligned to. | | 7 | 8 | 9 | ### Output Options 10 | 11 | | Nextflow parameter name | Type | Description | Help | Default | 12 | |--------------------------|------|-------------|------|---------| 13 | | out_dir | string | Directory for output of all files. | | output | 14 | | sample_name | string | Sample name to prefix file names of workflow outputs. | | SAMPLE | 15 | | output_fmt | string | Desired file format of files created by basecalling and alignment. | FASTQ can only be output when a reference has not been provided. Aligned output will always be written to CRAM unless BAM is selected. | cram | 16 | | igv | boolean | Visualize outputs in the EPI2ME IGV visualizer. | Enabling this option will visualize the output alignment files in the EPI2ME desktop app IGV visualizer. | False | 17 | 18 | 19 | ### Basecalling options 20 | 21 | | Nextflow parameter name | Type | Description | Help | Default | 22 | |--------------------------|------|-------------|------|---------| 23 | | basecaller_cfg | string | Name of the model to use for converting signal. | Required for basecalling. The model list only shows models that are compatible with this workflow. | | 24 | | duplex | boolean | Run the basecaller in duplex mode. | By default the workflow conducts simplex basecalling. If you used a chemistry and flowcell combination that supports duplex reads, you may switch this option on. This option is incompatible with the watch_path option due to the way the input files must be traversed in order to find duplex pairs. | False | 25 | | remora_cfg | string | Name of the model to use for calling modified bases. | Required for calling modified bases while basecalling. The model list only shows models that are compatible with this workflow. | | 26 | | dorado_ext | string | File extension for Dorado inputs. | Set this to fast5 if you have not converted your fast5 to pod5. It is recommended to [convert existing fast5 files to pod5 for use with Dorado](https://github.com/nanoporetech/pod5-file-format/blob/master/python/README.md#pod5-convert-from-fast5). | pod5 | 27 | | poly_a_config | string | Provide this TOML file to turn on and configure dorado poly(A) calling. | This TOML file allows you to turn on and configure poly(A) tail calling options in dorado. This feature is described [here](https://github.com/nanoporetech/dorado?tab=readme-ov-file#polya-tail-estimation). | | 28 | | barcode_kit | string | Name of the kit to use for barcoding. Demultiplex the data. | Providing a kit here will instruct the workflow to demultiplex your 'pass' data to BAM files, which can be found in your output directory under the folder 'demuxed' in a structure reminiscent of MinKNOW. | | 29 | 30 | 31 | ### Advanced basecalling options 32 | 33 | | Nextflow parameter name | Type | Description | Help | Default | 34 | |--------------------------|------|-------------|------|---------| 35 | | output_pod5 | boolean | Save the converted POD5 when running in duplex with FAST5 inputs. | Dorado duplex only supports POD5 input. The workflow will automatically convert FAST5 input to POD5 when duplex calling. By default, converted POD5 are deleted to save disk space. Enabling this option will make the workflow output converted POD5 files to a subfolder within the output directory. | False | 36 | | qscore_filter | number | Mean qscore by which to filter reads. Inclusive such that reads with score >= qscore_filter are kept. | The mean qscore of reads is calculated by dorado and rounded to an integer by dorado and stored as a tag in dorado's SAM output. The pipeline separates reads into pass and fail categories based on this SAM tag. | 10 | 37 | | cuda_device | string | GPU device to use for basecalling [cuda:all]. | For local execution this can be used to pin GPU tasks to one (or more) specific GPU devices. Use cuda:all to use all available GPU devices, or cuda:idx[,idx,...] where idx is an index number(s) of GPU device(s) to use. | cuda:all | 38 | | basecaller_model_path | string | Override the named basecalling model with a custom basecalling model. | For typical use, users should set --basecaller_cfg which will use a named model from inside the container. Experimental or custom basecallers will not be available in the container and can be loaded from the host with --basecaller_model_path. | | 39 | | remora_model_path | string | Override the named remora model with a custom remora model. | For typical use, users should set --remora_cfg which will use a named model from inside the container. Experimental or custom models will not be available in the container and can be loaded from the host with --remora_model_path. | | 40 | | basecaller_basemod_threads | number | Number of threads to use for base modification calling. | You must set this to > 0 when using a modbase aware model. Modbase calling does not require much additional CPU and should be set carefully when using GPU servers with a small number of CPUs per GPU. | 2 | 41 | | basecaller_args | string | Additional command line arguments to pass to the basecaller process. | | | 42 | | demux_args | string | Additional command line arguments to pass to the basecaller barcoding process. | | | 43 | 44 | 45 | ### Multiprocessing Options 46 | 47 | | Nextflow parameter name | Type | Description | Help | Default | 48 | |--------------------------|------|-------------|------|---------| 49 | | ubam_map_threads | integer | Set max number of threads to use for aligning reads from uBAM (limited by config executor cpus) | | 8 | 50 | | ubam_sort_threads | integer | Set max number of threads to use for sorting and indexing aligned reads from uBAM (limited by config executor cpus) | | 3 | 51 | | ubam_bam2fq_threads | integer | Set max number of threads to use for uncompressing uBAM and generating FASTQ for alignment (limited by config executor cpus) | | 1 | 52 | | merge_threads | integer | Set max number of threads to use for merging BAM files (limited by config executor cpus) | | 4 | 53 | | stats_threads | integer | Set max number of threads to use for getting stats from output files. (limited by config executor cpus) | | 4 | 54 | 55 | 56 | ### Real Time Analysis Options 57 | 58 | | Nextflow parameter name | Type | Description | Help | Default | 59 | |--------------------------|------|-------------|------|---------| 60 | | watch_path | boolean | Enable to continuously watch the input directory for new input files. Reads will be analysed as they appear. | This option enables the use of Nextflow's directory watching feature to constantly monitor input directories for new files. As soon as files are written by an external process Nextflow will begin analysing these files. The workflow will accumulate data over time to produce an updating report. Real time analysis of duplex data may lead to lower duplex rates than what would have been obtained by running basecalling after sequencing. | False | 61 | | read_limit | integer | Stop processing data when a particular number of reads have been analysed. | By default the workflow will run indefinitely when using the real time watch path option. This will set the upper bound on the number of reads that will be analysed before the workflow is automatically stopped and no more data is analysed. | | 62 | 63 | 64 | -------------------------------------------------------------------------------- /docs/07_outputs.md: -------------------------------------------------------------------------------- 1 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. 2 | 3 | | Title | File path | Description | Per sample or aggregated | 4 | |-------|-----------|-------------|--------------------------| 5 | | workflow report | wf-basecalling-report.html | Report summarising the work done by the basecalling workflow | per-sample | 6 | | Simplex alignment file of passed reads | {{ alias }}.pass.simplex.{{ format }} | BAM or CRAM file of simplex reads for the sample that pass QC filtering. | per-sample | 7 | | Duplex alignment file of passed reads | {{ alias }}.pass.duplex.{{ format }} | BAM or CRAM file of duplex reads for the sample that pass QC filtering. Created if duplex basecalling is requested. | per-sample | 8 | | Simplex alignment file index of passed reads | {{ alias }}.pass.simplex.{{ format }}.{{ index_format }} | The index of the resulting BAM or CRAM file with the simplex reads that pass QC filtering. | per-sample | 9 | | Duplex alignment file index of passed reads | {{ alias }}.pass.duplex.{{ format }}.{{ index_format }} | The index of the resulting BAM or CRAM file with the duplex reads that pass QC filtering. Created if duplex basecalling is requested. | per-sample | 10 | | Simplex alignment file of failed reads | {{ alias }}.fail.simplex.{{ format }} | BAM or CRAM file of simplex reads for the sample that fail QC filtering. | per-sample | 11 | | Duplex alignment file of failed reads | {{ alias }}.fail.duplex.{{ format }} | BAM or CRAM file of duplex reads for the sample that fail QC filtering. Created if duplex basecalling is requested. | per-sample | 12 | | Simplex alignment file index of failed reads | {{ alias }}.fail.simplex.{{ format }}.{{ index_format }} | The index of the resulting BAM or CRAM file with the simplex reads that fail QC filtering. | per-sample | 13 | | Duplex alignment file index of failed reads | {{ alias }}.fail.duplex.{{ format }}.{{ index_format }} | The index of the resulting BAM or CRAM file with the duplex reads that fail QC filtering. Created if duplex basecalling is requested. | per-sample | 14 | | Index of the reference FASTA file | {{ ref }}.fai | Index of the reference FASTA file. | aggregated | 15 | | JSON configuration file for IGV browser | igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reference genome. | aggregated | 16 | -------------------------------------------------------------------------------- /docs/08_pipeline_overview.md: -------------------------------------------------------------------------------- 1 | ### 1. Prerequisites 2 | 3 | The workflow uses [Dorado](https://github.com/nanoporetech/dorado) for basecalling which includes the use of [Remora](https://github.com/nanoporetech/remora) for modified basecalling. 4 | Basecalling with `Dorado` requires an NVIDIA GPU with [Pascal architecture or newer](https://www.nvidia.com/en-gb/technologies/) and at least 8 GB of vRAM. 5 | 6 | #### Windows 7 | 8 | Windows should not be considered as a supported operating systems for wf-basecalling as we do not directly support configuration of accelerated computing through WSL2 and Docker. 9 | Although we do not offer support, it is possible to set up Docker to use GPUs for most versions of Windows 11 and some versions of Windows 10 and we direct users to the [CUDA on WSL User Guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html). 10 | Users should take note of the support constraints section to ensure their environment is suitable before following the guidance. **Do not install an NVIDIA driver into your WSL2 environment**. 11 | Users are encouraged to download Dorado for Windows from the [Dorado GitHub repository](https://github.com/nanoporetech/dorado#installation). 12 | 13 | #### MacOS 14 | 15 | MacOS should not be considered as a supported operating systems for wf-basecalling as we do not support accelerated computing through Docker on MacOS. 16 | On MacOS, GPU support through Docker remains in technical infancy. In addition, the containers we provide will not be able to leverage the M1 and M2 architecture and will not run as performantly as if Dorado had been run natively. 17 | Users are encouraged to download Dorado for MacOS directly from the [Dorado GitHub repository](https://github.com/nanoporetech/dorado#installation). 18 | 19 | #### Linux 20 | 21 | When using Docker for accelerated computing on Linux, you will need the `nvidia-container-toolkit` installed. 22 | If you observe the error "could not select device driver with capabilities gpu", you should follow the instructions to install `nvidia-container-toolkit` [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#setting-up-nvidia-container-toolkit). You will need to follow the steps to: 23 | 24 | - Setup the package repository and the GPG key (ignore the box about experimental releases) 25 | - Update package listings 26 | - Install nvidia-container-toolkit 27 | - Configure the Docker daemon to recognize the NVIDIA Container Runtime 28 | - Restart the Docker daemon to complete the installation after setting the default runtime 29 | 30 | By default, workflows are configured to run GPU tasks in serial. That is, only one basecalling task will be run at a time. This is to prevent the GPU from running out of memory on local execution. 31 | When running workflows on a cluster, or in a cloud where GPU resources are isolated from one another, users should specify `-profile discrete_gpus` as part of the command invocation. This will allow for parallel execution of GPU tasks. 32 | You should ask your system administrator if you need to configure any additional options to leverage GPUs on your cluster. For example, you may need to provide a special string to the workflow's `--cuda_device` option to ensure tasks use the GPU assigned to them by the job scheduler. 33 | 34 | ### 2. Choosing a model 35 | 36 | To select the relevant model see the `dorado` repository for a [table of available models](https://github.com/nanoporetech/dorado#available-basecalling-models) to select for `--basecaller_cfg` and `--remora_cfg`. 37 | 38 | ### 3. Aligning to a reference 39 | 40 | The workflow can optionally perform the alignment of the basecalled data using [minimap2](https://github.com/lh3/minimap2) to a reference of choice, provided with the `--ref` option. 41 | Additionally, the workflow will generate an IGV configuration file. This file allows the user to view the filtered aligned BAM in the EPI2ME Desktop Application in the Viewer tab. 42 | 43 | ### 4. Duplex calling 44 | 45 | wf-basecalling supports [duplex calling](https://github.com/nanoporetech/dorado#duplex), which is enabled with the `--duplex` option. If you used a chemistry and flowcell combination that supported duplex reads, you should switch this option on. The resulting BAM/CRAM will quality filtered and then automatically split in separate BAM/CRAM files for the simplex and duplex reads. 46 | Since `dorado duplex` requires the inputs to be in `pod5` format, the workflow will perform the conversion automatically using [pod5 convert fast5](https://github.com/nanoporetech/pod5-file-format/blob/master/python/pod5/README.md#pod5-convert-fast5). These files are normally deleted upon completion of the analysis, but can optionally be saved by the user by providing the `--output_pod5` option. 47 | 48 | ### 5. Real-time analysis 49 | 50 | wf-basecalling can perform the basecalling as the pod5 files are generated. To enable this, provide the `--watch_path` option. The workflow will process the newly generated files as soon as they become available. 51 | 52 | ### 6. Barcode classification and demultiplexing 53 | 54 | wf-basecalling can perform data demultiplexing by providing the appropriate barcoding kit with the `--barcode_kit` option. 55 | This will generate a new `{{ out_dir }}/demuxed` directory, with one subfolder for each barcode and one additional `unclassified` folder for reads that cannot be demultiplexed. This option is not available for `dorado duplex`. 56 | Please note that the demultiplexed reads will always be in BAM format, even when the user sets `--output_bam false`. -------------------------------------------------------------------------------- /docs/09_troubleshooting.md: -------------------------------------------------------------------------------- 1 | * Duplex mode with wf-basecalling is reliant on internal optimisations to organise input files for better duplex rates, which is not possible when using streaming basecalling; therefore duplex combined with the `--watch_path` option could lead to lower duplex rates than what would be achieved running the algorithm after sequencing is completed. 2 | * Renaming, moving or deleting the reference genome or the output directory from the location provided at runtime will cause IGV to not load anymore. -------------------------------------------------------------------------------- /docs/10_FAQ.md: -------------------------------------------------------------------------------- 1 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-basecalling/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). 2 | -------------------------------------------------------------------------------- /docs/11_other.md: -------------------------------------------------------------------------------- 1 | + [Importing third-party workflows into EPI2ME Labs](https://labs.epi2me.io/nexflow-for-epi2melabs/) 2 | 3 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. -------------------------------------------------------------------------------- /lib/ArgumentParser.groovy: -------------------------------------------------------------------------------- 1 | /* Check arguments of a Nextflow function 2 | * 3 | * Nextflow script does not support the Groovy idiom: 4 | * 5 | * def function(Map args[:], arg1, arg2, ...) 6 | * 7 | * to support unordered kwargs. The methods here are designed 8 | * to reduce boileplate while allowing Nextflow script to implement 9 | * 10 | * def function(Map args[:]) 11 | * 12 | * with required and default values. This is similar to some Python 13 | * libraries' (notably matplotlib) extensive use of things like: 14 | * 15 | * def function(*args, **kwargs) 16 | * 17 | * to implement generic APIs. Why do we want to do all this? Because 18 | * we want to write library code with a clean set of required parameters 19 | * but also extensible with non-required parameters with default values. 20 | * This allows us to later add parameters without breaking existing code, 21 | * and is very common practice elsewhere. 22 | */ 23 | 24 | import java.util.Set 25 | 26 | class ArgumentParser { 27 | Set args 28 | Map kwargs 29 | String name 30 | 31 | /* Parse arguments, raising an error on unknown keys */ 32 | public Map parse_args(LinkedHashMap given_args) { 33 | Set opt_keys = kwargs.keySet() 34 | Set given_keys = given_args.keySet() 35 | check_required(given_keys) 36 | check_unknown(given_keys, opt_keys) 37 | return kwargs + given_args 38 | } 39 | 40 | /* Parse arguments, without raising an error for extra keys */ 41 | public Map parse_known_args(LinkedHashMap given_args) { 42 | Set opt_keys = kwargs.keySet() 43 | Set given_keys = given_args.keySet() 44 | check_required(given_keys) 45 | return kwargs + given_args 46 | } 47 | 48 | private void check_required(Set given) { 49 | Set missing_keys = args - given 50 | if (!missing_keys.isEmpty()) { 51 | throw new Exception("Missing arguments for function ${name}: ${missing_keys}") 52 | } 53 | } 54 | 55 | private void check_unknown(Set given, Set kwargs_keys) { 56 | Set extra_keys = given - (args + kwargs_keys) 57 | if (!extra_keys.isEmpty()) { 58 | throw new Exception("Unknown arguments provided to function ${name}: ${extra_keys}.") 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /lib/CWUtil.groovy: -------------------------------------------------------------------------------- 1 | /* Miscellaneous utilities for workflows from the ONT Customer Workflows Group. 2 | */ 3 | class CWUtil { 4 | 5 | /* Mutate the global Nextflow params map 6 | * 7 | * Occasionally, we may wish to mutate the value of a parameter provided 8 | * by the user. Typically, this leads to workflows with `params.my_param` 9 | * and `params._my_param` which is ripe for confusion. Instead, we can 10 | * mutate the parameter value in the Nextflow params ScriptMap itself 11 | * with the following call: 12 | * 13 | * CWUtil.mutateParam(params, k, v) 14 | * 15 | * This is possible as Groovy actually has a surprisingly loose 16 | * definition of "private", and allows us to call the private `allowNames` 17 | * method on the ScriptMap which removes the read-only status for a key set. 18 | * We can follow this up with a call to the private `put0` to reinsert 19 | * the key and mark it as read-only again. 20 | */ 21 | public static void mutateParam(nf_params, key, value) { 22 | Set s = [key] // must be a set to allow call to allowNames 23 | nf_params.allowNames(s) 24 | nf_params.put0(key, value) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /lib/NfcoreTemplate.groovy: -------------------------------------------------------------------------------- 1 | // 2 | // This file holds several functions used within the nf-core pipeline template. 3 | // 4 | 5 | // MIT License 6 | // 7 | // Copyright (c) 2018 nf-core 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a copy 10 | // of this software and associated documentation files (the "Software"), to deal 11 | // in the Software without restriction, including without limitation the rights 12 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | // copies of the Software, and to permit persons to whom the Software is 14 | // furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included in all 17 | // copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | // SOFTWARE. 26 | 27 | 28 | import org.yaml.snakeyaml.Yaml 29 | 30 | class NfcoreTemplate { 31 | 32 | // 33 | // Check AWS Batch related parameters have been specified correctly 34 | // 35 | public static void awsBatch(workflow, params) { 36 | if (workflow.profile.contains('awsbatch')) { 37 | // Check params.awsqueue and params.awsregion have been set if running on AWSBatch 38 | assert (params.awsqueue && params.awsregion) : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" 39 | // Check outdir paths to be S3 buckets if running on AWSBatch 40 | assert params.outdir.startsWith('s3:') : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" 41 | } 42 | } 43 | 44 | // 45 | // Check params.hostnames 46 | // 47 | public static void hostName(workflow, params, log) { 48 | Map colors = logColours(params.monochrome_logs) 49 | if (params.hostnames) { 50 | try { 51 | def hostname = "hostname".execute().text.trim() 52 | params.hostnames.each { prof, hnames -> 53 | hnames.each { hname -> 54 | if (hostname.contains(hname) && !workflow.profile.contains(prof)) { 55 | log.info "=${colors.yellow}====================================================${colors.reset}=\n" + 56 | "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + 57 | " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + 58 | " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + 59 | "=${colors.yellow}====================================================${colors.reset}=" 60 | } 61 | } 62 | } 63 | } catch (Exception e) { 64 | log.warn "[$workflow.manifest.name] Could not determine 'hostname' - skipping check. Reason: ${e.message}." 65 | } 66 | } 67 | } 68 | 69 | // 70 | // Generate version string 71 | // 72 | public static String version(workflow) { 73 | String version_string = "" 74 | 75 | if (workflow.manifest.version) { 76 | def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' 77 | version_string += "${prefix_v}${workflow.manifest.version}" 78 | } 79 | 80 | if (workflow.commitId) { 81 | def git_shortsha = workflow.commitId.substring(0, 7) 82 | version_string += "-g${git_shortsha}" 83 | } 84 | 85 | return version_string 86 | } 87 | 88 | // 89 | // Construct and send completion email 90 | // 91 | public static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], fail_mapped_reads=[:]) { 92 | 93 | // Set up the e-mail variables 94 | def subject = "[$workflow.manifest.name] Successful: $workflow.runName" 95 | if (fail_mapped_reads.size() > 0) { 96 | subject = "[$workflow.manifest.name] Partially successful (${fail_mapped_reads.size()} skipped): $workflow.runName" 97 | } 98 | if (!workflow.success) { 99 | subject = "[$workflow.manifest.name] FAILED: $workflow.runName" 100 | } 101 | 102 | def summary = [:] 103 | for (group in summary_params.keySet()) { 104 | summary << summary_params[group] 105 | } 106 | 107 | def misc_fields = [:] 108 | misc_fields['Date Started'] = workflow.start 109 | misc_fields['Date Completed'] = workflow.complete 110 | misc_fields['Pipeline script file path'] = workflow.scriptFile 111 | misc_fields['Pipeline script hash ID'] = workflow.scriptId 112 | if (workflow.repository) misc_fields['Pipeline repository Git URL'] = workflow.repository 113 | if (workflow.commitId) misc_fields['Pipeline repository Git Commit'] = workflow.commitId 114 | if (workflow.revision) misc_fields['Pipeline Git branch/tag'] = workflow.revision 115 | misc_fields['Nextflow Version'] = workflow.nextflow.version 116 | misc_fields['Nextflow Build'] = workflow.nextflow.build 117 | misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp 118 | 119 | def email_fields = [:] 120 | email_fields['version'] = NfcoreTemplate.version(workflow) 121 | email_fields['runName'] = workflow.runName 122 | email_fields['success'] = workflow.success 123 | email_fields['dateComplete'] = workflow.complete 124 | email_fields['duration'] = workflow.duration 125 | email_fields['exitStatus'] = workflow.exitStatus 126 | email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') 127 | email_fields['errorReport'] = (workflow.errorReport ?: 'None') 128 | email_fields['commandLine'] = workflow.commandLine 129 | email_fields['projectDir'] = workflow.projectDir 130 | email_fields['summary'] = summary << misc_fields 131 | email_fields['fail_mapped_reads'] = fail_mapped_reads.keySet() 132 | email_fields['min_mapped_reads'] = params.min_mapped_reads 133 | 134 | // On success try attach the multiqc report 135 | def mqc_report = null 136 | try { 137 | if (workflow.success && !params.skip_multiqc) { 138 | mqc_report = multiqc_report.getVal() 139 | if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) { 140 | if (mqc_report.size() > 1) { 141 | log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" 142 | } 143 | mqc_report = mqc_report[0] 144 | } 145 | } 146 | } catch (all) { 147 | if (multiqc_report) { 148 | log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" 149 | } 150 | } 151 | 152 | // Check if we are only sending emails on failure 153 | def email_address = params.email 154 | if (!params.email && params.email_on_fail && !workflow.success) { 155 | email_address = params.email_on_fail 156 | } 157 | 158 | // Render the TXT template 159 | def engine = new groovy.text.GStringTemplateEngine() 160 | def tf = new File("$projectDir/assets/email_template.txt") 161 | def txt_template = engine.createTemplate(tf).make(email_fields) 162 | def email_txt = txt_template.toString() 163 | 164 | // Render the HTML template 165 | def hf = new File("$projectDir/assets/email_template.html") 166 | def html_template = engine.createTemplate(hf).make(email_fields) 167 | def email_html = html_template.toString() 168 | 169 | // Render the sendmail template 170 | def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit 171 | def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] 172 | def sf = new File("$projectDir/assets/sendmail_template.txt") 173 | def sendmail_template = engine.createTemplate(sf).make(smail_fields) 174 | def sendmail_html = sendmail_template.toString() 175 | 176 | // Send the HTML e-mail 177 | Map colors = logColours(params.monochrome_logs) 178 | if (email_address) { 179 | try { 180 | if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } 181 | // Try to send HTML e-mail using sendmail 182 | [ 'sendmail', '-t' ].execute() << sendmail_html 183 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" 184 | } catch (all) { 185 | // Catch failures and try with plaintext 186 | def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] 187 | if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { 188 | mail_cmd += [ '-A', mqc_report ] 189 | } 190 | mail_cmd.execute() << email_html 191 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-" 192 | } 193 | } 194 | 195 | // Write summary e-mail HTML to a file 196 | def output_d = new File("${params.outdir}/pipeline_info/") 197 | if (!output_d.exists()) { 198 | output_d.mkdirs() 199 | } 200 | def output_hf = new File(output_d, "pipeline_report.html") 201 | output_hf.withWriter { w -> w << email_html } 202 | def output_tf = new File(output_d, "pipeline_report.txt") 203 | output_tf.withWriter { w -> w << email_txt } 204 | } 205 | 206 | // 207 | // Print pipeline summary on completion 208 | // 209 | public static void summary(workflow, params, log, fail_mapped_reads=[:], pass_mapped_reads=[:]) { 210 | Map colors = logColours(params.monochrome_logs) 211 | 212 | if (pass_mapped_reads.size() > 0) { 213 | def idx = 0 214 | def samp_aln = '' 215 | def total_aln_count = pass_mapped_reads.size() + fail_mapped_reads.size() 216 | for (samp in pass_mapped_reads) { 217 | samp_aln += " ${samp.value}: ${samp.key}\n" 218 | idx += 1 219 | if (idx > 5) { 220 | samp_aln += " ..see pipeline reports for full list\n" 221 | break; 222 | } 223 | } 224 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} ${pass_mapped_reads.size()}/$total_aln_count samples passed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-" 225 | } 226 | if (fail_mapped_reads.size() > 0) { 227 | def samp_aln = '' 228 | for (samp in fail_mapped_reads) { 229 | samp_aln += " ${samp.value}: ${samp.key}\n" 230 | } 231 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} ${fail_mapped_reads.size()} samples skipped since they failed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-" 232 | } 233 | 234 | if (workflow.success) { 235 | if (workflow.stats.ignoredCount == 0) { 236 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" 237 | } else { 238 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" 239 | } 240 | } else { 241 | hostName(workflow, params, log) 242 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" 243 | } 244 | } 245 | 246 | // 247 | // ANSII Colours used for terminal logging 248 | // 249 | public static Map logColours(Boolean monochrome_logs) { 250 | Map colorcodes = [:] 251 | 252 | // Reset / Meta 253 | colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" 254 | colorcodes['bold'] = monochrome_logs ? '' : "\033[1m" 255 | colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" 256 | colorcodes['underlined'] = monochrome_logs ? '' : "\033[4m" 257 | colorcodes['blink'] = monochrome_logs ? '' : "\033[5m" 258 | colorcodes['reverse'] = monochrome_logs ? '' : "\033[7m" 259 | colorcodes['hidden'] = monochrome_logs ? '' : "\033[8m" 260 | 261 | // Regular Colors 262 | colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" 263 | colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" 264 | colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" 265 | colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" 266 | colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" 267 | colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" 268 | colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" 269 | colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" 270 | 271 | // Bold 272 | colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" 273 | colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" 274 | colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" 275 | colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" 276 | colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" 277 | colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" 278 | colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" 279 | colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" 280 | 281 | // Underline 282 | colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" 283 | colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" 284 | colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" 285 | colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" 286 | colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" 287 | colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" 288 | colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" 289 | colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" 290 | 291 | // High Intensity 292 | colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" 293 | colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" 294 | colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" 295 | colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" 296 | colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" 297 | colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" 298 | colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" 299 | colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" 300 | 301 | // Bold High Intensity 302 | colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" 303 | colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" 304 | colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" 305 | colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" 306 | colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" 307 | colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" 308 | colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" 309 | colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" 310 | 311 | return colorcodes 312 | } 313 | 314 | // 315 | // Does what is says on the tin 316 | // 317 | public static String dashedLine(monochrome_logs) { 318 | Map colors = logColours(monochrome_logs) 319 | return "${colors.dim}--------------------------------------------------------------------------------${colors.reset}" 320 | } 321 | 322 | // epi2me-labs logo 323 | public static String logo(workflow, monochrome_logs) { 324 | Map colors = NfcoreTemplate.logColours(monochrome_logs) 325 | String workflow_name = workflow.manifest.name.split("/")[1] 326 | String workflow_version = version(workflow) 327 | String.format( 328 | """ 329 | ${colors.igreen}|||||||||| ${colors.reset}${colors.dim}_____ ____ ___ ____ __ __ _____ 330 | ${colors.igreen}|||||||||| ${colors.reset}${colors.dim}| ____| _ \\_ _|___ \\| \\/ | ____| 331 | ${colors.yellow}||||| ${colors.reset}${colors.dim}| _| | |_) | | __) | |\\/| | _| 332 | ${colors.yellow}||||| ${colors.reset}${colors.dim}| |___| __/| | / __/| | | | |__ 333 | ${colors.iblue}|||||||||| ${colors.reset}${colors.dim}|_____|_| |___|_____|_| |_|_____| 334 | ${colors.iblue}|||||||||| ${colors.reset}${colors.bold}${workflow_name} ${workflow_version}${colors.reset} 335 | ${NfcoreTemplate.dashedLine(monochrome_logs)} 336 | """.stripIndent() 337 | ) 338 | } 339 | } 340 | 341 | 342 | -------------------------------------------------------------------------------- /lib/Pinguscript.groovy: -------------------------------------------------------------------------------- 1 | import static groovy.json.JsonOutput.toJson 2 | import groovy.json.JsonBuilder 3 | import groovy.json.JsonSlurper 4 | 5 | 6 | class Pinguscript { 7 | 8 | // Send a ping for the start of a workflow 9 | public static void ping_start(nextflow, workflow, params) { 10 | wf_ping(nextflow, workflow, "start", null, params) 11 | } 12 | // Send a ping for a completed workflow (successful or otherwise) 13 | public static void ping_complete(nextflow, workflow, params) { 14 | wf_ping(nextflow, workflow, "end", null, params) 15 | } 16 | // Send a ping for a workflow error 17 | public static void ping_error(nextflow, workflow, params) { 18 | def error_message = workflow.errorMessage 19 | wf_ping(nextflow, workflow, "error", error_message, params) 20 | } 21 | // Shared handler to construct a ping JSON and send it 22 | private static String wf_ping(nextflow, workflow, event, error_message, params) { 23 | if (params.disable_ping) { 24 | return "{}" 25 | } 26 | def body_json = make_wf_ping(nextflow, workflow, event, error_message, params) 27 | send_ping_post("epilaby", body_json) 28 | } 29 | 30 | // Helper to removing keys from a map 31 | private static clean_meta(meta, keys_to_remove) { 32 | for (key in keys_to_remove) { 33 | if (meta.containsKey(key)) { 34 | meta.remove(key) 35 | } 36 | } 37 | } 38 | 39 | // Helper for fetching a key from the params map 40 | // seems pointless but you just know someone is going to end up writing meta.this ? meta.that 41 | private static get_meta(meta, key) { 42 | (meta.containsKey(key) && meta[key]) ? meta[key].toString() : null 43 | } 44 | 45 | // Construct workflow ping JSON 46 | private static String make_wf_ping(nextflow, workflow, event, error_message, params) { 47 | // cheeky deepcopy using json 48 | String paramsJSON = new JsonBuilder(params).toPrettyString() 49 | def params_data = new JsonSlurper().parseText(paramsJSON) 50 | 51 | // OS 52 | // TODO check version on WSL 53 | def opsys = System.properties['os.name'].toLowerCase() 54 | def opver = System.properties['os.version'] 55 | if (opver.toLowerCase().contains("wsl")){ 56 | opsys = "wsl" 57 | } 58 | 59 | // placeholder for any future okta business 60 | // for now we'll use the guest_ sent to wf.epi2me_user 61 | def user = get_meta(params.wf, "epi2me_user") 62 | 63 | // drop cruft to save some precious bytes 64 | // affects the deep copy rather than original params 65 | clean_meta(params_data, [ 66 | "schema_ignore_params", 67 | ]) 68 | def ingress_ids = [] 69 | if (params_data.containsKey("wf")) { 70 | ingress_ids = params_data.wf["ingress.run_ids"] ?: [] 71 | clean_meta(params_data.wf, [ 72 | "agent", // we send this later 73 | "epi2me_instance", // we send this later 74 | "epi2me_user", // we send this later 75 | "example_cmd", 76 | "ingress.run_ids", // we will send this elsewhere 77 | ]) 78 | } 79 | 80 | // try and get runtime information 81 | def cpus = null 82 | try { 83 | cpus = Runtime.getRuntime().availableProcessors() 84 | } 85 | catch(Exception e) {} 86 | 87 | def workflow_success = null 88 | def workflow_exitcode = null 89 | if (event != "start") { 90 | workflow_success = workflow.success 91 | workflow_exitcode = workflow.exitStatus 92 | } 93 | 94 | /// build message 95 | def body_json = new JsonBuilder() 96 | body_json \ 97 | "tracking_id": [ 98 | "msg_id": UUID.randomUUID().toString(), 99 | "version": "3.0.1" 100 | ], 101 | "source": "workflow", 102 | "event": event, 103 | "params": params_data, 104 | // data will be null on start events, as ingress has not run 105 | "data": event != "start" ? [run_ids: ingress_ids] : null, 106 | "workflow": [ 107 | "name": workflow.manifest.name, 108 | "version": workflow.manifest.version, // could use NfcoreTemplate.version(workflow) 109 | "run_name": workflow.runName, // required to disambiguate sessions 110 | "session": workflow.sessionId, 111 | "profile": workflow.profile, 112 | "resume": workflow.resume, 113 | "error": error_message, // null if no error 114 | "success": workflow_success, 115 | "exitcode": workflow_exitcode, 116 | ], 117 | "env": [ 118 | "user": user, // placeholder for any future okta 119 | "os": [ 120 | "name": opsys, 121 | "version": opver 122 | ], 123 | "resource": [ 124 | "cpus": cpus, 125 | "memory": null, // placeholder, no point asking via Runtime as it will just give us the Xmx size 126 | ], 127 | "agent": get_meta(params.wf, "agent"), // access via original params 128 | "epi2me": [ 129 | "instance": get_meta(params.wf, "epi2me_instance"), 130 | "user": user, 131 | ], 132 | "nextflow": [ 133 | "version": nextflow.version.toString(), 134 | "version_compat": nextflow.version.matches(workflow.manifest.nextflowVersion) 135 | ] 136 | ] 137 | return body_json 138 | } 139 | 140 | // Send a JSON payload to a given endpoint 141 | private static String send_ping_post(endpoint, body_json) { 142 | // Attempt to send payload and absorb any possible Exception gracefully 143 | String postResult 144 | boolean raise_exception = false 145 | try { 146 | ((HttpURLConnection)new URL("https://ping.oxfordnanoportal.com/${endpoint}").openConnection()).with({ 147 | requestMethod = 'POST' 148 | doOutput = true 149 | setConnectTimeout(5000) 150 | setReadTimeout(10000) 151 | setRequestProperty('Content-Type', 'application/json') 152 | setRequestProperty('accept', 'application/json') 153 | outputStream.withPrintWriter({printWriter -> 154 | printWriter.write(body_json.toString()) 155 | }) 156 | 157 | // Rethrow exceptions that imply we're not using this endpoint properly 158 | if(responseCode >= 400 && agent.toString() == "cw-ci") { 159 | raise_exception = true 160 | } 161 | // Accessing inputStream.text will raise an Exception for failed requests 162 | postResult = inputStream.text 163 | }) 164 | } 165 | catch(Exception e) { 166 | if(raise_exception) { throw e } 167 | } 168 | return (postResult) 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /lib/WorkflowMain.groovy: -------------------------------------------------------------------------------- 1 | // This file is based on the nf-core/tools pipeline-template. 2 | // Changes to this file must be propagated via wf-template. 3 | 4 | class WorkflowMain { 5 | 6 | // Citation string for pipeline 7 | public static String citation(workflow) { 8 | return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + 9 | "* The nf-core framework\n" + 10 | " https://doi.org/10.1038/s41587-020-0439-x\n\n" 11 | } 12 | 13 | // Generate help string 14 | public static String help(workflow, params, log) { 15 | String line_sep = ' \\ \n\t' 16 | String command_example = params.wf.example_cmd.join(line_sep) 17 | String command = 'nextflow run ' + workflow.manifest.name + line_sep + command_example 18 | String help_string = '' 19 | help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) 20 | help_string += NfcoreSchema.paramsHelp(workflow, params, command) 21 | help_string += '\n' + citation(workflow) + '\n' 22 | return help_string 23 | } 24 | 25 | // Generate parameter summary log string 26 | public static String paramsSummaryLog(workflow, params, log) { 27 | String workflow_version = NfcoreTemplate.version(workflow) 28 | String summary_log = '' 29 | summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) 30 | summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) 31 | summary_log += '\n' + citation(workflow) + '\n' 32 | summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) 33 | summary_log += "\nThis is ${workflow.manifest.name} ${workflow_version}.\n" 34 | summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) 35 | return summary_log 36 | } 37 | 38 | // Validate parameters and print summary to screen 39 | public static void initialise(workflow, params, log) { 40 | // Print help to screen if required 41 | if (params.help) { 42 | log.info help(workflow, params, log) 43 | System.exit(0) 44 | } 45 | 46 | // Print workflow version and exit on --version 47 | if (params.version) { 48 | String workflow_version = NfcoreTemplate.version(workflow) 49 | log.info "${workflow.manifest.name} ${workflow_version}" 50 | System.exit(0) 51 | } 52 | 53 | // Explode on conda 54 | // conda.enabled seems to be backward compatible but wrap this 55 | // in a generic catch just in case 56 | try { 57 | if (workflow.session.config.conda.enabled) { 58 | log.error "Sorry, this workflow is not compatible with Conda, please use -profile standard (Docker) or -profile singularity." 59 | System.exit(1) 60 | } 61 | } catch(Exception e) {} 62 | 63 | // Validate workflow parameters via the JSON schema 64 | if (params.validate_params) { 65 | NfcoreSchema.validateParameters(workflow, params, log) 66 | } 67 | 68 | // Print parameter summary log to screen 69 | log.info paramsSummaryLog(workflow, params, log) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /lib/common.nf: -------------------------------------------------------------------------------- 1 | import groovy.json.JsonBuilder 2 | 3 | process getParams { 4 | label "wf_common" 5 | publishDir "${params.out_dir}", mode: 'copy', pattern: "params.json" 6 | cache false 7 | cpus 1 8 | memory "2 GB" 9 | output: 10 | path "params.json" 11 | script: 12 | def paramsJSON = new JsonBuilder(params).toPrettyString().replaceAll("'", "'\\\\''") 13 | """ 14 | # Output nextflow params object to JSON 15 | echo '$paramsJSON' > params.json 16 | """ 17 | } 18 | 19 | process configure_igv { 20 | publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv 21 | label "wf_common" 22 | cpus 1 23 | memory "2 GB" 24 | input: 25 | // the python script will work out what to do with all the files based on their 26 | // extensions 27 | path "file-names.txt" 28 | val locus_str 29 | val aln_extra_opts 30 | val var_extra_opts 31 | output: path "igv.json" 32 | script: 33 | // the locus argument just makes sure that the initial view in IGV shows something 34 | // interesting 35 | String locus_arg = locus_str ? "--locus $locus_str" : "" 36 | // extra options for alignment tracks 37 | def aln_opts_json_str = \ 38 | aln_extra_opts ? new JsonBuilder(aln_extra_opts).toPrettyString() : "" 39 | String aln_extra_opts_arg = \ 40 | aln_extra_opts ? "--extra-alignment-opts extra-aln-opts.json" : "" 41 | // extra options for variant tracks 42 | def var_opts_json_str = \ 43 | var_extra_opts ? new JsonBuilder(var_extra_opts).toPrettyString() : "" 44 | String var_extra_opts_arg = \ 45 | var_extra_opts ? "--extra-vcf-opts extra-var-opts.json" : "" 46 | """ 47 | # write out JSON files with extra options for the alignment and variant tracks 48 | echo '$aln_opts_json_str' > extra-aln-opts.json 49 | echo '$var_opts_json_str' > extra-var-opts.json 50 | 51 | workflow-glue configure_igv \ 52 | --fofn file-names.txt \ 53 | $locus_arg \ 54 | $aln_extra_opts_arg \ 55 | $var_extra_opts_arg \ 56 | > igv.json 57 | """ 58 | } 59 | 60 | -------------------------------------------------------------------------------- /lib/nfcore_external_java_deps.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-basecalling/84442a77c74646c8971342d2ef80f124b5aeaf31/lib/nfcore_external_java_deps.jar -------------------------------------------------------------------------------- /lib/reference.nf: -------------------------------------------------------------------------------- 1 | // Argument parser 2 | Map parse_reference(Map arguments) { 3 | ArgumentParser parser = new ArgumentParser( 4 | args:[ 5 | "input_ref", 6 | ], 7 | kwargs:[ 8 | "output_cache": false, 9 | "output_mmi": false, 10 | ], 11 | name: "reference_ingress") 12 | return parser.parse_args(arguments) 13 | } 14 | 15 | // Process to generate the CRAM cache and 16 | // create the REF_PATH variable 17 | process cram_cache { 18 | label "wf_common" 19 | cpus 1 20 | memory 4.GB 21 | input: 22 | path reference 23 | output: 24 | tuple path("ref_cache/"), env(REF_PATH), emit: ref_cache 25 | shell: 26 | ''' 27 | # Invoke from binary installed to container PATH 28 | seq_cache_populate.pl -root ref_cache/ !{reference} 29 | REF_PATH="ref_cache/%2s/%2s/%s" 30 | ''' 31 | } 32 | 33 | // Process to create the faidx index 34 | process faidx { 35 | label "wf_common" 36 | cpus 1 37 | memory 4.GB 38 | input: 39 | path(ref) 40 | output: 41 | path("${ref}.fai") 42 | script: 43 | """ 44 | samtools faidx ${ref} 45 | """ 46 | } 47 | 48 | // Decompress the reference genome, if it is compressed 49 | // NOTE -f required to compress symlink 50 | process decompress_ref { 51 | label "wf_common" 52 | cpus 1 53 | memory 4.GB 54 | input: 55 | file compressed_ref 56 | output: 57 | path "${compressed_ref.baseName}", emit: decompressed_ref 58 | """ 59 | gzip -df ${compressed_ref} 60 | """ 61 | } 62 | 63 | // Prepare minimap2 .mmi index 64 | process make_mmi { 65 | // Minimap2 is not available in wf_common 66 | label "wf_basecalling" 67 | cpus 4 68 | memory 16.GB 69 | input: 70 | path(ref) 71 | output: 72 | path("ref.mmi") 73 | script: 74 | """ 75 | minimap2 -t ${task.cpus} -x lr:hq -d ref.mmi ${ref} 76 | """ 77 | } 78 | 79 | 80 | // Workflow to prepare the reference genome and its indexes. 81 | workflow prepare_reference { 82 | take: 83 | arguments 84 | main: 85 | Map margs = parse_reference(arguments) 86 | 87 | // Base ref channel 88 | ref = Channel.fromPath(margs.input_ref) 89 | 90 | // Check ref and decompress if needed 91 | // gzipped ref not supported by some downstream tools (e.g. cram_cache) 92 | // easier to just decompress and pass it around rather than confusing the user 93 | is_compressed = margs.input_ref.toLowerCase().endsWith("gz") 94 | if (is_compressed) { 95 | ref = decompress_ref(ref) 96 | } 97 | 98 | // Generate fai index if the file is either compressed, or if fai doesn't exists 99 | if (!is_compressed && file("${margs.input_ref}.fai").exists()){ 100 | ref_idx = Channel.fromPath("${margs.input_ref}.fai") 101 | } else { 102 | ref_idx = faidx(ref) 103 | } 104 | 105 | // Generate CRAM cache 106 | if (margs.output_cache){ 107 | cram_cache(ref) 108 | ref_cache = cram_cache.out.ref_cache 109 | } else { 110 | ref_cache = null 111 | } 112 | 113 | // Generate mmi index 114 | if (margs.output_mmi){ 115 | ref_mmi = make_mmi(ref) 116 | } else { 117 | ref_mmi = null 118 | } 119 | 120 | // Run collect on the outputs, allowing to treat them as value channels, and avoiding 121 | // conflicts with other queue channels downstream. 122 | emit: 123 | ref = ref | collect 124 | ref_idx = ref_idx | collect 125 | ref_cache = ref_cache | collect 126 | ref_mmi = ref_mmi | collect 127 | } 128 | -------------------------------------------------------------------------------- /lib/signal/merge.nf: -------------------------------------------------------------------------------- 1 | 2 | // this process is shared by both the uCRAM and CRAM arms of the basecalling workflow 3 | // for uCRAM the staged ref is the OPTIONAL_FILE, so we withhold the ref arg 4 | process merge_calls { 5 | label "wf_basecalling" 6 | cpus params.merge_threads 7 | memory 16.GB 8 | input: 9 | path(ref) 10 | path(crams, stageAs: "filtered_*.cram") 11 | val(filetag) 12 | tuple val(align_ext), val(index_ext) // either [bam, bai] or [cram, crai] 13 | output: 14 | tuple path("${params.sample_name}.${filetag}.${align_ext}"), path("${params.sample_name}.${filetag}.${align_ext}.${index_ext}") 15 | script: 16 | def ref_arg = ref.name != "OPTIONAL_FILE" ? "--reference ${ref}" : "" 17 | """ 18 | samtools merge -c -p "${params.sample_name}.${filetag}.${align_ext}##idx##${params.sample_name}.${filetag}.${align_ext}.${index_ext}" ${crams} --no-PG -O ${align_ext} --write-index ${ref_arg} --threads ${task.cpus} 19 | """ 20 | } 21 | 22 | process merge_calls_to_fastq { 23 | label "wf_basecalling" 24 | cpus { params.merge_threads + params.ubam_bam2fq_threads } 25 | memory 16.GB 26 | input: 27 | path(crams) 28 | val(filetag) 29 | output: 30 | path("${params.sample_name}.${filetag}.fq.gz") 31 | script: 32 | """ 33 | samtools merge -c -p ${crams} --no-PG -O CRAM -@ ${params.merge_threads} -o - | samtools bam2fq -T 1 -@ ${params.ubam_bam2fq_threads} -0 ${params.sample_name}.${filetag}.fq.gz - 34 | """ 35 | } 36 | -------------------------------------------------------------------------------- /main.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | import groovy.json.JsonBuilder 3 | import nextflow.util.BlankSeparatedList 4 | 5 | nextflow.enable.dsl = 2 6 | 7 | include { wf_dorado } from './lib/signal/ingress' 8 | include { 9 | configure_igv; 10 | getParams } from './lib/common' 11 | include { prepare_reference } from './lib/reference' 12 | nextflow.preview.recursion=true 13 | 14 | process getVersions { 15 | label "wf_basecalling" 16 | cpus 1 17 | output: 18 | path "versions.txt" 19 | script: 20 | """ 21 | dorado --version 2>&1 | head -n1 | sed 's/^/dorado,/' >> versions.txt 22 | minimap2 --version | head -n 1 | sed 's/^/minimap2,/' >> versions.txt 23 | """ 24 | } 25 | 26 | 27 | process bamstats { 28 | label "wf_common" 29 | cpus params.stats_threads 30 | input: 31 | path "input.cram" // chunks are always CRAM 32 | tuple path(ref_cache), env(REF_PATH) 33 | 34 | output: 35 | path "bamstats.tsv", emit: stats 36 | path "stats.${task.index}.json", emit: json 37 | script: 38 | """ 39 | bamstats --threads=${task.cpus} -u input.cram > bamstats.tsv 40 | fastcat_histogram.py \ 41 | --sample_id "${params.sample_name}" \ 42 | bamstats.tsv "stats.${task.index}.json" 43 | """ 44 | } 45 | 46 | 47 | // Scan step for accumulating fastcat stats 48 | // 49 | // Nextflow scan does a silly thing where it feeds back the growing list of 50 | // historical outputs. We only ever need the most recent output (the "state"). 51 | process progressive_stats { 52 | label "wf_common" 53 | maxForks 1 54 | cpus 1 55 | input: 56 | path fastcat_stats 57 | output: 58 | path("all_stats.${task.index}") 59 | script: 60 | def new_input = fastcat_stats instanceof BlankSeparatedList ? fastcat_stats.first() : fastcat_stats 61 | def state = fastcat_stats instanceof BlankSeparatedList ? fastcat_stats.last() : "NOSTATE" 62 | def output = "all_stats.${task.index}" 63 | """ 64 | touch "${state}" 65 | add_jsons.py "${new_input}" "${state}" "${output}" 66 | """ 67 | } 68 | 69 | 70 | // Split simplex reads belonging to a pair 71 | process split_xam { 72 | label "wf_common" 73 | cpus 2 74 | input: 75 | tuple path(xam), path(xam_index) 76 | tuple val(align_ext), val(index_ext) 77 | path ref 78 | output: 79 | tuple path("${xam.baseName}.duplex.${align_ext}"), path("${xam.baseName}.duplex.${align_ext}.${index_ext}"), emit: xam_dx 80 | tuple path("${xam.baseName}.simplex.${align_ext}"), path("${xam.baseName}.simplex.${align_ext}.${index_ext}"), emit: xam_sx 81 | script: 82 | String reference = ref.name.startsWith('OPTIONAL_FILE') ? '' : "--reference ${ref}" 83 | """ 84 | samtools view ${reference} \ 85 | -@ ${task.cpus} \ 86 | -O ${align_ext} \ 87 | --tag dx:-1 \ 88 | --unoutput ${xam.baseName}.duplex.${align_ext} \ 89 | -o ${xam.baseName}.simplex.${align_ext} \ 90 | ${xam} 91 | samtools index ${xam.baseName}.simplex.${align_ext} 92 | samtools index ${xam.baseName}.duplex.${align_ext} 93 | """ 94 | } 95 | 96 | 97 | // Compute pairing statistics progressively, if duplex enabled 98 | process pair_stats { 99 | label "wf_common" 100 | cpus 1 101 | input: 102 | path cram // chunks are always CRAM 103 | tuple path(ref_cache), env(REF_PATH) 104 | output: 105 | path("pairs.${task.index}.csv"), emit: csv 106 | script: 107 | """ 108 | duplex_stats.py ${cram} pairs.${task.index}.csv 109 | """ 110 | } 111 | 112 | 113 | process progressive_pairings { 114 | label "wf_common" 115 | maxForks 1 116 | cpus 1 117 | input: 118 | path pairings 119 | output: 120 | path("pairing_stats.${task.index}") 121 | script: 122 | // By passing around a directory 123 | // The state file within it will be a symlink containing the latest cumulative data 124 | // eg. ls -l will look like this 125 | // pairing_stats.1 -> /work/ab/xyz/pairing_stats.1 126 | // pairing_stats.2 -> /work/ab/xyz/pairing_stats.1 127 | // pairing_stats.3 -> /work/ab/xyz/pairing_stats.1 128 | def new_input = pairings instanceof BlankSeparatedList ? pairings.first() : pairings 129 | def state = pairings instanceof BlankSeparatedList ? pairings.last() : "NOSTATE" 130 | def new_state = "pairing_stats.${task.index}" 131 | def new_file = "pairing_stats.new" 132 | // n.b where this is used below the files will have been moved, hence new_state 133 | def dynamic_input = "${new_state}/sample.pairings_stats" 134 | """ 135 | # If first iteration create empty directory 136 | if [[ "${task.index}" == "1" ]]; then 137 | mkdir "${state}" 138 | fi 139 | # cp to another new folder 140 | cp -r "${state}" "${new_state}" 141 | # Create a new file with headers 142 | echo "Filename,Duplex,Paired,Simplex" > ${new_file} 143 | # If dynamic_input exists, save it to new_file 144 | if [ -f $dynamic_input ]; then 145 | # append everything from the old state file in to the new file 146 | # skip header with 'FNR>1' as already added above 147 | awk 'FNR>1' "${dynamic_input}" >> ${new_file} 148 | fi 149 | # append everything from the latest input file in to the new file 150 | awk 'FNR>1' ${new_input} >> ${new_file} 151 | # the new file now becomes the next state to be output 152 | mv "${new_file}" "${dynamic_input}" 153 | """ 154 | } 155 | 156 | 157 | // Make reports 158 | process makeReport { 159 | label "wf_common" 160 | publishDir "${params.out_dir}", mode: 'copy', pattern: "*" 161 | input: 162 | path per_read_stats 163 | path pairings 164 | path "versions/*" 165 | path "params.json" 166 | output: 167 | path "wf-basecalling-*.html" 168 | script: 169 | String report_name = "wf-basecalling-report.html" 170 | def report_pairings = params.duplex ? "--pairings ${pairings}/*" : "" 171 | """ 172 | report.py $report_name \ 173 | --sample_name $params.sample_name \ 174 | --versions versions \ 175 | --stats $per_read_stats \ 176 | --params params.json \ 177 | --workflow_version ${workflow.manifest.version} \ 178 | $report_pairings 179 | """ 180 | } 181 | 182 | 183 | // watch path stop condition, if params.read_limit is met will inject a stop file in to input folder. 184 | process stopCondition { 185 | label "wf_common" 186 | cpus 1 187 | publishDir params.input, mode: 'copy', pattern: "*" 188 | input: 189 | path json 190 | val (stop_filename) 191 | output: 192 | path "${stop_filename}", optional: true, emit: stop 193 | script: 194 | int threshold = params.read_limit 195 | """ 196 | #!/usr/bin/env python 197 | import json 198 | from pathlib import Path 199 | with open("$json") as json_file: 200 | state = json.load(json_file) 201 | total = 0 202 | for k,v in state.items(): 203 | total += v["total_reads"] 204 | if total >= $threshold: 205 | p = Path("$stop_filename") 206 | p.touch(exist_ok=False) 207 | """ 208 | } 209 | 210 | 211 | // See https://github.com/nextflow-io/nextflow/issues/1636 212 | // This is the only way to publish files from a workflow whilst 213 | // decoupling the publish from the process steps. 214 | process output_stream { 215 | // publish inputs to output directory 216 | label "wf_basecalling" 217 | publishDir "${params.out_dir}", mode: 'copy', pattern: "*" 218 | input: 219 | path fname 220 | output: 221 | path fname 222 | """ 223 | echo "Writing output files." 224 | """ 225 | } 226 | 227 | // Output the last report, once each of them finish 228 | process output_last { 229 | // publish inputs to output directory 230 | label "wf_basecalling" 231 | publishDir "${params.out_dir}", mode: 'copy', pattern: "*" 232 | input: 233 | path fname 234 | output: 235 | path fname 236 | """ 237 | echo "Writing output files." 238 | """ 239 | } 240 | 241 | // CW-2569: Emit pod5s if requested, in a new directory 242 | process output_pod5s { 243 | // publish inputs to output directory 244 | label "wf_basecalling" 245 | publishDir "${params.out_dir}/pod5s/", mode: 'copy', pattern: "*" 246 | input: 247 | path pod5s 248 | output: 249 | path pod5s 250 | """ 251 | echo "Writing output files." 252 | """ 253 | } 254 | 255 | 256 | // entrypoint workflow 257 | WorkflowMain.initialise(workflow, params, log) 258 | workflow { 259 | 260 | Map colors = NfcoreTemplate.logColours(params.monochrome_logs) 261 | 262 | Pinguscript.ping_start(nextflow, workflow, params) 263 | 264 | // Basecall 265 | // Ensure basecaller config is set 266 | if (!params.basecaller_cfg && !params.basecaller_model_path) { 267 | throw new Exception(colors.red + "You must provide a basecaller profile with --basecaller_cfg " + colors.reset) 268 | } 269 | if (params.duplex && params.output_fmt == "fastq") { 270 | throw new Exception(colors.red + "Duplex requires the outputs of Dorado to be in BAM format." + colors.reset) 271 | } 272 | if (params.ref && params.output_fmt == "fastq") { 273 | log.warn("Alignment will output data in BAM format and ignore `--output_fmt fastq`.") 274 | } 275 | if (params.basecaller_cfg && params.basecaller_model_path) { 276 | log.warn("--basecaller_cfg and --basecaller_model_path both provided. Custom remora model path (${params.basecaller_cfg}) will override enum choice (${params.basecaller_model_path}).") 277 | } 278 | if (params.remora_cfg && params.remora_model_path) { 279 | log.warn("--remora_cfg and --remora_model_path both provided. Custom remora model path (${params.remora_model_path}) will override enum choice (${params.remora_cfg}).") 280 | } 281 | if (params.duplex && params.dorado_ext != 'pod5') { 282 | log.warn("Duplex currently requires POD5 files and is not compatible with FAST5. The workflow will convert the FAST5 inputs to POD5 format automatically.") 283 | } 284 | if (params.duplex && params.barcode_kit) { 285 | throw new Exception(colors.red + "Duplex does not support barcoded data." + colors.reset) 286 | } 287 | if (params.igv && (!params.ref || params.output_fmt == 'fastq' )){ 288 | log.warn("IGV configuration works only for aligned BAM/CRAM outputs. Please provide a reference with `--ref`, and request either cram or bam output with `--output_fmt`.") 289 | } 290 | 291 | // Ensure modbase threads are set if calling them 292 | if (params.remora_cfg || params.remora_model_path) { 293 | if (params.basecaller_basemod_threads == 0) { 294 | throw new Exception(colors.red + "--remora_cfg modbase aware config requires setting --basecaller_basemod_threads > 0" + colors.reset) 295 | } 296 | } 297 | 298 | // 299 | if (params.use_bonito) { 300 | log.warn("Using bonito for basecalling, bonito is an experimental feature for which no support is entertained.") 301 | if (!params.experimental) { 302 | error "Use of bonito is locked behind the `--experimental` option." 303 | } 304 | } 305 | 306 | // Prepare the reference genome 307 | Boolean run_alignment = false 308 | if (params.ref) { 309 | prepare_reference([ 310 | "input_ref": params.ref, 311 | "output_mmi": true, 312 | "output_cache": true 313 | ]) 314 | ref = prepare_reference.out.ref 315 | ref_cache = prepare_reference.out.ref_cache 316 | ref_fai = prepare_reference.out.ref_idx 317 | ref_mmi = prepare_reference.out.ref_mmi 318 | run_alignment = true 319 | } else { 320 | ref = Channel.fromPath("${projectDir}/data/OPTIONAL_FILE") | collect 321 | ref_cache = Channel.of([file("${projectDir}/data/OPTIONAL_FILE"), null]) | collect 322 | ref_fai = Channel.empty() 323 | ref_mmi = Channel.empty() 324 | } 325 | 326 | // ring ring it's for you 327 | basecaller_out = wf_dorado([ 328 | "input_path": params.input, 329 | "input_ref": ref, 330 | "input_mmi": ref_mmi, 331 | "input_cache": ref_cache, 332 | "run_alignment": run_alignment, 333 | "basecaller_model_name": params.use_bonito ? params.bonito_cfg : params.basecaller_cfg, 334 | "remora_model_name": params.remora_cfg, 335 | "basecaller_model_path": params.basecaller_model_path, 336 | "remora_model_path": params.remora_model_path, 337 | "watch_path": params.watch_path, 338 | "output_fmt": params.output_fmt, 339 | "dorado_ext": params.dorado_ext, 340 | "poly_a_config": params.poly_a_config, 341 | "qscore_filter": params.qscore_filter 342 | ]) 343 | software_versions = getVersions() 344 | workflow_params = getParams() 345 | 346 | // stream stats for report 347 | stat = bamstats(basecaller_out.chunked_pass_crams, ref_cache) 348 | stats = progressive_stats.scan(stat.json) 349 | 350 | // stream pair stats for report 351 | // use first() to coerce this to a value channel 352 | pairings = Channel.fromPath("${projectDir}/data/OPTIONAL_FILE", checkIfExists: true).first() 353 | if (params.duplex){ 354 | // Separate the simplex reads belonging to a pair from the 355 | // duplex and simplex reads. 356 | // Save the simplex reads in a duplex in a separate xam file. 357 | split_xam( 358 | basecaller_out.pass.concat( 359 | basecaller_out.fail 360 | ), 361 | basecaller_out.output_exts, 362 | ref 363 | ) 364 | 365 | // Create emission channel 366 | emit_xam = split_xam.out.xam_dx.flatten() 367 | .concat(split_xam.out.xam_sx.flatten()) 368 | .concat(basecaller_out.summary) 369 | 370 | // Then, compute the stats on the duplex 371 | pairs = pair_stats(basecaller_out.chunked_pass_crams, ref_cache) 372 | pairings = progressive_pairings.scan(pairs.csv) 373 | } else { 374 | emit_xam = basecaller_out.pass.flatten() 375 | .concat(basecaller_out.fail.flatten()) 376 | } 377 | // Make the report 378 | report = makeReport(stats, pairings, software_versions, workflow_params) | last | collect | output_last 379 | 380 | // Create IGV if the reference genome is passed 381 | if (params.ref && params.igv && params.output_fmt!='fastq'){ 382 | // Create temporary channel of FASTA + FAI 383 | ref_ch = ref 384 | | combine( 385 | ref_fai 386 | ) 387 | 388 | igv_files = ref_ch 389 | // Use full path of the input reference, allowing to not emit the reference 390 | | map{ 391 | fna, fai -> 392 | // If the FASTA is compressed, then it should start with the work dir path, and therefore is emitted 393 | String fna_path = fna.startsWith("${workflow.workDir}") ? "${fna.name}" : "${fna.toUriString()}" 394 | // Same for the FAIDX 395 | String fai_path = fai.startsWith("${workflow.workDir}") ? "${fai.name}" : "${fai.toUriString()}" 396 | [fna_path, fai_path] 397 | } 398 | // We only show the pass BAM files as tracks. 399 | | concat ( 400 | basecaller_out.pass | map{ it -> "${it.Name}" } 401 | ) 402 | | flatten 403 | | collectFile(name: "file-names.txt", newLine: true, sort: false) 404 | igv_conf = configure_igv(igv_files, Channel.of(null), Channel.of(null), Channel.of(null)) 405 | // If the input reference is compressed, or the input fasta does not exists, emit faidx 406 | if (params.ref.toLowerCase().endsWith("gz") || !file("${params.ref}.fai").exists()){ 407 | igv_conf = igv_conf 408 | | concat( 409 | // If either the FASTA or the FAI have been modified in any way, emit them 410 | ref_ch 411 | | flatten 412 | | filter{it.startsWith("${workflow.workDir}")} 413 | ) 414 | } 415 | } else { 416 | igv_conf = Channel.empty() 417 | } 418 | 419 | // dump out artifacts thanks for calling 420 | output_stream( 421 | emit_xam 422 | | concat( 423 | pairings.last(), 424 | software_versions, 425 | workflow_params, 426 | igv_conf 427 | ) 428 | | filter{ it -> it.Name != "OPTIONAL_FILE"} 429 | ) 430 | 431 | // dump pod5s if requested 432 | if (params.duplex && params.dorado_ext == 'fast5' && params.output_pod5){ 433 | output_pod5s(basecaller_out.converted_pod5s) 434 | } 435 | 436 | // Stop file to input folder when read_limit stop condition is met. 437 | String stop_filename = "STOP.${workflow.sessionId}.${params.dorado_ext}" 438 | if (params.watch_path && params.read_limit){ 439 | stopCondition(stats, stop_filename).first().subscribe { 440 | log.info "Creating STOP file: '$stop_filename'" 441 | } 442 | } 443 | 444 | } 445 | 446 | workflow.onComplete { 447 | Pinguscript.ping_complete(nextflow, workflow, params) 448 | } 449 | workflow.onError { 450 | Pinguscript.ping_error(nextflow, workflow, params) 451 | } 452 | -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | // import profiles and workflow SHA from core 2 | includeConfig "base.config" 3 | 4 | 5 | // define workflow params 6 | params { 7 | help = false 8 | version = false 9 | aws_image_prefix = null 10 | aws_queue = null 11 | disable_ping = false 12 | 13 | monochrome_logs = false 14 | validate_params = true 15 | show_hidden_params = false 16 | schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf' 17 | 18 | // I/O 19 | input = null 20 | ref = null 21 | sample_name = "SAMPLE" 22 | store_dir = null 23 | 24 | // basecalling 25 | experimental = false 26 | /// common 27 | basecaller_chunk_size = 25 28 | basecaller_cfg = null 29 | basecaller_args = null 30 | basecaller_basemod_threads = 2 31 | duplex = false 32 | cuda_device = "cuda:all" 33 | ubam_map_threads = 8 34 | ubam_sort_threads = 3 35 | ubam_bam2fq_threads = 1 36 | merge_threads = 4 37 | stats_threads = 4 38 | basecaller_model_path = null 39 | remora_model_path = null 40 | qscore_filter = 10 41 | /// dorado 42 | remora_cfg = null 43 | dorado_ext = "pod5" 44 | poly_a_config = null 45 | /// bonito 46 | use_bonito = false 47 | bonito_cfg = 'dna_r10.4.1_e8.2_400bps_trns@v5.0.alpha' 48 | /// wf-basecalling 49 | output_fmt = "cram" 50 | output_pod5 = false 51 | // demuxing 52 | barcode_kit = null 53 | demux_args = null 54 | /// Stream input 55 | watch_path = false 56 | read_limit = null 57 | // Create IGV configuration 58 | igv = false 59 | 60 | wf { 61 | example_cmd = [ 62 | "--basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'", 63 | "--dorado_ext 'pod5'", 64 | "--input 'wf-basecalling-demo/input'", 65 | "--ref 'wf-basecalling-demo/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta'", 66 | "--remora_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2'", 67 | ] 68 | agent = null 69 | } 70 | } 71 | 72 | manifest { 73 | name = 'epi2me-labs/wf-basecalling' 74 | author = 'Oxford Nanopore Technologies' 75 | homePage = 'https://github.com/epi2me-labs/wf-basecalling' 76 | description = 'Helper workflow for basecalling ONT reads.' 77 | mainScript = 'main.nf' 78 | nextflowVersion = '>=23.04.2' 79 | version = '1.5.1' 80 | } 81 | 82 | epi2melabs { 83 | tags = "wf-basecalling,dorado,basecalling,utility" 84 | icon = "faTty" 85 | } 86 | -------------------------------------------------------------------------------- /output_definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": { 3 | "workflow-report": { 4 | "filepath": "wf-basecalling-report.html", 5 | "title": "workflow report", 6 | "description": "Report summarising the work done by the basecalling workflow", 7 | "mime-type": "text/html", 8 | "optional": false, 9 | "type": "per-sample" 10 | }, 11 | "simplex-alignment-pass": { 12 | "filepath": "{{ alias }}.pass.simplex.{{ format }}", 13 | "title": "Simplex alignment file of passed reads", 14 | "description": "BAM or CRAM file of simplex reads for the sample that pass QC filtering.", 15 | "mime-type": "application/x-gzip", 16 | "optional": false, 17 | "type": "per-sample" 18 | }, 19 | "duplex-alignment-pass": { 20 | "filepath": "{{ alias }}.pass.duplex.{{ format }}", 21 | "title": "Duplex alignment file of passed reads", 22 | "description": "BAM or CRAM file of duplex reads for the sample that pass QC filtering. Created if duplex basecalling is requested.", 23 | "mime-type": "application/x-gzip", 24 | "optional": true, 25 | "type": "per-sample" 26 | }, 27 | "simplex-alignment-pass-index": { 28 | "filepath": "{{ alias }}.pass.simplex.{{ format }}.{{ index_format }}", 29 | "title": "Simplex alignment file index of passed reads", 30 | "description": "The index of the resulting BAM or CRAM file with the simplex reads that pass QC filtering.", 31 | "mime-type": "application/octet-stream", 32 | "optional": false, 33 | "type": "per-sample" 34 | }, 35 | "duplex-alignment-pass-index": { 36 | "filepath": "{{ alias }}.pass.duplex.{{ format }}.{{ index_format }}", 37 | "title": "Duplex alignment file index of passed reads", 38 | "description": "The index of the resulting BAM or CRAM file with the duplex reads that pass QC filtering. Created if duplex basecalling is requested.", 39 | "mime-type": "application/octet-stream", 40 | "optional": true, 41 | "type": "per-sample" 42 | }, 43 | "simplex-alignment-fail": { 44 | "filepath": "{{ alias }}.fail.simplex.{{ format }}", 45 | "title": "Simplex alignment file of failed reads", 46 | "description": "BAM or CRAM file of simplex reads for the sample that fail QC filtering.", 47 | "mime-type": "application/x-gzip", 48 | "optional": false, 49 | "type": "per-sample" 50 | }, 51 | "duplex-alignment-fail": { 52 | "filepath": "{{ alias }}.fail.duplex.{{ format }}", 53 | "title": "Duplex alignment file of failed reads", 54 | "description": "BAM or CRAM file of duplex reads for the sample that fail QC filtering. Created if duplex basecalling is requested.", 55 | "mime-type": "application/x-gzip", 56 | "optional": true, 57 | "type": "per-sample" 58 | }, 59 | "simplex-alignment-fail-index": { 60 | "filepath": "{{ alias }}.fail.simplex.{{ format }}.{{ index_format }}", 61 | "title": "Simplex alignment file index of failed reads", 62 | "description": "The index of the resulting BAM or CRAM file with the simplex reads that fail QC filtering.", 63 | "mime-type": "application/octet-stream", 64 | "optional": false, 65 | "type": "per-sample" 66 | }, 67 | "duplex-alignment-fail-index": { 68 | "filepath": "{{ alias }}.fail.duplex.{{ format }}.{{ index_format }}", 69 | "title": "Duplex alignment file index of failed reads", 70 | "description": "The index of the resulting BAM or CRAM file with the duplex reads that fail QC filtering. Created if duplex basecalling is requested.", 71 | "mime-type": "application/octet-stream", 72 | "optional": true, 73 | "type": "per-sample" 74 | }, 75 | "reference-index": { 76 | "filepath": "{{ ref }}.fai", 77 | "title": "Index of the reference FASTA file", 78 | "description": "Index of the reference FASTA file.", 79 | "mime-type": "text/tab-separated-values", 80 | "optional": true, 81 | "type": "aggregated" 82 | }, 83 | "igv-config": { 84 | "filepath": "igv.json", 85 | "title": "JSON configuration file for IGV browser", 86 | "description": "JSON configuration file to be loaded in IGV for visualising alignments against the reference genome.", 87 | "mime-type": "text/json", 88 | "optional": true, 89 | "type": "aggregated" 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /util/update_models_schema.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Generate nextflow_schema with updated basecaller enumerations 3 | # 4 | # This script uses `nextflow config` to obtain the basecaller container, 5 | # creates JSON arrays of the models using the container's list-models script 6 | # and injects them with jq to create nextflow_schema.json.new. 7 | set -euo pipefail 8 | 9 | TARGET=$1 10 | ENGINE=$2 11 | 12 | if ! command -v nextflow &> /dev/null 13 | then 14 | # we should be in CI, nextflow is installed right here 15 | NEXTFLOW="./nextflow" 16 | else 17 | NEXTFLOW=`which nextflow` 18 | fi 19 | 20 | # work out how to inspect the container contents 21 | DORADO_CONTAINER=$(${NEXTFLOW} config -flat | grep "process.'withLabel:wf_basecalling'.container" | awk -F'= ' '{print $2}' | sed "s,',,g") 22 | echo "# DORADO_CONTAINER=${DORADO_CONTAINER}" 23 | if [ "$ENGINE" = "simg" ]; then 24 | CMD_PREFIX="singularity exec docker://${DORADO_CONTAINER}" 25 | else 26 | CMD_PREFIX="docker run ${DORADO_CONTAINER}" 27 | fi 28 | 29 | # Convert model lists to JSON arrays 30 | SIMPLEX_MODELS=$(${CMD_PREFIX} list-models --simplex --only-names | jq -Rn '[inputs]') 31 | MODBASE_MODELS=$(${CMD_PREFIX} list-models --modbase --only-names | jq -Rn '[inputs]') 32 | 33 | # Inject JSON arrays to relevant schema enum 34 | jq \ 35 | -j \ 36 | --indent 4 \ 37 | --argjson simplex_models "${SIMPLEX_MODELS}" \ 38 | --argjson modbase_models "${MODBASE_MODELS}" \ 39 | '(.definitions.basecalling_options.properties.basecaller_cfg.enum) = $simplex_models | 40 | (.definitions.basecalling_options.properties.remora_cfg.enum) = $modbase_models' \ 41 | ${TARGET}/nextflow_schema.json > ${TARGET}/nextflow_schema.json.new 42 | 43 | echo "# Updated schema generated, you should inspect it before adopting it!" 44 | echo "diff ${TARGET}/nextflow_schema.json ${TARGET}/nextflow_schema.json.new" 45 | echo "mv ${TARGET}/nextflow_schema.json.new ${TARGET}/nextflow_schema.json" 46 | --------------------------------------------------------------------------------