├── .dockerignore ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── config.yml │ ├── feature_request.yml │ └── question.yml ├── .gitignore ├── .gitlab-ci.yml ├── .gitmodules ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── bin ├── de_analysis.R ├── plot_dtu_results.R ├── workflow-glue └── workflow_glue │ ├── __init__.py │ ├── check_sample_sheet_condition.py │ ├── de_plots.py │ ├── generate_pychopper_stats.py │ ├── merge_count_tsvs.py │ ├── models │ ├── __init__.py │ └── common.py │ ├── parse_gffcompare.py │ ├── report.py │ ├── summarise_gff.py │ ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_check_sample_sheet_condition.py │ └── test_de_plots.py │ ├── util.py │ └── wfg_helpers │ ├── __init__.py │ ├── check_bam_headers_in_dir.py │ ├── check_sample_sheet.py │ ├── check_xam_index.py │ ├── configure_igv.py │ ├── get_max_depth_locus.py │ └── reheader_samstream.py ├── data └── OPTIONAL_FILE ├── docs ├── 01_brief_description.md ├── 02_introduction.md ├── 03_compute_requirements.md ├── 04_install_and_run.md ├── 05_related_protocols.md ├── 06_input_example.md ├── 06_input_parameters.md ├── 07_outputs.md ├── 08_pipeline_overview.md ├── 09_troubleshooting.md ├── 10_FAQ.md └── 11_other.md ├── evaluation └── tests.sh ├── lib ├── ArgumentParser.groovy ├── CWUtil.groovy ├── NfcoreSchema.groovy ├── NfcoreTemplate.groovy ├── Pinguscript.groovy ├── WorkflowMain.groovy ├── common.nf ├── ingress.nf └── nfcore_external_java_deps.jar ├── main.nf ├── nextflow.config ├── nextflow_schema.json ├── output_definition.json ├── subworkflows ├── differential_expression.nf └── reference_assembly.nf └── test_data ├── SIRV_150601a.fasta ├── SIRV_150601a.fasta.fai ├── SIRV_isoforms.gtf ├── demultiplexed_fastq ├── barcode01 │ └── SIRV_E0_PCS109_51.fq.gz └── barcode02 │ └── SIRV_E0_PCS109_25.fq.gz ├── fastq └── SIRV_E0_PCS109_50.fq.gz ├── sample_sheet.csv └── workflow_glue ├── MSTRG.11088.gff3 ├── MSTRG.11088.gtf └── check_sample_sheet_condition ├── sample_sheet_1.csv ├── sample_sheet_2.csv ├── sample_sheet_3.csv └── sample_sheet_4.csv /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | bin 3 | CHANGELOG.md 4 | data 5 | lib 6 | LICENSE 7 | main.nf 8 | nextflow.config 9 | README.md 10 | test_data 11 | # we typically run tests with outputs to these: 12 | output 13 | work 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report 3 | labels: ["triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for taking the time to fill out this bug report! 9 | 10 | 11 | - type: markdown 12 | attributes: 13 | value: | 14 | # Background 15 | - type: dropdown 16 | id: os 17 | attributes: 18 | label: Operating System 19 | description: What operating system are you running? 20 | options: 21 | - Windows 10 22 | - Windows 11 23 | - macOS 24 | - Ubuntu 22.04 25 | - CentOS 7 26 | - Other Linux (please specify below) 27 | validations: 28 | required: true 29 | - type: input 30 | id: other-os 31 | attributes: 32 | label: Other Linux 33 | placeholder: e.g. Fedora 38 34 | - type: input 35 | id: version 36 | attributes: 37 | label: Workflow Version 38 | description: This is most easily found in the workflow output log 39 | placeholder: v1.2.3 40 | validations: 41 | required: true 42 | - type: dropdown 43 | id: execution 44 | attributes: 45 | label: Workflow Execution 46 | description: Where are you running the workflow? 47 | options: 48 | - EPI2ME Desktop (Local) 49 | - EPI2ME Desktop (Cloud) 50 | - Command line (Local) 51 | - Command line (Cluster) 52 | - Other (please describe) 53 | validations: 54 | required: true 55 | - type: input 56 | id: other-workflow-execution 57 | attributes: 58 | label: Other workflow execution 59 | description: If "Other", please describe 60 | placeholder: Tell us where / how you are running the workflow. 61 | 62 | - type: markdown 63 | attributes: 64 | value: | 65 | # EPI2ME Desktop Application 66 | If you are using the application please provide the following. 67 | - type: input 68 | id: labs-version 69 | attributes: 70 | label: EPI2ME Version 71 | description: Available from the application settings page. 72 | placeholder: v5.1.1 73 | validations: 74 | required: false 75 | 76 | 77 | - type: markdown 78 | attributes: 79 | value: | 80 | # Command-line execution 81 | If you are using nextflow on a command-line, please provide the following. 82 | - type: textarea 83 | id: cli-command 84 | attributes: 85 | label: CLI command run 86 | description: Please tell us the command you are running 87 | placeholder: e.g. nextflow run epi2me-labs/wf-human-variations -profile standard --fastq my-reads/fastq 88 | validations: 89 | required: false 90 | - type: dropdown 91 | id: profile 92 | attributes: 93 | label: Workflow Execution - CLI Execution Profile 94 | description: Which execution profile are you using? If you are using a custom profile or nextflow configuration, please give details below. 95 | options: 96 | - standard (default) 97 | - singularity 98 | - custom 99 | validations: 100 | required: false 101 | 102 | 103 | - type: markdown 104 | attributes: 105 | value: | 106 | # Report details 107 | - type: textarea 108 | id: what-happened 109 | attributes: 110 | label: What happened? 111 | description: Also tell us, what did you expect to happen? 112 | placeholder: Tell us what you see! 113 | validations: 114 | required: true 115 | - type: textarea 116 | id: logs 117 | attributes: 118 | label: Relevant log output 119 | description: For CLI execution please include the full output from running nextflow. For execution from the EPI2ME application please copy the contents of the "Workflow logs" panel from the "Logs" tab corresponding to your workflow instance. (This will be automatically formatted into code, so no need for backticks). 120 | render: shell 121 | validations: 122 | required: true 123 | - type: textarea 124 | id: activity-log 125 | attributes: 126 | label: Application activity log entry 127 | description: For use with the EPI2ME application please see the Settings > View Activity Log page, and copy the contents of any items listed in red using the Copy to clipboard button. 128 | render: shell 129 | validations: 130 | required: false 131 | - type: dropdown 132 | id: run-demo 133 | attributes: 134 | label: Were you able to successfully run the latest version of the workflow with the demo data? 135 | description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button? 136 | options: 137 | - 'yes' 138 | - 'no' 139 | - other (please describe below) 140 | validations: 141 | required: true 142 | - type: textarea 143 | id: demo-other 144 | attributes: 145 | label: Other demo data information 146 | render: shell 147 | validations: 148 | required: false 149 | 150 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Nanopore customer support 4 | url: https://nanoporetech.com/contact 5 | about: For general support, including bioinformatics questions. 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature request 2 | description: Suggest an idea for this project 3 | labels: ["feature request"] 4 | body: 5 | 6 | - type: textarea 7 | id: question1 8 | attributes: 9 | label: Is your feature related to a problem? 10 | placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 11 | validations: 12 | required: true 13 | - type: textarea 14 | id: question2 15 | attributes: 16 | label: Describe the solution you'd like 17 | placeholder: A clear and concise description of what you want to happen. 18 | validations: 19 | required: true 20 | - type: textarea 21 | id: question3 22 | attributes: 23 | label: Describe alternatives you've considered 24 | placeholder: A clear and concise description of any alternative solutions or features you've considered. 25 | validations: 26 | required: true 27 | - type: textarea 28 | id: question4 29 | attributes: 30 | label: Additional context 31 | placeholder: Add any other context about the feature request here. 32 | validations: 33 | required: false 34 | 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.yml: -------------------------------------------------------------------------------- 1 | name: Question 2 | description: Ask a generic question about this project unrelated to features or bugs. 3 | labels: ["question"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Please reserve this form for issues not related to bugs or feature requests. If our developers deem your questions to be related to bugs or features you will be asked to fill in the appropriate form. 9 | - type: textarea 10 | id: question1 11 | attributes: 12 | label: Ask away! 13 | placeholder: | 14 | Bad question: How do I use this workflow in my HPC cluster? 15 | Good question: My HPC cluster uses a GridEngine scheduler. Can you point me to documentation for how to use your workflows to efficiently submit jobs to my cluster? 16 | validations: 17 | required: true 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | nextflow 2 | .nextflow* 3 | template-workflow 4 | .*.swp 5 | .*.swo 6 | *.pyc 7 | *.pyo 8 | .DS_store 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/.gitmodules -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: docs_readme 5 | name: docs_readme 6 | entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json 7 | language: python 8 | always_run: true 9 | pass_filenames: false 10 | additional_dependencies: 11 | - epi2melabs==0.0.57 12 | - repo: https://github.com/pycqa/flake8 13 | rev: 5.0.4 14 | hooks: 15 | - id: flake8 16 | pass_filenames: false 17 | additional_dependencies: 18 | - flake8-rst-docstrings 19 | - flake8-docstrings 20 | - flake8-import-order 21 | - flake8-forbid-visual-indent 22 | - pep8-naming 23 | - flake8-no-types 24 | - flake8-builtins 25 | - flake8-absolute-import 26 | - flake8-print 27 | args: [ 28 | "bin", 29 | "--import-order-style=google", 30 | "--statistics", 31 | "--max-line-length=88", 32 | "--per-file-ignores=bin/workflow_glue/models/*:NT001", 33 | ] 34 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [v1.7.0] 8 | ### Changed 9 | - `split_bam` and `build_minimap_index_transcriptome` process memory allocation increased. 10 | - Updated recommended memory requirement. 11 | - Updated project description. 12 | - A common user issue is providing a ref_annotation and ref_genome parameter that have mismatched reference IDs, which causes the DE_analysis to fail. The workflow will now do an upfront check and give an error message if no overlap is found or a warning if some IDs are present in one file but not in the other. 13 | - Reconciled workflow with wf-template v5.5.0. 14 | - Sort the columns and rows of the gene and transcript count files. 15 | - DE_analysis alignment summary stats table no longer includes MAPQ or quality scores. MAPQ is not relevant for transcript alignment and quality scores are already available in the read summary section of the report. 16 | ### Fixed 17 | - `all_gene_counts.tsv` contained the DE counts results. 18 | - Reduced memory usage of the report workflow process. 19 | - Output BAM alignments in all cases unless the workflow is run with `transcriptome_source` set to `precomputed`. 20 | - Corrected the demo command in the `README.md`. 21 | - The merged transcriptome generated for differential expression analysis now only contains the exons and not the full genomic sequence. 22 | - Output the gene name annotated differential expression analysis count files only. 23 | - Only use full length reads in the differential expression analysis. 24 | 25 | ## [v1.6.1] 26 | ### Fixed 27 | - `merge_gff_compare` failing with empty GFF files. 28 | 29 | ## [v1.6.0] 30 | ### Fixed 31 | - v1.5.0 bug; access to undefined channel output bug when using precomputed transcriptome. 32 | - Bug where incorrect gene_id assigned in the DE tables. 33 | 34 | ## [v1.5.0] 35 | ### Updated 36 | - Workflow report updated to use `ezcharts`. 37 | ### Fixed 38 | - Exons per isoforms histogram reporting incorrect numbers. 39 | - Output the `results_dexseq.tsv` file when `--de_analysis` enabled. 40 | ### Removed 41 | - per-class gffcompare tracking files as there exists a combine tracking file. 42 | 43 | ## [v1.4.0] 44 | ## Added 45 | - `--igv` parameter (default: false) for outputting IGV config allowing visualisation of read alignments in the EPI2ME App. 46 | - If required for IGV, reference indexes are output in to a `igv_reference` directory 47 | ### Changed 48 | - BAMS are output in to a BAMS directory. 49 | - Reconcile with template 5.2.6. 50 | 51 | ## [v1.3.0] 52 | ### Removed 53 | - Fusion detection subworkflow, as the functionality is not robust enough for general use at this time. 54 | ### Changed 55 | - Updated pychopper to 2.7.10 56 | ## Added 57 | - new `cdna_kit` options: PCS114 and PCB111/114 58 | 59 | ## [v1.2.1] 60 | ### Changed 61 | - Increase some memory and CPU allocations. 62 | 63 | ## [v1.2.0] 64 | ### Added 65 | - Workflow now accepts BAM or FASTQ files as input (using the --bam or --fastq parameters, respectively). 66 | ### Changed 67 | - MA plot in the `results_dge.pdf` has been updated to match the MA plot in the report. 68 | ### Added 69 | - Error message when running in `de_analysis` mode and `ref_annotation` input file contains unstranded annotations. 70 | 71 | ## [v1.1.1] 72 | ### Changed 73 | - Improved handling of different annotation file types (eg. `.gtf/.gff/.gff3`) in `de_analysis` mode. 74 | - Improved handling of annotation files that do not contain version numbers in transcript_id (such as gtf's from Ensembl). 75 | ### Fixed 76 | - Differential expression failing with 10 or more samples. 77 | - Regression causing the DE analysis numeric parameters to not be evaluated correctly. 78 | 79 | ## [v1.1.0] 80 | ### Changed 81 | - Improve documentation around filtering of transcripts done before DTU analysis. 82 | - Renamed files: 83 | - `de_analysis/all_counts_filtered.tsv` to `de_analysis/filtered_transcript_counts_with_genes.tsv` 84 | - `de_analysis/de_tpm_transcript_counts.tsv` to `de_analysis/unfiltered_tpm_transcript_counts.tsv` 85 | - Minimum memory requirements to `32 GB`. 86 | ### Added 87 | - Published isoforms table to output directory. 88 | - Output additional `de_analysis/cpm_gene_counts.tsv` with counts per million gene counts. 89 | - Output additional `de_analysis/unfiltered_transcript_counts_with_genes.tsv` with unfiltered transcript counts with associated gene IDs. 90 | - Add gene name column to the de_analysis counts TSV files. 91 | ### Fixed 92 | - Mapping stage using a single thread only. 93 | ### Changed 94 | - More memory assigned to the fusion detection process. 95 | - When no `--ref_annotation` is provided the workflow will still run but the output transcripts will not be annotated. However `--de_analysis` mode still requires a `--ref_annotation`. 96 | 97 | ## [v1.0.0] 98 | ### Added 99 | - Published minimap2 and pychopper results to output directory. 100 | - Two extra pychopper parameters `--cdna_kit` and `--pychopper_backend`. `--pychopper_options` is still available to define any other options. 101 | - Memory requirements for each process. 102 | ### Changed 103 | - Documentation. 104 | ### Fixed 105 | - When Jaffa is run only output one report. 106 | 107 | ## [v0.4.2] 108 | ### Changed 109 | - Sample sheet must include a `control` type to indicate which samples are the reference for the differential expression pipeline. 110 | ### Removed 111 | - Default local executor CPU and RAM limits. 112 | 113 | ## [v0.4.1] 114 | ### Changed 115 | - Updated docker container with Pychopper to support LSK114. 116 | 117 | ## [v0.4.0] 118 | ### Fixed 119 | - Remove dead links from README 120 | ### Removed 121 | - Denovo `--transcriptome_source` option. 122 | 123 | ## [v0.3.1] 124 | ### Added 125 | - Handling for input reference transcriptome headers that contain `|` 126 | 127 | ## [v0.3.0] 128 | ### Changed 129 | - Improve differential expression outputs. 130 | - Include transcript and gene count tables in DE_final folder. 131 | - If differential expression subworkflow is used a non redundant transcriptome will be output which includes novel transcripts. 132 | - Added wording to the report about how to identify novel transcripts in the DE tables. 133 | - Nextflow minimum required version to 23.04.2 134 | - `--minimap_index_opts` parameter has been changed to `minimap2_index_opts` for consistency. 135 | 136 | ### Added 137 | - An additional gene name column to the differential gene expression results. This is especially handy for transcriptomes where the gene ID is not the same as gene name (e.g. Ensembl). 138 | - Wording to the report about how to identify novel transcripts in the DE tables. 139 | 140 | ## [v0.2.1] 141 | ### Changed 142 | - Any sample aliases that contain spaces will be replaced with underscores. 143 | - Updated documentation to explain we only support Ensembl, NCBI and ENCODE annotation file types. 144 | 145 | ### Fixed 146 | - Documentation parameter examples corrected. 147 | - Handling for annotation files that use gene as gene_id attribute. 148 | - Handling for Ensembl annotation files. 149 | 150 | ## [v0.2.0] 151 | ### Changed 152 | - GitHub issue templates 153 | - Condition sheet is no longer required. The sample sheet is now used to indicate condition instead. 154 | - For differential expression, the sample sheet must have a `condition` column to indicate which condition group each sample in the sample sheet belongs to. 155 | - Values for the condition may be any two distinct strings, for example: treated/untreated; sample/control etc. 156 | 157 | ### Fixed 158 | - Remove default of null for `--ref_transcriptome`. 159 | - Read mapping summary table in the report has correct sample_ids. 160 | 161 | ## [v0.1.13] 162 | ### Added 163 | - Handling for GFF3 reference_annotation file type. 164 | - Warning for the `--transcriptome_source` denovo pipeline option. 165 | 166 | ### Changed 167 | - Enum choices are enumerated in the `--help` output 168 | - Enum choices are enumerated as part of the error message when a user has selected an invalid choice 169 | - Bumped minimum required Nextflow version to 22.10.8 170 | 171 | ### Fixed 172 | - Replaced `--threads` option in fastqingress with hardcoded values to remove warning about undefined `param.threads` 173 | - Fix for the `--transcriptome_source` denovo pipeline option. 174 | 175 | ## [v0.1.12] 176 | ### Added 177 | - Handling for GFF3 reference_annotation file type. 178 | - Handling gzip input reference and annotation parameters. 179 | - Handling for NCBI gtfs that contain some empty transcript ID fields. 180 | 181 | ## [v0.1.11] 182 | ### Changed 183 | - LICENSE to Oxford Nanopore Technologies PLC. Public License Version 1.0. 184 | 185 | ### Added 186 | - Configuration for running demo data in AWS 187 | 188 | ## [v0.1.10] 189 | ### Changed 190 | - Condition sheet parameter description fixed to CSV 191 | - Update fastqingress 192 | 193 | ## [v0.1.9] 194 | ### Changed 195 | - Simplify JAFFAL docs 196 | 197 | ## [v0.1.8] 198 | ### Changed 199 | - Description in manifest 200 | 201 | ## [v0.1.7] 202 | ### Changed 203 | - `-profile conda` is no longer supported, users should use `-profile standard` (Docker) or `-profile singularity` instead 204 | - `nextflow run epi2me-labs/wf-transcriptomes --version` will now print the workflow version number and exit 205 | - Use parameter `--transcriptome-source` to define precalculated, reference-based or denovo 206 | 207 | ## [v0.1.6] 208 | ### Changed 209 | - Removed sanitize option 210 | - Reduce size of differential expression data. 211 | 212 | ### Added 213 | - Improved DE explanation in docs 214 | - Option to turn off transcript assembly steps with param transcript_assembly 215 | 216 | ### Fixed 217 | - Fix JAFFAL terminating workflow when no fusions found. 218 | - Error if condition sheet and sample sheet don't match. 219 | - Failed to plot DE graphs when one of data sets is 0 length. 220 | 221 | ## [v0.1.5] 222 | ### Added 223 | - Differential transcript and gene expression subworkflow 224 | 225 | ## [v0.1.4] 226 | ### Added 227 | - JAFFAL fusion detection subworkflow 228 | 229 | ### Changed 230 | - Args parser for fastqingress 231 | - Set out_dir option type to ensure output is written to correct directory on Windows 232 | - Skip unnecessary conversion to fasta from fastq 233 | - Fastqingress metadata map 234 | - Changed workflow name to wf-transcriptomes 235 | 236 | ## [v0.1.3] 237 | ### Changed 238 | - Better help text on cli 239 | - Use EPI2ME Labs-maintained version of pychopper 240 | 241 | ## [v0.1.2] 242 | ### Added 243 | - direct_rna option 244 | - Some extra error handling 245 | - Minor report display improvements 246 | 247 | ## [v0.1.1] 248 | ### Fixed 249 | - Incorrect numbers and of transcripts caused by merging gff files with same gene and transcript ids 250 | - Error handling in de novo pipeline. Skip clusters in build_backbones that cause an isONclust2 error 251 | - Several small fixes in report plotting 252 | 253 | ## [v0.1.0] 254 | ### Added 255 | - Added the denovo pipeline 256 | 257 | ### Changed 258 | - Updates to the report plots 259 | 260 | ## [v0.0.1] 261 | ### Added 262 | - First release 263 | - Initial port of Snakemake WF from https://github.com/nanoporetech/pipeline-nanopore-ref-isoforms 264 | 265 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Oxford Nanopore Technologies PLC. Public License Version 1.0 2 | ============================================================= 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor’s Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Executable Form" 25 | means any form of the work other than Source Code Form. 26 | 27 | 1.6. "Larger Work" 28 | means a work that combines Covered Software with other material, in 29 | a separate file or files, that is not Covered Software. 30 | 31 | 1.7. "License" 32 | means this document. 33 | 34 | 1.8. "Licensable" 35 | means having the right to grant, to the maximum extent possible, 36 | whether at the time of the initial grant or subsequently, any and 37 | all of the rights conveyed by this License. 38 | 39 | 1.9. "Modifications" 40 | means any of the following: 41 | 42 | (a) any file in Source Code Form that results from an addition to, 43 | deletion from, or modification of the contents of Covered 44 | Software; or 45 | (b) any new file in Source Code Form that contains any Covered 46 | Software. 47 | 48 | 1.10. "Research Purposes" 49 | means use for internal research and not intended for or directed 50 | towards commercial advantages or monetary compensation; provided, 51 | however, that monetary compensation does not include sponsored 52 | research of research funded by grants. 53 | 54 | 1.11 "Secondary License" 55 | means either the GNU General Public License, Version 2.0, the GNU 56 | Lesser General Public License, Version 2.1, the GNU Affero General 57 | Public License, Version 3.0, or any later versions of those 58 | licenses. 59 | 60 | 1.12. "Source Code Form" 61 | means the form of the work preferred for making modifications. 62 | 63 | 1.13. "You" (or "Your") 64 | means an individual or a legal entity exercising rights under this 65 | License. For legal entities, "You" includes any entity that 66 | controls, is controlled by, or is under common control with You. For 67 | purposes of this definition, "control" means (a) the power, direct 68 | or indirect, to cause the direction or management of such entity, 69 | whether by contract or otherwise, or (b) ownership of more than 70 | fifty percent (50%) of the outstanding shares or beneficial 71 | ownership of such entity. 72 | 73 | 2. License Grants and Conditions 74 | -------------------------------- 75 | 76 | 2.1. Grants 77 | 78 | Each Contributor hereby grants You a world-wide, royalty-free, 79 | non-exclusive license under Contributor copyrights Licensable by such 80 | Contributor to use, reproduce, make available, modify, display, 81 | perform, distribute, and otherwise exploit solely for Research Purposes 82 | its Contributions, either on an unmodified basis, with Modifications, 83 | or as part of a Larger Work. 84 | 85 | 2.2. Effective Date 86 | 87 | The licenses granted in Section 2.1 with respect to any Contribution 88 | become effective for each Contribution on the date the Contributor 89 | first distributes such Contribution. 90 | 91 | 2.3. Limitations on Grant Scope 92 | 93 | The licenses granted in this Section 2 are the only rights granted under 94 | this License. No additional rights or licenses will be implied from the 95 | distribution or licensing of Covered Software under this License. The 96 | License is incompatible with Secondary Licenses. Notwithstanding 97 | Section 2.1 above, no copyright license is granted: 98 | 99 | (a) for any code that a Contributor has removed from Covered Software; 100 | or 101 | 102 | (b) use of the Contributions or its Contributor Version other than for 103 | Research Purposes only; or 104 | 105 | (c) for infringements caused by: (i) Your and any other third party’s 106 | modifications of Covered Software, or (ii) the combination of its 107 | Contributions with other software (except as part of its Contributor 108 | Version). 109 | 110 | This License does not grant any rights in the patents, trademarks, 111 | service marks, or logos of any Contributor (except as may be necessary 112 | to comply with the notice requirements in Section 3.4). 113 | 114 | 2.4. Subsequent Licenses 115 | 116 | No Contributor makes additional grants as a result of Your choice to 117 | distribute the Covered Software under a subsequent version of this 118 | License (see Section 10.2) or under the terms of a Secondary License 119 | (if permitted under the terms of Section 3.3). 120 | 121 | 2.5. Representation 122 | 123 | Each Contributor represents that the Contributor believes its 124 | Contributions are its original creation(s) or it has sufficient rights 125 | to grant the rights to its Contributions conveyed by this License. 126 | 127 | 2.6. Fair Use 128 | 129 | This License is not intended to limit any rights You have under 130 | applicable copyright doctrines of fair use, fair dealing, or other 131 | equivalents. 132 | 133 | 2.7. Conditions 134 | 135 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 136 | in Section 2.1. 137 | 138 | 3. Responsibilities 139 | ------------------- 140 | 141 | 3.1. Distribution of Source Form 142 | 143 | All distribution of Covered Software in Source Code Form, including any 144 | Modifications that You create or to which You contribute, must be under 145 | the terms of this License. You must inform recipients that the Source 146 | Code Form of the Covered Software is governed by the terms of this 147 | License, and how they can obtain a copy of this License. You may not 148 | attempt to alter or restrict the recipients’ rights in the Source Code Form. 149 | 150 | 3.2. Distribution of Executable Form 151 | 152 | If You distribute Covered Software in Executable Form then: 153 | 154 | (a) such Covered Software must also be made available in Source Code 155 | Form, as described in Section 3.1, and You must inform recipients of 156 | the Executable Form how they can obtain a copy of such Source Code 157 | Form by reasonable means in a timely manner, at a charge no more 158 | than the cost of distribution to the recipient; and 159 | 160 | (b) You may distribute such Executable Form under the terms of this 161 | License. 162 | 163 | 3.3. Distribution of a Larger Work 164 | 165 | You may create and distribute a Larger Work under terms of Your choice, 166 | provided that You also comply with the requirements of this License for 167 | the Covered Software. The Larger Work may not be a combination of Covered 168 | Software with a work governed by one or more Secondary Licenses. 169 | 170 | 3.4. Notices 171 | 172 | You may not remove or alter the substance of any license notices 173 | (including copyright notices, patent notices, disclaimers of warranty, 174 | or limitations of liability) contained within the Source Code Form of 175 | the Covered Software, except that You may alter any license notices to 176 | the extent required to remedy known factual inaccuracies. 177 | 178 | 3.5. Application of Additional Terms 179 | 180 | You may not choose to offer, or charge a fee for use of the Covered 181 | Software or a fee for, warranty, support, indemnity or liability 182 | obligations to one or more recipients of Covered Software. You must 183 | make it absolutely clear that any such warranty, support, indemnity, or 184 | liability obligation is offered by You alone, and You hereby agree to 185 | indemnify every Contributor for any liability incurred by such 186 | Contributor as a result of warranty, support, indemnity or liability 187 | terms You offer. You may include additional disclaimers of warranty and 188 | limitations of liability specific to any jurisdiction. 189 | 190 | 4. Inability to Comply Due to Statute or Regulation 191 | --------------------------------------------------- 192 | 193 | If it is impossible for You to comply with any of the terms of this 194 | License with respect to some or all of the Covered Software due to 195 | statute, judicial order, or regulation then You must: (a) comply with 196 | the terms of this License to the maximum extent possible; and (b) 197 | describe the limitations and the code they affect. Such description must 198 | be placed in a text file included with all distributions of the Covered 199 | Software under this License. Except to the extent prohibited by statute 200 | or regulation, such description must be sufficiently detailed for a 201 | recipient of ordinary skill to be able to understand it. 202 | 203 | 5. Termination 204 | -------------- 205 | 206 | 5.1. The rights granted under this License will terminate automatically 207 | if You fail to comply with any of its terms. 208 | 209 | 5.2. If You initiate litigation against any entity by asserting an 210 | infringement claim (excluding declaratory judgment actions, 211 | counter-claims, and cross-claims) alleging that a Contributor Version 212 | directly or indirectly infringes, then the rights granted to 213 | You by any and all Contributors for the Covered Software under Section 214 | 2.1 of this License shall terminate. 215 | 216 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 217 | end user license agreements (excluding distributors and resellers) which 218 | have been validly granted by You or Your distributors under this License 219 | prior to termination shall survive termination. 220 | 221 | ************************************************************************ 222 | * * 223 | * 6. Disclaimer of Warranty * 224 | * ------------------------- * 225 | * * 226 | * Covered Software is provided under this License on an "as is" * 227 | * basis, without warranty of any kind, either expressed, implied, or * 228 | * statutory, including, without limitation, warranties that the * 229 | * Covered Software is free of defects, merchantable, fit for a * 230 | * particular purpose or non-infringing. The entire risk as to the * 231 | * quality and performance of the Covered Software is with You. * 232 | * Should any Covered Software prove defective in any respect, You * 233 | * (not any Contributor) assume the cost of any necessary servicing, * 234 | * repair, or correction. This disclaimer of warranty constitutes an * 235 | * essential part of this License. No use of any Covered Software is * 236 | * authorized under this License except under this disclaimer. * 237 | * * 238 | ************************************************************************ 239 | 240 | ************************************************************************ 241 | * * 242 | * 7. Limitation of Liability * 243 | * -------------------------- * 244 | * * 245 | * Under no circumstances and under no legal theory, whether tort * 246 | * (including negligence), contract, or otherwise, shall any * 247 | * Contributor, or anyone who distributes Covered Software as * 248 | * permitted above, be liable to You for any direct, indirect, * 249 | * special, incidental, or consequential damages of any character * 250 | * including, without limitation, damages for lost profits, loss of * 251 | * goodwill, work stoppage, computer failure or malfunction, or any * 252 | * and all other commercial damages or losses, even if such party * 253 | * shall have been informed of the possibility of such damages. This * 254 | * limitation of liability shall not apply to liability for death or * 255 | * personal injury resulting from such party’s negligence to the * 256 | * extent applicable law prohibits such limitation, but in such event, * 257 | * and to the greatest extent permissible, damages will be limited to * 258 | * direct damages not to exceed one hundred dollars. Some * 259 | * jurisdictions do not allow the exclusion or limitation of * 260 | * incidental or consequential damages, so this exclusion and * 261 | * limitation may not apply to You. * 262 | * * 263 | ************************************************************************ 264 | 265 | 8. Litigation 266 | ------------- 267 | 268 | Any litigation relating to this License may be brought only in the 269 | courts of a jurisdiction where the defendant maintains its principal 270 | place of business and such litigation shall be governed by laws of that 271 | jurisdiction, without reference to its conflict-of-law provisions. 272 | Nothing in this Section shall prevent a party’s ability to bring 273 | cross-claims or counter-claims. 274 | 275 | 9. Miscellaneous 276 | ---------------- 277 | 278 | This License represents the complete agreement concerning the subject 279 | matter hereof. If any provision of this License is held to be 280 | unenforceable, such provision shall be reformed only to the extent 281 | necessary to make it enforceable. Any law or regulation which provides 282 | that the language of a contract shall be construed against the drafter 283 | shall not be used to construe this License against a Contributor. 284 | 285 | 10. Versions of the License 286 | --------------------------- 287 | 288 | 10.1. New Versions 289 | 290 | Oxford Nanopore Technologies PLC. is the license steward. Except as 291 | provided in Section 10.3, no one other than the license steward has the 292 | right to modify or publish new versions of this License. Each version 293 | will be given a distinguishing version number. 294 | 295 | 10.2. Effect of New Versions 296 | 297 | You may distribute the Covered Software under the terms of the version 298 | of the License under which You originally received the Covered Software, 299 | or under the terms of any subsequent version published by the license 300 | steward. 301 | 302 | 10.3. Modified Versions 303 | 304 | If you create software not governed by this License, and you want to 305 | create a new license for such software, you may create and use a 306 | modified version of this License if you rename the license and remove 307 | any references to the name of the license steward (except to note that 308 | such modified license differs from this License). 309 | 310 | Exhibit A - Source Code Form License Notice 311 | ------------------------------------------- 312 | 313 | This Source Code Form is subject to the terms of the Oxford Nanopore 314 | Technologies PLC. Public License, v. 1.0. Full licence can be found 315 | obtained from support@nanoporetech.com 316 | 317 | If it is not possible or desirable to put the notice in a particular 318 | file, then You may include the notice in a location (such as a LICENSE 319 | file in a relevant directory) where a recipient would be likely to look 320 | for such a notice. 321 | 322 | You may add additional accurate notices of copyright ownership. 323 | -------------------------------------------------------------------------------- /bin/de_analysis.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(argparser)) 4 | 5 | parser <- arg_parser("Run differential expression analysis") 6 | parser <- add_argument(parser, "--annotation", help="Reference annotation.") 7 | parser <- add_argument(parser, "--min_samps_gene_expr", help="Minimum number of samples a gene must be expressed in to be included in differential gene expression.", type="numeric") 8 | parser <- add_argument(parser, "--min_samps_feature_expr", help="Minimum number of samples for differential transcript usage.", type="numeric") 9 | parser <- add_argument(parser, "--min_gene_expr", help="Minimum counts per gene required for differential gene expression.", type="numeric") 10 | parser <- add_argument(parser, "--min_feature_expr", help="Minimum counts per transcript required for differential transcript usage.", type="numeric") 11 | parser <- add_argument(parser, "--sample_sheet", help="Sample sheet.") 12 | parser <- add_argument(parser, "--all_counts", help="All transcript counts CSV file.") 13 | parser <- add_argument(parser, "--de_out_dir", help="Directory where differential expression out files will be saved. Directory will be created if it does not exist", default="de_analysis") 14 | parser <- add_argument(parser, "--merged_out_dir", help="Directory where merged count files will be saved. Directory will be created if it does not exist", default="merged") 15 | argv <- parse_args(parser) 16 | 17 | suppressMessages(library("DRIMSeq")) 18 | suppressMessages(library("GenomicFeatures")) 19 | suppressMessages(library("edgeR")) 20 | 21 | # Create output directories 22 | if (!dir.exists(argv$de_out_dir)){ 23 | dir.create(argv$de_out_dir, recursive=TRUE) 24 | } 25 | if (!dir.exists(argv$merged_out_dir)){ 26 | dir.create(argv$merged_out_dir, recursive=TRUE) 27 | } 28 | 29 | cat("Loading counts, conditions and parameters.\n") 30 | cts <- as.matrix(read.csv(argv$all_counts, sep="\t", row.names="Reference", stringsAsFactors=FALSE)) 31 | 32 | # Set up sample data frame: 33 | #changed this to sample_id 34 | coldata <- read.csv(argv$sample_sheet, row.names="alias", sep=",", stringsAsFactors=TRUE) 35 | 36 | coldata$sample_id <- rownames(coldata) 37 | # check if control condition exists, sets as reference 38 | if(!"control" %in% coldata$condition) 39 | stop("sample_sheet.csv does not contain 'control' 40 | condition - unable to set reference.") 41 | coldata$condition <- relevel(coldata$condition, ref = "control") 42 | 43 | # a .gff annotation file extension may be gff2(gtf) or gff3 so check in files for use of = in the attribute field 44 | # if '=' present it is gff3 if not it is gtf. 45 | # see https://www.ensembl.org/info/website/upload/gff.html 46 | # and http://gmod.org/wiki/GFF2#Converting_GFF2_to_GFF3 47 | cat("Checking annotation file type.\n") 48 | lines <- readLines(file(argv$annotation), n=10000) 49 | # If transcript_id containing '=' (format eg. transcript_id=xxx) 50 | # annotation type is gff3 51 | check_file_type <- sum(grepl("transcript_id=", lines)) 52 | if (check_file_type != 0){ 53 | cat("Annotation file type is gff3.\n") 54 | annotation_type <- "gff3" 55 | } else { 56 | # otherwise gtf 57 | cat("Annotation file type is gtf.\n") 58 | annotation_type <- "gtf" 59 | } 60 | 61 | # Transcript_id versions (eg. ENTXXX.1, eg. ENTXXX.2) represent how many times that transcript reference has been changed 62 | # during its time in the database. 63 | # Not all annotation files include it as part of the transcript_id - notably Ensembl 64 | # The following handles this. 65 | cat("Checking annotation file for presence of transcript_id versions.\n") 66 | # Get the first transcript_id from the annotation file by parsing 67 | lines <- readLines(file(argv$annotation), n=100000) 68 | # Find transcript_ids in first 1000 lines and check if they contain dot (format eg. ENTXXX.1) 69 | check_version <- sum(grepl("transcript_id[^;]+\\.", lines)) 70 | if (check_version != 0){ 71 | # we do not need to strip the count file rows if ref_annotation includes versions 72 | cat("Annotation file transcript_ids include versions.\n") 73 | } else { 74 | # otherwise remove the versions 75 | rownames(cts) <- lapply(rownames(cts), sub, pattern = "\\.\\d+$", replacement = "") 76 | cat("Annotation file transcript_ids do not include versions so also strip versions from the counts df.\n") 77 | } 78 | 79 | cat("Loading annotation database.\n") 80 | txdb <- makeTxDbFromGFF(argv$annotation, format = annotation_type) 81 | txdf <- select(txdb, keys(txdb,"GENEID"), "TXNAME", "GENEID") 82 | tab <- table(txdf$GENEID) 83 | txdf$ntx<- tab[match(txdf$GENEID, names(tab))] 84 | 85 | 86 | cts <- cts[rownames(cts) %in% txdf$TXNAME, ] # FIXME: filter for transcripts which are in the annotation. Why they are not all there? 87 | 88 | # Reorder transcript/gene database to match input counts: 89 | txdf <- txdf[match(rownames(cts), txdf$TXNAME), ] 90 | rownames(txdf) <- NULL 91 | 92 | # Create counts data frame: 93 | counts<-data.frame(gene_id=txdf$GENEID, feature_id=txdf$TXNAME, cts) 94 | 95 | # output unfiltered version of the counts table now we have paired transcripts with gene ids 96 | write.table(counts, file=file.path(argv$de_out_dir, "unfiltered_transcript_counts_with_genes.tsv"), sep="\t", row.names = FALSE, quote=FALSE) 97 | 98 | cat("Filtering counts using DRIMSeq.\n") 99 | 100 | d <- dmDSdata(counts=counts, samples=coldata) 101 | trs_cts_unfiltered <- counts(d) 102 | 103 | d <- dmFilter(d, min_samps_gene_expr=argv$min_samps_gene_expr, min_samps_feature_expr=argv$min_samps_feature_expr, 104 | min_gene_expr=argv$min_gene_expr, min_feature_expr=argv$min_feature_expr) 105 | 106 | cat("Building model matrix.\n") 107 | design <- model.matrix(~condition, data=DRIMSeq::samples(d)) 108 | 109 | 110 | 111 | suppressMessages(library("dplyr")) 112 | 113 | # Sum transcript counts into gene counts: 114 | cat("Sum transcript counts into gene counts.\n") 115 | trs_cts <- counts(d) 116 | write.table(trs_cts, file=file.path(argv$merged_out_dir, "filtered_transcript_counts_with_genes.tsv"), sep="\t", row.names = FALSE, quote=FALSE) 117 | 118 | gene_cts <- trs_cts_unfiltered %>% dplyr::select(c(1, 3:ncol(trs_cts))) %>% group_by(gene_id) %>% summarise_all(tibble::lst(sum)) %>% data.frame() 119 | rownames(gene_cts) <- gene_cts$gene_id 120 | gene_cts$gene_id <- NULL 121 | write.table(gene_cts, file=file.path(argv$merged_out_dir, "all_gene_counts.tsv"), sep="\t", quote=FALSE) 122 | 123 | # Output count per million of the gene counts using edgeR CPM 124 | cpm_gene_counts <- cpm(gene_cts) 125 | # Add gene_id as index column header 126 | cpm_gene_counts <- cbind(var_name = rownames(cpm_gene_counts), cpm_gene_counts) 127 | rownames(cpm_gene_counts) <- NULL 128 | colnames(cpm_gene_counts)[1] <- "gene_id" 129 | write.table(cpm_gene_counts, file=file.path(argv$de_out_dir, "cpm_gene_counts.tsv"), sep="\t", quote=FALSE, row.names = FALSE) 130 | 131 | # Differential gene expression using edgeR: 132 | cat("Running differential gene expression analysis using edgeR.\n") 133 | 134 | y <- DGEList(gene_cts) 135 | y <- calcNormFactors(y) 136 | y <- estimateDisp(y,design) 137 | fit <- glmQLFit(y,design) 138 | qlf <- glmQLFTest(fit) 139 | edger_res <- topTags(qlf, n=nrow(y), sort.by="PValue")[[1]] 140 | 141 | pdf("de_analysis/results_dge.pdf") 142 | 143 | # create status vector 144 | status <- ifelse( 145 | qlf$PValue<0.01 & qlf$logFC>0, 146 | 'up', 147 | ifelse( 148 | qlf$PValue<0.01 & qlf$logFC<=0, 149 | 'down', 150 | 'notsig' 151 | ) 152 | ) 153 | plotMD(qlf, status=status, values=c("up","down","notsig"), hl.col=c("red","blue","black")) 154 | abline(h=c(-1,1), col="blue") 155 | plotQLDisp(fit) 156 | 157 | write.table(as.data.frame(edger_res), file=file.path(argv$de_out_dir, "results_dge.tsv"), sep="\t") 158 | 159 | # Differential transcript usage using DEXSeq: 160 | suppressMessages(library("DEXSeq")) 161 | cat("Running differential transcript usage analysis using DEXSeq.\n") 162 | 163 | sample.data<-DRIMSeq::samples(d) 164 | count.data <- round(as.matrix(counts(d)[,-c(1:2)])) 165 | dxd <- DEXSeqDataSet(countData=count.data, sampleData=sample.data, design=~sample + exon + condition:exon, featureID=trs_cts$feature_id, groupID=trs_cts$gene_id) 166 | dxd <- estimateSizeFactors(dxd) 167 | dxd <- estimateDispersions(dxd) 168 | dxd <- testForDEU(dxd, reducedModel=~sample + exon) 169 | dxd <- estimateExonFoldChanges( dxd, fitExpToVar="condition") 170 | dxr <- DEXSeqResults(dxd, independentFiltering=FALSE) 171 | 172 | dev.off() 173 | pdf("de_analysis/results_dtu.pdf") 174 | plotMA(dxr, cex=0.8, alpha=0.05) 175 | plotDispEsts(dxd) 176 | 177 | qval <- perGeneQValue(dxr) 178 | dxr.g<-data.frame(gene=names(qval), qval) 179 | dxr.g <- dxr.g[order(dxr.g$qval),] 180 | 181 | dxr_out <- as.data.frame(dxr[,c("featureID", "groupID", "pvalue")]) 182 | dxr_out <- dxr_out[order(dxr$pvalue),] 183 | 184 | write.table(dxr.g, file=file.path(argv$de_out_dir, "results_dtu_gene.tsv"), sep="\t") 185 | write.table(dxr_out, file=file.path(argv$de_out_dir, "results_dtu_transcript.tsv"), sep="\t") 186 | 187 | # and writing out some of the DEXSeq metrics to accompany EPI2ME Labs tutorial 188 | colnames(dxr)[grep("log2fold", colnames(dxr))] <- "log2fold" 189 | MADTUdata <- data.frame(dxr)[order(dxr$padj),c("exonBaseMean", "log2fold", "pvalue", "padj")] 190 | MADTUdata$exonBaseMean <- log2(MADTUdata$exonBaseMean) 191 | colnames(MADTUdata)[which(colnames(MADTUdata)=="exonBaseMean")] <- "Log2MeanExon" 192 | colnames(MADTUdata)[which(colnames(MADTUdata)=="log2fold")] <- "Log2FC" 193 | write.table(MADTUdata, file=file.path(argv$de_out_dir, "results_dexseq.tsv"), sep="\t") 194 | 195 | # stageR analysis of DEXSeq results: 196 | cat("stageR analysis\n") 197 | library(stageR) 198 | 199 | cat("Running stageR analysis on the differential transcript usage results.\n") 200 | pConfirmation <- matrix(dxr$pvalue, ncol=1) 201 | 202 | dimnames(pConfirmation) <- list(dxr$featureID, "transcript") 203 | pScreen <- qval 204 | tx2gene <- as.data.frame(dxr[,c("featureID", "groupID")]) 205 | 206 | stageRObj <- stageRTx(pScreen=pScreen, pConfirmation=pConfirmation, pScreenAdjusted=TRUE, tx2gene=tx2gene) 207 | # note: the choice of 0.05 here means you can *only* threshold at 5% OFDR later 208 | stageRObj <- stageWiseAdjustment(stageRObj, method="dtu", alpha=0.10) 209 | suppressWarnings({dex.padj <- getAdjustedPValues(stageRObj, order=FALSE, onlySignificantGenes=FALSE)}) 210 | 211 | # dex.padj <- dex.padj[,-1] 212 | write.table(dex.padj, file=file.path(argv$de_out_dir, "results_dtu_stageR.tsv"), sep="\t") 213 | -------------------------------------------------------------------------------- /bin/plot_dtu_results.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(argparser)) 4 | 5 | parser <- arg_parser("Plot results") 6 | parser <- add_argument(parser, "--counts", help="Filtered transcript counts with genes.") 7 | parser <- add_argument(parser, "--results_dtu", help="stageR results.") 8 | parser <- add_argument(parser, "--sample_sheet", help="Sample sheet.") 9 | parser <- add_argument(parser, "--pdf_out", help="PDF file name.") 10 | argv <- parse_args(parser) 11 | 12 | suppressMessages(library(dplyr)) 13 | suppressMessages(library(ggplot2)) 14 | suppressMessages(library(tidyr)) 15 | 16 | # Set up sample data frame: 17 | coldata <- read.csv(argv$sample_sheet, row.names="alias", sep=",") 18 | coldata$condition <- factor(coldata$condition, levels=rev(levels(coldata$condition))) 19 | coldata$type <-NULL 20 | coldata$patient <-NULL 21 | 22 | # Read stageR results: 23 | stageR <- read.csv(argv$results_dtu, sep="\t") 24 | names(stageR) <- c("gene_id", "transcript_id", "p_gene", "p_transcript"); 25 | 26 | # Read filtered counts: 27 | counts <- read.csv(argv$counts, sep="\t"); 28 | names(counts)[2]<-"transcript_id" 29 | 30 | # Join counts and stageR results: 31 | df <- counts %>% left_join(stageR, by = c("gene_id", "transcript_id")) 32 | df <- df[order(df$p_gene),] 33 | 34 | scols <- setdiff(names(df),c("gene_id", "transcript_id", "p_gene", "p_transcript")) 35 | 36 | # Normalise counts: 37 | for(sc in scols){ 38 | df[sc] <- df[sc] / sum(df[sc]) 39 | } 40 | 41 | # Melt data frame: 42 | tdf <- df %>% gather(key='sample', value='norm_count',-gene_id, -transcript_id, -p_gene, -p_transcript) 43 | 44 | # Add sample group column: 45 | sampleToGroup<-function(x){ 46 | return(coldata[x,]$condition) 47 | } 48 | 49 | tdf$group <- sampleToGroup(tdf$sample) 50 | 51 | # Filter for significant genes: 52 | sig_level <- 0.05 53 | genes <- as.character(tdf[which(tdf$p_gene < sig_level),]$gene_id) 54 | genes <- unique(genes) 55 | 56 | pdf(argv$pdf_out) 57 | 58 | for(gene in genes){ 59 | gdf<-tdf[which(tdf$gene_id==gene),] 60 | p_gene <- unique(gdf$p_gene) 61 | dtu_plot <- ggplot(gdf, aes(x=transcript_id, y=norm_count)) + geom_bar(stat="identity", aes(fill=sample), position="dodge") 62 | dtu_plot <- dtu_plot + facet_wrap(~ group) + coord_flip() 63 | dtu_plot <- dtu_plot + ggtitle(paste(gene," : p_value=",p_gene,sep="")) 64 | print(dtu_plot) 65 | } 66 | -------------------------------------------------------------------------------- /bin/workflow-glue: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Entry point for sc_tools (single_cell_tools).""" 3 | 4 | from workflow_glue import cli 5 | 6 | if __name__ == '__main__': 7 | cli() 8 | -------------------------------------------------------------------------------- /bin/workflow_glue/__init__.py: -------------------------------------------------------------------------------- 1 | """Workflow Python code.""" 2 | import argparse 3 | import glob 4 | import importlib 5 | import itertools 6 | import os 7 | import sys 8 | 9 | from .util import _log_level, get_main_logger # noqa: ABS101 10 | 11 | 12 | __version__ = "0.0.1" 13 | _package_name = "workflow_glue" 14 | 15 | HELPERS = "wfg_helpers" 16 | 17 | 18 | def get_components(allowed_components=None): 19 | """Find a list of workflow command scripts.""" 20 | logger = get_main_logger(_package_name) 21 | 22 | # gather all python files in the current directory and the wfg_helpers 23 | home_path = os.path.dirname(os.path.abspath(__file__)) 24 | standard_lib = os.path.join(home_path, HELPERS) 25 | globs = itertools.chain.from_iterable(( 26 | glob.glob(os.path.join(path, "*.py")) 27 | for path in (home_path, standard_lib))) 28 | 29 | components = dict() 30 | for fname in globs: 31 | name = os.path.splitext(os.path.basename(fname))[0] 32 | if name in ("__init__", "util"): 33 | continue 34 | if allowed_components is not None and name not in allowed_components: 35 | continue 36 | 37 | # leniently attempt to import module 38 | try: 39 | if HELPERS in fname: 40 | mod = importlib.import_module(f"{_package_name}.{HELPERS}.{name}") 41 | else: 42 | mod = importlib.import_module(f"{_package_name}.{name}") 43 | except ModuleNotFoundError as e: 44 | # if imports cannot be satisifed, refuse to add the component 45 | # rather than exploding 46 | logger.warn(f"Could not load {name} due to missing module {e.name}") 47 | continue 48 | 49 | # if theres a main() and and argparser() thats good enough for us. 50 | try: 51 | req = "main", "argparser" 52 | if all(callable(getattr(mod, x)) for x in req): 53 | components[name] = mod 54 | except Exception: 55 | pass 56 | return components 57 | 58 | 59 | def cli(): 60 | """Run workflow entry points.""" 61 | logger = get_main_logger(_package_name) 62 | logger.info("Bootstrapping CLI.") 63 | parser = argparse.ArgumentParser( 64 | 'wf-glue', 65 | parents=[_log_level()], 66 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 67 | 68 | parser.add_argument( 69 | '-v', '--version', action='version', 70 | version='%(prog)s {}'.format(__version__)) 71 | 72 | subparsers = parser.add_subparsers( 73 | title='subcommands', description='valid commands', 74 | help='additional help', dest='command') 75 | subparsers.required = True 76 | 77 | # importing everything can take time, try to shortcut 78 | if len(sys.argv) > 1: 79 | components = get_components(allowed_components=[sys.argv[1]]) 80 | if not sys.argv[1] in components: 81 | logger.warn("Importing all modules, this may take some time.") 82 | components = get_components() 83 | else: 84 | components = get_components() 85 | 86 | # add all module parsers to main CLI 87 | for name, module in components.items(): 88 | p = subparsers.add_parser( 89 | name.split(".")[-1], parents=[module.argparser()]) 90 | p.set_defaults(func=module.main) 91 | 92 | args = parser.parse_args() 93 | 94 | logger.info("Starting entrypoint.") 95 | args.func(args) 96 | -------------------------------------------------------------------------------- /bin/workflow_glue/check_sample_sheet_condition.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Check if a sample sheet is valid.""" 3 | from collections import Counter 4 | import csv 5 | import sys 6 | 7 | from .util import get_named_logger, wf_parser # noqa: ABS101 8 | 9 | 10 | def main(args): 11 | """Run the entry point.""" 12 | logger = get_named_logger("checkSheetCondition") 13 | with open(args.sample_sheet, "r") as f: 14 | csv_reader = csv.DictReader(f) 15 | conditions_count = Counter() 16 | for row in csv_reader: 17 | if "condition" in row: 18 | conditions_count[row['condition']] += 1 19 | else: 20 | sys.exit( 21 | "Sample sheet has no condition column " 22 | "which is required for the " 23 | "differential expression subworkflow.") 24 | if len(conditions_count.keys()) != 2: 25 | sys.exit( 26 | "There must be only two unique conditions " 27 | "in the condition column of the sample sheet.") 28 | if "control" not in conditions_count: 29 | sys.exit( 30 | "One of the condition types must be control, " 31 | "to indicate which samples to use as the reference.") 32 | if any(v < 2 for v in conditions_count.values()): 33 | sys.exit( 34 | "There must be at least 2 repeats for each " 35 | "condition indicated in the sample sheet.") 36 | logger.info(f"Checked sample sheet for condition column {args.sample_sheet}.") 37 | 38 | 39 | def argparser(): 40 | """Argument parser for entrypoint.""" 41 | parser = wf_parser("check_sample_sheet_condition") 42 | parser.add_argument("sample_sheet", help="Sample sheet to check") 43 | return parser 44 | -------------------------------------------------------------------------------- /bin/workflow_glue/de_plots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Create de report section.""" 3 | import json 4 | import os 5 | 6 | from dominate.tags import h5, p 7 | from dominate.util import raw 8 | from ezcharts import scatterplot 9 | from ezcharts.components.ezchart import EZChart 10 | from ezcharts.layout.snippets import DataTable 11 | from natsort import natsorted 12 | import numpy as np 13 | import pandas as pd 14 | 15 | 16 | def flagstats_df(flagstats_reports): 17 | """Flag stats alignment dataframe.""" 18 | flagstats_dic = {} 19 | for flagstat in flagstats_reports.iterdir(): 20 | with open(flagstat, "r") as f: 21 | data = json.load(f) 22 | data = data["QC-passed reads"] 23 | flagstats = [ 24 | 'mapped', 'primary mapped', 'secondary', 'supplementary'] 25 | per_sample_flagstats = {key: data.get(key) for key in flagstats} 26 | sample = os.path.basename(flagstat).split(".")[0] 27 | flagstats_dic[sample] = per_sample_flagstats 28 | alignment_summary_df = pd.DataFrame(flagstats_dic) 29 | alignment_summary_df = alignment_summary_df[ 30 | natsorted(alignment_summary_df.columns)] 31 | alignment_summary_df.index = [ 32 | "Total Read Mappings", 33 | "Primary", "Secondary", 34 | "Supplementary"] 35 | alignment_summary_df.index.name = "Statistic" 36 | return alignment_summary_df 37 | 38 | 39 | def dexseq_section(dexseq_file, tr_id_to_gene_name, tr_id_to_gene_id, pval_thresh): 40 | """Add gene isoforms table and plot.""" 41 | h5("Differential Isoform usage") 42 | p("""Table showing gene isoforms, ranked by adjusted 43 | p-value, from the DEXSeq analysis. Information shown includes the log2 fold 44 | change between experimental conditions, the log-scaled transcript 45 | abundance and the false discovery corrected p-value (FDR - Benjamini-Hochberg) . 46 | This table has not been filtered 47 | for genes that satisfy statistical or magnitudinal thresholds""") 48 | 49 | dexseq_results = pd.read_csv(dexseq_file, sep='\t') 50 | dexseq_results.index.name = "gene_id:transcript_id" 51 | 52 | # Replace any occurrences of stringtie-generated MSTRG gene ids with 53 | # reference gene_ids. 54 | dexseq_results.index = dexseq_results.index.map( 55 | lambda ge_tr: str( # lookup gene_id from transcript_id [1] 56 | f"{tr_id_to_gene_id.get(ge_tr.split(':')[1])}: {str(ge_tr.split(':')[1])}") 57 | ) 58 | 59 | # Add gene name column. 60 | dexseq_results.insert(0, "gene_name", dexseq_results.index.map( 61 | lambda x: tr_id_to_gene_name.get(x.split(':')[1]))) 62 | 63 | DataTable.from_pandas( 64 | dexseq_results.sort_values(by='pvalue', ascending=True), use_index=True) 65 | 66 | p( 67 | """The figure below presents the MA plot from the DEXSeq analysis. 68 | M is the log2 ratio of isoform transcript abundance between conditions. 69 | A is the log2 transformed mean abundance value. 70 | Transcripts that satisfy the logFC and FDR-corrected 71 | (False discovery rate - Benjamini-Hochberg) p-value 72 | thresholds defined are shaded as 'Up-' or 'Down-' regulated.""") 73 | 74 | dexseq_results['direction'] = 'not_sig' 75 | 76 | dexseq_results.loc[ 77 | (dexseq_results["Log2FC"] > 0) & (dexseq_results['pvalue'] < pval_thresh), 78 | 'direction'] = 'up' 79 | 80 | dexseq_results.loc[ 81 | (dexseq_results["Log2FC"] <= 0) & (dexseq_results['pvalue'] < pval_thresh), 82 | 'direction'] = 'down' 83 | 84 | plot = scatterplot( 85 | data=dexseq_results, x='Log2MeanExon', y='Log2FC', hue='direction', 86 | palette=['#E32636', '#7E8896', '#0A22DE'], 87 | hue_order=['up', 'down', 'not_sig'], marker='circle') 88 | plot._fig.xaxis.axis_label = "A (log2 transformed mean exon read counts)" 89 | plot._fig.yaxis.axis_label = "M (log2 transformed differential abundance)" 90 | plot.legend = dict(orient='horizontal', top=30) 91 | plot._fig.title = "Average copy per million (CPM) vs Log-fold change (LFC)" 92 | EZChart(plot) 93 | 94 | 95 | def dtu_section(dtu_file, txid_to_gene_name): 96 | """Plot dtu section.""" 97 | dtu_results = pd.read_csv(dtu_file, sep='\t') 98 | dtu_results["gene_name"] = dtu_results["txID"].apply( 99 | lambda x: txid_to_gene_name.get(x)) 100 | 101 | dtu_pvals = dtu_results.sort_values(by='gene', ascending=True) 102 | raw("""Table showing gene and transcript identifiers 103 | and their FDR-corrected (False discovery rate - Benjamini-Hochberg) probabilities 104 | for the genes and their isoforms that have been 105 | identified as showing DTU using the R packages DEXSeq and StageR. 106 | This list has been shortened requiring that both gene and transcript 107 | must satisfy the p-value 108 | threshold""") 109 | DataTable.from_pandas(dtu_results.loc[dtu_pvals.index], use_index=False) 110 | 111 | raw("""View dtu_plots.pdf file to see plots of differential isoform usage""") 112 | 113 | 114 | def dge_section(df, pval_thresh): 115 | """Create DGE table and MA plot.""" 116 | h5("Differential gene expression") 117 | df[['logFC', 'logCPM', 'F']] = df[ 118 | ['logFC', 'logCPM', 'F']].round(2) 119 | 120 | p("""Table showing the genes from the edgeR analysis. 121 | Information shown includes the log2 fold change between 122 | experimental conditions, the log-scaled counts per million measure of abundance 123 | and the FDR-corrected p-value (False discovery rate - Benjamini-Hochberg). 124 | This table has not been 125 | filtered for genes that satisfy statistical or magnitudinal thresholds""") 126 | 127 | df = df.sort_values('FDR', ascending=True) 128 | df.index.name = 'gene_id' 129 | DataTable.from_pandas(df, use_index=True) 130 | 131 | h5("Results of the edgeR Analysis.") 132 | 133 | p("""This plot visualises differences in measurements between the 134 | two experimental conditions. M is the log2 ratio of gene expression 135 | calculated between the conditions. 136 | A is a log2 transformed mean expression value. 137 | The figure below presents the MA figure from this edgeR analysis. 138 | Genes that satisfy the logFC and FDR-corrected 139 | (False discovery rate - Benjamini-Hochberg) p-value thresholds 140 | defined are shaded as 'Up-' or 'Down-' regulated. 141 | """) 142 | df['sig'] = None 143 | df.loc[(df["logFC"] > 0) & (df['PValue'] < pval_thresh), 'sig'] = 'up' 144 | df.loc[(df["logFC"] <= 0) & (df['PValue'] < pval_thresh), 'sig'] = 'down' 145 | df.loc[(df["PValue"] >= pval_thresh), 'sig'] = 'not_sig' 146 | 147 | plot = scatterplot( 148 | data=df, x='logCPM', y='logFC', hue='sig', 149 | palette=['#E32636', '#7E8896', '#0A22DE'], 150 | hue_order=['up', 'not_sig', 'down'], marker='circle') 151 | plot._fig.x_range.start = 10 152 | plot._fig.xaxis.axis_label = "Average log CPM" 153 | plot._fig.yaxis.axis_label = "Log-fold change" 154 | plot.legend = dict(orient='horizontal', top=30) 155 | # Should opacity of the symbols be lowered? 156 | plot._fig.title = "Average copy per million (CPM) vs Log-fold change (LFC)" 157 | EZChart(plot) 158 | 159 | 160 | def salmon_table(salmon_counts): 161 | """Create salmon counts summary table.""" 162 | salmon_counts = pd.read_csv(salmon_counts, sep='\t') 163 | salmon_counts.set_index("Reference", drop=True, append=False, inplace=True) 164 | salmon_size_top = salmon_counts.sum(axis=1).sort_values(ascending=False) 165 | salmon_counts = salmon_counts.applymap(np.int64) 166 | h5("Transcripts Per Million") 167 | p("""Table showing the annotated Transcripts Per Million 168 | identified by Minimap2 mapping and Salmon transcript 169 | detection. Displaying the top 100 transcripts with the highest 170 | number of mapped reads""") 171 | 172 | salmon_counts = salmon_counts[sorted(salmon_counts.columns)] 173 | DataTable.from_pandas( 174 | salmon_counts.loc[salmon_size_top.index].head(n=100), use_index=True) 175 | 176 | 177 | def get_translations(gtf): 178 | """Create gene_and transcript id mappings. 179 | 180 | Annotation can be stringtie-generated (GTF) or from the input 181 | reference annotation (GTF or GFF3) and the various attributes can differ 182 | """ 183 | with open(gtf) as fh: 184 | txid_to_gene_name = {} 185 | gid_to_gene_name = {} 186 | tx_id_to_gene_id = {} 187 | 188 | def get_feature(row, feature): 189 | return row.split(feature)[1].split( 190 | ";")[0].replace('=', '').replace("\"", "").strip() 191 | 192 | for gff_entry in fh: 193 | # Process transcripts features only 194 | if gff_entry.startswith("#") or gff_entry.split('\t')[2] != 'transcript': 195 | continue 196 | # Different gtf/gff formats contain different attributes 197 | # and different formating (eg. gene_name="xyz" or gene_name "xyz") 198 | gene_name = gene_id = transcript_id = 'unknown' 199 | 200 | if 'ref_gene_id' in gff_entry: 201 | # Favour ref_gene_id over gene_id. The latter can be multi-locus merged 202 | # genes from stringtie 203 | gene_id = get_feature(gff_entry, 'ref_gene_id') 204 | elif 'gene_id' in gff_entry: 205 | gene_id = get_feature(gff_entry, 'gene_id') 206 | else: 207 | gene_id = get_feature(gff_entry, 'gene') 208 | 209 | if 'transcript_id' in gff_entry: 210 | transcript_id = get_feature(gff_entry, 'transcript_id') 211 | 212 | if 'gene_name' in gff_entry: 213 | gene_name = get_feature(gff_entry, 'gene_name') 214 | else: 215 | # Fallback to gene_id if gene_name is not present 216 | gene_name = gene_id 217 | 218 | txid_to_gene_name[transcript_id] = gene_name 219 | tx_id_to_gene_id[transcript_id] = gene_id 220 | gid_to_gene_name[gene_id] = gene_name 221 | return txid_to_gene_name, tx_id_to_gene_id, gid_to_gene_name 222 | 223 | 224 | def de_section( 225 | annotation, dge, dexseq, dtu, 226 | tpm, report, filtered, unfiltered, 227 | gene_counts, flagstats_dir, pval_threshold=0.01): 228 | """Differential expression sections.""" 229 | with (report.add_section("Differential expression", "DE")): 230 | 231 | p("""This section shows differential gene expression 232 | and differential isoform usage. Salmon was used to 233 | assign reads to individual annotated isoforms defined by 234 | the GTF-format annotation. 235 | These counts were used to perform a statistical analysis to identify 236 | the genes and isoforms that show differences in abundance between 237 | the experimental conditions. 238 | Any novel genes or transcripts that do not have relevant gene or transcript IDs 239 | are prefixed with MSTRG for use in differential expression analysis. 240 | Find the full sequences of any transcripts in the 241 | final_non_redundant_transcriptome.fasta file. 242 | """) 243 | alignment_summary_df = flagstats_df(flagstats_dir) 244 | h5("Alignment summary stats") 245 | DataTable.from_pandas(alignment_summary_df, use_index=True) 246 | salmon_table(tpm) 247 | 248 | # Get translations for adding gene names to tables 249 | ( 250 | txid_to_gene_name, txid_to_gene_id, gid_to_gene_name 251 | ) = get_translations(annotation) 252 | 253 | # Add gene names columns to counts files and write out 254 | # for publishing to user dir. 255 | df_dge = pd.read_csv(dge, sep='\t') 256 | df_dge.insert(0, 'gene_name', df_dge.index.map( 257 | lambda x: gid_to_gene_name.get(x))) 258 | df_dge.to_csv('results_dge.tsv', index=True, index_label="gene_id", sep="\t") 259 | 260 | df_gene_counts = pd.read_csv(gene_counts, sep='\t') 261 | df_gene_counts.insert( 262 | 0, 'gene_name', df_gene_counts.index.map( 263 | lambda x: gid_to_gene_name.get(x))) 264 | df_gene_counts.to_csv( 265 | 'all_gene_counts.tsv', index=True, index_label="gene_id", sep="\t") 266 | 267 | df_filtered = pd.read_csv(filtered, sep='\t') 268 | df_filtered.insert(1, "gene_name", df_filtered.gene_id.map( 269 | lambda x: gid_to_gene_name.get(x))) 270 | df_filtered.to_csv( 271 | 'filtered_transcript_counts_with_genes.tsv', index=False, sep='\t') 272 | 273 | df_unfiltered = pd.read_csv(unfiltered, sep='\t') 274 | df_unfiltered.insert(1, "gene_name", df_unfiltered.gene_id.map( 275 | lambda x: gid_to_gene_name.get(x))) 276 | df_unfiltered.to_csv( 277 | 'unfiltered_transcript_counts_with_genes.tsv', index=False, sep='\t') 278 | 279 | df_tpm = pd.read_csv(tpm, sep='\t') 280 | df_tpm.insert(1, "gene_name", df_tpm.Reference.map( 281 | lambda x: txid_to_gene_name.get(x))) 282 | df_tpm.to_csv("unfiltered_tpm_transcript_counts.tsv", index=False, sep='\t') 283 | 284 | # Add tables to report 285 | dge_section(df_dge, pval_threshold) 286 | dexseq_section(dexseq, txid_to_gene_name, txid_to_gene_id, pval_threshold) 287 | dtu_section(dtu, txid_to_gene_name) 288 | -------------------------------------------------------------------------------- /bin/workflow_glue/generate_pychopper_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Generate CSV of pychopper stats.""" 3 | 4 | # -*- coding: utf-8 -*- 5 | 6 | import os 7 | 8 | import pandas as pd 9 | 10 | from .util import wf_parser # noqa: ABS101 11 | 12 | 13 | def argparser(): 14 | """Argument parser for entrypoint.""" 15 | parser = wf_parser("generate_pychopper_stats") 16 | parser.add_argument("--data", required=True, help="") 17 | parser.add_argument("--output_dir", required=True, help="") 18 | 19 | return parser 20 | 21 | 22 | def generate_pychopper_stats(tsv, output): 23 | """Make CSV of pychopper stats.""" 24 | classified_path = os.path.join(output, "pychopper_stats.csv") 25 | df = pd.read_csv(tsv, sep="\t", index_col="Name") 26 | classified = df.loc[df["Category"] == "Classification"]\ 27 | .copy().reset_index().rename(columns={'Name': 'Classification'}) 28 | classified["Percentage"] = \ 29 | 100 * classified["Value"] / classified["Value"].sum() 30 | tuning = df.loc[df["Category"] == "AutotuneSample"]\ 31 | .copy().reset_index().rename(columns={'Name': 'Filter'}) 32 | tuning.to_csv(classified_path) 33 | 34 | 35 | def main(args): 36 | """Run entry point.""" 37 | assert os.path.isfile(args.data) 38 | assert os.path.isdir(args.output_dir) 39 | generate_pychopper_stats(tsv=args.data, output=args.output_dir) 40 | -------------------------------------------------------------------------------- /bin/workflow_glue/merge_count_tsvs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Merge salmon output count files.""" 3 | 4 | from functools import reduce 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from .util import wf_parser # noqa: ABS101 10 | 11 | 12 | def argparser(): 13 | """Argument parser for entrypoint.""" 14 | parser = wf_parser("merge_count_tsvs") 15 | parser.add_argument( 16 | '-j', metavar='join', help="Join type (outer).", default="outer") 17 | parser.add_argument( 18 | '-f', metavar='field', 19 | help="Join on this field (Reference).", default="Reference") 20 | parser.add_argument( 21 | '-o', metavar='out_tsv', 22 | help="Output tsv (merge_tsvs.tsv).", default="merge_tsvs.tsv") 23 | parser.add_argument( 24 | '-z', action="store_true", 25 | help="Fill NA values with zero.", default=False) 26 | parser.add_argument( 27 | '-tpm', type=bool, default=False, 28 | help="TPM instead of counts") 29 | parser.add_argument( 30 | '-tsvs', metavar='input_tsvs', nargs='*', 31 | help="Input tab separated files.") 32 | 33 | return parser 34 | 35 | 36 | def main(args): 37 | """Run entry point.""" 38 | dfs = {x: pd.read_csv(x, sep="\t") for x in args.tsvs} 39 | 40 | ndfs = [] 41 | for x, df in dfs.items(): 42 | # Transform counts to integers: 43 | if args.tpm: 44 | df = df.rename(columns={'TPM': 'Count', 'Name': 'Reference'}) 45 | else: 46 | df = df.rename(columns={'NumReads': 'Count', 'Name': 'Reference'}) 47 | df.Count = np.array(df.Count, dtype=int) 48 | # Take only non-zero counts: 49 | df = df[df.Count > 0] 50 | df = df[["Reference", "Count"]] 51 | df = df.sort_values(by=["Count"], ascending=False) 52 | name = x.split('.')[0] 53 | df = df.rename(columns={'Count': name}) 54 | ndfs.append(df) 55 | dfs = ndfs 56 | 57 | df_merged = reduce(lambda left, right: pd.merge( 58 | left, right, on=args.f, how=args.j), dfs) 59 | if args.z: 60 | df_merged = df_merged.fillna(0) 61 | df_merged = df_merged.sort_index(axis=1) 62 | df_merged = df_merged.sort_index(axis=0) 63 | 64 | df_merged.to_csv(args.o, sep="\t", index=False) 65 | -------------------------------------------------------------------------------- /bin/workflow_glue/models/__init__.py: -------------------------------------------------------------------------------- 1 | """A collection of scripts for results models.""" 2 | -------------------------------------------------------------------------------- /bin/workflow_glue/models/common.py: -------------------------------------------------------------------------------- 1 | """Common model classes used across all workflows.""" 2 | from dataclasses import asdict, dataclass, field 3 | from decimal import Decimal 4 | from enum import Enum 5 | import json 6 | from typing import Any, Dict, List 7 | 8 | 9 | class SampleType(str, Enum): 10 | """The type of the sample.""" 11 | 12 | no_template_control = "no_template_control" 13 | positive_control = "positive_control" 14 | negative_control = "negative_control" 15 | test_sample = "test_sample" 16 | 17 | def friendly_name(self): 18 | """Convert sample type to string.""" 19 | return self.name.replace("_", " ").capitalize() 20 | 21 | 22 | @dataclass 23 | class SampleIdentifier: 24 | """Additional identifiers for a sample.""" 25 | 26 | name: str = field( 27 | metadata={ 28 | "title": "Identifier name", 29 | "Description": "The name of the sample identifier"}) 30 | value: str = field( 31 | metadata={ 32 | "title": "Identifier value", 33 | "Description": "The value of the sample identifier"}) 34 | 35 | 36 | @dataclass 37 | class CheckResult: 38 | """ 39 | A result of some check the workflow has performed. 40 | 41 | This can be at sample or workflow level. 42 | """ 43 | 44 | check_category: str = field( 45 | metadata={ 46 | "title": "Check category", 47 | "description": "The category of the check"}) 48 | check_name: str = field( 49 | metadata={ 50 | "title": "Check name", 51 | "description": "The name of the check"}) 52 | check_pass: bool = field( 53 | metadata={ 54 | "title": "Check pass", 55 | "description": "If true the check has passed"}) 56 | check_threshold: str | None = field( 57 | default=None, metadata={ 58 | "title": "Check threshold", 59 | "description": "The threshold for the check, useful for reporting later"}) 60 | 61 | categories = {} 62 | 63 | def friendly_check_category(self): 64 | """Convert category to string.""" 65 | if self.check_category not in self.categories: 66 | raise ValueError(f"{self.check_category} has no friendly name") 67 | return self.categories[self.check_category] 68 | 69 | def friendly_check_name(self): 70 | """Convert check name to string.""" 71 | return self.check_name.replace("_", " ").capitalize() 72 | 73 | 74 | @dataclass 75 | class ResultsContents: 76 | """Placeholder class for results contents.""" 77 | 78 | pass 79 | 80 | 81 | @dataclass 82 | class Sample: 83 | """A sample sheet entry and its corresponding checks and related results.""" 84 | 85 | alias: str = field( 86 | metadata={ 87 | "title": "Sample alias", 88 | "description": "The alias for the sample given by the user"}) 89 | barcode: str = field( 90 | metadata={ 91 | "title": "Sample barcode", 92 | "description": "The physical barcode assigned to the sample"}) 93 | sample_type: SampleType = field( 94 | metadata={ 95 | "title": "Sample type", 96 | "description": "The type of the sample"}) 97 | sample_pass: bool = field( 98 | metadata={ 99 | "title": "Sample pass", 100 | "description": "If true the sample has passed workflow checks"}) 101 | additional_identifiers: List[SampleIdentifier] = field( 102 | default_factory=list, metadata={ 103 | "title": "Additional sample identifiers", 104 | "description": "Addition identifiers for the sample"}) 105 | sample_checks: list[CheckResult] = field( 106 | default_factory=list, metadata={ 107 | "title": "Sample checks", 108 | "description": "An array of checks performed on the sample"}) 109 | results: ResultsContents | None = field( 110 | default=None, metadata={ 111 | "title": "Sample results", 112 | "description": "Further specific workflow results for this sample"}) 113 | config: Dict[str, Any] | None = field( 114 | default=None, metadata={ 115 | "title": "Sample configuration", 116 | "description": """Sample specific config parameters 117 | used for running analysis"""}) 118 | 119 | def __post_init__(self): 120 | """Determine overall status for a sample given the individual check results.""" 121 | self.sample_pass = all( 122 | check.check_pass for check in self.sample_checks) 123 | 124 | def get_sample_identifier(self, sample_identifier): 125 | """Get a sample identifier given the identifier name.""" 126 | for indentifier in self.additional_identifiers: 127 | if indentifier.name == sample_identifier: 128 | return indentifier.value 129 | raise KeyError("Sample identifier not found") 130 | 131 | def set_sample_identifier(self, name, value): 132 | """Set a sample identifier.""" 133 | sample_identifier = SampleIdentifier( 134 | name=name, 135 | value=value) 136 | self.additional_identifiers.append(sample_identifier) 137 | return self.additional_identifiers 138 | 139 | def to_json(self, filename): 140 | """Save class as JSON.""" 141 | with open(filename, 'w') as f: 142 | json.dump(asdict(self), f, default=str, indent=2, cls=DecimalEncoder) 143 | 144 | 145 | @dataclass 146 | class RunStats: 147 | """Basic run statistics for the entire run.""" 148 | 149 | total_reads: int | None = field( 150 | default=None, metadata={ 151 | "title": "Total reads", 152 | "description": "Total number of reads on run"}) 153 | total_ambiguous_reads: int | None = field( 154 | default=None, metadata={ 155 | "title": "Total ambiguous reads", 156 | "description": "Number of reads of unknown provenance"}) 157 | total_unaligned_reads: int | None = field( 158 | default=None, metadata={ 159 | "title": "Total unaligned reads", 160 | "description": "Number of unaligned reads"}) 161 | 162 | 163 | @dataclass 164 | class WorkflowResult(): 165 | """ 166 | Definition for results that will be returned by this workflow. 167 | 168 | This structure will be passed through by Gizmo speaking clients 169 | as WorkflowInstance.results. 170 | """ 171 | 172 | samples: list[Sample] = field( 173 | metadata={ 174 | "title": "Samples", 175 | "description": "Samples in this workflow instance"}) 176 | workflow_pass: bool | None = field( 177 | default=None, metadata={ 178 | "title": "Workflow pass", 179 | "description": "True if this workflow instance passes all checks"}) 180 | workflow_checks: list[CheckResult] = field( 181 | default_factory=list, metadata={ 182 | "title": "Workflow checks", 183 | "description": "An array of checks performed on the workflow instance"}) 184 | run_stats: RunStats | None = field( 185 | default=None, metadata={ 186 | "title": "Samples", 187 | "description": "Basic run statistics"}) 188 | client_fields: dict[str, Any] | None = field( 189 | default_factory=dict, metadata={ 190 | "title": "Client fields", 191 | "description": "Arbitrary key-value pairs provided by the client"}) 192 | 193 | def to_json(self, filename): 194 | """Save class as JSON.""" 195 | with open(filename, 'w') as f: 196 | json.dump(asdict(self), f, default=str, indent=2, cls=DecimalEncoder) 197 | 198 | 199 | class DecimalEncoder(json.JSONEncoder): 200 | """This should probably be moved.""" 201 | 202 | def default(self, obj): 203 | """Override the default method to handle Decimal objects.""" 204 | if isinstance(obj, Decimal): 205 | return float(obj) 206 | return super().default(obj) 207 | -------------------------------------------------------------------------------- /bin/workflow_glue/parse_gffcompare.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Make report tables and data for plotting.""" 3 | import os 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from .util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def argparser(): 12 | """Argument parser for entrypoint.""" 13 | parser = wf_parser("Parse gffcompare") 14 | parser.add_argument( 15 | '--sample_id', help="Sample ID", required=True) 16 | parser.add_argument( 17 | '--gffcompare_dir', 18 | help="The gffcompare output directory", 19 | required=False, 20 | type=Path) 21 | parser.add_argument( 22 | '--isoform_table_out', 23 | help="Output path for per-isoform table", 24 | type=Path) 25 | parser.add_argument( 26 | '--tracking', 27 | help="gffcompare tracking file", 28 | type=Path) 29 | parser.add_argument( 30 | "--annotation", 31 | required=False, 32 | default=None, help="Reference annotation GFF file") 33 | 34 | return parser 35 | 36 | 37 | def _parse_stat_line(sl): 38 | """Parse a stats line.""" 39 | res = {} 40 | tmp = sl.split(':')[1].split('|') 41 | res['sensitivity'] = float(tmp[0].strip()) 42 | res['precision'] = float(tmp[1].strip()) 43 | return res 44 | 45 | 46 | def _parse_matching_line(line): 47 | """Parse a matching line.""" 48 | tmp = line.split(':')[1].strip() 49 | return int(tmp) 50 | 51 | 52 | def _parse_mn_line(line): 53 | """Parse a miss or novel line.""" 54 | res = {} 55 | tmp = line.split(':')[1].strip() 56 | tmp = tmp.split('/') 57 | res['value'] = int(tmp[0]) 58 | tmp = tmp[1].split('(') 59 | res['value_total'] = int(tmp[0].strip()) 60 | res['percent'] = float(tmp[1].split('%)')[0]) 61 | return res 62 | 63 | 64 | def _parse_total_line(line): 65 | """Parse a total line.""" 66 | res = {} 67 | tmp = line.split(':')[1].strip() 68 | tmp = tmp.split('in') 69 | res['transcripts'] = int(tmp[0].strip()) 70 | tmp = tmp[1].split('loci') 71 | res['loci'] = int(tmp[0].strip()) 72 | tmp = int(tmp[1].split('(')[1].split(' ')[0]) 73 | res['me_transcripts'] = tmp 74 | return res 75 | 76 | 77 | def parse_gffcmp_stats(gffcompare_stats, sample_id, outpath): 78 | """Parse a gffcompare stats file. 79 | 80 | Gffcompare stats file 81 | :param gffcompare_stats: Path to the gffcompare stats file. 82 | :returns: Return as tuple of dataframes containing: 83 | perfromance statistics, match statistics, miss statistics, 84 | novel statistics, total statistics. 85 | :rtype: tuple 86 | """ 87 | performance = [] 88 | missed = [] 89 | novel = [] 90 | total = [] 91 | 92 | with open(gffcompare_stats, 'r') as fh: 93 | for line in fh: 94 | line = line.strip() 95 | if len(line) == 0: 96 | continue 97 | 98 | # Parse totals: 99 | if line.startswith('# Query mRNAs'): 100 | r = _parse_total_line(line) 101 | total.append([r['loci'], 'loci', 'query']) 102 | total.append([r['transcripts'], 'transcripts', 'query']) 103 | total.append([r['me_transcripts'], 'multexonic', 'query']) 104 | if line.startswith('# Reference mRNAs '): 105 | r = _parse_total_line(line) 106 | total.append([r['loci'], 'loci', 'reference']) 107 | total.append([r['transcripts'], 'transcripts', 'reference']) 108 | total.append([r['me_transcripts'], 'multexonic', 'reference']) 109 | 110 | # Parse basic statistics: 111 | if line.startswith('Base level'): 112 | st = _parse_stat_line(line) 113 | performance.append((st['sensitivity'], 'Sensitivity', 'Base')) 114 | performance.append((st['precision'], 'Precision', 'Base')) 115 | if line.startswith('Exon level'): 116 | st = _parse_stat_line(line) 117 | performance.append((st['sensitivity'], 'Sensitivity', 'Exon')) 118 | performance.append((st['precision'], 'Precision', 'Exon')) 119 | if line.startswith('Intron level'): 120 | st = _parse_stat_line(line) 121 | performance.append((st['sensitivity'], 'Sensitivity', 'Intron')) 122 | performance.append((st['precision'], 'Precision', 'Intron')) 123 | if line.startswith('Intron chain level'): 124 | st = _parse_stat_line(line) 125 | performance.append((st['sensitivity'], 'Sensitivity', 'Intron_chain')) 126 | performance.append((st['precision'], 'Precision', 'Intron_chain')) 127 | if line.startswith('Transcript level'): 128 | st = _parse_stat_line(line) 129 | performance.append((st['sensitivity'], 'Sensitivity', 'Transcript')) 130 | performance.append((st['precision'], 'Precision', 'Transcript')) 131 | if line.startswith('Locus level'): 132 | st = _parse_stat_line(line) 133 | performance.append((st['sensitivity'], 'Sensitivity', 'Locus')) 134 | performance.append((st['precision'], 'Precision', 'Locus')) 135 | 136 | # Parse missing statistics: 137 | if line.startswith('Missed exons'): 138 | r = _parse_mn_line(line) 139 | missed.append((r['value'], 'Missed', 'Exons')) 140 | missed.append((r['value_total'], 'total', 'Exons')) 141 | missed.append((r['percent'], 'Percent', 'Exons')) 142 | if line.startswith('Missed introns'): 143 | r = _parse_mn_line(line) 144 | missed.append((r['value'], 'Missed', 'Introns')) 145 | missed.append((r['value_total'], 'total', 'Introns')) 146 | missed.append((r['percent'], 'Percent', 'Introns')) 147 | if line.startswith('Missed loci'): 148 | r = _parse_mn_line(line) 149 | missed.append((r['value'], 'Missed', 'Loci')) 150 | missed.append((r['value_total'], 'total', 'Loci')) 151 | missed.append((r['percent'], 'Percent', 'Loci')) 152 | 153 | # Parse novel statistics: 154 | if line.startswith('Novel exons'): 155 | r = _parse_mn_line(line) 156 | novel.append((r['value'], 'Novel', 'Exons')) 157 | novel.append((r['value_total'], 'Total', 'Exons')) 158 | novel.append((r['percent'], 'Percent_novel', 'Exons')) 159 | if line.startswith('Novel introns'): 160 | r = _parse_mn_line(line) 161 | novel.append((r['value'], 'Novel', 'Introns')) 162 | novel.append((r['value_total'], 'Total', 'Introns')) 163 | novel.append((r['percent'], 'Percent_novel', 'Introns')) 164 | if line.startswith('Novel loci'): 165 | r = _parse_mn_line(line) 166 | novel.append((r['value'], 'Novel', 'Loci')) 167 | novel.append((r['value_total'], 'Total', 'Loci')) 168 | novel.append((r['percent'], 'Percent_novel', 'Loci')) 169 | 170 | def write_records(records, fn): 171 | pd.DataFrame.from_records(records, columns=['counts', 'type', 'source']) \ 172 | .to_csv(outpath / fn, sep='\t') 173 | 174 | write_records(total, 'Totals.tsv') 175 | write_records(missed, 'Missed.tsv') 176 | write_records(performance, 'Performance.tsv') 177 | write_records(novel, 'Novel.tsv') 178 | 179 | 180 | def tracking_summary(tracking_file, output_dir, annotations=None): 181 | """Write per transcript class gffcompare tracking files.""" 182 | tracking_headings = [ 183 | "query_transfrag_id", "query_locus_id", "ref_gene_id", 184 | "class", "details"] 185 | nice_names = { 186 | '=': 'complete', 'c': 'contained', 'k': 'containment', 187 | 'm': 'retained', 'n': 'retained (partial)', 'j': 'multi', 188 | 'e': 'single', 'o': 'overlap', 's': 'opposite', 189 | 'x': 'exonic', 'i': 'intron', 'y': 'contains', 'p': 'runon', 190 | 'r': 'repeat', 'u': 'unknown'} 191 | 192 | if os.path.exists(annotations): 193 | tracking = pd.read_csv( 194 | tracking_file, sep="\t", names=tracking_headings[1:], 195 | index_col=0) 196 | 197 | df = ( 198 | pd.DataFrame(tracking['class'].value_counts()) 199 | .reset_index() 200 | .rename(columns={'index': 'class', 'class': 'Count'}) 201 | ) 202 | 203 | df['Percent'] = round(df['Count'] * 100 / df['Count'].sum(), 2) 204 | df['description'] = [nice_names[x] for x in df['class']] 205 | 206 | df = df.sort_values('Count', ascending=True) 207 | df.to_csv(output_dir / 'tracking_summary.tsv', sep='\t') 208 | 209 | else: 210 | logger = get_named_logger('trackingSum') 211 | logger.info("Skipping classification summary as no annotation provided.") 212 | 213 | 214 | def make_isoform_table(gffcompare_dir, sample_id, outpath): 215 | """Make an isoform summary table.""" 216 | try: 217 | tmap_file = next(gffcompare_dir.glob('*.tmap')) 218 | except StopIteration: 219 | raise ValueError("Cannot find .tmap file in {}".format(gffcompare_dir)) 220 | dtypes = { 221 | 'ref_gene_id': str, 222 | 'ref_id': str, 223 | 'class_code': str, 224 | 'qry_id': str, 225 | 'num_exons': np.uint16, 226 | 'cov': np.uint32, 227 | 'len': np.uint32 228 | } 229 | df = pd.read_csv( 230 | tmap_file, sep='\t+', 231 | index_col=None, 232 | usecols=list(dtypes.keys()), 233 | dtype=dtypes) 234 | 235 | if df.empty: # No transcripts. Write a header only result file 236 | df = pd.DataFrame( 237 | columns=list(dtypes.keys()) + ['sample_id', 'parent gene iso num']) 238 | df.to_csv(f'{sample_id}_transcripts_table.tsv', sep='\t', index=False) 239 | else: 240 | df = df.assign(sample_id=sample_id) 241 | 242 | # Make a column of number of isoforms in parent gene 243 | gb = df.groupby(['ref_gene_id']).count() 244 | gb.rename(columns={'ref_id': 'num_isoforms'}, inplace=True) 245 | 246 | df['parent gene iso num'] = df.apply( 247 | lambda x: gb.loc[(x.ref_gene_id), 'num_isoforms'], axis=1) 248 | 249 | # Unclassified transcripts should not be lumped together 250 | df.loc[df.class_code == 'u', 'parent gene iso num'] = None 251 | 252 | df.to_csv(outpath, sep='\t', index=False) 253 | 254 | 255 | def main(args): 256 | """Entry point.""" 257 | if args.gffcompare_dir: # TODO: should this every be optional? 258 | stats = args.gffcompare_dir / 'str_merged.stats' 259 | parse_gffcmp_stats(stats, args.sample_id, args.gffcompare_dir) 260 | make_isoform_table(args.gffcompare_dir, args.sample_id, args.isoform_table_out) 261 | tracking_summary( 262 | args.tracking, args.gffcompare_dir, args.annotation) 263 | -------------------------------------------------------------------------------- /bin/workflow_glue/summarise_gff.py: -------------------------------------------------------------------------------- 1 | """Get summary statistics from GFF file.""" 2 | from collections import Counter 3 | from pathlib import Path 4 | import pickle 5 | 6 | import gffutils 7 | from .util import wf_parser # noqa: ABS101 8 | 9 | 10 | def argparser(): 11 | """Argument parser for entrypoint.""" 12 | parser = wf_parser("summ_gff") 13 | parser.add_argument( 14 | "gff", 15 | help="Report output file", 16 | type=Path) 17 | parser.add_argument( 18 | "sample_id", 19 | help="Output TSV file path") 20 | parser.add_argument( 21 | "out", 22 | default="gff_summary.tsv", 23 | help="Output TSV file path", 24 | type=Path) 25 | 26 | return parser 27 | 28 | 29 | def main(args): 30 | """Entry point.""" 31 | db = gffutils.create_db( 32 | str(args.gff), dbfn=':memory:', force=True, keep_order=True, 33 | merge_strategy='merge', sort_attribute_values=True 34 | ) 35 | 36 | num_transcripts = db.count_features_of_type('transcript') 37 | num_genes = db.count_features_of_type('gene') 38 | 39 | transcript_lens = [] 40 | exons_per_transcript = Counter() 41 | isoforms_per_gene = Counter() 42 | 43 | for gene in db.features_of_type('gene'): 44 | 45 | n_isos = len(list(db.children(gene, featuretype='transcript'))) 46 | isoforms_per_gene[n_isos] += 1 47 | 48 | for transcript in db.children( 49 | gene, featuretype='transcript', order_by='start'): 50 | tr_len = 0 51 | exons = list(db.children(transcript, featuretype='exon')) 52 | if len(exons) == 0: 53 | continue 54 | exons_per_transcript[len(exons)] += 1 55 | for ex in exons: 56 | tr_len += abs(ex.end - ex.start) 57 | 58 | transcript_lens.append(tr_len) 59 | 60 | results = { 61 | 'sample_id': args.sample_id, 62 | 'summaries': { 63 | 'Total genes': [num_genes], 64 | 'Total transcripts': [num_transcripts], 65 | 'Max trans. len': max(transcript_lens), 66 | 'Min trans. len': min(transcript_lens) 67 | }, 68 | 'transcript_lengths': transcript_lens, 69 | 'exons_per_transcript': exons_per_transcript, 70 | 'isoforms_per_gene': isoforms_per_gene 71 | } 72 | 73 | with open(args.out, 'wb') as fh: 74 | pickle.dump(results, fh) 75 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """__init__.py for the tests.""" 2 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Pytests argument definitions.""" 3 | 4 | 5 | def pytest_addoption(parser): 6 | """Define command line arguments for pytest.""" 7 | parser.addoption( 8 | "--test_data", 9 | action="store", 10 | default="/host/test_data" 11 | ) 12 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_check_sample_sheet_condition.py: -------------------------------------------------------------------------------- 1 | """Test check_sample_sheet.py.""" 2 | import os 3 | 4 | import pytest 5 | from workflow_glue import check_sample_sheet_condition 6 | 7 | 8 | # define a list of error messages 9 | ERROR_MESSAGES = [ 10 | ("sample_sheet_1.csv", "There must be only two unique conditions in the condition column of the sample sheet."), # noqa: E501 11 | ("sample_sheet_2.csv", "Sample sheet has no condition column which is required for the differential expression subworkflow."), # noqa: E501 12 | ("sample_sheet_3.csv", "There must be at least 2 repeats for each condition indicated in the sample sheet."), # noqa: E501 13 | ("sample_sheet_4.csv", "One of the condition types must be control, to indicate which samples to use as the reference."), # noqa: E501 14 | ] 15 | 16 | 17 | @pytest.fixture 18 | def test_data(request): 19 | """Define data location fixture.""" 20 | return os.path.join( 21 | request.config.getoption("--test_data"), 22 | "workflow_glue", 23 | "check_sample_sheet_condition") 24 | 25 | 26 | @pytest.mark.parametrize("sample_sheet_name,error_msg", ERROR_MESSAGES) 27 | def test_check_sample_sheet( 28 | test_data, sample_sheet_name, error_msg): 29 | """Test the sample sheets.""" 30 | expected_error_message = error_msg 31 | sample_sheet_path = f"{test_data}/{sample_sheet_name}" 32 | args = check_sample_sheet_condition.argparser().parse_args( 33 | [sample_sheet_path] 34 | ) 35 | try: 36 | check_sample_sheet_condition.main(args) 37 | except SystemExit as e: 38 | assert str(e) == expected_error_message 39 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_de_plots.py: -------------------------------------------------------------------------------- 1 | """Test assign_barcodes.""" 2 | from pathlib import Path 3 | 4 | import pytest 5 | from workflow_glue.de_plots import get_translations 6 | 7 | 8 | @pytest.fixture 9 | def test_data(request): 10 | """Define data location fixture.""" 11 | return Path(request.config.getoption("--test_data")) / "workflow_glue" 12 | 13 | 14 | @pytest.mark.parametrize( 15 | 'annotation_file,expected', 16 | [ 17 | [ 18 | 'MSTRG.11088.gtf', 19 | dict(gid_to_gene_name={ 20 | 'ENSG00000236051.7': 'MYCBP2-AS1', 21 | 'ENSG00000283208.2': 'ENSG00000283208', 22 | 'ENSG00000102805.16': 'CLN5', 23 | 'MSTRG.11088': 'MSTRG.11088' 24 | }, 25 | txid_to_gene_name={ 26 | 'ENST00000636183.2': 'CLN5', 27 | 'ENST00000636780.2': 'CLN5', 28 | 'ENST00000638147.2': 'ENSG00000283208', 29 | 'ENST00000637192.1': 'ENSG00000283208', 30 | 'ENST00000636737.1': 'MYCBP2-AS1', 31 | 'ENST00000450627.6': 'MYCBP2-AS1', 32 | 'MSTRG.11088.2': 'MSTRG.11088' 33 | }, 34 | txid_to_gene_id={ 35 | 'ENST00000636183.2': 'ENSG00000102805.16', 36 | 'MSTRG.11088.2': 'MSTRG.11088', 37 | 'ENST00000636780.2': 'ENSG00000102805.16', 38 | 'ENST00000638147.2': 'ENSG00000283208.2', 39 | 'ENST00000637192.1': 'ENSG00000283208.2', 40 | 'ENST00000636737.1': 'ENSG00000236051.7', 41 | 'ENST00000450627.6': 'ENSG00000236051.7' 42 | }) 43 | 44 | ], 45 | # Small test to check that GFF3 works 46 | [ 47 | 'MSTRG.11088.gff3', 48 | dict(gid_to_gene_name={ 49 | "ENSG00000290825.1": "DDX11L2", 50 | "ENSG00000236397.3": "DDX11L2" 51 | }, 52 | txid_to_gene_name={ 53 | "ENST00000456328.2": "DDX11L2", 54 | "ENST00000437401.1": "DDX11L2" 55 | }, 56 | txid_to_gene_id={ 57 | 'ENST00000437401.1': 'ENSG00000236397.3', 58 | 'ENST00000456328.2': 'ENSG00000290825.1' 59 | }) 60 | ] 61 | ] 62 | ) 63 | def test_get_translations(test_data, annotation_file, expected): 64 | """Test that correct feature identifiers are extracted from the annotation. 65 | 66 | `stringtie --merge` can sometimes generate gene models that may span multiple 67 | reference genes. Possibly related issue: 68 | https://github.com/gpertea/stringtie/issues/217 69 | This can lead to the original genes and transcripts being assigned to that 70 | incorrectly-merged gene model. The test data contains such a gene model generated 71 | from `stringtie --merge` but actually consists of multiple different genes. 72 | 73 | 74 | """ 75 | input_gtf = test_data / annotation_file 76 | txid_to_gene_name, txid_to_gene_id, gid_to_gene_name = get_translations(input_gtf) 77 | 78 | assert expected['gid_to_gene_name'] == gid_to_gene_name 79 | assert expected['txid_to_gene_name'] == txid_to_gene_name 80 | assert expected['txid_to_gene_id'] == txid_to_gene_id 81 | -------------------------------------------------------------------------------- /bin/workflow_glue/util.py: -------------------------------------------------------------------------------- 1 | """The odd helper function. 2 | 3 | Be careful what you place in here. This file is imported into all glue. 4 | """ 5 | import argparse 6 | import logging 7 | 8 | 9 | _log_name = None 10 | 11 | 12 | def get_main_logger(name): 13 | """Create the top-level logger.""" 14 | global _log_name 15 | _log_name = name 16 | logging.basicConfig( 17 | format='[%(asctime)s - %(name)s] %(message)s', 18 | datefmt='%H:%M:%S', level=logging.INFO) 19 | return logging.getLogger(name) 20 | 21 | 22 | def get_named_logger(name): 23 | """Create a logger with a name. 24 | 25 | :param name: name of logger. 26 | """ 27 | name = name.ljust(10)[:10] # so logging is aligned 28 | logger = logging.getLogger('{}.{}'.format(_log_name, name)) 29 | return logger 30 | 31 | 32 | def wf_parser(name): 33 | """Make an argument parser for a workflow command.""" 34 | return argparse.ArgumentParser( 35 | name, 36 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 37 | add_help=False) 38 | 39 | 40 | def _log_level(): 41 | """Parser to set logging level and acquire software version/commit.""" 42 | parser = argparse.ArgumentParser( 43 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False) 44 | 45 | modify_log_level = parser.add_mutually_exclusive_group() 46 | modify_log_level.add_argument( 47 | '--debug', action='store_const', 48 | dest='log_level', const=logging.DEBUG, default=logging.INFO, 49 | help='Verbose logging of debug information.') 50 | modify_log_level.add_argument( 51 | '--quiet', action='store_const', 52 | dest='log_level', const=logging.WARNING, default=logging.INFO, 53 | help='Minimal logging; warnings only.') 54 | 55 | return parser 56 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | """A collection of helper scripts common to workflows.""" 2 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/check_bam_headers_in_dir.py: -------------------------------------------------------------------------------- 1 | """Check (u)BAM files for `@SQ` lines whether they are the same in all headers.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pysam 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def main(args): 12 | """Run the entry point.""" 13 | logger = get_named_logger("checkBamHdr") 14 | 15 | if not args.input_path.is_dir(): 16 | raise ValueError(f"Input path '{args.input_path}' must be a directory.") 17 | 18 | target_files = list(args.input_path.glob("*")) 19 | if not target_files: 20 | raise ValueError(f"No files found in input directory '{args.input_path}'.") 21 | # Loop over target files and check if there are `@SQ` lines in all headers or not. 22 | # Set `is_unaligned` accordingly. If there are mixed headers (either with some files 23 | # containing `@SQ` lines and some not or with different files containing different 24 | # `@SQ` lines), set `mixed_headers` to `True`. 25 | # Also check if there is the SO line, to validate whether the file is (un)sorted. 26 | first_sq_lines = None 27 | mixed_headers = False 28 | sorted_xam = False 29 | for xam_file in target_files: 30 | # get the `@SQ` and `@HD` lines in the header 31 | with pysam.AlignmentFile(xam_file, check_sq=False) as f: 32 | # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with 33 | # same reference but different SQ.UR as mixed_header (see CW-4842) 34 | sq_lines = [{ 35 | "SN": sq["SN"], 36 | "LN": sq["LN"], 37 | "M5": sq.get("M5"), 38 | } for sq in f.header.get("SQ", [])] 39 | hd_lines = f.header.get("HD") 40 | # Check if it is sorted. 41 | # When there is more than one BAM, merging/sorting 42 | # will happen regardless of this flag. 43 | if hd_lines is not None and hd_lines.get('SO') == 'coordinate': 44 | sorted_xam = True 45 | if first_sq_lines is None: 46 | # this is the first file 47 | first_sq_lines = sq_lines 48 | else: 49 | # this is a subsequent file; check with the first `@SQ` lines 50 | if sq_lines != first_sq_lines: 51 | mixed_headers = True 52 | break 53 | 54 | # we set `is_unaligned` to `True` if there were no mixed headers and the last file 55 | # didn't have `@SQ` lines (as we can then be sure that none of the files did) 56 | is_unaligned = not mixed_headers and not sq_lines 57 | # write `is_unaligned` and `mixed_headers` out so that they can be set as env. 58 | # variables 59 | sys.stdout.write( 60 | f"IS_UNALIGNED={int(is_unaligned)};" + 61 | f"MIXED_HEADERS={int(mixed_headers)};" + 62 | f"IS_SORTED={int(sorted_xam)}" 63 | ) 64 | logger.info(f"Checked (u)BAM headers in '{args.input_path}'.") 65 | 66 | 67 | def argparser(): 68 | """Argument parser for entrypoint.""" 69 | parser = wf_parser("check_bam_headers_in_dir") 70 | parser.add_argument("input_path", type=Path, help="Path to target directory") 71 | return parser 72 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/check_sample_sheet.py: -------------------------------------------------------------------------------- 1 | """Check if a sample sheet is valid.""" 2 | import codecs 3 | import csv 4 | import os 5 | import re 6 | import sys 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | # Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my 12 | # comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8 13 | # I should add). If we do not handle this with the correct encoding, the mark will 14 | # appear in the parsed data, causing the header to be malformed. 15 | # See CW-2310 16 | def determine_codec(f): 17 | """Peek at a file and return an appropriate reading codec.""" 18 | with open(f, 'rb') as f_bytes: 19 | # Could use chardet here if we need to expand codec support 20 | initial_bytes = f_bytes.read(8) 21 | 22 | for codec, encoding_name in [ 23 | [codecs.BOM_UTF8, "utf-8-sig"], # use the -sig codec to drop the mark 24 | [codecs.BOM_UTF16_BE, "utf-16"], # don't specify LE or BE to drop mark 25 | [codecs.BOM_UTF16_LE, "utf-16"], 26 | [codecs.BOM_UTF32_BE, "utf-32"], # handle 32 for completeness 27 | [codecs.BOM_UTF32_LE, "utf-32"], # again skip LE or BE to drop mark 28 | ]: 29 | if initial_bytes.startswith(codec): 30 | return encoding_name 31 | return None # will cause file to be opened with default encoding 32 | 33 | 34 | def main(args): 35 | """Run the entry point.""" 36 | logger = get_named_logger("checkSheet") 37 | 38 | barcodes = [] 39 | aliases = [] 40 | sample_types = [] 41 | analysis_groups = [] 42 | allowed_sample_types = [ 43 | "test_sample", "positive_control", "negative_control", "no_template_control" 44 | ] 45 | 46 | if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet): 47 | sys.stdout.write("Could not open sample sheet file.") 48 | sys.exit() 49 | 50 | try: 51 | encoding = determine_codec(args.sample_sheet) 52 | with open(args.sample_sheet, "r", encoding=encoding) as f: 53 | try: 54 | # Excel files don't throw any error until here 55 | csv.Sniffer().sniff(f.readline()) 56 | f.seek(0) # return to initial position again 57 | except Exception as e: 58 | # Excel fails with UniCode error 59 | sys.stdout.write( 60 | "The sample sheet doesn't seem to be a CSV file.\n" 61 | "The sample sheet has to be a CSV file.\n" 62 | "Please verify that the sample sheet is a CSV file.\n" 63 | f"Parsing error: {e}" 64 | ) 65 | 66 | sys.exit() 67 | 68 | csv_reader = csv.DictReader(f) 69 | n_row = 0 70 | for row in csv_reader: 71 | n_row += 1 72 | if n_row == 1: 73 | n_cols = len(row) 74 | else: 75 | # check we got the same number of fields 76 | if len(row) != n_cols: 77 | sys.stdout.write( 78 | f"Unexpected number of cells in row number {n_row}" 79 | ) 80 | sys.exit() 81 | try: 82 | barcodes.append(row["barcode"]) 83 | except KeyError: 84 | sys.stdout.write("'barcode' column missing") 85 | sys.exit() 86 | try: 87 | aliases.append(row["alias"]) 88 | except KeyError: 89 | sys.stdout.write("'alias' column missing") 90 | sys.exit() 91 | try: 92 | sample_types.append(row["type"]) 93 | except KeyError: 94 | pass 95 | try: 96 | analysis_groups.append(row["analysis_group"]) 97 | except KeyError: 98 | pass 99 | except Exception as e: 100 | sys.stdout.write(f"Parsing error: {e}") 101 | sys.exit() 102 | 103 | # check barcodes are correct format 104 | for barcode in barcodes: 105 | if not re.match(r'^barcode\d\d+$', barcode): 106 | sys.stdout.write("values in 'barcode' column are incorrect format") 107 | sys.exit() 108 | 109 | # check aliases are correct format 110 | # for now we have decided they may not start with "barcode" 111 | for alias in aliases: 112 | if alias.startswith("barcode"): 113 | sys.stdout.write("values in 'alias' column must not begin with 'barcode'") 114 | sys.exit() 115 | 116 | # check barcodes are all the same length 117 | first_length = len(barcodes[0]) 118 | for barcode in barcodes[1:]: 119 | if len(barcode) != first_length: 120 | sys.stdout.write("values in 'barcode' column are different lengths") 121 | sys.exit() 122 | 123 | # check barcode and alias values are unique 124 | if len(barcodes) > len(set(barcodes)): 125 | sys.stdout.write("values in 'barcode' column not unique") 126 | sys.exit() 127 | if len(aliases) > len(set(aliases)): 128 | sys.stdout.write("values in 'alias' column not unique") 129 | sys.exit() 130 | 131 | if sample_types: 132 | # check if "type" column has unexpected values 133 | unexp_type_vals = set(sample_types) - set(allowed_sample_types) 134 | 135 | if unexp_type_vals: 136 | sys.stdout.write( 137 | f"found unexpected values in 'type' column: {unexp_type_vals}. " 138 | f"Allowed values are: {allowed_sample_types}" 139 | ) 140 | sys.exit() 141 | 142 | if args.required_sample_types: 143 | for required_type in args.required_sample_types: 144 | if required_type not in allowed_sample_types: 145 | sys.stdout.write(f"Not an allowed sample type: {required_type}") 146 | sys.exit() 147 | if sample_types.count(required_type) < 1: 148 | sys.stdout.write( 149 | f"Sample sheet requires at least 1 of {required_type}") 150 | sys.exit() 151 | if analysis_groups: 152 | # if there was a "analysis_group" column, make sure it had values for all 153 | # samples 154 | if not all(analysis_groups): 155 | sys.stdout.write( 156 | "if an 'analysis_group' column exists, it needs values in each row" 157 | ) 158 | sys.exit() 159 | 160 | logger.info(f"Checked sample sheet {args.sample_sheet}.") 161 | 162 | 163 | def argparser(): 164 | """Argument parser for entrypoint.""" 165 | parser = wf_parser("check_sample_sheet") 166 | parser.add_argument("sample_sheet", help="Sample sheet to check") 167 | parser.add_argument( 168 | "--required_sample_types", 169 | help="List of required sample types. Each sample type provided must " 170 | "appear at least once in the sample sheet", 171 | nargs="*" 172 | ) 173 | return parser 174 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/check_xam_index.py: -------------------------------------------------------------------------------- 1 | """Validate a single (u)BAM file index.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pysam 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def validate_xam_index(xam_file): 12 | """Use fetch to validate the index. 13 | 14 | Invalid indexes will fail the call with a ValueError: 15 | ValueError: fetch called on bamfile without index 16 | """ 17 | with pysam.AlignmentFile(xam_file, check_sq=False) as alignments: 18 | try: 19 | alignments.fetch() 20 | has_valid_index = True 21 | except ValueError: 22 | has_valid_index = False 23 | return has_valid_index 24 | 25 | 26 | def main(args): 27 | """Run the entry point.""" 28 | logger = get_named_logger("checkBamIdx") 29 | 30 | # Check if a XAM has a valid index 31 | has_valid_index = validate_xam_index(args.input_xam) 32 | # write `has_valid_index` out so that they can be set as env. 33 | sys.stdout.write( 34 | f"HAS_VALID_INDEX={int(has_valid_index)}" 35 | ) 36 | logger.info(f"Checked (u)BAM index for: '{args.input_xam}'.") 37 | 38 | 39 | def argparser(): 40 | """Argument parser for entrypoint.""" 41 | parser = wf_parser("check_xam_index") 42 | parser.add_argument("input_xam", type=Path, help="Path to target XAM") 43 | return parser 44 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/configure_igv.py: -------------------------------------------------------------------------------- 1 | """Create an IGV config file.""" 2 | 3 | import json 4 | from pathlib import Path 5 | import sys 6 | 7 | from ..util import get_named_logger, wf_parser # noqa: ABS101 8 | 9 | 10 | # Common variables 11 | REF_EXTENSIONS = [".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fna.gz"] 12 | DATA_TYPES_LISTS = { 13 | "bam": ["bam"], 14 | "bam_idx": ["bam.bai"], 15 | "cram": ["cram"], 16 | "cram_idx": ["cram.crai"], 17 | "vcf": ["vcf", "vcf.gz"], 18 | "vcf_idx": ["vcf.gz.tbi", "vcf.gz.csi"], 19 | "bcf": ["bcf"], 20 | "bcf_idx": ["bcf.csi"], 21 | "gtf": ["gtf", "gtf.gz"], 22 | "gtf_idx": ["gtf.gz.tbi"], 23 | "gff": ["gff", "gff.gz", "gff3", "gff3.gz"], 24 | "gff_idx": ["gff.gz.tbi", "gff3.gz.tbi"], 25 | "bed": ["bed", "bed.gz"], 26 | "bed_idx": ["bed.gz.tbi"], 27 | "bedmethyl": ["bedmethyl", "bedmethyl.gz"], 28 | "bedmethyl_idx": ["bedmethyl.gz.tbi"], 29 | "ref": REF_EXTENSIONS, 30 | } 31 | DATA_TYPES = { 32 | ext: ftype for ftype, extlist in DATA_TYPES_LISTS.items() for ext in extlist 33 | } 34 | 35 | # Data by idx 36 | DATA_INDEXES_FMT = { 37 | fmt: f"{fmt}_idx" for fmt, dtype in DATA_TYPES.items() if "_idx" not in dtype 38 | } 39 | 40 | # Assign each format to its index 41 | INDEX_PAIRS = { 42 | "bam": ("bai",), 43 | "cram": ("crai",), 44 | "vcf": ("tbi", "csi"), 45 | "bcf": ("csi",), 46 | "bed": ("tbi",), 47 | "bedmethyl": ("tbi",), 48 | "gff": ("tbi",), 49 | "gtf": ("tbi",), 50 | } 51 | 52 | 53 | class TrackBuilder: 54 | """Class that builds an IGV track.""" 55 | 56 | def __init__(self): 57 | """Initialize properties for interval track.""" 58 | # Reference properties 59 | self.ref = None 60 | self.fai = None 61 | self.gzi = None 62 | # Samples info 63 | self.samples = {} 64 | # Track properties 65 | self.igv_json = {"reference": {}, "tracks": []} 66 | self.track_type = { 67 | "bam": "alignment", 68 | "cram": "alignment", 69 | "bcf": "variant", 70 | "vcf": "variant", 71 | "bedmethyl": "annotation", 72 | "bed": "annotation", 73 | "gtf": "annotation", 74 | "gff": "annotation", 75 | } 76 | # Here we save aliases of file formats that IGV.js 77 | # wants and that do not match the input file extension. 78 | self.igv_fmt_alias = {"gff": "gff3"} 79 | # lookup of extra options for each data type 80 | self.extra_opts_lookups = { 81 | "bam": {}, 82 | "cram": {}, 83 | "bcf": {}, 84 | "vcf": {}, 85 | "bed": {}, 86 | "bedmethyl": {}, 87 | "gtf": {}, 88 | "gff": {}, 89 | } 90 | 91 | def add_ref(self, ref=None): 92 | """Add reference file, unless already defined.""" 93 | if self.ref: 94 | raise Exception( 95 | f"Reference genome has already been set to {self.ref}.\n" 96 | "Only one reference FASTA file is expected." 97 | ) 98 | else: 99 | self.ref = ref 100 | 101 | def add_ref_index(self, ref_index=None): 102 | """Add reference index if valid.""" 103 | basename = Path(self.ref).name 104 | idx_basename = Path(ref_index).name 105 | if idx_basename == f"{basename}.fai": 106 | self.fai = ref_index 107 | if idx_basename == f"{basename}.gzi" and basename.endswith(".gz"): 108 | self.gzi = ref_index 109 | 110 | def parse_fnames(self, fofn): 111 | """Parse list with filenames and return them grouped. 112 | 113 | :param fofn: File with list of file names (one per line) 114 | """ 115 | tmp_samples = {} 116 | with open(fofn, "r") as f: 117 | for line in f: 118 | # If the line contains the sample name, prepare the data structure 119 | if "," in line: 120 | sample, fname = line.strip().split(",") 121 | if sample not in tmp_samples: 122 | tmp_samples[sample] = SampleBundle(sample=sample) 123 | tmp_samples[sample].append(fname) 124 | else: 125 | # Otherwise, assign everything to NO_SAMPLE 126 | # Files will still be displayed, but in no specific order. 127 | fname = line.strip() 128 | if any(fname.endswith(ext) for ext in REF_EXTENSIONS): 129 | self.add_ref(ref=fname) 130 | elif fname.endswith(".fai") or fname.endswith(".gzi"): 131 | self.add_ref_index(ref_index=fname) 132 | else: 133 | if "NO_SAMPLE" not in tmp_samples.keys(): 134 | tmp_samples["NO_SAMPLE"] = SampleBundle(sample="NO_SAMPLE") 135 | tmp_samples["NO_SAMPLE"].append(fname) 136 | # Re-order samples in dict and add them to the list, leaving 137 | # NO_SAMPLE as last 138 | sorted_samples = ( 139 | sorted([sample for sample in tmp_samples.keys() if sample != 'NO_SAMPLE']) 140 | ) 141 | if 'NO_SAMPLE' in tmp_samples.keys(): 142 | sorted_samples += ['NO_SAMPLE'] 143 | for sample in sorted_samples: 144 | self.samples[sample] = tmp_samples[sample] 145 | 146 | def build_igv_json(self): 147 | """Ensure there is a reference genome.""" 148 | if not self.ref: 149 | raise ValueError( 150 | "No reference file (i.e. file ending in one of " 151 | f"{REF_EXTENSIONS} was found)." 152 | ) 153 | # Evaluate that a bgzipped reference has the appropriate index. 154 | if self.ref.endswith(".gz") and not self.gzi: 155 | raise ValueError(f"GZI reference index for {self.ref} not found.") 156 | 157 | # Create the base track if there is a reference genome. 158 | self.igv_json["reference"] = { 159 | "id": "ref", 160 | "name": "ref", 161 | "wholeGenomeView": False, 162 | "fastaURL": self.ref, 163 | } 164 | if self.fai: 165 | self.igv_json["reference"]["indexURL"] = self.fai 166 | if self.gzi: 167 | self.igv_json["reference"]["compressedIndexURL"] = self.gzi 168 | 169 | # Add samples data now 170 | for sample, bundle in self.samples.items(): 171 | bundle.process_data() 172 | # Add the bundled data to the tracks 173 | for fname, index, file_fmt in bundle.data_bundles: 174 | self.add_track( 175 | fname, 176 | file_fmt, 177 | sample_name=sample if sample != "NO_SAMPLE" else None, 178 | index=index, 179 | extra_opts=self.extra_opts_lookups[file_fmt], 180 | ) 181 | 182 | def add_track(self, infile, file_fmt, sample_name=None, index=None, extra_opts={}): 183 | """Add a track to an IGV json. 184 | 185 | This function takes an input file, an optional index file, its 186 | file format and additional extra options for the track. 187 | 188 | :param infile: input file to create a track for 189 | :param file_fmt: input file track type 190 | :param sample_name: Name of the sample to display in the track name 191 | :param index: index for the input file 192 | :param extra_opts: dict of extra options for the track 193 | :return: dict with track options 194 | """ 195 | # Define track name depending on whether the sample ID is provided 196 | track_name = Path(infile).name 197 | if sample_name: 198 | track_name = f"{sample_name}: {Path(infile).name}" 199 | track_dict = { 200 | "name": track_name, 201 | "type": self.track_type[file_fmt], 202 | "format": self.igv_fmt_alias.get(file_fmt, file_fmt), 203 | "url": infile, 204 | } 205 | # add the index, if present 206 | if index: 207 | track_dict["indexURL"] = index 208 | track_dict.update(extra_opts) 209 | self.igv_json["tracks"] += [track_dict] 210 | 211 | def add_locus(self, locus): 212 | """Add target locus to the json.""" 213 | self.igv_json["locus"] = locus 214 | 215 | def add_extra_opts( 216 | self, 217 | extra_alignment_opts=None, 218 | extra_variant_opts=None, 219 | extra_interval_opts=None, 220 | ): 221 | """Import extra options from json files.""" 222 | if extra_alignment_opts is not None: 223 | with open(extra_alignment_opts, "r") as f: 224 | extra_alignment_opts_json = json.load(f) 225 | for ftype in ["bam", "cram"]: 226 | self.extra_opts_lookups[ftype] = extra_alignment_opts_json 227 | if extra_variant_opts is not None: 228 | with open(extra_variant_opts, "r") as f: 229 | extra_variant_opts_json = json.load(f) 230 | for ftype in ["vcf", "bcf"]: 231 | self.extra_opts_lookups[ftype] = extra_variant_opts_json 232 | if extra_interval_opts is not None: 233 | with open(extra_interval_opts, "r") as f: 234 | extra_interval_opts_json = json.load(f) 235 | for ftype in ["bed", "bedmethyl", "gff", "gtf"]: 236 | self.extra_opts_lookups[ftype] = extra_interval_opts_json 237 | 238 | 239 | class SampleBundle: 240 | """Sample data class. 241 | 242 | This class stores the data for multiple tracks for a 243 | single sample, then is used to generate a collection of 244 | IGV.js tracks. 245 | """ 246 | 247 | def __init__(self, sample): 248 | """Initialize properties for a sample.""" 249 | self.sample = sample 250 | self.infiles = [] 251 | self.data_bundles = [] 252 | 253 | def append(self, fname): 254 | """Add a new raw file to the bundle.""" 255 | self.infiles.append(fname) 256 | 257 | def process_data(self): 258 | """Process input files.""" 259 | fbasenames = [Path(fname).name for fname in self.infiles] 260 | ftypes = [self.classify_files(bname) for bname in fbasenames] 261 | self.data_bundles = self.pair_file_with_index(self.infiles, fbasenames, ftypes) 262 | 263 | @staticmethod 264 | def classify_files(fname): 265 | """Classify inputs.""" 266 | for extension, ftype in DATA_TYPES.items(): 267 | if fname.endswith(f".{extension}"): 268 | return ftype 269 | 270 | @staticmethod 271 | def pair_file_with_index(infiles, fbasenames, ftypes): 272 | """Clump files with their indexes.""" 273 | # Collect data by group type 274 | groups = {ftype: {"basenames": [], "paths": []} for ftype in set(ftypes)} 275 | # Group each file by its type and base name 276 | for ftype, fbasename, fname in zip(ftypes, fbasenames, infiles): 277 | groups[ftype]["basenames"] += [fbasename] 278 | groups[ftype]["paths"] += [fname] 279 | 280 | # Output bundles 281 | outputs = [] 282 | # Start matching the variant files 283 | for ftype, itype in DATA_INDEXES_FMT.items(): 284 | # Ignore file formats that are not present in the bundle. 285 | if ftype not in groups: 286 | continue 287 | # Make pairs of files. 288 | for fbasename, fpath in zip( 289 | groups[ftype]["basenames"], groups[ftype]["paths"] 290 | ): 291 | # Construct potential index file names based on basename of input files 292 | idx_basenames = set( 293 | [f"{fbasename}.{idx}" for idx in INDEX_PAIRS[ftype]] 294 | ) 295 | # Find which indexes are available 296 | if itype in groups.keys(): 297 | idx_basenames = list( 298 | idx_basenames.intersection(set(groups[itype]["basenames"])) 299 | ) 300 | # Get the first index (if there are more than one, 301 | # it doesn't matter) 302 | bname = idx_basenames[0] 303 | idx_fn = groups[itype]["paths"][ 304 | groups[itype]["basenames"].index(bname) 305 | ] 306 | outputs.append([fpath, idx_fn, ftype]) 307 | # Otherwise, return only the simple file. 308 | else: 309 | outputs.append([fpath, None, ftype]) 310 | return outputs 311 | 312 | 313 | def main(args): 314 | """Run the entry point.""" 315 | logger = get_named_logger("configIGV") 316 | 317 | # parse the FOFN 318 | igv_builder = TrackBuilder() 319 | 320 | # Add the additional track configurations 321 | igv_builder.add_extra_opts( 322 | extra_alignment_opts=args.extra_alignment_opts, 323 | extra_variant_opts=args.extra_variant_opts, 324 | extra_interval_opts=args.extra_interval_opts 325 | ) 326 | 327 | # Import files 328 | igv_builder.parse_fnames(args.fofn) 329 | 330 | # initialise the IGV options dict with the reference options 331 | igv_builder.build_igv_json() 332 | 333 | # Add locus information 334 | if args.locus is not None: 335 | igv_builder.add_locus(args.locus) 336 | 337 | json.dump(igv_builder.igv_json, sys.stdout, indent=4) 338 | 339 | logger.info("Printed IGV config JSON to STDOUT.") 340 | 341 | 342 | def argparser(): 343 | """Argument parser for entrypoint.""" 344 | parser = wf_parser("configure_igv") 345 | parser.add_argument( 346 | "--fofn", 347 | required=True, 348 | help=( 349 | "File with list of names of reference / XAM / VCF files and indices " 350 | "(one filename per line)" 351 | ), 352 | ) 353 | parser.add_argument( 354 | "--locus", 355 | help="Locus string to set initial genomic coordinates to display in IGV", 356 | ) 357 | parser.add_argument( 358 | "--extra-alignment-opts", 359 | help="JSON file with extra options for alignment tracks", 360 | ) 361 | parser.add_argument( 362 | "--extra-variant-opts", 363 | help="JSON file with extra options for variant tracks", 364 | ) 365 | parser.add_argument( 366 | "--extra_interval_opts", 367 | help="JSON file with extra options for interval tracks", 368 | ) 369 | return parser 370 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/get_max_depth_locus.py: -------------------------------------------------------------------------------- 1 | """Find max depth window in a `mosdepth` regions BED file and write as locus string.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pandas as pd 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def main(args): 12 | """Run the entry point.""" 13 | logger = get_named_logger("getMaxDepth") 14 | 15 | # read the regions BED file 16 | df = pd.read_csv( 17 | args.depths_bed, sep="\t", header=None, names=["ref", "start", "end", "depth"] 18 | ) 19 | 20 | # get the window with the largest depth 21 | ref, start, end, depth = df.loc[df["depth"].idxmax()] 22 | 23 | # get the length of the reference of that window 24 | ref_length = df.query("ref == @ref")["end"].iloc[-1] 25 | 26 | # show the whole reference in case it's shorter than the desired locus size 27 | if ref_length < args.locus_size: 28 | start = 1 29 | end = ref_length 30 | else: 31 | # otherwise, show a region of the desired size around the window 32 | half_size = args.locus_size // 2 33 | mid = (start + end) // 2 34 | start = mid - half_size 35 | end = mid + half_size 36 | # check if the region starts below `1` or ends beyond the end of the reference 37 | if start < 1: 38 | start = 1 39 | end = args.locus_size 40 | if end > ref_length: 41 | start = ref_length - args.locus_size 42 | end = ref_length 43 | 44 | # write depth and locus string 45 | sys.stdout.write(f"{depth}\t{ref}:{start}-{end}") 46 | 47 | logger.info("Wrote locus with maximum depth to STDOUT.") 48 | 49 | 50 | def argparser(): 51 | """Argument parser for entrypoint.""" 52 | parser = wf_parser("get_max_depth_locus") 53 | parser.add_argument( 54 | "depths_bed", 55 | type=Path, 56 | help="path to mosdepth regions depth file (can be compressed)", 57 | ) 58 | parser.add_argument( 59 | "locus_size", type=int, help="size of the locus in basepairs (e.g. '2000')" 60 | ) 61 | return parser 62 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/reheader_samstream.py: -------------------------------------------------------------------------------- 1 | """Reheader a SAM in a stream. 2 | 3 | When using the bam2fq -> minimap2 pattern for (re)aligning BAM data, we 4 | lose any existing RG and PG headers. This is particularly egregious when 5 | handling basecalled data as lines related to dorado basecalling settings 6 | as well as dorado RG headers are lost; orphaning RG tags in the reads. 7 | This is problematic for downstream anaylses that would like to read the 8 | XAM header to intelligently determine how to handle the reads based on 9 | the basecaller model and basecaller configuration. 10 | 11 | This script handles: 12 | - Inserting RG, PG and CO lines from an existing XAM header into the 13 | header of the SAM emitted from minimap2's alignment stream 14 | - Inserting a PG header to indicate that a call to bam2fq was made 15 | - Updating the first streamed PG.PP parent tag with the last PG.ID 16 | of the existing XAM header to maintain a chain of custody 17 | - Updating any streamed PG.ID (and PG.PP) tags to avoid collisions 18 | with inserted PG.ID 19 | 20 | Handling collisions may seem like overkill but it is anticipated that 21 | this script will be called immediately after minimap2, any previous 22 | attempt to use minimap2 will lead to ambiguity. This would be the 23 | expected case where users have used wf-basecalling or wf-alignment to 24 | align a set of reads, only to realign them to another reference (eg. 25 | via wf-human-variation). Arguably, we should remove older references to 26 | minimap2 as they will have been invalidated by the call to bam2fq but 27 | removing PG records and sticking the PG chain back together seems more 28 | fraught with annoying future bugs than simply resolving conflicts. 29 | 30 | This script will explode on a stream that contains: 31 | - PG lines in the original header where the last PG in the chain is 32 | ambiguous, or where the parent PP IDs are not injective 33 | - PG lines in the stream that do not appear in the order of their 34 | chain (that is if a PG.PP refers to a PG.ID that has not been 35 | encountered yet) 36 | 37 | SQ lines are retained after an HD line. That is to say, the most recent 38 | set of SQ lines observed after an HD will appear in the final output. 39 | SQ, RG, PG and CO lines are emitted as a group together, with elements 40 | written out in the order observed. 41 | 42 | PG lines are naively appended to the last PG element in the chain. No 43 | attempt is made to keep multiple program chains intact as this can lead 44 | to bloated headers. Broken PG metadata is a known problem (see 45 | samtools/hts-specs#275) but one that is preferable to headers that 46 | become unwieldly large to process: there IS an upper limit to a SAM 47 | header's size after all. 48 | 49 | This script takes advantage of minimap2's SAM output to immediately 50 | reheader the stream before any downstream calls to other programs pollute 51 | the PG header. This script is a little overkill but attempts to be robust 52 | with handling PG collisions and more obviously encapsulates reheadering 53 | behaviour, and leaves some room to do more clever things as necessary. 54 | """ 55 | from shutil import copyfileobj 56 | import sys 57 | 58 | from ..util import wf_parser # noqa: ABS101 59 | 60 | 61 | class SamHeader: 62 | """An overkill container to manage merging PG lines in SAM headers. 63 | 64 | Collision handling is simple. If a PG.ID is duplicated by the stream 65 | then we add a suffix to its name and keep an eye out for the 66 | corresponding PG.PP later. We assume that headers emitted by the 67 | stream are chronological because this script should not be called as 68 | part of any complicated pipework other than immediately following 69 | minimap2. 70 | """ 71 | 72 | def __init__(self): 73 | """Initialise a collision aware PG container.""" 74 | self.remapped_pgids = {} 75 | self.collision_suffix = 0 76 | 77 | # Default HD, in case the new stream does not provide one 78 | self.hd = "@HD\tVN:1.6\tSO:unknown" 79 | 80 | # We'll merge RG, CO and PG 81 | self.rg_records = [] 82 | self.co_records = [] 83 | self.pg_records = [] 84 | 85 | # We keep the most recently observed block of SQ records by 86 | # resetting SQ on the first SQ seen after non-SQ. We cannot 87 | # rely on HD being emitted (as minimap2 does not do this!) 88 | self.sq_records = [] 89 | self.reset_sq = False 90 | 91 | self.observed_rgids = set() 92 | self.observed_pgids = set() 93 | self.last_pgid = None 94 | 95 | @staticmethod 96 | def str_to_record(line): 97 | """Return an appropriate struct for a given string record.""" 98 | try: 99 | record_type, record_data = line.strip().split('\t', 1) 100 | except ValueError: 101 | raise Exception(f"Record type could not be determined: {line}") 102 | 103 | if len(record_type) > 3: 104 | raise Exception(f"Record type malformed: {record_type}") 105 | 106 | record = {} 107 | if record_type in ["@HD", "@CO", "@SQ"]: 108 | return record_type, record_data 109 | elif record_type in ["@RG", "@PG"]: 110 | for field in record_data.strip().split('\t'): 111 | k, v = field.split(':', 1) 112 | if len(k) == 2 and k[0].isalpha() and k[1].isalnum(): 113 | record[k] = v 114 | else: 115 | raise Exception(f"{record_type} with invalid tag: '{k}'") 116 | if "ID" not in record: 117 | raise Exception(f"{record_type} with no ID: {record_data}") 118 | return record_type, record 119 | else: 120 | raise Exception(f"Unknown record type: {line}") 121 | 122 | @staticmethod 123 | def record_to_str(record_type, record_data): 124 | """Form a string from a header record.""" 125 | if record_type in ["@PG", "@RG"]: 126 | tags = [f"{k}:{v}" for k, v in record_data.items()] 127 | return f"{record_type}\t" + '\t'.join(tags) 128 | elif record_type in ["@SQ", "@CO"]: 129 | return f"{record_type}\t{record_data}" 130 | 131 | @staticmethod 132 | def resolve_pg_chain(pg_dicts): 133 | """Check links between PG.ID and PP.ID, exploding if inconsistent.""" 134 | links = {} 135 | # Document links between all ID and their PP parent 136 | pgids_without_ppid = 0 137 | for pgd in pg_dicts: 138 | pgid = pgd["ID"] 139 | pgpp = pgd.get("PP") 140 | links[pgid] = pgpp 141 | if pgpp is None: 142 | pgids_without_ppid += 1 143 | if len(links) > 0: 144 | # If there are links, exactly one should have a None parent 145 | # to indicate the first PG in the chain. Explode if we see 146 | # no head or multiple heads. 147 | if pgids_without_ppid == 0: 148 | raise Exception("PG chain does not have a head.") 149 | elif pgids_without_ppid > 1: 150 | raise Exception("PG chain has multiple heads.") 151 | for source in links: 152 | head = source 153 | path = [head] 154 | while True: 155 | head = links[head] 156 | if head is None: 157 | break 158 | if head in path: 159 | path.append(head) 160 | raise Exception(f"PG chain appears to contain cycle: {path}") 161 | path.append(head) 162 | # This function is only really called to catch any explosions 163 | # but we'll return the links here as it is useful for testing 164 | return links 165 | 166 | def _bump_pg_collider(self): 167 | """Alter the collision suffix after determining a collision.""" 168 | self.collision_suffix += 1 169 | 170 | def _uncollide_pgid(self, pgid): 171 | """Return an uncollided string for a given PG ID.""" 172 | new_pgid = f"{pgid}-{self.collision_suffix}" 173 | self.remapped_pgids[pgid] = new_pgid 174 | self._bump_pg_collider() 175 | return new_pgid 176 | 177 | def add_line(self, line): 178 | """Add a header line to the header.""" 179 | record_type, record = self.str_to_record(line) 180 | 181 | if record_type == "@HD": 182 | self.hd = f"@HD\t{record}" 183 | elif record_type == "@CO": 184 | self.co_records.append(record) 185 | elif record_type == "@SQ": 186 | if self.reset_sq: 187 | self.sq_records = [] 188 | self.reset_sq = False 189 | self.sq_records.append(record) 190 | elif record_type == "@RG": 191 | rgid = record["ID"] 192 | if rgid not in self.observed_rgids: 193 | self.observed_rgids.add(rgid) 194 | self.rg_records.append(record) 195 | elif record not in self.rg_records: 196 | # if rgid has been seen before, abort if this record is different 197 | raise Exception( 198 | f"Duplicate RG with ID '{rgid}' conflicts with previously seen RG with same ID." # noqa:E501 199 | ) 200 | elif record_type == "@PG": 201 | pgid = record["ID"] 202 | if pgid in self.observed_pgids: 203 | # collision, rewrite the pgid 204 | pgid = self._uncollide_pgid(pgid) 205 | record["ID"] = pgid 206 | else: 207 | self.observed_pgids.add(pgid) 208 | 209 | # maintain chain 210 | ppid = record.get("PP") 211 | if not ppid: 212 | # record has no parent, this is either 213 | # - the first record (last_pgid is None) so is the tail 214 | # - an inserted record that needs its parent to be the current tail 215 | if not self.last_pgid: 216 | self.last_pgid = pgid 217 | else: 218 | record["PP"] = self.last_pgid 219 | self.last_pgid = pgid 220 | else: 221 | if ppid not in self.observed_pgids: 222 | raise Exception( 223 | f"Encountered PG.PP '{ppid}' before observing corresponding PG.ID" # noqa:E501 224 | ) 225 | # remap parent id (if needed) 226 | record["PP"] = self.remapped_pgids.get(ppid, ppid) 227 | # set tail to this record 228 | self.last_pgid = pgid 229 | 230 | self.pg_records.append(record) 231 | 232 | if len(self.sq_records) > 0 and record_type != '@SQ': 233 | self.reset_sq = True 234 | 235 | return record 236 | 237 | def write_header(self, fh): 238 | """Write this header to a file handle.""" 239 | self.resolve_pg_chain(self.pg_records) # check PG header 240 | fh.write(f"{self.hd}\n") 241 | for sq in self.sq_records: 242 | fh.write(self.record_to_str("@SQ", sq) + '\n') 243 | for rg in self.rg_records: 244 | fh.write(self.record_to_str("@RG", rg) + '\n') 245 | for pg in self.pg_records: 246 | fh.write(self.record_to_str("@PG", pg) + '\n') 247 | for co in self.co_records: 248 | fh.write(self.record_to_str("@CO", co) + '\n') 249 | 250 | 251 | def reheader_samstream(header_in, stream_in, stream_out, args): 252 | """Run reheader_samstream.""" 253 | # read original header into container 254 | sh = SamHeader() 255 | for line in header_in: 256 | sh.add_line(line) 257 | 258 | # append user provided lines to container 259 | for line in args.insert: 260 | sh.add_line(line) 261 | 262 | # read the header portion of the minimap2 stream 263 | wrote_header = False 264 | for line in stream_in: 265 | if line[0] != '@': 266 | # write out header on first alignment 267 | sh.write_header(stream_out) 268 | wrote_header = True 269 | # and actually write the first alignment 270 | stream_out.write(line) 271 | break 272 | sh.add_line(line) 273 | 274 | # Pass through the rest of the alignments. 275 | # I toyed with a few ways of doing this: 276 | # - A trivial iter over the input file was slow. presumably as we incurred some 277 | # overhead calling read() and write() and decoding more than other methods. 278 | # - os.read/write avoids dealing with higher level python read/write but requires 279 | # file descriptors which rules out non-file-like objects. this made testing more 280 | # annoying as StringIO does not have a file descriptor. we could have mocked fds 281 | # but i was not happy with the discrepancy between real and test execution. 282 | # - copyfileobj with the stream_in.buffer would also avoid some of the higher 283 | # level text handling but would require all tests to provide inputs that have 284 | # an underlying binary buffer. it was also not possible to seek the buffer to 285 | # the position of the text stream as we've used next() to iterate over the 286 | # header lines, fixing this would have required rewriting of the header 287 | # handling or keeping track of the position in the stream ourselves which 288 | # just seemed unncessary overkill given how we expect this program to be used. 289 | # copyfileobj on the text streams is more efficient than merely iterating the file 290 | # and dumping the lines out and seems to do the job. this keeps the code and tests 291 | # simple with minimal additional cost to performance. i anticipate any overhead of 292 | # this program will be dwarfed by that of minimap2/samtools sort anyway. 293 | # increasing the buffer size gave worse performance in my limited testing so we 294 | # leave it as the default here. 295 | copyfileobj(stream_in, stream_out) 296 | 297 | # If there were no alignments, we won't have hit the != @ case in the first stdin, 298 | # and we won't have written the header out. Write a header if we haven't already. 299 | if not wrote_header: 300 | sh.write_header(stream_out) 301 | 302 | 303 | def argparser(): 304 | """Argument parser for entrypoint.""" 305 | parser = wf_parser("reheader_samstream") 306 | parser.add_argument("header_in") 307 | parser.add_argument("--insert", action="append", default=[]) 308 | return parser 309 | 310 | 311 | def main(args): 312 | """reheader_samstream default entry point.""" 313 | with open(args.header_in) as header_in: 314 | reheader_samstream(header_in, sys.stdin, sys.stdout, args) 315 | -------------------------------------------------------------------------------- /data/OPTIONAL_FILE: -------------------------------------------------------------------------------- 1 | # Nothing to see here. A sentinel file to replace real data. 2 | # e.g.: 3 | # 4 | # input: 5 | # file some_data 6 | # file extra_data 7 | # script: 8 | # def extra = extra_data.name != 'OPTIONAL_FILE' ? "--extra-data $opt" : '' 9 | # """ 10 | # command ${some_data} ${extra} 11 | # """ 12 | -------------------------------------------------------------------------------- /docs/01_brief_description.md: -------------------------------------------------------------------------------- 1 | Transcriptome analysis of cDNA and direct RNA sequencing data. -------------------------------------------------------------------------------- /docs/02_introduction.md: -------------------------------------------------------------------------------- 1 | This workflow can be used for the following: 2 | 3 | + Identify RNA transcripts using either cDNA or direct RNA reads. 4 | + Reference aided transcriptome assembly. 5 | + Annotation of assembled transcripts. 6 | + Differential gene expression analysis using a pre-computed or assembled reference transcriptome. 7 | + Differential transcript usage analysis using a precomputed or assembled reference transcriptome. -------------------------------------------------------------------------------- /docs/03_compute_requirements.md: -------------------------------------------------------------------------------- 1 | Recommended requirements: 2 | 3 | + CPUs = 16 4 | + Memory = 64GB 5 | 6 | Minimum requirements: 7 | 8 | + CPUs = 8 9 | + Memory = 32GB 10 | 11 | Approximate run time: 15 minutes per sample, with 1 million reads and recommended resources. 12 | 13 | ARM processor support: False 14 | -------------------------------------------------------------------------------- /docs/04_install_and_run.md: -------------------------------------------------------------------------------- 1 | 2 | These are instructions to install and run the workflow on command line. 3 | You can also access the workflow via the 4 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/). 5 | 6 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage 7 | compute and software resources, 8 | therefore Nextflow will need to be 9 | installed before attempting to run the workflow. 10 | 11 | The workflow can currently be run using either 12 | [Docker](https://www.docker.com/products/docker-desktop) 13 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) 14 | to provide isolation of the required software. 15 | Both methods are automated out-of-the-box provided 16 | either Docker or Singularity is installed. 17 | This is controlled by the 18 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) 19 | parameter as exemplified below. 20 | 21 | It is not required to clone or download the git repository 22 | in order to run the workflow. 23 | More information on running EPI2ME workflows can 24 | be found on our [website](https://labs.epi2me.io/wfindex). 25 | 26 | The following command can be used to obtain the workflow. 27 | This will pull the repository in to the assets folder of 28 | Nextflow and provide a list of all parameters 29 | available for the workflow as well as an example command: 30 | 31 | ``` 32 | nextflow run epi2me-labs/wf-transcriptomes --help 33 | ``` 34 | To update a workflow to the latest version on the command line use 35 | the following command: 36 | ``` 37 | nextflow pull epi2me-labs/wf-transcriptomes 38 | ``` 39 | 40 | A demo dataset is provided for testing of the workflow. 41 | It can be downloaded and unpacked using the following commands: 42 | ``` 43 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-transcriptomes/wf-transcriptomes-demo.tar.gz 44 | tar -xzvf wf-transcriptomes-demo.tar.gz 45 | ``` 46 | The workflow can then be run with the downloaded demo data using: 47 | ``` 48 | nextflow run epi2me-labs/wf-transcriptomes \ 49 | --de_analysis \ 50 | --direct_rna \ 51 | --fastq 'wf-transcriptomes-demo/differential_expression_fastq' \ 52 | --minimap2_index_opts '-k 15' \ 53 | --ref_annotation 'wf-transcriptomes-demo/gencode.v22.annotation.chr20.gtf' \ 54 | --ref_genome 'wf-transcriptomes-demo/hg38_chr20.fa' \ 55 | --sample_sheet 'wf-transcriptomes-demo/sample_sheet.csv' \ 56 | -profile standard 57 | ``` 58 | 59 | For further information about running a workflow on 60 | the command line see https://labs.epi2me.io/wfquickstart/ 61 | -------------------------------------------------------------------------------- /docs/05_related_protocols.md: -------------------------------------------------------------------------------- 1 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. 2 | 3 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). -------------------------------------------------------------------------------- /docs/06_input_example.md: -------------------------------------------------------------------------------- 1 | 2 | This workflow accepts either FASTQ or BAM files as input. 3 | 4 | The FASTQ or BAM input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ or BAM file; (ii) the path to a top-level directory containing FASTQ or BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ or BAM files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. 5 | 6 | ``` 7 | (i) (ii) (iii) 8 | input_reads.fastq ─── input_directory ─── input_directory 9 | ├── reads0.fastq ├── barcode01 10 | └── reads1.fastq │ ├── reads0.fastq 11 | │ └── reads1.fastq 12 | ├── barcode02 13 | │ ├── reads0.fastq 14 | │ ├── reads1.fastq 15 | │ └── reads2.fastq 16 | └── barcode03 17 | └── reads0.fastq 18 | ``` -------------------------------------------------------------------------------- /docs/06_input_parameters.md: -------------------------------------------------------------------------------- 1 | ### Input Options 2 | 3 | | Nextflow parameter name | Type | Description | Help | Default | 4 | |--------------------------|------|-------------|------|---------| 5 | | fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | 6 | | bam | string | BAM or unaligned BAM (uBAM) files to use in the analysis. | This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | 7 | | transcriptome_source | string | Select how the transcriptome used for analysis should be prepared. | For differential expression analysis, use of an existing transcriptome may be preferred and so 'precomputed' should be selected. In this case the 'ref_transcriptome' parameter should be specified. To create a reference transcriptome using an existing reference genome, select 'reference guided' and specify the 'ref_genome' parameter. | reference-guided | 8 | | ref_genome | string | Path to reference genome sequence [.fa/.fq/.fa.gz/fq.gz]. Required for reference-based workflow. | A reference genome is required for reference-based assembly of a transcriptome. | | 9 | | ref_transcriptome | string | Transcriptome reference file. Required for precomputed transcriptome calculation and for differential expression analysis. | A reference transcriptome related to the sample under study. Must be supplied when the 'Transcriptome source' parameter has been set to 'precomputed' or to perform differential expression. | | 10 | | ref_annotation | string | A reference annotation in GFF2 or GFF3 format (extensions .gtf(.gz), .gff(.gz), .gff3(.gz)). Only annotation files from [Encode](https://www.encodeproject.org), [Ensembl](https://www.ensembl.org/index.html) and [NCBI](https://www.ncbi.nlm.nih.gov/) are supported. | This will be used for guiding the transcriptome assembly and to label transcripts with their corresponding gene identifiers. Note: If in de_analysis mode transcript strands must be only + or -. | | 11 | | direct_rna | boolean | Set to true for direct RNA sequencing. | Omits the pychopper step. | False | 12 | | analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | 13 | 14 | 15 | ### Output Options 16 | 17 | | Nextflow parameter name | Type | Description | Help | Default | 18 | |--------------------------|------|-------------|------|---------| 19 | | out_dir | string | Directory for output of all user-facing files. | | output | 20 | | igv | boolean | Visualize outputs in the EPI2ME IGV visualizer. | Enabling this option will visualize the output alignment files in the EPI2ME Desktop App IGV visualizer. | False | 21 | 22 | 23 | ### Sample Options 24 | 25 | | Nextflow parameter name | Type | Description | Help | Default | 26 | |--------------------------|------|-------------|------|---------| 27 | | sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. If you are running the differential expression workflow, there must be an additional column `condition` with two labels, one of which must be `control` (e.g. `control` and `treated`). Control will indicate which samples will be used as the reference. There should be at least 3 repeats for each condition. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. | | 28 | | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | 29 | 30 | 31 | ### Options for reference-based workflow 32 | 33 | | Nextflow parameter name | Type | Description | Help | Default | 34 | |--------------------------|------|-------------|------|---------| 35 | | plot_gffcmp_stats | boolean | Create a PDF of plots from showing gffcompare results | If set to true, a PDF file containing detailed gffcompare reults will be output | True | 36 | | gffcompare_opts | string | Extra command-line options to give to gffcompare -r | For a list of possible options see [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml). | -R | 37 | | minimap2_index_opts | string | Extra command-line options for minimap2 indexing. | See [minimap2 index options](https://lh3.github.io/minimap2/minimap2.html#4) for more information. These will only be relevant in the reference based transcriptome assembly. | -k 14 | 38 | | minimap2_opts | string | Additional command-line options for minimap2 alignment. | See [minimap2 options](https://lh3.github.io/minimap2/minimap2.html#5) for further information. These will only be relevant in the reference based transcriptome assembly. | -uf | 39 | | minimum_mapping_quality | integer | filter aligned reads by MAPQ quality. | Reads that do not meet this mapping quality after minimap2 alignment, will be filtered out. | 40 | 40 | | stringtie_opts | string | Extra command-line options for stringtie transcript assembly. | For additional String tie options see [here](https://github.com/gpertea/stringtie#stringtie-options). | --conservative | 41 | 42 | 43 | ### Differential Expression Options 44 | 45 | | Nextflow parameter name | Type | Description | Help | Default | 46 | |--------------------------|------|-------------|------|---------| 47 | | de_analysis | boolean | Run DE anaylsis | Running this requires you to provide at least two replicates for a control and treated sample as well as a sample sheet param. | False | 48 | | min_gene_expr | integer | The minimum number of total mapped sequence reads required for a gene to be considered in differential transcript usage analysis. | Filtering at the gene level ensures that the observed transcript ratios are calculated with a minimum number of counts per gene. | 10 | 49 | | min_feature_expr | integer | The minimum number of reads assigned to a transcript for it to be considered in differential transcript usage analysis. | Filter out transcripts that do not have this minimum number of transcript expression, reducing noise. | 3 | 50 | | min_samps_gene_expr | integer | Set the minimum number of samples in which a gene is expressed to be included in the differential transcript usage analysis. | A gene must be expressed in at least this number of samples for the gene be included in the differential transcript usage analysis. Filtering at the gene level improves the reliability of the observed transcript ratios. | 3 | 51 | | min_samps_feature_expr | integer | Set the minimum number of samples in which a transcript is expressed to be included in the differential transcript usage analysis. | A transcript must expressed in at least this minimum number of samples to be included in the analysis. Should be equal to the number of replicates per sample you have. | 1 | 52 | 53 | 54 | ### Advanced Options 55 | 56 | | Nextflow parameter name | Type | Description | Help | Default | 57 | |--------------------------|------|-------------|------|---------| 58 | | threads | integer | Number of CPU threads. | Only provided to processes including alignment and and assembly that benefit from multiple threads. | 4 | 59 | | cdna_kit | string | If cDNA reads are used, select the kit used. | This will be used by pychopper to preprocess the reads for downstream analysis. | SQK-PCS109 | 60 | | pychopper_backend | string | Pychopper can use one of two available backends for identifying primers in the raw reads | 'edlib' is set by default due to its high performance. However, it may be less sensitive than 'phmm'. | edlib | 61 | | pychopper_opts | string | Extra pychopper opts | See available options (here)[https://github.com/epi2me-labs/pychopper#usage] | | 62 | | bundle_min_reads | integer | Minimum size of bam bundle for parallel processing. | | 50000 | 63 | | isoform_table_nrows | integer | Maximum rows to dispay in the isoform report table | | 5000 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /docs/07_outputs.md: -------------------------------------------------------------------------------- 1 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. 2 | 3 | | Title | File path | Description | Per sample or aggregated | 4 | |-------|-----------|-------------|--------------------------| 5 | | workflow report | wf-transcriptomes-report.html | a HTML report document detailing the primary findings of the workflow | aggregated | 6 | | Per file read stats | fastq_ingress_results/{{ alias }}//reads/fastcat_stats/per-file-stats.tsv | A TSV with per file read stats, including all samples. | aggregated | 7 | | Read stats | fastq_ingress_results/{{ alias }}//reads/fastcat_stats/per-read-stats.tsv | A TSV with per read stats, including all samples. | aggregated | 8 | | Run ID's | fastq_ingress_results/{{ alias }}//reads/fastcat_stats/run_ids | List of run IDs present in reads. | aggregated | 9 | | Meta map json | fastq_ingress_results/{{ alias }}//reads/metamap.json | Metadata used in workflow presented in a JSON. | aggregated | 10 | | Concatenated sequence data | fastq_ingress_results/{{ alias }}//reads/{{ alias }}.fastq.gz | Per sample reads concatenated in to one FASTQ file. | per-sample | 11 | | Assembled transcriptome | {{ alias }}_transcriptome.fas | Per sample assembled transcriptome. Not output if a reference annotation was supplied | per-sample | 12 | | Annotated assembled transcriptome | {{ alias }}_merged_transcriptome.fas | Per sample annotated assembled transcriptome. Only output if a reference annotation was supplied | per-sample | 13 | | Alignment summary statistics | {{ alias }}_read_aln_stats.tsv | Per sample alignment summary statistics. | per-sample | 14 | | GFF compare results. | {{ alias }}_gffcompare | All GFF compare output files. | per-sample | 15 | | Differential gene expression results | de_analysis/results_dge.tsv | This is a gene-level result file that describes genes and their probability of showing differential expression between experimental conditions. | aggregated | 16 | | Differential gene expression report | de_analysis/results_dge.pdf | Summary report of differential gene expression analysis as a PDF. | aggregated | 17 | | Differential transcript usage gene TSV | de_analysis/results_dtu_gene.tsv | This is a gene-level result file from DEXSeq that lists annotated genes and their probabilities of differential expression. | aggregated | 18 | | Differential transcript usage report | de_analysis/results_dtu.pdf | Summary report of differential transcript usage results as a PDF. | aggregated | 19 | | Differential transcript usage TSV | de_analysis/results_dtu_transcript.tsv | This is a transcript-level result file from DEXSeq that lists annotated genes and their probabilities of differential expression. | aggregated | 20 | | Differential transcript usage stageR TSV | de_analysis/results_dtu_stageR.tsv | This is the output from StageR and it shows both gene and transcript probabilities of differential expression | aggregated | 21 | | Differential transcript usage DEXSeq TSV | de_analysis/results_dexseq.tsv | The complete output from the DEXSeq-analysis, shows both gene and transcript probabilities of differential expression. | aggregated | 22 | | Gene counts | de_analysis/all_gene_counts.tsv | Raw gene counts created by the Salmon tool, before filtering. | aggregated | 23 | | Gene counts per million | de_analysis/cpm_gene_counts.tsv | This file shows counts per million (CPM) of the raw gene counts to facilitate comparisons across samples. | aggregated | 24 | | Transcript counts | de_analysis/unfiltered_transcript_counts_with_genes.tsv | Raw transcript counts created by the Salmon tool, before filtering. Includes reference to the associated gene ID. | aggregated | 25 | | Transcript per million counts | de_analysis/unfiltered_tpm_transcript_counts.tsv | This file shows transcripts per million (TPM) of the raw counts to facilitate comparisons across samples. | aggregated | 26 | | Transcript counts filtered | de_analysis/filtered_transcript_counts_with_genes.tsv | Filtered transcript counts, used for differential transcript usage analysis. Includes a reference to the associated gene ID. | aggregated | 27 | | Transcript info table | {{ alias }}_transcripts_table.tsv | This file details each isoform that was reconstructed from the input reads. It contains a subset of columns from the .tmap output from [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml) | per-sample | 28 | | Final non redundant transcriptome | de_analysis/final_non_redundant_transcriptome.fasta | Transcripts that were used for differential expression analysis including novel transcripts with the identifiers used for DE analysis. Only applicable when the ref_transcriptome parameter is not provided. | aggregated | 29 | | Index of reference FASTA file | igv_reference/{{ ref_genome_file }}.fai | Reference genome index of the FASTA file required for IGV config. | aggregated | 30 | | GZI index of the reference FASTA file | igv_reference/{{ ref_genome_file }}.gzi | GZI Index of the reference FASTA file. | aggregated | 31 | | JSON configuration file for IGV browser | igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reference. | aggregated | 32 | | BAM file (minimap2) | BAMS/{{ alias }}.reads_aln_sorted.bam | BAM file generated from mapping input reads to the reference. | per-sample | 33 | | BAM index file (minimap2) | BAMS/{{ alias }}.reads_aln_sort.bam.bai | Index file generated from mapping input reads to the reference. | per-sample | 34 | -------------------------------------------------------------------------------- /docs/08_pipeline_overview.md: -------------------------------------------------------------------------------- 1 | ### 1. Concatenate input files and generate per read stats. 2 | The [fastcat](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. It will also output per read stats including average read lengths and qualities. 3 | 4 | ### 2. Preprocess cDNA. 5 | If input sequences are cDNA [Pychopper](https://github.com/epi2me-labs/pychopper) is used to orient, trim and rescue full length cDNA reads and associated statistics. If the `direct_rna` parameter is selected this step will be skipped. 6 | 7 | ### 3. Build transcriptome. 8 | If the `transcriptome_source` parameter is "reference-guided" a transcriptome will be built for each sample as outlined below. If the `transcriptome_source` is "precomputed" and the `reference_transcriptome` parameter is provided the workflow will skip step 3. 9 | 10 | #### 3.1 Align reads with reference genome. 11 | The reference genome will be indexed and aligned using [Minimap2](https://github.com/lh3/minimap2). The output is sorted and converted to a BAM file using [Samtools](https://www.htslib.org/). Alignment stats are created from these using [Seqkit BAM](https://bioinf.shenwei.me/seqkit/usage/#bam). 12 | 13 | Additionally, the workflow will generate an IGV configuration file if `--igv` is selected. This file allows the user to view the aligned BAM in the EPI2ME Desktop Application in the Viewer tab. 14 | 15 | #### 3.2 Chunk BAM 16 | The aligned BAMs are split into chunks using the bundle_min_reads parameter (default: 50000). 17 | 18 | #### 3.3 Assemble transcripts 19 | [StringTie](https://ccb.jhu.edu/software/stringtie/) is then used to assemble the transcripts using the aligned segments in the chunked BAM files. The assembled transcript will be output as a [GFF file](https://www.ensembl.org/info/website/upload/gff3.html). If a `ref_annotation` file is provided this will also be included in the GFF. 20 | 21 | #### 3.4 Merge Chunks 22 | Transcript GFF files from the chunks with the same sample aliases will then be merged. 23 | 24 | #### 3.5 Annnotate 25 | [GffCompare](https://ccb.jhu.edu/software/stringtie/gffcompare.html) is then used to compare query and reference annotations, merging records where appropriate and then annotating them. This also creates estimates of accuracy of the GFF files output in a stats file per sample. 26 | 27 | #### 3.6 Create transcriptomes 28 | [Gffread](https://github.com/gpertea/gffread) is used to create a transcriptome FASTA file from the final GFF as well as a merged transcriptome that includes annotations in the FASTA headers where available. 29 | 30 | ### 4. Differential expression analysis 31 | 32 | Differential gene expression (DGE) and differential transcript usage (DTU) analyses aim to identify genes and transcripts that show statistically altered expression patterns. 33 | 34 | Differential Expression requires at least 2 replicates of each sample to compare (but we recommend three). You can see an example sample_sheet.csv below. 35 | 36 | #### Sample sheet condition column 37 | The sample sheet should be a comma separated values file (.csv) and include at least three columns named `barcode`, `alias` and `condition`. 38 | - Each `barcode` should refer to a directory of the same name in the input FASTQ directory (in the example below `barcode01` to `barcode06` reflect the `test_data` directory). 39 | - The `alias` column allows you to rename each barcode to an alias that will be used in the report and other output files. 40 | - The condition column will need to contain one of two keys to indicate the two samples being compared. Control must be one of the keys, used to indicate which samples will be used as the reference in the differential expression analysis. 41 | 42 | eg. sample_sheet.csv 43 | ``` 44 | barcode,alias,condition 45 | barcode01,sample01,control 46 | barcode02,sample02,control 47 | barcode03,sample03,control 48 | barcode04,sample04,treated 49 | barcode05,sample05,treated 50 | barcode06,sample06,treated 51 | ``` 52 | 53 | #### 4.1 Merge cross sample transcriptomes 54 | If a `ref_transcriptome` is not provided, the transcriptomes created by the workflow will be used for DE analysis. To do this, the GFF outputs of GffCompare are merged using StringTie. A final non redundant FASTA file of the transcripts is created using the merged GFF file and the reference genome using seqkit. 55 | 56 | #### 4.2 Create a final non redundant transcriptome 57 | The reads from all the samples will be aligned with the final non redundant transcriptome using Minimap2 in a splice aware manner. 58 | 59 | #### 4.3 Count genes and transcripts 60 | [Salmon](https://github.com/COMBINE-lab/salmon) is used for transcript quantification, giving gene and transcript counts. 61 | 62 | #### 4.4 edgeR based differential expression analysis 63 | A statistical analysis is first performed using [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) to identify the subset of differentially expressed genes using the gene counts as input. A normalisation factor is calculated for each sequence library using the default TMM method (see [McCarthy et al. (2012)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3378882/) for further details). The defined experimental design is used to calculate estimates of dispersion for each of the gene features. Statistical tests are calculated using the contrasts defined in the experimental design. The differentially expressed genes are corrected for false discovery (FDR) using the method of Benjamini & Hochberg ([Benjamini and Hochberg (1995)](https://www.jstor.org/stable/2346101)) 64 | 65 | #### 4.5 Pre-filtering of quantitative data using DRIMSeq 66 | [DRIMSeq](https://bioconductor.org/packages/release/bioc/html/DRIMSeq.html) is used to filter the transcript count data from the Salmon analysis for differential transcript usage (DTU) analysis. The filter step will be used to select for genes and transcripts that satisfy rules for the number of samples in which a gene or transcript must be observed, and minimum threshold levels for the number of observed reads. The parameters used for filtering are `min_samps_gene_expr`, `min_samps_feature_expr`, `min_gene_expr`, and `min_feature_expr`. By default, any transcripts with zero expression or one transcript in all samples are filtered out at this stage. 67 | 68 | #### 4.6 Differential transcript usage using DEXSeq 69 | Differential transcript usage analysis is performed using the R [DEXSeq](https://bioconductor.org/packages/release/bioc/html/DEXSeq.html) package ([Anders et al. (2012)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3460195/)). Similar to the edgeR package, DEXSeq estimates the variance between the biological replicates and applies generalised linear models for the statistical testing. The key difference is that the DEXSeq method looks for differences at the exon count level. DEXSeq uses the filtered transcript count data prepared earlier in this analysis. 70 | 71 | #### 4.7 StageR stage-wise analysis of DGE and DTU 72 | The final component of this isoform analysis is a stage-wise statistical test using the R software package [stageR](https://bioconductor.org/packages/release/bioc/html/stageR.html)([Van den Berge and Clement (2018)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1277-0)). stageR uses (1) the raw p-values for DTU from the DEXSeq analysis in the previous section and (2) a false-discovery corrected set of p-values from testing whether individual genes contain at least one exon showing DTU. A hierarchical two-stage statistical testing evaluates the set of genes for DTU. 73 | -------------------------------------------------------------------------------- /docs/09_troubleshooting.md: -------------------------------------------------------------------------------- 1 | + If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. 2 | + See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/). 3 | + Renaming, moving or deleting the input BAM, reference genome or the output directory from the location provided at runtime will stop IGV in the EPI2ME Desktop app from loading. -------------------------------------------------------------------------------- /docs/10_FAQ.md: -------------------------------------------------------------------------------- 1 | *Does the workflow support de novo assembly?* - Currently the workflow does not have a *de novo* mode. 2 | 3 | *Why is the IGV panel not showing?* - The workflow expects either an uncompressed or [`bgzip`](https://www.htslib.org/doc/bgzip.html)-compressed reference. If the user provides a reference compressed not with `bgzip`, the workflow will run to completion, but won't be able to generate the necessary indexes to visualize the outputs in IGV. 4 | 5 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-transcriptomes/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). -------------------------------------------------------------------------------- /docs/11_other.md: -------------------------------------------------------------------------------- 1 | + [How to align your data](https://labs.epi2me.io/how-to-align/) 2 | 3 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. -------------------------------------------------------------------------------- /evaluation/tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # A few simple tests with different combinations of CLI options 4 | # Run from within an appropriate active conda environment 5 | 6 | if [[ "$#" -lt 1 ]]; then 7 | echo "usage: tests.sh [nextflow.config]" 8 | exit 1 9 | fi 10 | 11 | if [[ "$#" -eq 1 ]]; then 12 | config='' 13 | fi 14 | 15 | if [[ "$#" -eq 2 ]]; then 16 | config="-c $2"; 17 | fi 18 | 19 | SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 20 | cd $SCRIPT_DIR/../; 21 | 22 | singledir="test_data/fastq" 23 | multisampledir="test_data/demultiplexed_fastq" 24 | 25 | # This is for when using SIRV dataset with non-canonical spice junctions 26 | #"--minimap2_opts '-uf --splice-flank=no'" 27 | results=() 28 | 29 | # Reference based tests 30 | OUTPUT=$1/reference_single_dir; 31 | nextflow run . --fastq $singledir $config --ref_genome test_data/SIRV_150601a.fasta --minimap2_opts '-uf --splice-flank=no' \ 32 | --ref_annotation test_data/SIRV_isofroms.gtf -profile local --out_dir ${OUTPUT} -w ${OUTPUT}/workspace -resume; 33 | r=$? 34 | results+=("$(basename $OUTPUT): $r") 35 | 36 | OUTPUT=$1/multiple_samples; 37 | nextflow run . --fastq $multisampledir $config --ref_genome test_data/SIRV_150601a.fasta --minimap2_opts '-uf --splice-flank=no'\ 38 | --ref_annotation test_data/SIRV_isofroms.gtf -profile local --out_dir ${OUTPUT} -w ${OUTPUT}/workspace \ 39 | --sample_sheet test_data/sample_sheet -resume; 40 | r=$? 41 | results+=("$(basename $OUTPUT): $r") 42 | 43 | OUTPUT=$1/reference_no_ref_annotation; 44 | nextflow run . --fastq $singledir $config --ref_genome test_data/SIRV_150601a.fasta --minimap2_opts '-uf --splice-flank=no'\ 45 | -profile local --out_dir ${OUTPUT} -w ${OUTPUT}/workspace -resume; 46 | r=$? 47 | results+=("$(basename $OUTPUT): $r") 48 | 49 | # Force split_bam to make multiple alignment bundles 50 | OUTPUT=$1/reference_frce_split_bam; 51 | nextflow run . --fastq $singledir $config --ref_genome test_data/SIRV_150601a.fasta --minimap2_opts '-uf --splice-flank=no'\ 52 | --ref_annotation test_data/SIRV_isofroms.gtf -profile local --out_dir ${OUTPUT} -w ${OUTPUT}/workspace \ 53 | --bundle_min_reads 5 -resume; 54 | r=$? 55 | results+=("$(basename $OUTPUT): $r") 56 | 57 | echo "Exit status codes for each test" 58 | for value in "${results[@]}"; do 59 | echo "${value}" 60 | done -------------------------------------------------------------------------------- /lib/ArgumentParser.groovy: -------------------------------------------------------------------------------- 1 | /* Check arguments of a Nextflow function 2 | * 3 | * Nextflow script does not support the Groovy idiom: 4 | * 5 | * def function(Map args[:], arg1, arg2, ...) 6 | * 7 | * to support unordered kwargs. The methods here are designed 8 | * to reduce boileplate while allowing Nextflow script to implement 9 | * 10 | * def function(Map args[:]) 11 | * 12 | * with required and default values. This is similar to some Python 13 | * libraries' (notably matplotlib) extensive use of things like: 14 | * 15 | * def function(*args, **kwargs) 16 | * 17 | * to implement generic APIs. Why do we want to do all this? Because 18 | * we want to write library code with a clean set of required parameters 19 | * but also extensible with non-required parameters with default values. 20 | * This allows us to later add parameters without breaking existing code, 21 | * and is very common practice elsewhere. 22 | */ 23 | 24 | import java.util.Set 25 | 26 | class ArgumentParser { 27 | Set args 28 | Map kwargs 29 | String name 30 | 31 | /* Parse arguments, raising an error on unknown keys */ 32 | public Map parse_args(LinkedHashMap given_args) { 33 | Set opt_keys = kwargs.keySet() 34 | Set given_keys = given_args.keySet() 35 | check_required(given_keys) 36 | check_unknown(given_keys, opt_keys) 37 | return kwargs + given_args 38 | } 39 | 40 | /* Parse arguments, without raising an error for extra keys */ 41 | public Map parse_known_args(LinkedHashMap given_args) { 42 | Set opt_keys = kwargs.keySet() 43 | Set given_keys = given_args.keySet() 44 | check_required(given_keys) 45 | return kwargs + given_args 46 | } 47 | 48 | private void check_required(Set given) { 49 | Set missing_keys = args - given 50 | if (!missing_keys.isEmpty()) { 51 | throw new Exception("Missing arguments for function ${name}: ${missing_keys}") 52 | } 53 | } 54 | 55 | private void check_unknown(Set given, Set kwargs_keys) { 56 | Set extra_keys = given - (args + kwargs_keys) 57 | if (!extra_keys.isEmpty()) { 58 | throw new Exception("Unknown arguments provided to function ${name}: ${extra_keys}.") 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /lib/CWUtil.groovy: -------------------------------------------------------------------------------- 1 | /* Miscellaneous utilities for workflows from the ONT Customer Workflows Group. 2 | */ 3 | class CWUtil { 4 | 5 | /* Mutate the global Nextflow params map 6 | * 7 | * Occasionally, we may wish to mutate the value of a parameter provided 8 | * by the user. Typically, this leads to workflows with `params.my_param` 9 | * and `params._my_param` which is ripe for confusion. Instead, we can 10 | * mutate the parameter value in the Nextflow params ScriptMap itself 11 | * with the following call: 12 | * 13 | * CWUtil.mutateParam(params, k, v) 14 | * 15 | * This is possible as Groovy actually has a surprisingly loose 16 | * definition of "private", and allows us to call the private `allowNames` 17 | * method on the ScriptMap which removes the read-only status for a key set. 18 | * We can follow this up with a call to the private `put0` to reinsert 19 | * the key and mark it as read-only again. 20 | */ 21 | public static void mutateParam(nf_params, key, value) { 22 | Set s = [key] // must be a set to allow call to allowNames 23 | nf_params.allowNames(s) 24 | nf_params.put0(key, value) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /lib/Pinguscript.groovy: -------------------------------------------------------------------------------- 1 | import static groovy.json.JsonOutput.toJson 2 | import groovy.json.JsonBuilder 3 | import groovy.json.JsonSlurper 4 | 5 | 6 | class Pinguscript { 7 | 8 | // Send a ping for the start of a workflow 9 | public static void ping_start(nextflow, workflow, params) { 10 | wf_ping(nextflow, workflow, "start", null, params) 11 | } 12 | // Send a ping for a completed workflow (successful or otherwise) 13 | public static void ping_complete(nextflow, workflow, params) { 14 | wf_ping(nextflow, workflow, "end", null, params) 15 | } 16 | // Send a ping for a workflow error 17 | public static void ping_error(nextflow, workflow, params) { 18 | def error_message = workflow.errorMessage 19 | wf_ping(nextflow, workflow, "error", error_message, params) 20 | } 21 | // Shared handler to construct a ping JSON and send it 22 | private static String wf_ping(nextflow, workflow, event, error_message, params) { 23 | if (params.disable_ping) { 24 | return "{}" 25 | } 26 | def body_json = make_wf_ping(nextflow, workflow, event, error_message, params) 27 | send_ping_post("epilaby", body_json) 28 | } 29 | 30 | // Helper to removing keys from a map 31 | private static clean_meta(meta, keys_to_remove) { 32 | for (key in keys_to_remove) { 33 | if (meta.containsKey(key)) { 34 | meta.remove(key) 35 | } 36 | } 37 | } 38 | 39 | // Helper for fetching a key from the params map 40 | // seems pointless but you just know someone is going to end up writing meta.this ? meta.that 41 | private static get_meta(meta, key) { 42 | (meta.containsKey(key) && meta[key]) ? meta[key].toString() : null 43 | } 44 | 45 | // Construct workflow ping JSON 46 | private static String make_wf_ping(nextflow, workflow, event, error_message, params) { 47 | // cheeky deepcopy using json 48 | String paramsJSON = new JsonBuilder(params).toPrettyString() 49 | def params_data = new JsonSlurper().parseText(paramsJSON) 50 | 51 | // hostname 52 | def host = null 53 | try { 54 | host = InetAddress.getLocalHost().getHostName() 55 | } 56 | catch(Exception e) {} 57 | 58 | // OS 59 | // TODO check version on WSL 60 | def opsys = System.properties['os.name'].toLowerCase() 61 | def opver = System.properties['os.version'] 62 | if (opver.toLowerCase().contains("wsl")){ 63 | opsys = "wsl" 64 | } 65 | 66 | // placeholder for any future okta business 67 | // for now we'll use the guest_ sent to wf.epi2me_user 68 | def user = get_meta(params.wf, "epi2me_user") 69 | 70 | // drop cruft to save some precious bytes 71 | // affects the deep copy rather than original params 72 | clean_meta(params_data, [ 73 | "schema_ignore_params", 74 | ]) 75 | def ingress_ids = [] 76 | if (params_data.containsKey("wf")) { 77 | ingress_ids = params_data.wf["ingress.run_ids"] ?: [] 78 | clean_meta(params_data.wf, [ 79 | "agent", // we send this later 80 | "epi2me_instance", // we send this later 81 | "epi2me_user", // we send this later 82 | "example_cmd", 83 | "ingress.run_ids", // we will send this elsewhere 84 | ]) 85 | } 86 | 87 | // try and get runtime information 88 | def cpus = null 89 | try { 90 | cpus = Runtime.getRuntime().availableProcessors() 91 | } 92 | catch(Exception e) {} 93 | 94 | def workflow_success = null 95 | def workflow_exitcode = null 96 | if (event != "start") { 97 | workflow_success = workflow.success 98 | workflow_exitcode = workflow.exitStatus 99 | } 100 | 101 | /// build message 102 | def body_json = new JsonBuilder() 103 | body_json \ 104 | "tracking_id": [ 105 | "msg_id": UUID.randomUUID().toString(), 106 | "version": "3.0.0" 107 | ], 108 | "source": "workflow", 109 | "event": event, 110 | "params": params_data, 111 | // data will be null on start events, as ingress has not run 112 | "data": event != "start" ? [run_ids: ingress_ids] : null, 113 | "workflow": [ 114 | "name": workflow.manifest.name, 115 | "version": workflow.manifest.version, // could use NfcoreTemplate.version(workflow) 116 | "run_name": workflow.runName, // required to disambiguate sessions 117 | "session": workflow.sessionId, 118 | "profile": workflow.profile, 119 | "resume": workflow.resume, 120 | "error": error_message, // null if no error 121 | "success": workflow_success, 122 | "exitcode": workflow_exitcode, 123 | ], 124 | "env": [ 125 | "user": user, // placeholder for any future okta 126 | "hostname": host, 127 | "os": [ 128 | "name": opsys, 129 | "version": opver 130 | ], 131 | "resource": [ 132 | "cpus": cpus, 133 | "memory": null, // placeholder, no point asking via Runtime as it will just give us the Xmx size 134 | ], 135 | "agent": get_meta(params.wf, "agent"), // access via original params 136 | "epi2me": [ 137 | "instance": get_meta(params.wf, "epi2me_instance"), 138 | "user": user, 139 | ], 140 | "nextflow": [ 141 | "version": nextflow.version.toString(), 142 | "version_compat": nextflow.version.matches(workflow.manifest.nextflowVersion) 143 | ] 144 | ] 145 | return body_json 146 | } 147 | 148 | // Send a JSON payload to a given endpoint 149 | private static String send_ping_post(endpoint, body_json) { 150 | // Attempt to send payload and absorb any possible Exception gracefully 151 | String postResult 152 | boolean raise_exception = false 153 | try { 154 | ((HttpURLConnection)new URL("https://ping.oxfordnanoportal.com/${endpoint}").openConnection()).with({ 155 | requestMethod = 'POST' 156 | doOutput = true 157 | setConnectTimeout(5000) 158 | setReadTimeout(10000) 159 | setRequestProperty('Content-Type', 'application/json') 160 | setRequestProperty('accept', 'application/json') 161 | outputStream.withPrintWriter({printWriter -> 162 | printWriter.write(body_json.toString()) 163 | }) 164 | 165 | // Rethrow exceptions that imply we're not using this endpoint properly 166 | if(responseCode >= 400 && agent.toString() == "cw-ci") { 167 | raise_exception = true 168 | } 169 | // Accessing inputStream.text will raise an Exception for failed requests 170 | postResult = inputStream.text 171 | }) 172 | } 173 | catch(Exception e) { 174 | if(raise_exception) { throw e } 175 | } 176 | return (postResult) 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /lib/WorkflowMain.groovy: -------------------------------------------------------------------------------- 1 | // This file is based on the nf-core/tools pipeline-template. 2 | // Changes to this file must be propagated via wf-template. 3 | 4 | class WorkflowMain { 5 | 6 | // Citation string for pipeline 7 | public static String citation(workflow) { 8 | return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + 9 | "* The nf-core framework\n" + 10 | " https://doi.org/10.1038/s41587-020-0439-x\n\n" 11 | } 12 | 13 | // Generate help string 14 | public static String help(workflow, params, log) { 15 | String line_sep = ' \\ \n\t' 16 | String command_example = params.wf.example_cmd.join(line_sep) 17 | String command = 'nextflow run ' + workflow.manifest.name + line_sep + command_example 18 | String help_string = '' 19 | help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) 20 | help_string += NfcoreSchema.paramsHelp(workflow, params, command) 21 | help_string += '\n' + citation(workflow) + '\n' 22 | return help_string 23 | } 24 | 25 | // Generate parameter summary log string 26 | public static String paramsSummaryLog(workflow, params, log) { 27 | String workflow_version = NfcoreTemplate.version(workflow) 28 | String summary_log = '' 29 | summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) 30 | summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) 31 | summary_log += '\n' + citation(workflow) + '\n' 32 | summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) 33 | summary_log += "\nThis is ${workflow.manifest.name} ${workflow_version}.\n" 34 | summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) 35 | return summary_log 36 | } 37 | 38 | // Validate parameters and print summary to screen 39 | public static void initialise(workflow, params, log) { 40 | // Print help to screen if required 41 | if (params.help) { 42 | log.info help(workflow, params, log) 43 | System.exit(0) 44 | } 45 | 46 | // Print workflow version and exit on --version 47 | if (params.version) { 48 | String workflow_version = NfcoreTemplate.version(workflow) 49 | log.info "${workflow.manifest.name} ${workflow_version}" 50 | System.exit(0) 51 | } 52 | 53 | // Explode on conda 54 | // conda.enabled seems to be backward compatible but wrap this 55 | // in a generic catch just in case 56 | try { 57 | if (workflow.session.config.conda.enabled) { 58 | log.error "Sorry, this workflow is not compatible with Conda, please use -profile standard (Docker) or -profile singularity." 59 | System.exit(1) 60 | } 61 | } catch(Exception e) {} 62 | 63 | // Validate workflow parameters via the JSON schema 64 | if (params.validate_params) { 65 | NfcoreSchema.validateParameters(workflow, params, log) 66 | } 67 | 68 | // Print parameter summary log to screen 69 | log.info paramsSummaryLog(workflow, params, log) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /lib/common.nf: -------------------------------------------------------------------------------- 1 | import groovy.json.JsonBuilder 2 | 3 | process getParams { 4 | label "wf_common" 5 | publishDir "${params.out_dir}", mode: 'copy', pattern: "params.json" 6 | cache false 7 | cpus 1 8 | memory "2 GB" 9 | output: 10 | path "params.json" 11 | script: 12 | def paramsJSON = new JsonBuilder(params).toPrettyString().replaceAll("'", "'\\\\''") 13 | """ 14 | # Output nextflow params object to JSON 15 | echo '$paramsJSON' > params.json 16 | """ 17 | } 18 | 19 | process configure_igv { 20 | publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv 21 | label "wf_common" 22 | cpus 1 23 | memory "2 GB" 24 | input: 25 | // the python script will work out what to do with all the files based on their 26 | // extensions 27 | path "file-names.txt" 28 | val locus_str 29 | val aln_extra_opts 30 | val var_extra_opts 31 | output: path "igv.json" 32 | script: 33 | // the locus argument just makes sure that the initial view in IGV shows something 34 | // interesting 35 | String locus_arg = locus_str ? "--locus $locus_str" : "" 36 | // extra options for alignment tracks 37 | def aln_opts_json_str = \ 38 | aln_extra_opts ? new JsonBuilder(aln_extra_opts).toPrettyString() : "" 39 | String aln_extra_opts_arg = \ 40 | aln_extra_opts ? "--extra-alignment-opts extra-aln-opts.json" : "" 41 | // extra options for variant tracks 42 | def var_opts_json_str = \ 43 | var_extra_opts ? new JsonBuilder(var_extra_opts).toPrettyString() : "" 44 | String var_extra_opts_arg = \ 45 | var_extra_opts ? "--extra-vcf-opts extra-var-opts.json" : "" 46 | """ 47 | # write out JSON files with extra options for the alignment and variant tracks 48 | echo '$aln_opts_json_str' > extra-aln-opts.json 49 | echo '$var_opts_json_str' > extra-var-opts.json 50 | 51 | workflow-glue configure_igv \ 52 | --fofn file-names.txt \ 53 | $locus_arg \ 54 | $aln_extra_opts_arg \ 55 | $var_extra_opts_arg \ 56 | > igv.json 57 | """ 58 | } 59 | 60 | -------------------------------------------------------------------------------- /lib/nfcore_external_java_deps.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/lib/nfcore_external_java_deps.jar -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | // 2 | // Notes to End Users. 3 | // 4 | // The workflow should run without editing this configuration file, 5 | // however there may be instances in which you wish to edit this 6 | // file for compute performance or other reasons. Please see: 7 | // 8 | // https://nextflow.io/docs/latest/config.html#configuration 9 | // 10 | // for further help editing this file. 11 | 12 | 13 | params { 14 | help = false 15 | fastq = null 16 | bam = null 17 | ref_genome = null 18 | ref_annotation = null 19 | transcriptome_source = "reference-guided" 20 | threads = 4 21 | // Thresholds for viewing isoforms in report table 22 | isoform_table_nrows = 5000 23 | 24 | out_dir = "output" 25 | sample = null 26 | sample_sheet = null 27 | aws_image_prefix = null 28 | aws_queue = null 29 | analyse_unclassified = false 30 | version = false 31 | 32 | monochrome_logs = false 33 | igv = false 34 | validate_params = true 35 | show_hidden_params = false 36 | schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf' 37 | 38 | // Process cDNA reads using pychopper, turn off for direct RNA: 39 | direct_rna = false 40 | // Options passed to pychopper: 41 | pychopper_opts = null 42 | pychopper_backend = "edlib" 43 | cdna_kit = "SQK-PCS109" 44 | 45 | // Extra option passed to minimap2 when generating index 46 | minimap2_index_opts = "-k 14" 47 | 48 | // Extra options passed to minimap2 49 | // For SIRV data 50 | //minimap2_opts = "-uf --splice-flank=no" 51 | // AFor non-SIRV data: 52 | minimap2_opts = "-uf" 53 | 54 | // Minmum mapping quality 55 | minimum_mapping_quality = 40 56 | 57 | // Internal priming filter context size: 58 | poly_context = 24 59 | 60 | // Maximum allowed poly(A) length in the genome near the 3' end of mapping: 61 | max_poly_run = 8 62 | 63 | // Minimium number of reads in BAM bundles: 64 | bundle_min_reads = 50000 65 | 66 | // Options passed to stringtie: 67 | stringtie_opts = "--conservative" 68 | 69 | // Options passed to gffcompare: 70 | gffcompare_opts = "-R" 71 | 72 | // Plot gffcompare results: 73 | plot_gffcmp_stats = true 74 | 75 | disable_ping = false 76 | store_dir = null 77 | 78 | // de options 79 | de_analysis = false 80 | ref_transcriptome = null 81 | min_samps_gene_expr = 3 82 | min_samps_feature_expr = 1 83 | min_gene_expr = 10 84 | min_feature_expr = 3 85 | 86 | 87 | wf { 88 | example_cmd = [ 89 | "--de_analysis", 90 | "--direct_rna", 91 | "--fastq 'wf-transcriptomes-demo/differential_expression_fastq'", 92 | "--minimap2_index_opts '-k 15'", 93 | "--ref_annotation 'wf-transcriptomes-demo/gencode.v22.annotation.chr20.gtf'", 94 | "--ref_genome 'wf-transcriptomes-demo/hg38_chr20.fa'", 95 | "--sample_sheet 'wf-transcriptomes-demo/sample_sheet.csv'", 96 | ] 97 | agent = null 98 | container_sha = "shac733d952a14257cf3c5c5d5d44c6aed84d5fe5a1" 99 | common_sha = "sha9ef2f4e4585c4ce6a604616e77185077551abf50" 100 | } 101 | } 102 | 103 | manifest { 104 | name = 'epi2me-labs/wf-transcriptomes' 105 | author = 'Oxford Nanopore Technologies' 106 | homePage = 'https://github.com/epi2me-labs/wf-transcriptomes' 107 | description = 'Transcriptome analysis including differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.' 108 | mainScript = 'main.nf' 109 | nextflowVersion = '>=23.04.2' 110 | version = 'v1.7.0' 111 | } 112 | 113 | epi2melabs { 114 | tags = "wf-transcriptomes,isoforms,transcriptomics,denovo,human,mouse,plant" 115 | } 116 | 117 | // used by default for "standard" (docker) and singularity profiles, 118 | // other profiles may override. 119 | process { 120 | withLabel:isoforms { 121 | container = "ontresearch/wf-transcriptomes:${params.wf.container_sha}" 122 | } 123 | withLabel:wf_common { 124 | container = "ontresearch/wf-common:${params.wf.common_sha}" 125 | } 126 | 127 | shell = ['/bin/bash', '-euo', 'pipefail'] 128 | } 129 | 130 | 131 | profiles { 132 | // the "standard" profile is used implicitely by nextflow 133 | // if no other profile is given on the CLI 134 | standard { 135 | docker { 136 | enabled = true 137 | // this ensures container is run as host user and group, but 138 | // also adds host user to the within-container group 139 | runOptions = "--user \$(id -u):\$(id -g) --group-add 100" 140 | } 141 | } 142 | 143 | // using singularity instead of docker 144 | singularity { 145 | singularity { 146 | enabled = true 147 | autoMounts = true 148 | } 149 | } 150 | 151 | 152 | conda { 153 | conda.enabled = true 154 | } 155 | 156 | // Using AWS batch. 157 | // May need to set aws.region and aws.batch.cliPath 158 | awsbatch { 159 | process { 160 | executor = 'awsbatch' 161 | queue = "${params.aws_queue}" 162 | memory = '8G' 163 | withLabel:isoforms { 164 | container = "${params.aws_image_prefix}-wf-transcriptomes:${params.wf.container_sha}" 165 | } 166 | withLabel:wf_common { 167 | container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}" 168 | } 169 | shell = ['/bin/bash', '-euo', 'pipefail'] 170 | } 171 | } 172 | 173 | // local profile for simplified development testing 174 | local { 175 | process.executor = 'local' 176 | } 177 | } 178 | 179 | 180 | timeline { 181 | enabled = true 182 | overwrite = true 183 | file = "${params.out_dir}/execution/timeline.html" 184 | } 185 | report { 186 | enabled = true 187 | overwrite = true 188 | file = "${params.out_dir}/execution/report.html" 189 | } 190 | trace { 191 | enabled = true 192 | overwrite = true 193 | file = "${params.out_dir}/execution/trace.txt" 194 | } 195 | 196 | env { 197 | PYTHONNOUSERSITE = 1 198 | JAVA_TOOL_OPTIONS = "-Xlog:disable -Xlog:all=warning:stderr" 199 | } 200 | -------------------------------------------------------------------------------- /output_definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": { 3 | "workflow-report": { 4 | "filepath": "wf-transcriptomes-report.html", 5 | "title": "workflow report", 6 | "description": "a HTML report document detailing the primary findings of the workflow", 7 | "mime-type": "text/html", 8 | "optional": false, 9 | "type": "aggregated" 10 | }, 11 | "read-stats-per-file": { 12 | "filepath": "fastq_ingress_results/{{ alias }}//reads/fastcat_stats/per-file-stats.tsv", 13 | "title": "Per file read stats", 14 | "description": "A TSV with per file read stats, including all samples.", 15 | "mime-type": "text/tab-separated-values", 16 | "optional": false, 17 | "type": "aggregated" 18 | }, 19 | "read-stats-per-read": { 20 | "filepath": "fastq_ingress_results/{{ alias }}//reads/fastcat_stats/per-read-stats.tsv", 21 | "title": "Read stats", 22 | "description": "A TSV with per read stats, including all samples.", 23 | "mime-type": "text/tab-separated-values", 24 | "optional": false, 25 | "type": "aggregated" 26 | }, 27 | "run-ids": { 28 | "filepath": "fastq_ingress_results/{{ alias }}//reads/fastcat_stats/run_ids", 29 | "title": "Run ID's", 30 | "description": "List of run IDs present in reads.", 31 | "mime-type": "text/txt", 32 | "optional": false, 33 | "type": "aggregated" 34 | }, 35 | "metamap": { 36 | "filepath": "fastq_ingress_results/{{ alias }}//reads/metamap.json", 37 | "title": "Meta map json", 38 | "description": "Metadata used in workflow presented in a JSON.", 39 | "mime-type": "text/json", 40 | "optional": false, 41 | "type": "aggregated" 42 | }, 43 | "sample-data": { 44 | "filepath": "fastq_ingress_results/{{ alias }}//reads/{{ alias }}.fastq.gz", 45 | "title": "Concatenated sequence data", 46 | "description": "Per sample reads concatenated in to one FASTQ file.", 47 | "mime-type": "text/json", 48 | "optional": false, 49 | "type": "per-sample" 50 | }, 51 | "transcriptome": { 52 | "filepath": "{{ alias }}_transcriptome.fas", 53 | "title": "Assembled transcriptome", 54 | "description": "Per sample assembled transcriptome. Not output if a reference annotation was supplied", 55 | "mime-type": "text/x-fasta", 56 | "optional": true, 57 | "type": "per-sample" 58 | }, 59 | "merged_transcriptome": { 60 | "filepath": "{{ alias }}_merged_transcriptome.fas", 61 | "title": "Annotated assembled transcriptome", 62 | "description": "Per sample annotated assembled transcriptome. Only output if a reference annotation was supplied", 63 | "mime-type": "text/x-fasta", 64 | "optional": true, 65 | "type": "per-sample" 66 | }, 67 | "alignment-stats": { 68 | "filepath": "{{ alias }}_read_aln_stats.tsv", 69 | "title": "Alignment summary statistics", 70 | "description": "Per sample alignment summary statistics.", 71 | "mime-type": "text/tab-separated-valuesa", 72 | "optional": false, 73 | "type": "per-sample" 74 | }, 75 | "gff_compare": { 76 | "filepath": "{{ alias }}_gffcompare", 77 | "title": "GFF compare results.", 78 | "description": "All GFF compare output files.", 79 | "mime-type": "text/directory", 80 | "optional": true, 81 | "type": "per-sample" 82 | }, 83 | "dge-results-tsv": { 84 | "filepath": "de_analysis/results_dge.tsv", 85 | "title": "Differential gene expression results", 86 | "description": "This is a gene-level result file that describes genes and their probability of showing differential expression between experimental conditions.", 87 | "mime-type": "text/tab-separated-values", 88 | "optional": true, 89 | "type": "aggregated" 90 | }, 91 | "dge-report-pdf": { 92 | "filepath": "de_analysis/results_dge.pdf", 93 | "title": "Differential gene expression report", 94 | "description": "Summary report of differential gene expression analysis as a PDF.", 95 | "mime-type": "application/pdf", 96 | "optional": true, 97 | "type": "aggregated" 98 | }, 99 | "dtu-gene-tsv": { 100 | "filepath": "de_analysis/results_dtu_gene.tsv", 101 | "title": "Differential transcript usage gene TSV", 102 | "description": "This is a gene-level result file from DEXSeq that lists annotated genes and their probabilities of differential expression.", 103 | "mime-type": "text/tab-separated-values", 104 | "optional": true, 105 | "type": "aggregated" 106 | }, 107 | "dtu-report-pdf": { 108 | "filepath": "de_analysis/results_dtu.pdf", 109 | "title": "Differential transcript usage report", 110 | "description": "Summary report of differential transcript usage results as a PDF.", 111 | "mime-type": "application/pdf", 112 | "optional": true, 113 | "type": "aggregated" 114 | }, 115 | "dtu-transcript": { 116 | "filepath": "de_analysis/results_dtu_transcript.tsv", 117 | "title": "Differential transcript usage TSV", 118 | "description": "This is a transcript-level result file from DEXSeq that lists annotated genes and their probabilities of differential expression.", 119 | "mime-type": "text/tab-separated-values", 120 | "optional": true, 121 | "type": "aggregated" 122 | }, 123 | "dtu-stageR": { 124 | "filepath": "de_analysis/results_dtu_stageR.tsv ", 125 | "title": "Differential transcript usage stageR TSV", 126 | "description": "This is the output from StageR and it shows both gene and transcript probabilities of differential expression", 127 | "mime-type": "text/tab-separated-values", 128 | "optional": true, 129 | "type": "aggregated" 130 | }, 131 | "dexseq": { 132 | "filepath": "de_analysis/results_dexseq.tsv", 133 | "title": "Differential transcript usage DEXSeq TSV", 134 | "description": "The complete output from the DEXSeq-analysis, shows both gene and transcript probabilities of differential expression.", 135 | "mime-type": "text/tab-separated-values", 136 | "optional": true, 137 | "type": "aggregated" 138 | }, 139 | "gene_counts": { 140 | "filepath": "de_analysis/all_gene_counts.tsv", 141 | "title": "Gene counts", 142 | "description": "Raw gene counts created by the Salmon tool, before filtering.", 143 | "mime-type": "text/tab-separated-values", 144 | "optional": true, 145 | "type": "aggregated" 146 | }, 147 | "gene_counts_per_million": { 148 | "filepath": "de_analysis/cpm_gene_counts.tsv", 149 | "title": "Gene counts per million", 150 | "description": "This file shows counts per million (CPM) of the raw gene counts to facilitate comparisons across samples.", 151 | "mime-type": "text/tab-separated-values", 152 | "optional": true, 153 | "type": "aggregated" 154 | }, 155 | "transcript_counts": { 156 | "filepath": "de_analysis/unfiltered_transcript_counts_with_genes.tsv", 157 | "title": "Transcript counts", 158 | "description": "Raw transcript counts created by the Salmon tool, before filtering. Includes reference to the associated gene ID.", 159 | "mime-type": "text/tab-separated-values", 160 | "optional": true, 161 | "type": "aggregated" 162 | }, 163 | "tpm_transcript_counts": { 164 | "filepath": "de_analysis/unfiltered_tpm_transcript_counts.tsv", 165 | "title": "Transcript per million counts", 166 | "description": "This file shows transcripts per million (TPM) of the raw counts to facilitate comparisons across samples.", 167 | "mime-type": "text/tab-separated-values", 168 | "optional": true, 169 | "type": "aggregated" 170 | }, 171 | "transcipt_counts_filtered": { 172 | "filepath": "de_analysis/filtered_transcript_counts_with_genes.tsv", 173 | "title": "Transcript counts filtered", 174 | "description": "Filtered transcript counts, used for differential transcript usage analysis. Includes a reference to the associated gene ID.", 175 | "mime-type": "text/tab-separated-values", 176 | "optional": true, 177 | "type": "aggregated" 178 | }, 179 | "transcripts_table": { 180 | "filepath": "{{ alias }}_transcripts_table.tsv", 181 | "title": "Transcript info table", 182 | "description": "This file details each isoform that was reconstructed from the input reads. It contains a subset of columns from the .tmap output from [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml)", 183 | "mime-type": "text/tab-separated-values", 184 | "optional": true, 185 | "type": "per-sample" 186 | }, 187 | "final_non_redundant_transcriptome": { 188 | "filepath": "de_analysis/final_non_redundant_transcriptome.fasta", 189 | "title": "Final non redundant transcriptome", 190 | "description": "Transcripts that were used for differential expression analysis including novel transcripts with the identifiers used for DE analysis. Only applicable when the ref_transcriptome parameter is not provided.", 191 | "mime-type": "text/x-fasta", 192 | "optional": true, 193 | "type": "aggregated" 194 | }, 195 | "reference-index": { 196 | "filepath": "igv_reference/{{ ref_genome_file }}.fai", 197 | "title": "Index of reference FASTA file", 198 | "description": "Reference genome index of the FASTA file required for IGV config.", 199 | "mime-type": "text/tab-separated-values", 200 | "optional": true, 201 | "type": "aggregated" 202 | }, 203 | "reference-gzi-index": { 204 | "filepath": "igv_reference/{{ ref_genome_file }}.gzi", 205 | "title": "GZI index of the reference FASTA file", 206 | "description": "GZI Index of the reference FASTA file.", 207 | "mime-type": "application/octet-stream", 208 | "optional": true, 209 | "type": "aggregated" 210 | }, 211 | "igv-config": { 212 | "filepath": "igv.json", 213 | "title": "JSON configuration file for IGV browser", 214 | "description": "JSON configuration file to be loaded in IGV for visualising alignments against the reference.", 215 | "mime-type": "text/json", 216 | "optional": true, 217 | "type": "aggregated" 218 | }, 219 | "minimap2-bam": { 220 | "filepath": "BAMS/{{ alias }}.reads_aln_sorted.bam", 221 | "title": "BAM file (minimap2)", 222 | "description": "BAM file generated from mapping input reads to the reference.", 223 | "mime-type": "application/gzip", 224 | "optional": true, 225 | "type": "per-sample" 226 | }, 227 | "minimap2-index": { 228 | "filepath": "BAMS/{{ alias }}.reads_aln_sort.bam.bai", 229 | "title": "BAM index file (minimap2)", 230 | "description": "Index file generated from mapping input reads to the reference.", 231 | "mime-type": "application/octet-stream", 232 | "optional": true, 233 | "type": "per-sample" 234 | } 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /subworkflows/differential_expression.nf: -------------------------------------------------------------------------------- 1 | process checkSampleSheetCondition { 2 | label "isoforms" 3 | cpus 1 4 | memory "2 GB" 5 | input: 6 | path "sample_sheet.csv" 7 | """ 8 | workflow-glue check_sample_sheet_condition "sample_sheet.csv" 9 | """ 10 | } 11 | 12 | 13 | 14 | process count_transcripts { 15 | // Count transcripts using Salmon. 16 | // library type is specified as forward stranded (-l SF) as it should have either been through pychopper or come from direct RNA reads. 17 | label "isoforms" 18 | cpus params.threads 19 | memory "31 GB" 20 | input: 21 | tuple val(meta), path(bam), path(ref_transcriptome) 22 | output: 23 | path "*transcript_counts.tsv", emit: counts 24 | """ 25 | salmon quant --noErrorModel -p "${task.cpus}" -t "${ref_transcriptome}" -l SF -a "${bam}" -o counts 26 | mv counts/quant.sf "${meta.alias}.transcript_counts.tsv" 27 | """ 28 | } 29 | 30 | 31 | process mergeCounts { 32 | label "isoforms" 33 | cpus 1 34 | memory "2 GB" 35 | input: 36 | path counts 37 | output: 38 | path "unfiltered_transcript_counts.tsv" 39 | """ 40 | workflow-glue merge_count_tsvs -z -o unfiltered_transcript_counts.tsv -tsvs ${counts} 41 | """ 42 | } 43 | 44 | process mergeTPM { 45 | label "isoforms" 46 | cpus 1 47 | memory "2 GB" 48 | input: 49 | path counts 50 | output: 51 | path "unfiltered_tpm_transcript_counts.tsv" 52 | // Use tpm parameter with merge_counts_tsvs.py to out transcript per million file 53 | """ 54 | workflow-glue merge_count_tsvs -o unfiltered_tpm_transcript_counts.tsv -z -tpm True -tsvs $counts 55 | """ 56 | } 57 | 58 | 59 | process deAnalysis { 60 | label "isoforms" 61 | cpus 4 62 | memory "16 GB" 63 | input: 64 | path "sample_sheet.csv" 65 | path "all_counts.tsv" 66 | path "annotation.gtf" 67 | output: 68 | path "de_analysis/results_dtu_stageR.tsv", emit: stageR 69 | path "merged/filtered_transcript_counts_with_genes.tsv", emit: flt_counts 70 | path "merged/all_gene_counts.tsv", emit: gene_counts 71 | path "de_analysis/unfiltered_transcript_counts_with_genes.tsv", emit: unflt_counts 72 | path "de_analysis/results_dge.tsv", emit: dge 73 | path "de_analysis/results_dexseq.tsv", emit: dexseq 74 | path "de_analysis/results_dge.pdf", emit: dge_pdf 75 | path "de_analysis/results_dge.tsv", emit: dge_tsv 76 | path "de_analysis/results_dtu_gene.tsv", emit: dtu_gene 77 | path "de_analysis/results_dtu_transcript.tsv", emit: dtu_transcript 78 | path "de_analysis/results_dtu_stageR.tsv", emit: dtu_stageR 79 | path "de_analysis/results_dtu.pdf", emit: dtu_pdf 80 | path "de_analysis/cpm_gene_counts.tsv", emit: cpm 81 | """ 82 | de_analysis.R \ 83 | --annotation annotation.gtf \ 84 | --min_samps_gene_expr $params.min_samps_gene_expr \ 85 | --min_samps_feature_expr $params.min_samps_feature_expr \ 86 | --min_gene_expr $params.min_gene_expr \ 87 | --min_feature_expr $params.min_feature_expr \ 88 | --sample_sheet sample_sheet.csv \ 89 | --all_counts all_counts.tsv \ 90 | --de_out_dir de_analysis \ 91 | --merged_out_dir merged 92 | """ 93 | } 94 | 95 | 96 | process plotResults { 97 | label "isoforms" 98 | cpus 2 99 | memory "2 GB" 100 | input: 101 | path "filtered_transcript_counts_with_genes.tsv" 102 | path "results_dtu_stageR.tsv" 103 | path "sample_sheet.tsv" 104 | output: 105 | path "dtu_plots.pdf", emit: dtu_plots 106 | """ 107 | plot_dtu_results.R \ 108 | --counts filtered_transcript_counts_with_genes.tsv \ 109 | --results_dtu results_dtu_stageR.tsv \ 110 | --sample_sheet sample_sheet.tsv \ 111 | --pdf_out dtu_plots.pdf 112 | """ 113 | } 114 | 115 | process build_minimap_index_transcriptome{ 116 | /* 117 | Build minimap index from reference genome 118 | */ 119 | label "isoforms" 120 | cpus params.threads 121 | memory "31 GB" 122 | input: 123 | path reference 124 | output: 125 | tuple path("genome_index.mmi"), path(reference), emit: index 126 | script: 127 | """ 128 | minimap2 -t "${task.cpus}" ${params.minimap2_index_opts} -I 1000G -d "genome_index.mmi" "${reference}" 129 | 130 | """ 131 | } 132 | 133 | 134 | process map_transcriptome{ 135 | /* 136 | Map reads to reference using minimap2. 137 | Filter reads by mapping quality. 138 | Filter internally-primed reads. 139 | */ 140 | label "isoforms" 141 | cpus params.threads 142 | memory "16 GB" 143 | 144 | input: 145 | tuple val(meta), path (fastq_reads), path(index) 146 | output: 147 | tuple val(meta), path("${meta.alias}_reads_aln_sorted.bam"), emit: bam 148 | path("${meta.alias}.flagstat.stats"), emit: align_stats 149 | """ 150 | minimap2 -t ${task.cpus} -ax splice -uf -p 1.0 "${index}" "${fastq_reads}" \ 151 | | samtools view -Sb > "output.bam" 152 | samtools sort -@ ${task.cpus} "output.bam" -o "${meta.alias}_reads_aln_sorted.bam" 153 | samtools flagstat -O json "${meta.alias}_reads_aln_sorted.bam" > "${meta.alias}.flagstat.stats" 154 | """ 155 | } 156 | 157 | 158 | workflow differential_expression { 159 | take: 160 | ref_transcriptome 161 | full_len_reads 162 | sample_sheet 163 | ref_annotation 164 | main: 165 | sample_sheet = Channel.fromPath(sample_sheet) 166 | checkSampleSheetCondition(sample_sheet) 167 | t_index = build_minimap_index_transcriptome(ref_transcriptome) 168 | mapped = map_transcriptome(full_len_reads.combine(t_index) 169 | .map{meta, fastq, reference, transcriptome -> tuple(meta, fastq, reference) }) 170 | count_transcripts(mapped.bam.combine(t_index.map{ mmi, reference -> reference})) 171 | merged = mergeCounts(count_transcripts.out.counts.collect()) 172 | merged_TPM = mergeTPM(count_transcripts.out.counts.collect()) 173 | analysis = deAnalysis(sample_sheet, merged, ref_annotation) 174 | plotResults(analysis.flt_counts, analysis.stageR, sample_sheet) 175 | // Concat files required for making the report 176 | de_report = analysis.flt_counts.concat( 177 | analysis.gene_counts, analysis.dge, analysis.dexseq, 178 | analysis.stageR, sample_sheet, merged, ref_annotation, merged_TPM, analysis.unflt_counts).collect() 179 | // Concat files required to be output to user without any changes 180 | de_outputs_concat = analysis.cpm.concat(plotResults.out.dtu_plots, analysis.dge_pdf, analysis.dge_tsv, 181 | analysis.dtu_gene, analysis.dtu_transcript, analysis.dtu_stageR, analysis.dtu_pdf, merged_TPM).collect() 182 | collected_de_alignment_stats = mapped.align_stats.collect() 183 | emit: 184 | all_de = de_report 185 | de_alignment_stats = collected_de_alignment_stats 186 | de_outputs = de_outputs_concat 187 | } 188 | -------------------------------------------------------------------------------- /subworkflows/reference_assembly.nf: -------------------------------------------------------------------------------- 1 | process map_reads{ 2 | /* 3 | Map reads to reference using minimap2. 4 | Filter reads by mapping quality. 5 | Filter internally-primed reads. 6 | */ 7 | label "isoforms" 8 | cpus params.threads 9 | memory "31 GB" 10 | publishDir path: "${params.out_dir}/${publish_prefix_bams}", mode: 'copy', pattern: "${sample_id}_reads_aln_sorted.bam*", overwrite: true 11 | input: 12 | tuple val(sample_id), path (fastq_reads), path(index), path(reference) 13 | val publish_prefix_bams 14 | output: 15 | tuple val(sample_id), 16 | path("${sample_id}_reads_aln_sorted.bam"), 17 | path("${sample_id}_reads_aln_sorted.bam.bai"), 18 | emit: bam 19 | tuple val(sample_id), path("${sample_id}_read_aln_stats.tsv"), emit: stats 20 | script: 21 | def ContextFilter = """AlnContext: { Ref: "${reference}", LeftShift: -${params.poly_context}, 22 | RightShift: ${params.poly_context}, RegexEnd: "[Aa]{${params.max_poly_run},}", 23 | Stranded: True,Invert: True, Tsv: "internal_priming_fail.tsv"} """ 24 | 25 | def mm2_threads = Math.max(task.cpus - 3, 1) 26 | """ 27 | minimap2 -t ${mm2_threads} -ax splice ${params.minimap2_opts} ${index} ${fastq_reads}\ 28 | | samtools view -q ${params.minimum_mapping_quality} -F 2304 -Sb -\ 29 | | seqkit bam -j 1 -x -T '${ContextFilter}' -\ 30 | | samtools sort --write-index -@ 1 -o "${sample_id}_reads_aln_sorted.bam##idx##${sample_id}_reads_aln_sorted.bam.bai" - ; 31 | ((cat "${sample_id}_reads_aln_sorted.bam" | seqkit bam -s -j 1 - 2>&1) | tee "${sample_id}_read_aln_stats.tsv" ) || true 32 | 33 | # Add sample id header and column; remove last column (File) 34 | cat "${sample_id}_read_aln_stats.tsv" \ 35 | | sed "s/^/${sample_id} /" \ 36 | | sed "1 s/^${sample_id}/sample_id/" \ 37 | | awk 'NF{NF-=1};1' \ 38 | > tmp 39 | mv tmp "${sample_id}_read_aln_stats.tsv" 40 | 41 | if [[ -s "internal_priming_fail.tsv" ]]; 42 | then 43 | tail -n +2 "internal_priming_fail.tsv" | awk '{print ">" \$1 "\\n" \$4 }' - > "context_internal_priming_fail_start.fasta" 44 | tail -n +2 "internal_priming_fail.tsv" | awk '{print ">" \$1 "\\n" \$6 }' - > "context_internal_priming_fail_end.fasta" 45 | fi 46 | """ 47 | } 48 | 49 | workflow reference_assembly { 50 | take: 51 | index 52 | reference 53 | fastq_reads 54 | publish_prefix_bams 55 | main: 56 | map_reads(fastq_reads.combine(index).combine(reference), publish_prefix_bams) 57 | emit: 58 | bam = map_reads.out.bam 59 | stats = map_reads.out.stats 60 | } 61 | -------------------------------------------------------------------------------- /test_data/SIRV_150601a.fasta.fai: -------------------------------------------------------------------------------- 1 | SIRV1 12643 7 80 81 2 | SIRV2 6911 12816 80 81 3 | SIRV3 10943 19821 80 81 4 | SIRV4 16122 30908 80 81 5 | SIRV5 14606 47239 80 81 6 | SIRV6 12837 62035 80 81 7 | SIRV7 148957 75040 80 81 8 | -------------------------------------------------------------------------------- /test_data/demultiplexed_fastq/barcode01/SIRV_E0_PCS109_51.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/test_data/demultiplexed_fastq/barcode01/SIRV_E0_PCS109_51.fq.gz -------------------------------------------------------------------------------- /test_data/demultiplexed_fastq/barcode02/SIRV_E0_PCS109_25.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/test_data/demultiplexed_fastq/barcode02/SIRV_E0_PCS109_25.fq.gz -------------------------------------------------------------------------------- /test_data/fastq/SIRV_E0_PCS109_50.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/test_data/fastq/SIRV_E0_PCS109_50.fq.gz -------------------------------------------------------------------------------- /test_data/sample_sheet.csv: -------------------------------------------------------------------------------- 1 | barcode,sample_id,alias,condition 2 | barcode01,sample01,sample01,control 3 | barcode02,sample02,sample02,control 4 | barcode03,sample03,sample03,control 5 | barcode04,sample04,sample04,treated 6 | barcode05,sample05,sample05,treated 7 | barcode06,sample06,sample06,treated 8 | -------------------------------------------------------------------------------- /test_data/workflow_glue/MSTRG.11088.gff3: -------------------------------------------------------------------------------- 1 | chr1 HAVANA transcript 11869 14409 . + . ID=ENST00000456328.2;Parent=ENSG00000290825.1;gene_id=ENSG00000290825.1;transcript_id=ENST00000456328.2;gene_type=lncRNA;gene_name=DDX11L2;transcript_type=lncRNA;transcript_name=DDX11L2-202;level=2;transcript_support_level=1;tag=basic,Ensembl_canonical;havana_transcript=OTTHUMT00000362751.1 2 | chr2 HAVANA transcript 113599036 113601261 . - . ID=ENST00000437401.1;Parent=ENSG00000236397.3;gene_id=ENSG00000236397.3;transcript_id=ENST00000437401.1;gene_type=unprocessed_pseudogene;gene_name=DDX11L2;transcript_type=unprocessed_pseudogene;transcript_name=DDX11L2-201;level=2;transcript_support_level=NA;hgnc_id=HGNC:37103;ont=PGO:0000005;tag=basic,Ensembl_canonical;havana_gene=OTTHUMG00000047823.1;havana_transcript=OTTHUMT00000109036.1 3 | -------------------------------------------------------------------------------- /test_data/workflow_glue/MSTRG.11088.gtf: -------------------------------------------------------------------------------- 1 | chr13 StringTie transcript 76990660 77005117 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 2 | chr13 StringTie exon 76990660 76992271 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; exon_number "1"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 3 | chr13 StringTie exon 76995063 76995228 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; exon_number "2"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 4 | chr13 StringTie exon 76995902 76996127 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; exon_number "3"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 5 | chr13 StringTie exon 77000458 77005117 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; exon_number "4"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 6 | chr13 StringTie transcript 76991729 77005117 1000 + . gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; 7 | chr13 StringTie exon 76991729 76991832 1000 + . gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; exon_number "1"; 8 | chr13 StringTie exon 76995063 76995228 1000 + . gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; exon_number "2"; 9 | chr13 StringTie exon 76995902 76996127 1000 + . gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; exon_number "3"; 10 | chr13 StringTie exon 77000458 77005117 1000 + . gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; exon_number "4"; 11 | chr13 StringTie transcript 76992044 77005117 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 12 | chr13 StringTie exon 76992044 76992271 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "1"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 13 | chr13 StringTie exon 76995063 76995228 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "2"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 14 | chr13 StringTie exon 76995902 76996127 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "3"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 15 | chr13 StringTie exon 76998043 76998085 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "4"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 16 | chr13 StringTie exon 77000458 77005117 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "5"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 17 | chr13 StringTie transcript 76992078 77078025 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 18 | chr13 StringTie exon 76992078 76992271 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "1"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 19 | chr13 StringTie exon 76995063 76995228 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "2"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 20 | chr13 StringTie exon 76995902 76996127 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "3"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 21 | chr13 StringTie exon 77075518 77075584 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "4"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 22 | chr13 StringTie exon 77076816 77078025 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "5"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 23 | chr13 StringTie transcript 76995915 77129717 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000637192.1"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 24 | chr13 StringTie exon 76995915 76996127 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000637192.1"; exon_number "1"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 25 | chr13 StringTie exon 77109648 77110102 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000637192.1"; exon_number "2"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 26 | chr13 StringTie exon 77129147 77129717 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000637192.1"; exon_number "3"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 27 | chr13 StringTie transcript 77026767 77078025 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636737.1"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 28 | chr13 StringTie exon 77026767 77027122 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636737.1"; exon_number "1"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 29 | chr13 StringTie exon 77075518 77075584 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636737.1"; exon_number "2"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 30 | chr13 StringTie exon 77076816 77078025 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000636737.1"; exon_number "3"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 31 | chr13 StringTie transcript 77075514 77087778 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000450627.6"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 32 | chr13 StringTie exon 77075514 77075584 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000450627.6"; exon_number "1"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 33 | chr13 StringTie exon 77076816 77076866 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000450627.6"; exon_number "2"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 34 | chr13 StringTie exon 77087552 77087778 1000 + . gene_id "MSTRG.11088"; transcript_id "ENST00000450627.6"; exon_number "3"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; -------------------------------------------------------------------------------- /test_data/workflow_glue/check_sample_sheet_condition/sample_sheet_1.csv: -------------------------------------------------------------------------------- 1 | barcode,sample_id,alias,condition 2 | barcode01,sample01,sample01,control 3 | barcode02,sample02,sample02,control 4 | barcode03,sample03,sample03,control 5 | barcode04,sample04,sample04,treated 6 | barcode05,sample05,sample05,treated 7 | barcode06,sample06,sample06,other -------------------------------------------------------------------------------- /test_data/workflow_glue/check_sample_sheet_condition/sample_sheet_2.csv: -------------------------------------------------------------------------------- 1 | barcode,sample_id,alias 2 | barcode01,sample01,sample01 3 | barcode02,sample02,sample02 4 | barcode03,sample03,sample03 5 | barcode04,sample04,sample04 6 | barcode05,sample05,sample05 7 | barcode06,sample06,sample06 -------------------------------------------------------------------------------- /test_data/workflow_glue/check_sample_sheet_condition/sample_sheet_3.csv: -------------------------------------------------------------------------------- 1 | barcode,sample_id,alias,condition 2 | barcode01,sample01,sample01,control 3 | barcode04,sample04,sample04,treated 4 | -------------------------------------------------------------------------------- /test_data/workflow_glue/check_sample_sheet_condition/sample_sheet_4.csv: -------------------------------------------------------------------------------- 1 | barcode,sample_id,alias,condition 2 | barcode01,sample01,sample01,untreated 3 | barcode02,sample02,sample02,untreated 4 | barcode03,sample03,sample03,untreated 5 | barcode04,sample04,sample04,treated 6 | barcode05,sample05,sample05,treated 7 | barcode06,sample06,sample06,treated 8 | --------------------------------------------------------------------------------