├── .dockerignore
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── config.yml
    │   ├── feature_request.yml
    │   └── question.yml
├── .gitignore
├── .gitlab-ci.yml
├── .gitmodules
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── bin
    ├── de_analysis.R
    ├── plot_dtu_results.R
    ├── workflow-glue
    └── workflow_glue
    │   ├── __init__.py
    │   ├── check_sample_sheet_condition.py
    │   ├── de_plots.py
    │   ├── generate_pychopper_stats.py
    │   ├── merge_count_tsvs.py
    │   ├── models
    │       ├── __init__.py
    │       └── common.py
    │   ├── parse_gffcompare.py
    │   ├── report.py
    │   ├── summarise_gff.py
    │   ├── tests
    │       ├── __init__.py
    │       ├── conftest.py
    │       ├── test_check_sample_sheet_condition.py
    │       └── test_de_plots.py
    │   ├── util.py
    │   └── wfg_helpers
    │       ├── __init__.py
    │       ├── check_bam_headers_in_dir.py
    │       ├── check_sample_sheet.py
    │       ├── check_xam_index.py
    │       ├── configure_igv.py
    │       ├── get_max_depth_locus.py
    │       └── reheader_samstream.py
├── data
    └── OPTIONAL_FILE
├── docs
    ├── 01_brief_description.md
    ├── 02_introduction.md
    ├── 03_compute_requirements.md
    ├── 04_install_and_run.md
    ├── 05_related_protocols.md
    ├── 06_input_example.md
    ├── 06_input_parameters.md
    ├── 07_outputs.md
    ├── 08_pipeline_overview.md
    ├── 09_troubleshooting.md
    ├── 10_FAQ.md
    └── 11_other.md
├── evaluation
    └── tests.sh
├── lib
    ├── ArgumentParser.groovy
    ├── CWUtil.groovy
    ├── NfcoreSchema.groovy
    ├── NfcoreTemplate.groovy
    ├── Pinguscript.groovy
    ├── WorkflowMain.groovy
    ├── common.nf
    ├── ingress.nf
    └── nfcore_external_java_deps.jar
├── main.nf
├── nextflow.config
├── nextflow_schema.json
├── output_definition.json
├── subworkflows
    ├── differential_expression.nf
    └── reference_assembly.nf
└── test_data
    ├── SIRV_150601a.fasta
    ├── SIRV_150601a.fasta.fai
    ├── SIRV_isoforms.gtf
    ├── demultiplexed_fastq
        ├── barcode01
        │   └── SIRV_E0_PCS109_51.fq.gz
        └── barcode02
        │   └── SIRV_E0_PCS109_25.fq.gz
    ├── fastq
        └── SIRV_E0_PCS109_50.fq.gz
    ├── sample_sheet.csv
    └── workflow_glue
        ├── MSTRG.11088.gff3
        ├── MSTRG.11088.gtf
        └── check_sample_sheet_condition
            ├── sample_sheet_1.csv
            ├── sample_sheet_2.csv
            ├── sample_sheet_3.csv
            └── sample_sheet_4.csv


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | bin
 3 | CHANGELOG.md
 4 | data
 5 | lib
 6 | LICENSE
 7 | main.nf
 8 | nextflow.config
 9 | README.md
10 | test_data
11 | # we typically run tests with outputs to these:
12 | output
13 | work
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
  1 | name: Bug Report
  2 | description: File a bug report
  3 | labels: ["triage"]
  4 | body:
  5 |   - type: markdown
  6 |     attributes:
  7 |       value: |
  8 |         Thanks for taking the time to fill out this bug report!
  9 | 
 10 | 
 11 |   - type: markdown
 12 |     attributes:
 13 |       value: |
 14 |           # Background
 15 |   - type: dropdown
 16 |     id: os
 17 |     attributes:
 18 |       label: Operating System
 19 |       description: What operating system are you running?
 20 |       options:
 21 |         - Windows 10
 22 |         - Windows 11
 23 |         - macOS
 24 |         - Ubuntu 22.04
 25 |         - CentOS 7
 26 |         - Other Linux (please specify below)
 27 |     validations:
 28 |       required: true
 29 |   - type: input
 30 |     id: other-os
 31 |     attributes:
 32 |       label: Other Linux
 33 |       placeholder: e.g. Fedora 38
 34 |   - type: input
 35 |     id: version
 36 |     attributes:
 37 |       label: Workflow Version
 38 |       description: This is most easily found in the workflow output log
 39 |       placeholder: v1.2.3
 40 |     validations:
 41 |       required: true
 42 |   - type: dropdown
 43 |     id: execution
 44 |     attributes:
 45 |       label: Workflow Execution
 46 |       description: Where are you running the workflow?
 47 |       options:
 48 |         - EPI2ME Desktop (Local)
 49 |         - EPI2ME Desktop (Cloud)
 50 |         - Command line (Local)
 51 |         - Command line (Cluster)
 52 |         - Other (please describe)
 53 |     validations:
 54 |       required: true
 55 |   - type: input
 56 |     id: other-workflow-execution
 57 |     attributes:
 58 |       label: Other workflow execution
 59 |       description: If "Other", please describe
 60 |       placeholder: Tell us where / how you are running the workflow.
 61 | 
 62 |   - type: markdown
 63 |     attributes:
 64 |       value: |
 65 |         # EPI2ME Desktop Application
 66 |         If you are using the application please provide the following.
 67 |   - type: input
 68 |     id: labs-version
 69 |     attributes:
 70 |       label: EPI2ME Version
 71 |       description: Available from the application settings page.
 72 |       placeholder: v5.1.1
 73 |     validations:
 74 |       required: false
 75 | 
 76 | 
 77 |   - type: markdown
 78 |     attributes:
 79 |       value: |
 80 |         # Command-line execution
 81 |         If you are using nextflow on a command-line, please provide the following.
 82 |   - type: textarea
 83 |     id: cli-command
 84 |     attributes:
 85 |       label: CLI command run
 86 |       description: Please tell us the command you are running
 87 |       placeholder: e.g. nextflow run epi2me-labs/wf-human-variations -profile standard --fastq my-reads/fastq
 88 |     validations:
 89 |       required: false
 90 |   - type: dropdown
 91 |     id: profile
 92 |     attributes:
 93 |       label: Workflow Execution - CLI Execution Profile
 94 |       description: Which execution profile are you using? If you are using a custom profile or nextflow configuration, please give details below.
 95 |       options:
 96 |         - standard (default)
 97 |         - singularity
 98 |         - custom
 99 |     validations:
100 |       required: false
101 | 
102 | 
103 |   - type: markdown
104 |     attributes:
105 |       value: |
106 |         # Report details
107 |   - type: textarea
108 |     id: what-happened
109 |     attributes:
110 |       label: What happened?
111 |       description: Also tell us, what did you expect to happen?
112 |       placeholder: Tell us what you see!
113 |     validations:
114 |       required: true
115 |   - type: textarea
116 |     id: logs
117 |     attributes:
118 |       label: Relevant log output
119 |       description: For CLI execution please include the full output from running nextflow. For execution from the EPI2ME application please copy the contents of the "Workflow logs" panel from the "Logs" tab corresponding to your workflow instance. (This will be automatically formatted into code, so no need for backticks).
120 |       render: shell
121 |     validations:
122 |       required: true
123 |   - type: textarea
124 |     id: activity-log
125 |     attributes:
126 |       label: Application activity log entry
127 |       description: For use with the EPI2ME application please see the Settings > View Activity Log page, and copy the contents of any items listed in red using the Copy to clipboard button.
128 |       render: shell
129 |     validations:
130 |       required: false
131 |   - type: dropdown
132 |     id: run-demo
133 |     attributes:
134 |       label: Were you able to successfully run the latest version of the workflow with the demo data?
135 |       description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button?
136 |       options:
137 |         - 'yes'
138 |         - 'no'
139 |         - other (please describe below)
140 |     validations:
141 |       required: true
142 |   - type: textarea
143 |     id: demo-other
144 |     attributes:
145 |       label: Other demo data information
146 |       render: shell
147 |     validations:
148 |       required: false
149 | 
150 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |     - name: Nanopore customer support
4 |       url: https://nanoporetech.com/contact
5 |       about: For general support, including bioinformatics questions.
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature request
 2 | description: Suggest an idea for this project
 3 | labels: ["feature request"]
 4 | body:
 5 |   
 6 |   - type: textarea
 7 |     id: question1
 8 |     attributes:
 9 |       label: Is your feature related to a problem?
10 |       placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 |     validations:
12 |       required: true
13 |   - type: textarea
14 |     id: question2
15 |     attributes:
16 |       label: Describe the solution you'd like
17 |       placeholder: A clear and concise description of what you want to happen.
18 |     validations:
19 |       required: true
20 |   - type: textarea
21 |     id: question3
22 |     attributes:
23 |       label: Describe alternatives you've considered
24 |       placeholder: A clear and concise description of any alternative solutions or features you've considered.
25 |     validations:
26 |       required: true
27 |   - type: textarea
28 |     id: question4
29 |     attributes:
30 |       label: Additional context
31 |       placeholder: Add any other context about the feature request here.
32 |     validations:
33 |       required: false
34 | 
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.yml:
--------------------------------------------------------------------------------
 1 | name: Question
 2 | description: Ask a generic question about this project unrelated to features or bugs.
 3 | labels: ["question"]
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Please reserve this form for issues not related to bugs or feature requests. If our developers deem your questions to be related to bugs or features you will be asked to fill in the appropriate form.
 9 |   - type: textarea
10 |     id: question1
11 |     attributes:
12 |       label: Ask away!
13 |       placeholder: |
14 |           Bad question: How do I use this workflow in my HPC cluster?
15 |           Good question: My HPC cluster uses a GridEngine scheduler. Can you point me to documentation for how to use your workflows to efficiently submit jobs to my cluster?
16 |     validations:
17 |       required: true
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | nextflow
2 | .nextflow*
3 | template-workflow
4 | .*.swp
5 | .*.swo
6 | *.pyc
7 | *.pyo
8 | .DS_store
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/.gitmodules


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: docs_readme
 5 |         name: docs_readme
 6 |         entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json
 7 |         language: python
 8 |         always_run: true
 9 |         pass_filenames: false
10 |         additional_dependencies:
11 |           - epi2melabs==0.0.57
12 |   - repo: https://github.com/pycqa/flake8
13 |     rev: 5.0.4
14 |     hooks:
15 |       - id: flake8
16 |         pass_filenames: false
17 |         additional_dependencies:
18 |           - flake8-rst-docstrings
19 |           - flake8-docstrings
20 |           - flake8-import-order
21 |           - flake8-forbid-visual-indent
22 |           - pep8-naming
23 |           - flake8-no-types
24 |           - flake8-builtins
25 |           - flake8-absolute-import
26 |           - flake8-print
27 |         args: [
28 |             "bin",
29 |             "--import-order-style=google",
30 |             "--statistics",
31 |             "--max-line-length=88",
32 |             "--per-file-ignores=bin/workflow_glue/models/*:NT001",
33 |         ]
34 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ## [v1.7.0]
  8 | ### Changed
  9 | - `split_bam` and `build_minimap_index_transcriptome` process memory allocation increased.
 10 | - Updated recommended memory requirement.
 11 | - Updated project description.
 12 | - A common user issue is providing a ref_annotation and ref_genome parameter that have mismatched reference IDs, which causes the DE_analysis to fail. The workflow will now do an upfront check and give an error message if no overlap is found or a warning if some IDs are present in one file but not in the other.
 13 | - Reconciled workflow with wf-template v5.5.0.
 14 | - Sort the columns and rows of the gene and transcript count files.
 15 | - DE_analysis alignment summary stats table no longer includes MAPQ or quality scores. MAPQ is not relevant for transcript alignment and quality scores are already available in the read summary section of the report. 
 16 | ### Fixed
 17 | - `all_gene_counts.tsv` contained the DE counts results.
 18 | - Reduced memory usage of the report workflow process.
 19 | - Output BAM alignments in all cases unless the workflow is run with `transcriptome_source` set to `precomputed`.
 20 | - Corrected the demo command in the `README.md`.
 21 | - The merged transcriptome generated for differential expression analysis now only contains the exons and not the full genomic sequence.
 22 | - Output the gene name annotated differential expression analysis count files only.
 23 | - Only use full length reads in the differential expression analysis.
 24 | 
 25 | ## [v1.6.1]
 26 | ### Fixed
 27 | - `merge_gff_compare` failing with empty GFF files.
 28 | 
 29 | ## [v1.6.0]
 30 | ### Fixed
 31 | - v1.5.0 bug; access to undefined channel output bug when using precomputed transcriptome.
 32 | - Bug where incorrect gene_id assigned in the DE tables.
 33 | 
 34 | ## [v1.5.0]
 35 | ### Updated
 36 | - Workflow report updated to use `ezcharts`.
 37 | ### Fixed
 38 | - Exons per isoforms histogram reporting incorrect numbers.
 39 | - Output the `results_dexseq.tsv` file when `--de_analysis` enabled.
 40 | ### Removed
 41 | - per-class gffcompare tracking files as there exists a combine tracking file. 
 42 | 
 43 | ## [v1.4.0]
 44 | ## Added
 45 | - `--igv` parameter (default: false) for outputting IGV config allowing visualisation of read alignments in the EPI2ME App.
 46 | - If required for IGV, reference indexes are output in to a `igv_reference` directory
 47 | ### Changed
 48 | - BAMS are output in to a BAMS directory.
 49 | - Reconcile with template 5.2.6.
 50 | 
 51 | ## [v1.3.0]
 52 | ### Removed
 53 | - Fusion detection subworkflow, as the functionality is not robust enough for general use at this time.
 54 | ### Changed
 55 | - Updated pychopper to 2.7.10
 56 | ## Added 
 57 | - new `cdna_kit` options: PCS114 and PCB111/114
 58 | 
 59 | ## [v1.2.1]
 60 | ### Changed
 61 | - Increase some memory and CPU allocations.
 62 | 
 63 | ## [v1.2.0]
 64 | ### Added
 65 | - Workflow now accepts BAM or FASTQ files as input (using the --bam or --fastq parameters, respectively).
 66 | ### Changed
 67 | - MA plot in the `results_dge.pdf` has been updated to match the MA plot in the report.
 68 | ### Added
 69 | - Error message when running in `de_analysis` mode and `ref_annotation` input file contains unstranded annotations.
 70 | 
 71 | ## [v1.1.1]
 72 | ### Changed
 73 | - Improved handling of different annotation file types (eg. `.gtf/.gff/.gff3`) in `de_analysis` mode.
 74 | - Improved handling of annotation files that do not contain version numbers in transcript_id (such as gtf's from Ensembl).
 75 | ### Fixed
 76 | - Differential expression failing with 10 or more samples.
 77 | - Regression causing the DE analysis numeric parameters to not be evaluated correctly.
 78 | 
 79 | ## [v1.1.0]
 80 | ### Changed
 81 | - Improve documentation around filtering of transcripts done before DTU analysis.
 82 | - Renamed files:
 83 |   -  `de_analysis/all_counts_filtered.tsv` to `de_analysis/filtered_transcript_counts_with_genes.tsv`
 84 |   -  `de_analysis/de_tpm_transcript_counts.tsv` to `de_analysis/unfiltered_tpm_transcript_counts.tsv`
 85 | - Minimum memory requirements to `32 GB`.
 86 | ### Added
 87 | - Published isoforms table to output directory.
 88 | - Output additional `de_analysis/cpm_gene_counts.tsv` with counts per million gene counts.
 89 | - Output additional `de_analysis/unfiltered_transcript_counts_with_genes.tsv` with unfiltered transcript counts with associated gene IDs.
 90 | - Add gene name column to the de_analysis counts TSV files.
 91 | ### Fixed
 92 | - Mapping stage using a single thread only.
 93 | ### Changed
 94 | - More memory assigned to the fusion detection process.
 95 | - When no `--ref_annotation` is provided the workflow will still run but the output transcripts will not be annotated. However `--de_analysis` mode still requires a `--ref_annotation`.
 96 | 
 97 | ## [v1.0.0]
 98 | ### Added
 99 | - Published minimap2 and pychopper results to output directory.
100 | - Two extra pychopper parameters `--cdna_kit` and `--pychopper_backend`. `--pychopper_options` is still available to define any other options.
101 | - Memory requirements for each process.
102 | ### Changed
103 | - Documentation.
104 | ### Fixed
105 | - When Jaffa is run only output one report.
106 | 
107 | ## [v0.4.2]
108 | ### Changed
109 | - Sample sheet must include a `control` type to indicate which samples are the reference for the differential expression pipeline.
110 | ### Removed
111 | - Default local executor CPU and RAM limits.
112 | 
113 | ## [v0.4.1]
114 | ### Changed
115 | - Updated docker container with Pychopper to support LSK114.
116 | 
117 | ## [v0.4.0]
118 | ### Fixed
119 | - Remove dead links from README
120 | ### Removed
121 | - Denovo `--transcriptome_source` option.
122 | 
123 | ## [v0.3.1]
124 | ### Added
125 | - Handling for input reference transcriptome headers that contain `|`
126 | 
127 | ## [v0.3.0]
128 | ### Changed
129 | - Improve differential expression outputs.
130 | - Include transcript and gene count tables in DE_final folder.
131 | - If differential expression subworkflow is used a non redundant transcriptome will be output which includes novel transcripts.
132 | - Added wording to the report about how to identify novel transcripts in the DE tables.
133 | - Nextflow minimum required version to 23.04.2
134 | - `--minimap_index_opts` parameter has been changed to `minimap2_index_opts` for consistency.
135 | 
136 | ### Added
137 | - An additional gene name column to the differential gene expression results. This is especially handy for transcriptomes where the gene ID is not the same as gene name (e.g. Ensembl).
138 | - Wording to the report about how to identify novel transcripts in the DE tables.
139 | 
140 | ## [v0.2.1]
141 | ### Changed
142 | - Any sample aliases that contain spaces will be replaced with underscores.
143 | - Updated documentation to explain we only support Ensembl, NCBI and ENCODE annotation file types. 
144 | 
145 | ### Fixed
146 | - Documentation parameter examples corrected.
147 | - Handling for annotation files that use gene as gene_id attribute.
148 | - Handling for Ensembl annotation files.
149 | 
150 | ## [v0.2.0]
151 | ### Changed
152 | - GitHub issue templates
153 | - Condition sheet is no longer required. The sample sheet is now used to indicate condition instead.
154 |     - For differential expression, the sample sheet must have a `condition` column to indicate which condition group each sample in the sample sheet belongs to.
155 |     - Values for the condition may be any two distinct strings, for example: treated/untreated; sample/control etc.
156 | 
157 | ### Fixed
158 | - Remove default of null for `--ref_transcriptome`.
159 | - Read mapping summary table in the report has correct sample_ids.
160 | 
161 | ## [v0.1.13]
162 | ### Added
163 | - Handling for GFF3 reference_annotation file type.
164 | - Warning for the `--transcriptome_source` denovo pipeline option.
165 | 
166 | ### Changed
167 | - Enum choices are enumerated in the `--help` output
168 | - Enum choices are enumerated as part of the error message when a user has selected an invalid choice
169 | - Bumped minimum required Nextflow version to 22.10.8
170 | 
171 | ### Fixed
172 | - Replaced `--threads` option in fastqingress with hardcoded values to remove warning about undefined `param.threads`
173 | - Fix for the `--transcriptome_source` denovo pipeline option.
174 | 
175 | ## [v0.1.12]
176 | ### Added
177 | - Handling for GFF3 reference_annotation file type.
178 | - Handling gzip input reference and annotation parameters.
179 | - Handling for NCBI gtfs that contain some empty transcript ID fields.
180 | 
181 | ## [v0.1.11]
182 | ### Changed
183 | - LICENSE to Oxford Nanopore Technologies PLC. Public License Version 1.0.
184 | 
185 | ### Added
186 | - Configuration for running demo data in AWS
187 | 
188 | ## [v0.1.10]
189 | ### Changed
190 | - Condition sheet parameter description fixed to CSV
191 | - Update fastqingress
192 | 
193 | ## [v0.1.9]
194 | ### Changed
195 | - Simplify JAFFAL docs
196 | 
197 | ## [v0.1.8]
198 | ### Changed
199 | - Description in manifest
200 | 
201 | ## [v0.1.7]
202 | ### Changed
203 | - `-profile conda` is no longer supported, users should use `-profile standard` (Docker) or `-profile singularity` instead
204 | - `nextflow run epi2me-labs/wf-transcriptomes --version` will now print the workflow version number and exit
205 | - Use parameter `--transcriptome-source` to define precalculated, reference-based or denovo
206 | 
207 | ## [v0.1.6]
208 | ### Changed
209 | - Removed sanitize option
210 | - Reduce size of differential expression data.
211 | 
212 | ### Added
213 | - Improved DE explanation in docs
214 | - Option to turn off transcript assembly steps with param transcript_assembly
215 | 
216 | ### Fixed
217 | - Fix JAFFAL terminating workflow when no fusions found.
218 | - Error if condition sheet and sample sheet don't match.
219 | - Failed to plot DE graphs when one of data sets is 0 length.
220 | 
221 | ## [v0.1.5]
222 | ### Added
223 | - Differential transcript and gene expression subworkflow
224 | 
225 | ## [v0.1.4]
226 | ### Added
227 | - JAFFAL fusion detection subworkflow
228 | 
229 | ### Changed
230 | - Args parser for fastqingress
231 | - Set out_dir option type to ensure output is written to correct directory on Windows
232 | - Skip unnecessary conversion to fasta from fastq
233 | - Fastqingress metadata map
234 | - Changed workflow name to wf-transcriptomes
235 | 
236 | ## [v0.1.3]
237 | ### Changed
238 | - Better help text on cli
239 | - Use EPI2ME Labs-maintained version of pychopper
240 | 
241 | ## [v0.1.2]
242 | ### Added
243 | - direct_rna option
244 | - Some extra error handling
245 | - Minor report display improvements
246 | 
247 | ## [v0.1.1]
248 | ### Fixed
249 | - Incorrect numbers and of transcripts caused by merging gff files with same gene and transcript ids
250 | - Error handling in de novo pipeline. Skip clusters in build_backbones that cause an isONclust2 error
251 | - Several small fixes in report plotting
252 | 
253 | ## [v0.1.0]
254 | ### Added
255 | - Added the denovo pipeline
256 | 
257 | ### Changed
258 | - Updates to the report plots
259 | 
260 | ## [v0.0.1]
261 | ### Added
262 | - First release
263 | - Initial port of Snakemake WF from https://github.com/nanoporetech/pipeline-nanopore-ref-isoforms
264 | 
265 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Oxford Nanopore Technologies PLC. Public License Version 1.0
  2 | =============================================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor’s Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Executable Form"
 25 |     means any form of the work other than Source Code Form.
 26 | 
 27 | 1.6. "Larger Work"
 28 |     means a work that combines Covered Software with other material, in
 29 |     a separate file or files, that is not Covered Software.
 30 | 
 31 | 1.7. "License"
 32 |     means this document.
 33 | 
 34 | 1.8. "Licensable"
 35 |     means having the right to grant, to the maximum extent possible,
 36 |     whether at the time of the initial grant or subsequently, any and
 37 |     all of the rights conveyed by this License.
 38 | 
 39 | 1.9. "Modifications"
 40 |     means any of the following:
 41 | 
 42 |     (a)	  any file in Source Code Form that results from an addition to,
 43 |           deletion from, or modification of the contents of Covered
 44 |           Software; or
 45 |     (b)   any new file in Source Code Form that contains any Covered
 46 |           Software.
 47 | 
 48 | 1.10. "Research Purposes"
 49 |     means use for internal research and not intended for or directed
 50 |     towards commercial advantages or monetary compensation; provided,
 51 |     however, that monetary compensation does not include sponsored
 52 |     research of research funded by grants.
 53 | 
 54 | 1.11  "Secondary License"
 55 |     means either the GNU General Public License, Version 2.0, the GNU
 56 |     Lesser General Public License, Version 2.1, the GNU Affero General
 57 |     Public License, Version 3.0, or any later versions of those
 58 |     licenses.
 59 | 
 60 | 1.12. "Source Code Form"
 61 |     means the form of the work preferred for making modifications.
 62 | 
 63 | 1.13. "You" (or "Your")
 64 |     means an individual or a legal entity exercising rights under this
 65 |     License. For legal entities, "You" includes any entity that
 66 |     controls, is controlled by, or is under common control with You. For
 67 |     purposes of this definition, "control" means (a) the power, direct
 68 |     or indirect, to cause the direction or management of such entity,
 69 |     whether by contract or otherwise, or (b) ownership of more than
 70 |     fifty percent (50%) of the outstanding shares or beneficial
 71 |     ownership of such entity.
 72 | 
 73 | 2. License Grants and Conditions
 74 | --------------------------------
 75 | 
 76 | 2.1. Grants
 77 | 
 78 | Each Contributor hereby grants You a world-wide, royalty-free,
 79 | non-exclusive license under Contributor copyrights Licensable by such
 80 | Contributor to use, reproduce, make available, modify, display,
 81 | perform, distribute, and otherwise exploit solely for Research Purposes
 82 | its Contributions, either on an unmodified basis, with Modifications,
 83 | or as part of a Larger Work.
 84 | 
 85 | 2.2. Effective Date
 86 | 
 87 | The licenses granted in Section 2.1 with respect to any Contribution
 88 | become effective for each Contribution on the date the Contributor
 89 | first distributes such Contribution.
 90 | 
 91 | 2.3. Limitations on Grant Scope
 92 | 
 93 | The licenses granted in this Section 2 are the only rights granted under
 94 | this License. No additional rights or licenses will be implied from the
 95 | distribution or licensing of Covered Software under this License. The
 96 | License is incompatible with Secondary Licenses.  Notwithstanding
 97 | Section 2.1 above, no copyright license is granted:
 98 | 
 99 | (a) for any code that a Contributor has removed from Covered Software;
100 |     or
101 | 
102 | (b) use of the Contributions or its Contributor Version other than for
103 | Research Purposes only; or
104 | 
105 | (c) for infringements caused by: (i) Your and any other third party’s
106 | modifications of Covered Software, or (ii) the combination of its
107 | Contributions with other software (except as part of its Contributor
108 | Version).
109 | 
110 | This License does not grant any rights in the patents, trademarks,
111 | service marks, or logos of any Contributor (except as may be necessary
112 | to comply with the notice requirements in Section 3.4).
113 | 
114 | 2.4. Subsequent Licenses
115 | 
116 | No Contributor makes additional grants as a result of Your choice to
117 | distribute the Covered Software under a subsequent version of this
118 | License (see Section 10.2) or under the terms of a Secondary License
119 | (if permitted under the terms of Section 3.3).
120 | 
121 | 2.5. Representation
122 | 
123 | Each Contributor represents that the Contributor believes its
124 | Contributions are its original creation(s) or it has sufficient rights
125 | to grant the rights to its Contributions conveyed by this License.
126 | 
127 | 2.6. Fair Use
128 | 
129 | This License is not intended to limit any rights You have under
130 | applicable copyright doctrines of fair use, fair dealing, or other
131 | equivalents.
132 | 
133 | 2.7. Conditions
134 | 
135 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
136 | in Section 2.1.
137 | 
138 | 3. Responsibilities
139 | -------------------
140 | 
141 | 3.1. Distribution of Source Form
142 | 
143 | All distribution of Covered Software in Source Code Form, including any
144 | Modifications that You create or to which You contribute, must be under
145 | the terms of this License. You must inform recipients that the Source
146 | Code Form of the Covered Software is governed by the terms of this
147 | License, and how they can obtain a copy of this License. You may not
148 | attempt to alter or restrict the recipients’ rights in the Source Code Form.
149 | 
150 | 3.2. Distribution of Executable Form
151 | 
152 | If You distribute Covered Software in Executable Form then:
153 | 
154 | (a) such Covered Software must also be made available in Source Code
155 |     Form, as described in Section 3.1, and You must inform recipients of
156 |     the Executable Form how they can obtain a copy of such Source Code
157 |     Form by reasonable means in a timely manner, at a charge no more
158 |     than the cost of distribution to the recipient; and
159 | 
160 | (b) You may distribute such Executable Form under the terms of this
161 |     License.
162 | 
163 | 3.3. Distribution of a Larger Work
164 | 
165 | You may create and distribute a Larger Work under terms of Your choice,
166 | provided that You also comply with the requirements of this License for
167 | the Covered Software. The Larger Work may not be a combination of Covered
168 | Software with a work governed by one or more Secondary Licenses.
169 | 
170 | 3.4. Notices
171 | 
172 | You may not remove or alter the substance of any license notices
173 | (including copyright notices, patent notices, disclaimers of warranty,
174 | or limitations of liability) contained within the Source Code Form of
175 | the Covered Software, except that You may alter any license notices to
176 | the extent required to remedy known factual inaccuracies.
177 | 
178 | 3.5. Application of Additional Terms
179 | 
180 | You may not choose to offer, or charge a fee for use of the Covered
181 | Software or a fee for, warranty, support, indemnity or liability
182 | obligations to one or more recipients of Covered Software.  You must
183 | make it absolutely clear that any such warranty, support, indemnity, or
184 | liability obligation is offered by You alone, and You hereby agree to
185 | indemnify every Contributor for any liability incurred by such
186 | Contributor as a result of warranty, support, indemnity or liability
187 | terms You offer. You may include additional disclaimers of warranty and
188 | limitations of liability specific to any jurisdiction.
189 | 
190 | 4. Inability to Comply Due to Statute or Regulation
191 | ---------------------------------------------------
192 | 
193 | If it is impossible for You to comply with any of the terms of this
194 | License with respect to some or all of the Covered Software due to
195 | statute, judicial order, or regulation then You must: (a) comply with
196 | the terms of this License to the maximum extent possible; and (b)
197 | describe the limitations and the code they affect. Such description must
198 | be placed in a text file included with all distributions of the Covered
199 | Software under this License. Except to the extent prohibited by statute
200 | or regulation, such description must be sufficiently detailed for a
201 | recipient of ordinary skill to be able to understand it.
202 | 
203 | 5. Termination
204 | --------------
205 | 
206 | 5.1. The rights granted under this License will terminate automatically
207 | if You fail to comply with any of its terms.
208 | 
209 | 5.2. If You initiate litigation against any entity by asserting an
210 | infringement claim (excluding declaratory judgment actions,
211 | counter-claims, and cross-claims) alleging that a Contributor Version
212 | directly or indirectly infringes, then the rights granted to
213 | You by any and all Contributors for the Covered Software under Section
214 | 2.1 of this License shall terminate.
215 | 
216 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
217 | end user license agreements (excluding distributors and resellers) which
218 | have been validly granted by You or Your distributors under this License
219 | prior to termination shall survive termination.
220 | 
221 | ************************************************************************
222 | *                                                                      *
223 | *  6. Disclaimer of Warranty                                           *
224 | *  -------------------------                                           *
225 | *                                                                      *
226 | *  Covered Software is provided under this License on an "as is"       *
227 | *  basis, without warranty of any kind, either expressed, implied, or  *
228 | *  statutory, including, without limitation, warranties that the       *
229 | *  Covered Software is free of defects, merchantable, fit for a        *
230 | *  particular purpose or non-infringing. The entire risk as to the     *
231 | *  quality and performance of the Covered Software is with You.        *
232 | *  Should any Covered Software prove defective in any respect, You     *
233 | *  (not any Contributor) assume the cost of any necessary servicing,   *
234 | *  repair, or correction. This disclaimer of warranty constitutes an   *
235 | *  essential part of this License. No use of any Covered Software is   *
236 | *  authorized under this License except under this disclaimer.         *
237 | *                                                                      *
238 | ************************************************************************
239 | 
240 | ************************************************************************
241 | *                                                                      *
242 | *  7. Limitation of Liability                                          *
243 | *  --------------------------                                          *
244 | *                                                                      *
245 | *  Under no circumstances and under no legal theory, whether tort      *
246 | *  (including negligence), contract, or otherwise, shall any           *
247 | *  Contributor, or anyone who distributes Covered Software as          *
248 | *  permitted above, be liable to You for any direct, indirect,         *
249 | *  special, incidental, or consequential damages of any character      *
250 | *  including, without limitation, damages for lost profits, loss of    *
251 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
252 | *  and all other commercial damages or losses, even if such party      *
253 | *  shall have been informed of the possibility of such damages. This   *
254 | *  limitation of liability shall not apply to liability for death or   *
255 | *  personal injury resulting from such party’s negligence to the       *
256 | *  extent applicable law prohibits such limitation, but in such event, *
257 | *  and to the greatest extent permissible, damages will be limited to  *
258 | *  direct damages not to exceed one hundred dollars. Some              *
259 | *  jurisdictions do not allow the exclusion or limitation of           *
260 | *  incidental or consequential damages, so this exclusion and          *
261 | *  limitation may not apply to You.                                    *
262 | *                                                                      *
263 | ************************************************************************
264 | 
265 | 8. Litigation
266 | -------------
267 | 
268 | Any litigation relating to this License may be brought only in the
269 | courts of a jurisdiction where the defendant maintains its principal
270 | place of business and such litigation shall be governed by laws of that
271 | jurisdiction, without reference to its conflict-of-law provisions.
272 | Nothing in this Section shall prevent a party’s ability to bring
273 | cross-claims or counter-claims.
274 | 
275 | 9. Miscellaneous
276 | ----------------
277 | 
278 | This License represents the complete agreement concerning the subject
279 | matter hereof. If any provision of this License is held to be
280 | unenforceable, such provision shall be reformed only to the extent
281 | necessary to make it enforceable. Any law or regulation which provides
282 | that the language of a contract shall be construed against the drafter
283 | shall not be used to construe this License against a Contributor.
284 | 
285 | 10. Versions of the License
286 | ---------------------------
287 | 
288 | 10.1. New Versions
289 | 
290 | Oxford Nanopore Technologies PLC. is the license steward. Except as
291 | provided in Section 10.3, no one other than the license steward has the
292 | right to modify or publish new versions of this License. Each version
293 | will be given a distinguishing version number.
294 | 
295 | 10.2. Effect of New Versions
296 | 
297 | You may distribute the Covered Software under the terms of the version
298 | of the License under which You originally received the Covered Software,
299 | or under the terms of any subsequent version published by the license
300 | steward.
301 | 
302 | 10.3. Modified Versions
303 | 
304 | If you create software not governed by this License, and you want to
305 | create a new license for such software, you may create and use a
306 | modified version of this License if you rename the license and remove
307 | any references to the name of the license steward (except to note that
308 | such modified license differs from this License).
309 | 
310 | Exhibit A - Source Code Form License Notice
311 | -------------------------------------------
312 | 
313 |   This Source Code Form is subject to the terms of the Oxford Nanopore
314 |   Technologies PLC. Public License, v. 1.0. Full licence can be found
315 |   obtained from support@nanoporetech.com
316 | 
317 | If it is not possible or desirable to put the notice in a particular
318 | file, then You may include the notice in a location (such as a LICENSE
319 | file in a relevant directory) where a recipient would be likely to look
320 | for such a notice.
321 | 
322 | You may add additional accurate notices of copyright ownership.
323 | 


--------------------------------------------------------------------------------
/bin/de_analysis.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressMessages(library(argparser))
  4 | 
  5 | parser <- arg_parser("Run differential expression analysis")
  6 | parser <- add_argument(parser, "--annotation", help="Reference annotation.")
  7 | parser <- add_argument(parser, "--min_samps_gene_expr", help="Minimum number of samples a gene must be expressed in to be included in differential gene expression.", type="numeric")
  8 | parser <- add_argument(parser, "--min_samps_feature_expr", help="Minimum number of samples for differential transcript usage.", type="numeric")
  9 | parser <- add_argument(parser, "--min_gene_expr", help="Minimum counts per gene required for differential gene expression.", type="numeric")
 10 | parser <- add_argument(parser, "--min_feature_expr", help="Minimum counts per transcript required for differential transcript usage.", type="numeric")
 11 | parser <- add_argument(parser, "--sample_sheet", help="Sample sheet.")
 12 | parser <- add_argument(parser, "--all_counts", help="All transcript counts CSV file.")
 13 | parser <- add_argument(parser, "--de_out_dir", help="Directory where differential expression out files will be saved. Directory will be created if it does not exist", default="de_analysis")
 14 | parser <- add_argument(parser, "--merged_out_dir", help="Directory where merged count files will be saved. Directory will be created if it does not exist", default="merged")
 15 | argv <- parse_args(parser)
 16 | 
 17 | suppressMessages(library("DRIMSeq"))
 18 | suppressMessages(library("GenomicFeatures"))
 19 | suppressMessages(library("edgeR"))
 20 | 
 21 | # Create output directories
 22 | if (!dir.exists(argv$de_out_dir)){
 23 |   dir.create(argv$de_out_dir, recursive=TRUE)
 24 | }
 25 | if (!dir.exists(argv$merged_out_dir)){
 26 |   dir.create(argv$merged_out_dir, recursive=TRUE)
 27 | }
 28 | 
 29 | cat("Loading counts, conditions and parameters.\n")
 30 | cts <- as.matrix(read.csv(argv$all_counts, sep="\t", row.names="Reference", stringsAsFactors=FALSE))
 31 | 
 32 | # Set up sample data frame:
 33 | #changed this to sample_id
 34 | coldata <- read.csv(argv$sample_sheet, row.names="alias", sep=",", stringsAsFactors=TRUE)
 35 | 
 36 | coldata$sample_id <- rownames(coldata)
 37 | # check if control condition exists, sets as reference 
 38 | if(!"control" %in% coldata$condition)
 39 |   stop("sample_sheet.csv does not contain 'control' 
 40 |        condition - unable to set reference.")
 41 | coldata$condition <- relevel(coldata$condition, ref = "control")
 42 | 
 43 | # a .gff annotation file extension may be gff2(gtf) or gff3 so check in files for use of = in the attribute field
 44 | # if '=' present it is gff3 if not it is gtf.
 45 | # see https://www.ensembl.org/info/website/upload/gff.html
 46 | # and http://gmod.org/wiki/GFF2#Converting_GFF2_to_GFF3
 47 | cat("Checking annotation file type.\n")
 48 | lines <- readLines(file(argv$annotation), n=10000)
 49 | # If transcript_id containing '=' (format eg. transcript_id=xxx)
 50 | # annotation type is gff3
 51 | check_file_type <- sum(grepl("transcript_id=", lines))
 52 | if (check_file_type != 0){
 53 |     cat("Annotation file type is gff3.\n")
 54 |     annotation_type <- "gff3"
 55 | } else {
 56 |     # otherwise gtf
 57 |     cat("Annotation file type is gtf.\n")
 58 |     annotation_type <- "gtf"
 59 | }
 60 | 
 61 | # Transcript_id versions (eg. ENTXXX.1, eg. ENTXXX.2) represent how many times that transcript reference has been changed 
 62 | # during its time in the database.
 63 | # Not all annotation files include it as part of the transcript_id - notably Ensembl
 64 | # The following handles this.
 65 | cat("Checking annotation file for presence of transcript_id versions.\n")
 66 | # Get the first transcript_id from the annotation file by parsing
 67 | lines <- readLines(file(argv$annotation), n=100000)
 68 | # Find transcript_ids in first 1000 lines and check if they contain dot (format eg. ENTXXX.1)
 69 | check_version <- sum(grepl("transcript_id[^;]+\\.", lines))
 70 | if (check_version != 0){
 71 |         # we do not need to strip the count file rows if ref_annotation includes versions
 72 |         cat("Annotation file transcript_ids include versions.\n")
 73 |     } else {
 74 |        # otherwise remove the versions
 75 |         rownames(cts) <- lapply(rownames(cts),  sub, pattern = "\\.\\d+$", replacement = "")
 76 |         cat("Annotation file transcript_ids do not include versions so also strip versions from the counts df.\n")
 77 |     }
 78 | 
 79 | cat("Loading annotation database.\n")
 80 | txdb <- makeTxDbFromGFF(argv$annotation,  format = annotation_type)
 81 | txdf <- select(txdb, keys(txdb,"GENEID"), "TXNAME", "GENEID")
 82 | tab <- table(txdf$GENEID)
 83 | txdf$ntx<- tab[match(txdf$GENEID, names(tab))]
 84 | 
 85 | 
 86 | cts <- cts[rownames(cts) %in% txdf$TXNAME, ] # FIXME: filter for transcripts which are in the annotation. Why they are not all there? 
 87 | 
 88 | # Reorder transcript/gene database to match input counts:
 89 | txdf <- txdf[match(rownames(cts), txdf$TXNAME), ]
 90 | rownames(txdf) <- NULL
 91 | 
 92 | # Create counts data frame:
 93 | counts<-data.frame(gene_id=txdf$GENEID, feature_id=txdf$TXNAME, cts)
 94 | 
 95 | # output unfiltered version of the counts table now we have paired transcripts with gene ids
 96 | write.table(counts, file=file.path(argv$de_out_dir, "unfiltered_transcript_counts_with_genes.tsv"), sep="\t", row.names = FALSE, quote=FALSE)
 97 | 
 98 | cat("Filtering counts using DRIMSeq.\n")
 99 | 
100 | d <- dmDSdata(counts=counts, samples=coldata)
101 | trs_cts_unfiltered <- counts(d)
102 | 
103 | d <- dmFilter(d, min_samps_gene_expr=argv$min_samps_gene_expr, min_samps_feature_expr=argv$min_samps_feature_expr,
104 |         min_gene_expr=argv$min_gene_expr, min_feature_expr=argv$min_feature_expr)
105 | 
106 | cat("Building model matrix.\n")
107 | design <- model.matrix(~condition, data=DRIMSeq::samples(d))
108 | 
109 | 
110 | 
111 | suppressMessages(library("dplyr"))
112 | 
113 | # Sum transcript counts into gene counts:
114 | cat("Sum transcript counts into gene counts.\n")
115 | trs_cts <- counts(d)
116 | write.table(trs_cts, file=file.path(argv$merged_out_dir, "filtered_transcript_counts_with_genes.tsv"), sep="\t", row.names = FALSE, quote=FALSE)
117 | 
118 | gene_cts <- trs_cts_unfiltered %>% dplyr::select(c(1, 3:ncol(trs_cts)))  %>% group_by(gene_id) %>% summarise_all(tibble::lst(sum)) %>% data.frame()
119 | rownames(gene_cts) <- gene_cts$gene_id
120 | gene_cts$gene_id <- NULL
121 | write.table(gene_cts, file=file.path(argv$merged_out_dir, "all_gene_counts.tsv"), sep="\t", quote=FALSE)
122 | 
123 | # Output count per million of the gene counts using edgeR CPM
124 | cpm_gene_counts <- cpm(gene_cts)
125 | # Add gene_id as index column header
126 | cpm_gene_counts <- cbind(var_name = rownames(cpm_gene_counts), cpm_gene_counts)
127 | rownames(cpm_gene_counts) <- NULL
128 | colnames(cpm_gene_counts)[1] <- "gene_id"
129 | write.table(cpm_gene_counts, file=file.path(argv$de_out_dir, "cpm_gene_counts.tsv"), sep="\t", quote=FALSE, row.names = FALSE)
130 | 
131 | # Differential gene expression using edgeR:
132 | cat("Running differential gene expression analysis using edgeR.\n")
133 | 
134 | y <- DGEList(gene_cts)
135 | y <- calcNormFactors(y)
136 | y <- estimateDisp(y,design)
137 | fit <- glmQLFit(y,design)
138 | qlf <- glmQLFTest(fit)
139 | edger_res <- topTags(qlf, n=nrow(y), sort.by="PValue")[[1]]
140 | 
141 | pdf("de_analysis/results_dge.pdf")
142 | 
143 | # create status vector
144 | status <- ifelse(
145 |   qlf$PValue<0.01 & qlf$logFC>0, 
146 |   'up', 
147 |   ifelse(
148 |     qlf$PValue<0.01 & qlf$logFC<=0,
149 |     'down',
150 |     'notsig'
151 |   )
152 | )
153 | plotMD(qlf, status=status,  values=c("up","down","notsig"), hl.col=c("red","blue","black"))
154 | abline(h=c(-1,1), col="blue")
155 | plotQLDisp(fit)
156 | 
157 | write.table(as.data.frame(edger_res), file=file.path(argv$de_out_dir, "results_dge.tsv"), sep="\t")
158 | 
159 | # Differential transcript usage using DEXSeq:
160 | suppressMessages(library("DEXSeq"))
161 | cat("Running differential transcript usage analysis using DEXSeq.\n")
162 | 
163 | sample.data<-DRIMSeq::samples(d)
164 | count.data <- round(as.matrix(counts(d)[,-c(1:2)]))
165 | dxd <- DEXSeqDataSet(countData=count.data, sampleData=sample.data, design=~sample + exon + condition:exon, featureID=trs_cts$feature_id, groupID=trs_cts$gene_id)
166 | dxd <- estimateSizeFactors(dxd)
167 | dxd <- estimateDispersions(dxd)
168 | dxd <- testForDEU(dxd, reducedModel=~sample + exon)
169 | dxd <- estimateExonFoldChanges( dxd, fitExpToVar="condition")
170 | dxr <- DEXSeqResults(dxd, independentFiltering=FALSE)
171 | 
172 | dev.off()
173 | pdf("de_analysis/results_dtu.pdf")
174 | plotMA(dxr, cex=0.8, alpha=0.05) 
175 | plotDispEsts(dxd)
176 | 
177 | qval <- perGeneQValue(dxr) 
178 | dxr.g<-data.frame(gene=names(qval), qval)
179 | dxr.g <- dxr.g[order(dxr.g$qval),]
180 | 
181 | dxr_out <- as.data.frame(dxr[,c("featureID", "groupID", "pvalue")])
182 | dxr_out <- dxr_out[order(dxr$pvalue),]
183 | 
184 | write.table(dxr.g, file=file.path(argv$de_out_dir, "results_dtu_gene.tsv"), sep="\t")
185 | write.table(dxr_out, file=file.path(argv$de_out_dir, "results_dtu_transcript.tsv"), sep="\t")
186 | 
187 | # and writing out some of the DEXSeq metrics to accompany EPI2ME Labs tutorial
188 | colnames(dxr)[grep("log2fold", colnames(dxr))] <- "log2fold"
189 | MADTUdata <- data.frame(dxr)[order(dxr$padj),c("exonBaseMean", "log2fold", "pvalue", "padj")]
190 | MADTUdata$exonBaseMean <- log2(MADTUdata$exonBaseMean)
191 | colnames(MADTUdata)[which(colnames(MADTUdata)=="exonBaseMean")] <- "Log2MeanExon"
192 | colnames(MADTUdata)[which(colnames(MADTUdata)=="log2fold")] <- "Log2FC"
193 | write.table(MADTUdata, file=file.path(argv$de_out_dir, "results_dexseq.tsv"), sep="\t")
194 | 
195 | # stageR analysis of DEXSeq results:
196 | cat("stageR analysis\n")
197 | library(stageR)
198 | 
199 | cat("Running stageR analysis on the differential transcript usage results.\n")
200 | pConfirmation <- matrix(dxr$pvalue, ncol=1)
201 | 
202 | dimnames(pConfirmation) <- list(dxr$featureID, "transcript")
203 | pScreen <- qval
204 | tx2gene <- as.data.frame(dxr[,c("featureID", "groupID")])
205 | 
206 | stageRObj <- stageRTx(pScreen=pScreen, pConfirmation=pConfirmation, pScreenAdjusted=TRUE, tx2gene=tx2gene)
207 | # note: the choice of 0.05 here means you can *only* threshold at 5% OFDR later
208 | stageRObj <- stageWiseAdjustment(stageRObj, method="dtu", alpha=0.10)
209 | suppressWarnings({dex.padj <- getAdjustedPValues(stageRObj, order=FALSE, onlySignificantGenes=FALSE)})
210 | 
211 | # dex.padj <- dex.padj[,-1]
212 | write.table(dex.padj, file=file.path(argv$de_out_dir, "results_dtu_stageR.tsv"), sep="\t")
213 | 


--------------------------------------------------------------------------------
/bin/plot_dtu_results.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressMessages(library(argparser))
 4 | 
 5 | parser <- arg_parser("Plot results")
 6 | parser <- add_argument(parser, "--counts", help="Filtered transcript counts with genes.")
 7 | parser <- add_argument(parser, "--results_dtu", help="stageR results.")
 8 | parser <- add_argument(parser, "--sample_sheet", help="Sample sheet.")
 9 | parser <- add_argument(parser, "--pdf_out", help="PDF file name.")
10 | argv <- parse_args(parser)
11 | 
12 | suppressMessages(library(dplyr))
13 | suppressMessages(library(ggplot2))
14 | suppressMessages(library(tidyr))
15 | 
16 | # Set up sample data frame:
17 | coldata <- read.csv(argv$sample_sheet, row.names="alias", sep=",")
18 | coldata$condition <- factor(coldata$condition, levels=rev(levels(coldata$condition)))
19 | coldata$type <-NULL
20 | coldata$patient <-NULL
21 | 
22 | # Read stageR results:
23 | stageR <- read.csv(argv$results_dtu, sep="\t")
24 | names(stageR) <- c("gene_id", "transcript_id", "p_gene", "p_transcript");
25 | 
26 | # Read filtered counts:
27 | counts <- read.csv(argv$counts, sep="\t");
28 | names(counts)[2]<-"transcript_id"
29 | 
30 | # Join counts and stageR results:
31 | df <- counts %>% left_join(stageR, by = c("gene_id", "transcript_id"))
32 | df <- df[order(df$p_gene),]
33 | 
34 | scols <- setdiff(names(df),c("gene_id", "transcript_id", "p_gene", "p_transcript"))
35 | 
36 | # Normalise counts:
37 | for(sc in scols){
38 |     df[sc] <- df[sc] / sum(df[sc])
39 | }
40 | 
41 | # Melt data frame:
42 | tdf <- df %>% gather(key='sample', value='norm_count',-gene_id, -transcript_id, -p_gene, -p_transcript)
43 | 
44 | # Add sample group column:
45 | sampleToGroup<-function(x){
46 |     return(coldata[x,]$condition)
47 | }
48 | 
49 | tdf$group <- sampleToGroup(tdf$sample)
50 | 
51 | # Filter for significant genes:
52 | sig_level <- 0.05
53 | genes <- as.character(tdf[which(tdf$p_gene < sig_level),]$gene_id)
54 | genes <- unique(genes)
55 | 
56 | pdf(argv$pdf_out)
57 | 
58 | for(gene in genes){
59 |     gdf<-tdf[which(tdf$gene_id==gene),]
60 |     p_gene <- unique(gdf$p_gene)
61 |     dtu_plot <- ggplot(gdf, aes(x=transcript_id, y=norm_count)) + geom_bar(stat="identity", aes(fill=sample), position="dodge")
62 |     dtu_plot <- dtu_plot + facet_wrap(~ group) + coord_flip()
63 |     dtu_plot <- dtu_plot + ggtitle(paste(gene," : p_value=",p_gene,sep=""))
64 |     print(dtu_plot)
65 | }
66 | 


--------------------------------------------------------------------------------
/bin/workflow-glue:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Entry point for sc_tools (single_cell_tools)."""
3 | 
4 | from workflow_glue import cli
5 | 
6 | if __name__ == '__main__':
7 |     cli()
8 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/__init__.py:
--------------------------------------------------------------------------------
 1 | """Workflow Python code."""
 2 | import argparse
 3 | import glob
 4 | import importlib
 5 | import itertools
 6 | import os
 7 | import sys
 8 | 
 9 | from .util import _log_level, get_main_logger  # noqa: ABS101
10 | 
11 | 
12 | __version__ = "0.0.1"
13 | _package_name = "workflow_glue"
14 | 
15 | HELPERS = "wfg_helpers"
16 | 
17 | 
18 | def get_components(allowed_components=None):
19 |     """Find a list of workflow command scripts."""
20 |     logger = get_main_logger(_package_name)
21 | 
22 |     # gather all python files in the current directory and the wfg_helpers
23 |     home_path = os.path.dirname(os.path.abspath(__file__))
24 |     standard_lib = os.path.join(home_path, HELPERS)
25 |     globs = itertools.chain.from_iterable((
26 |         glob.glob(os.path.join(path, "*.py"))
27 |         for path in (home_path, standard_lib)))
28 | 
29 |     components = dict()
30 |     for fname in globs:
31 |         name = os.path.splitext(os.path.basename(fname))[0]
32 |         if name in ("__init__", "util"):
33 |             continue
34 |         if allowed_components is not None and name not in allowed_components:
35 |             continue
36 | 
37 |         # leniently attempt to import module
38 |         try:
39 |             if HELPERS in fname:
40 |                 mod = importlib.import_module(f"{_package_name}.{HELPERS}.{name}")
41 |             else:
42 |                 mod = importlib.import_module(f"{_package_name}.{name}")
43 |         except ModuleNotFoundError as e:
44 |             # if imports cannot be satisifed, refuse to add the component
45 |             # rather than exploding
46 |             logger.warn(f"Could not load {name} due to missing module {e.name}")
47 |             continue
48 | 
49 |         # if theres a main() and and argparser() thats good enough for us.
50 |         try:
51 |             req = "main", "argparser"
52 |             if all(callable(getattr(mod, x)) for x in req):
53 |                 components[name] = mod
54 |         except Exception:
55 |             pass
56 |     return components
57 | 
58 | 
59 | def cli():
60 |     """Run workflow entry points."""
61 |     logger = get_main_logger(_package_name)
62 |     logger.info("Bootstrapping CLI.")
63 |     parser = argparse.ArgumentParser(
64 |         'wf-glue',
65 |         parents=[_log_level()],
66 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
67 | 
68 |     parser.add_argument(
69 |         '-v', '--version', action='version',
70 |         version='%(prog)s {}'.format(__version__))
71 | 
72 |     subparsers = parser.add_subparsers(
73 |         title='subcommands', description='valid commands',
74 |         help='additional help', dest='command')
75 |     subparsers.required = True
76 | 
77 |     # importing everything can take time, try to shortcut
78 |     if len(sys.argv) > 1:
79 |         components = get_components(allowed_components=[sys.argv[1]])
80 |         if not sys.argv[1] in components:
81 |             logger.warn("Importing all modules, this may take some time.")
82 |             components = get_components()
83 |     else:
84 |         components = get_components()
85 | 
86 |     # add all module parsers to main CLI
87 |     for name, module in components.items():
88 |         p = subparsers.add_parser(
89 |             name.split(".")[-1], parents=[module.argparser()])
90 |         p.set_defaults(func=module.main)
91 | 
92 |     args = parser.parse_args()
93 | 
94 |     logger.info("Starting entrypoint.")
95 |     args.func(args)
96 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/check_sample_sheet_condition.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Check if a sample sheet is valid."""
 3 | from collections import Counter
 4 | import csv
 5 | import sys
 6 | 
 7 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 8 | 
 9 | 
10 | def main(args):
11 |     """Run the entry point."""
12 |     logger = get_named_logger("checkSheetCondition")
13 |     with open(args.sample_sheet, "r") as f:
14 |         csv_reader = csv.DictReader(f)
15 |         conditions_count = Counter()
16 |         for row in csv_reader:
17 |             if "condition" in row:
18 |                 conditions_count[row['condition']] += 1
19 |             else:
20 |                 sys.exit(
21 |                     "Sample sheet has no condition column "
22 |                     "which is required for the "
23 |                     "differential expression subworkflow.")
24 |         if len(conditions_count.keys()) != 2:
25 |             sys.exit(
26 |                 "There must be only two unique conditions "
27 |                 "in the condition column of the sample sheet.")
28 |         if "control" not in conditions_count:
29 |             sys.exit(
30 |                 "One of the condition types must be control, "
31 |                 "to indicate which samples to use as the reference.")
32 |         if any(v < 2 for v in conditions_count.values()):
33 |             sys.exit(
34 |                 "There must be at least 2 repeats for each "
35 |                 "condition indicated in the sample sheet.")
36 |     logger.info(f"Checked sample sheet for condition column {args.sample_sheet}.")
37 | 
38 | 
39 | def argparser():
40 |     """Argument parser for entrypoint."""
41 |     parser = wf_parser("check_sample_sheet_condition")
42 |     parser.add_argument("sample_sheet", help="Sample sheet to check")
43 |     return parser
44 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/de_plots.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Create de report section."""
  3 | import json
  4 | import os
  5 | 
  6 | from dominate.tags import h5, p
  7 | from dominate.util import raw
  8 | from ezcharts import scatterplot
  9 | from ezcharts.components.ezchart import EZChart
 10 | from ezcharts.layout.snippets import DataTable
 11 | from natsort import natsorted
 12 | import numpy as np
 13 | import pandas as pd
 14 | 
 15 | 
 16 | def flagstats_df(flagstats_reports):
 17 |     """Flag stats alignment dataframe."""
 18 |     flagstats_dic = {}
 19 |     for flagstat in flagstats_reports.iterdir():
 20 |         with open(flagstat, "r") as f:
 21 |             data = json.load(f)
 22 |             data = data["QC-passed reads"]
 23 |             flagstats = [
 24 |                 'mapped', 'primary mapped', 'secondary', 'supplementary']
 25 |             per_sample_flagstats = {key: data.get(key) for key in flagstats}
 26 |             sample = os.path.basename(flagstat).split(".")[0]
 27 |             flagstats_dic[sample] = per_sample_flagstats
 28 |     alignment_summary_df = pd.DataFrame(flagstats_dic)
 29 |     alignment_summary_df = alignment_summary_df[
 30 |         natsorted(alignment_summary_df.columns)]
 31 |     alignment_summary_df.index = [
 32 |         "Total Read Mappings",
 33 |         "Primary", "Secondary",
 34 |         "Supplementary"]
 35 |     alignment_summary_df.index.name = "Statistic"
 36 |     return alignment_summary_df
 37 | 
 38 | 
 39 | def dexseq_section(dexseq_file, tr_id_to_gene_name, tr_id_to_gene_id, pval_thresh):
 40 |     """Add gene isoforms table and plot."""
 41 |     h5("Differential Isoform usage")
 42 |     p("""Table showing gene isoforms, ranked by adjusted
 43 |     p-value, from the DEXSeq analysis. Information shown includes the log2 fold
 44 |     change between experimental conditions, the log-scaled transcript
 45 |     abundance and the false discovery corrected p-value (FDR - Benjamini-Hochberg) .
 46 |     This table has not been filtered
 47 |     for genes that satisfy statistical or magnitudinal thresholds""")
 48 | 
 49 |     dexseq_results = pd.read_csv(dexseq_file, sep='\t')
 50 |     dexseq_results.index.name = "gene_id:transcript_id"
 51 | 
 52 |     # Replace any occurrences of stringtie-generated MSTRG gene ids with
 53 |     # reference gene_ids.
 54 |     dexseq_results.index = dexseq_results.index.map(
 55 |         lambda ge_tr: str(  # lookup gene_id from transcript_id [1]
 56 |             f"{tr_id_to_gene_id.get(ge_tr.split(':')[1])}: {str(ge_tr.split(':')[1])}")
 57 |     )
 58 | 
 59 |     # Add gene name column.
 60 |     dexseq_results.insert(0, "gene_name", dexseq_results.index.map(
 61 |         lambda x: tr_id_to_gene_name.get(x.split(':')[1])))
 62 | 
 63 |     DataTable.from_pandas(
 64 |         dexseq_results.sort_values(by='pvalue', ascending=True), use_index=True)
 65 | 
 66 |     p(
 67 |         """The figure below presents the MA plot from the DEXSeq analysis.
 68 |         M is the log2 ratio of isoform transcript abundance between conditions.
 69 |         A is the log2 transformed mean abundance value.
 70 |         Transcripts that satisfy the logFC and FDR-corrected
 71 |         (False discovery rate - Benjamini-Hochberg) p-value
 72 |         thresholds defined are shaded as 'Up-' or 'Down-' regulated.""")
 73 | 
 74 |     dexseq_results['direction'] = 'not_sig'
 75 | 
 76 |     dexseq_results.loc[
 77 |         (dexseq_results["Log2FC"] > 0) & (dexseq_results['pvalue'] < pval_thresh),
 78 |         'direction'] = 'up'
 79 | 
 80 |     dexseq_results.loc[
 81 |         (dexseq_results["Log2FC"] <= 0) & (dexseq_results['pvalue'] < pval_thresh),
 82 |         'direction'] = 'down'
 83 | 
 84 |     plot = scatterplot(
 85 |         data=dexseq_results, x='Log2MeanExon', y='Log2FC', hue='direction',
 86 |         palette=['#E32636', '#7E8896', '#0A22DE'],
 87 |         hue_order=['up', 'down', 'not_sig'], marker='circle')
 88 |     plot._fig.xaxis.axis_label = "A (log2 transformed mean exon read counts)"
 89 |     plot._fig.yaxis.axis_label = "M (log2 transformed differential abundance)"
 90 |     plot.legend = dict(orient='horizontal', top=30)
 91 |     plot._fig.title = "Average copy per million (CPM) vs Log-fold change (LFC)"
 92 |     EZChart(plot)
 93 | 
 94 | 
 95 | def dtu_section(dtu_file, txid_to_gene_name):
 96 |     """Plot dtu section."""
 97 |     dtu_results = pd.read_csv(dtu_file, sep='\t')
 98 |     dtu_results["gene_name"] = dtu_results["txID"].apply(
 99 |         lambda x: txid_to_gene_name.get(x))
100 | 
101 |     dtu_pvals = dtu_results.sort_values(by='gene', ascending=True)
102 |     raw("""Table showing gene and transcript identifiers
103 |     and their FDR-corrected (False discovery rate - Benjamini-Hochberg) probabilities
104 |     for the genes and their isoforms that have been
105 |     identified as showing DTU using the R packages DEXSeq and StageR.
106 |     This list has been shortened requiring that both gene and transcript
107 |     must satisfy the p-value
108 |     threshold""")
109 |     DataTable.from_pandas(dtu_results.loc[dtu_pvals.index], use_index=False)
110 | 
111 |     raw("""View dtu_plots.pdf file to see plots of differential isoform usage""")
112 | 
113 | 
114 | def dge_section(df, pval_thresh):
115 |     """Create DGE table and MA plot."""
116 |     h5("Differential gene expression")
117 |     df[['logFC', 'logCPM', 'F']] = df[
118 |         ['logFC', 'logCPM', 'F']].round(2)
119 | 
120 |     p("""Table showing the genes from the edgeR analysis.
121 |     Information shown includes the log2 fold change between
122 |     experimental conditions, the log-scaled counts per million measure of abundance
123 |     and the FDR-corrected p-value (False discovery rate - Benjamini-Hochberg).
124 |     This table has not been
125 |     filtered for genes that satisfy statistical or magnitudinal thresholds""")
126 | 
127 |     df = df.sort_values('FDR', ascending=True)
128 |     df.index.name = 'gene_id'
129 |     DataTable.from_pandas(df, use_index=True)
130 | 
131 |     h5("Results of the edgeR Analysis.")
132 | 
133 |     p("""This plot visualises differences in measurements between the
134 |     two experimental conditions. M is the log2 ratio of gene expression
135 |     calculated between the conditions.
136 |     A is a log2 transformed mean expression value.
137 |     The figure below presents the MA figure from this edgeR analysis.
138 |     Genes that satisfy the logFC and FDR-corrected
139 |     (False discovery rate - Benjamini-Hochberg) p-value thresholds
140 |     defined are shaded as 'Up-' or 'Down-' regulated.
141 |     """)
142 |     df['sig'] = None
143 |     df.loc[(df["logFC"] > 0) & (df['PValue'] < pval_thresh), 'sig'] = 'up'
144 |     df.loc[(df["logFC"] <= 0) & (df['PValue'] < pval_thresh), 'sig'] = 'down'
145 |     df.loc[(df["PValue"] >= pval_thresh), 'sig'] = 'not_sig'
146 | 
147 |     plot = scatterplot(
148 |         data=df, x='logCPM', y='logFC', hue='sig',
149 |         palette=['#E32636', '#7E8896', '#0A22DE'],
150 |         hue_order=['up', 'not_sig', 'down'], marker='circle')
151 |     plot._fig.x_range.start = 10
152 |     plot._fig.xaxis.axis_label = "Average log CPM"
153 |     plot._fig.yaxis.axis_label = "Log-fold change"
154 |     plot.legend = dict(orient='horizontal', top=30)
155 |     # Should opacity of the symbols be lowered?
156 |     plot._fig.title = "Average copy per million (CPM) vs Log-fold change (LFC)"
157 |     EZChart(plot)
158 | 
159 | 
160 | def salmon_table(salmon_counts):
161 |     """Create salmon counts summary table."""
162 |     salmon_counts = pd.read_csv(salmon_counts, sep='\t')
163 |     salmon_counts.set_index("Reference", drop=True, append=False, inplace=True)
164 |     salmon_size_top = salmon_counts.sum(axis=1).sort_values(ascending=False)
165 |     salmon_counts = salmon_counts.applymap(np.int64)
166 |     h5("Transcripts Per Million")
167 |     p("""Table showing the annotated Transcripts Per Million
168 |     identified by Minimap2 mapping and Salmon transcript
169 |     detection. Displaying the top 100 transcripts with the highest
170 |     number of mapped reads""")
171 | 
172 |     salmon_counts = salmon_counts[sorted(salmon_counts.columns)]
173 |     DataTable.from_pandas(
174 |         salmon_counts.loc[salmon_size_top.index].head(n=100), use_index=True)
175 | 
176 | 
177 | def get_translations(gtf):
178 |     """Create gene_and transcript id mappings.
179 | 
180 |     Annotation can be stringtie-generated (GTF) or from the input
181 |     reference annotation (GTF or GFF3) and the various attributes can differ
182 |     """
183 |     with open(gtf) as fh:
184 |         txid_to_gene_name = {}
185 |         gid_to_gene_name = {}
186 |         tx_id_to_gene_id = {}
187 | 
188 |         def get_feature(row, feature):
189 |             return row.split(feature)[1].split(
190 |                 ";")[0].replace('=', '').replace("\"", "").strip()
191 | 
192 |         for gff_entry in fh:
193 |             # Process transcripts features only
194 |             if gff_entry.startswith("#") or gff_entry.split('\t')[2] != 'transcript':
195 |                 continue
196 |             # Different gtf/gff formats contain different attributes
197 |             # and different formating (eg. gene_name="xyz" or gene_name "xyz")
198 |             gene_name = gene_id = transcript_id = 'unknown'
199 | 
200 |             if 'ref_gene_id' in gff_entry:
201 |                 # Favour ref_gene_id over gene_id. The latter can be multi-locus merged
202 |                 # genes from stringtie
203 |                 gene_id = get_feature(gff_entry, 'ref_gene_id')
204 |             elif 'gene_id' in gff_entry:
205 |                 gene_id = get_feature(gff_entry, 'gene_id')
206 |             else:
207 |                 gene_id = get_feature(gff_entry, 'gene')
208 | 
209 |             if 'transcript_id' in gff_entry:
210 |                 transcript_id = get_feature(gff_entry, 'transcript_id')
211 | 
212 |             if 'gene_name' in gff_entry:
213 |                 gene_name = get_feature(gff_entry, 'gene_name')
214 |             else:
215 |                 # Fallback to gene_id if gene_name is not present
216 |                 gene_name = gene_id
217 | 
218 |             txid_to_gene_name[transcript_id] = gene_name
219 |             tx_id_to_gene_id[transcript_id] = gene_id
220 |             gid_to_gene_name[gene_id] = gene_name
221 |     return txid_to_gene_name, tx_id_to_gene_id, gid_to_gene_name
222 | 
223 | 
224 | def de_section(
225 |         annotation, dge, dexseq, dtu,
226 |         tpm, report, filtered, unfiltered,
227 |         gene_counts, flagstats_dir, pval_threshold=0.01):
228 |     """Differential expression sections."""
229 |     with (report.add_section("Differential expression", "DE")):
230 | 
231 |         p("""This section shows differential gene expression
232 |         and differential isoform usage. Salmon was used to
233 |         assign reads to individual annotated isoforms defined by
234 |         the GTF-format annotation.
235 |         These counts were used to perform a statistical analysis to identify
236 |         the genes and isoforms that show differences in abundance between
237 |         the experimental conditions.
238 |         Any novel genes or transcripts that do not have relevant gene or transcript IDs
239 |         are prefixed with MSTRG for use in differential expression analysis.
240 |         Find the full sequences of any transcripts in the
241 |         final_non_redundant_transcriptome.fasta file.
242 |         """)
243 |         alignment_summary_df = flagstats_df(flagstats_dir)
244 |         h5("Alignment summary stats")
245 |         DataTable.from_pandas(alignment_summary_df, use_index=True)
246 |         salmon_table(tpm)
247 | 
248 |         # Get translations for adding gene names to tables
249 |         (
250 |             txid_to_gene_name,  txid_to_gene_id, gid_to_gene_name
251 |          ) = get_translations(annotation)
252 | 
253 |         # Add gene names columns to counts files and write out
254 |         # for publishing to user dir.
255 |         df_dge = pd.read_csv(dge, sep='\t')
256 |         df_dge.insert(0, 'gene_name', df_dge.index.map(
257 |             lambda x: gid_to_gene_name.get(x)))
258 |         df_dge.to_csv('results_dge.tsv', index=True, index_label="gene_id", sep="\t")
259 | 
260 |         df_gene_counts = pd.read_csv(gene_counts, sep='\t')
261 |         df_gene_counts.insert(
262 |             0, 'gene_name', df_gene_counts.index.map(
263 |                 lambda x: gid_to_gene_name.get(x)))
264 |         df_gene_counts.to_csv(
265 |             'all_gene_counts.tsv', index=True, index_label="gene_id", sep="\t")
266 | 
267 |         df_filtered = pd.read_csv(filtered, sep='\t')
268 |         df_filtered.insert(1, "gene_name", df_filtered.gene_id.map(
269 |             lambda x: gid_to_gene_name.get(x)))
270 |         df_filtered.to_csv(
271 |             'filtered_transcript_counts_with_genes.tsv', index=False, sep='\t')
272 | 
273 |         df_unfiltered = pd.read_csv(unfiltered, sep='\t')
274 |         df_unfiltered.insert(1, "gene_name", df_unfiltered.gene_id.map(
275 |             lambda x: gid_to_gene_name.get(x)))
276 |         df_unfiltered.to_csv(
277 |             'unfiltered_transcript_counts_with_genes.tsv', index=False, sep='\t')
278 | 
279 |         df_tpm = pd.read_csv(tpm, sep='\t')
280 |         df_tpm.insert(1, "gene_name", df_tpm.Reference.map(
281 |             lambda x: txid_to_gene_name.get(x)))
282 |         df_tpm.to_csv("unfiltered_tpm_transcript_counts.tsv", index=False, sep='\t')
283 | 
284 |         # Add tables to report
285 |         dge_section(df_dge, pval_threshold)
286 |         dexseq_section(dexseq, txid_to_gene_name, txid_to_gene_id, pval_threshold)
287 |         dtu_section(dtu, txid_to_gene_name)
288 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/generate_pychopper_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Generate CSV of pychopper stats."""
 3 | 
 4 | # -*- coding: utf-8 -*-
 5 | 
 6 | import os
 7 | 
 8 | import pandas as pd
 9 | 
10 | from .util import wf_parser  # noqa: ABS101
11 | 
12 | 
13 | def argparser():
14 |     """Argument parser for entrypoint."""
15 |     parser = wf_parser("generate_pychopper_stats")
16 |     parser.add_argument("--data", required=True, help="")
17 |     parser.add_argument("--output_dir", required=True, help="")
18 | 
19 |     return parser
20 | 
21 | 
22 | def generate_pychopper_stats(tsv, output):
23 |     """Make CSV of pychopper stats."""
24 |     classified_path = os.path.join(output, "pychopper_stats.csv")
25 |     df = pd.read_csv(tsv, sep="\t", index_col="Name")
26 |     classified = df.loc[df["Category"] == "Classification"]\
27 |         .copy().reset_index().rename(columns={'Name': 'Classification'})
28 |     classified["Percentage"] = \
29 |         100 * classified["Value"] / classified["Value"].sum()
30 |     tuning = df.loc[df["Category"] == "AutotuneSample"]\
31 |         .copy().reset_index().rename(columns={'Name': 'Filter'})
32 |     tuning.to_csv(classified_path)
33 | 
34 | 
35 | def main(args):
36 |     """Run entry point."""
37 |     assert os.path.isfile(args.data)
38 |     assert os.path.isdir(args.output_dir)
39 |     generate_pychopper_stats(tsv=args.data, output=args.output_dir)
40 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/merge_count_tsvs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Merge salmon output count files."""
 3 | 
 4 | from functools import reduce
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from .util import wf_parser  # noqa: ABS101
10 | 
11 | 
12 | def argparser():
13 |     """Argument parser for entrypoint."""
14 |     parser = wf_parser("merge_count_tsvs")
15 |     parser.add_argument(
16 |         '-j', metavar='join', help="Join type (outer).", default="outer")
17 |     parser.add_argument(
18 |         '-f', metavar='field',
19 |         help="Join on this field (Reference).", default="Reference")
20 |     parser.add_argument(
21 |         '-o', metavar='out_tsv',
22 |         help="Output tsv (merge_tsvs.tsv).", default="merge_tsvs.tsv")
23 |     parser.add_argument(
24 |         '-z', action="store_true",
25 |         help="Fill NA values with zero.", default=False)
26 |     parser.add_argument(
27 |         '-tpm', type=bool, default=False,
28 |         help="TPM instead of counts")
29 |     parser.add_argument(
30 |         '-tsvs', metavar='input_tsvs', nargs='*',
31 |         help="Input tab separated files.")
32 | 
33 |     return parser
34 | 
35 | 
36 | def main(args):
37 |     """Run entry point."""
38 |     dfs = {x: pd.read_csv(x, sep="\t") for x in args.tsvs}
39 | 
40 |     ndfs = []
41 |     for x, df in dfs.items():
42 |         # Transform counts to integers:
43 |         if args.tpm:
44 |             df = df.rename(columns={'TPM': 'Count', 'Name': 'Reference'})
45 |         else:
46 |             df = df.rename(columns={'NumReads': 'Count', 'Name': 'Reference'})
47 |         df.Count = np.array(df.Count, dtype=int)
48 |         # Take only non-zero counts:
49 |         df = df[df.Count > 0]
50 |         df = df[["Reference", "Count"]]
51 |         df = df.sort_values(by=["Count"], ascending=False)
52 |         name = x.split('.')[0]
53 |         df = df.rename(columns={'Count': name})
54 |         ndfs.append(df)
55 |     dfs = ndfs
56 | 
57 |     df_merged = reduce(lambda left, right: pd.merge(
58 |         left, right, on=args.f, how=args.j), dfs)
59 |     if args.z:
60 |         df_merged = df_merged.fillna(0)
61 |     df_merged = df_merged.sort_index(axis=1)
62 |     df_merged = df_merged.sort_index(axis=0)
63 | 
64 |     df_merged.to_csv(args.o, sep="\t", index=False)
65 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/models/__init__.py:
--------------------------------------------------------------------------------
1 | """A collection of scripts for results models."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/models/common.py:
--------------------------------------------------------------------------------
  1 | """Common model classes used across all workflows."""
  2 | from dataclasses import asdict, dataclass, field
  3 | from decimal import Decimal
  4 | from enum import Enum
  5 | import json
  6 | from typing import Any, Dict, List
  7 | 
  8 | 
  9 | class SampleType(str, Enum):
 10 |     """The type of the sample."""
 11 | 
 12 |     no_template_control = "no_template_control"
 13 |     positive_control = "positive_control"
 14 |     negative_control = "negative_control"
 15 |     test_sample = "test_sample"
 16 | 
 17 |     def friendly_name(self):
 18 |         """Convert sample type to string."""
 19 |         return self.name.replace("_", " ").capitalize()
 20 | 
 21 | 
 22 | @dataclass
 23 | class SampleIdentifier:
 24 |     """Additional identifiers for a sample."""
 25 | 
 26 |     name: str = field(
 27 |         metadata={
 28 |             "title": "Identifier name",
 29 |             "Description": "The name of the sample identifier"})
 30 |     value: str = field(
 31 |         metadata={
 32 |             "title": "Identifier value",
 33 |             "Description": "The value of the sample identifier"})
 34 | 
 35 | 
 36 | @dataclass
 37 | class CheckResult:
 38 |     """
 39 |     A result of some check the workflow has performed.
 40 | 
 41 |     This can be at sample or workflow level.
 42 |     """
 43 | 
 44 |     check_category: str = field(
 45 |         metadata={
 46 |             "title": "Check category",
 47 |             "description": "The category of the check"})
 48 |     check_name: str = field(
 49 |         metadata={
 50 |             "title": "Check name",
 51 |             "description": "The name of the check"})
 52 |     check_pass: bool = field(
 53 |         metadata={
 54 |             "title": "Check pass",
 55 |             "description": "If true the check has passed"})
 56 |     check_threshold: str | None = field(
 57 |         default=None, metadata={
 58 |             "title": "Check threshold",
 59 |             "description": "The threshold for the check, useful for reporting later"})
 60 | 
 61 |     categories = {}
 62 | 
 63 |     def friendly_check_category(self):
 64 |         """Convert category to string."""
 65 |         if self.check_category not in self.categories:
 66 |             raise ValueError(f"{self.check_category} has no friendly name")
 67 |         return self.categories[self.check_category]
 68 | 
 69 |     def friendly_check_name(self):
 70 |         """Convert check name to string."""
 71 |         return self.check_name.replace("_", " ").capitalize()
 72 | 
 73 | 
 74 | @dataclass
 75 | class ResultsContents:
 76 |     """Placeholder class for results contents."""
 77 | 
 78 |     pass
 79 | 
 80 | 
 81 | @dataclass
 82 | class Sample:
 83 |     """A sample sheet entry and its corresponding checks and related results."""
 84 | 
 85 |     alias: str = field(
 86 |         metadata={
 87 |             "title": "Sample alias",
 88 |             "description": "The alias for the sample given by the user"})
 89 |     barcode: str = field(
 90 |         metadata={
 91 |             "title": "Sample barcode",
 92 |             "description": "The physical barcode assigned to the sample"})
 93 |     sample_type: SampleType = field(
 94 |         metadata={
 95 |             "title": "Sample type",
 96 |             "description": "The type of the sample"})
 97 |     sample_pass: bool = field(
 98 |         metadata={
 99 |             "title": "Sample pass",
100 |             "description": "If true the sample has passed workflow checks"})
101 |     additional_identifiers: List[SampleIdentifier] = field(
102 |         default_factory=list, metadata={
103 |             "title": "Additional sample identifiers",
104 |             "description": "Addition identifiers for the sample"})
105 |     sample_checks: list[CheckResult] = field(
106 |         default_factory=list, metadata={
107 |             "title": "Sample checks",
108 |             "description": "An array of checks performed on the sample"})
109 |     results: ResultsContents | None = field(
110 |         default=None, metadata={
111 |             "title": "Sample results",
112 |             "description": "Further specific workflow results for this sample"})
113 |     config:  Dict[str, Any] | None = field(
114 |         default=None, metadata={
115 |             "title": "Sample configuration",
116 |             "description": """Sample specific config parameters
117 |             used for running analysis"""})
118 | 
119 |     def __post_init__(self):
120 |         """Determine overall status for a sample given the individual check results."""
121 |         self.sample_pass = all(
122 |             check.check_pass for check in self.sample_checks)
123 | 
124 |     def get_sample_identifier(self, sample_identifier):
125 |         """Get a sample identifier given the identifier name."""
126 |         for indentifier in self.additional_identifiers:
127 |             if indentifier.name == sample_identifier:
128 |                 return indentifier.value
129 |         raise KeyError("Sample identifier not found")
130 | 
131 |     def set_sample_identifier(self, name, value):
132 |         """Set a sample identifier."""
133 |         sample_identifier = SampleIdentifier(
134 |             name=name,
135 |             value=value)
136 |         self.additional_identifiers.append(sample_identifier)
137 |         return self.additional_identifiers
138 | 
139 |     def to_json(self, filename):
140 |         """Save class as JSON."""
141 |         with open(filename, 'w') as f:
142 |             json.dump(asdict(self), f, default=str, indent=2, cls=DecimalEncoder)
143 | 
144 | 
145 | @dataclass
146 | class RunStats:
147 |     """Basic run statistics for the entire run."""
148 | 
149 |     total_reads: int | None = field(
150 |         default=None, metadata={
151 |             "title": "Total reads",
152 |             "description": "Total number of reads on run"})
153 |     total_ambiguous_reads: int | None = field(
154 |         default=None, metadata={
155 |             "title": "Total ambiguous reads",
156 |             "description": "Number of reads of unknown provenance"})
157 |     total_unaligned_reads: int | None = field(
158 |         default=None, metadata={
159 |             "title": "Total unaligned reads",
160 |             "description": "Number of unaligned reads"})
161 | 
162 | 
163 | @dataclass
164 | class WorkflowResult():
165 |     """
166 |     Definition for results that will be returned by this workflow.
167 | 
168 |     This structure will be passed through by Gizmo speaking clients
169 |     as WorkflowInstance.results.
170 |     """
171 | 
172 |     samples: list[Sample] = field(
173 |         metadata={
174 |             "title": "Samples",
175 |             "description": "Samples in this workflow instance"})
176 |     workflow_pass: bool | None = field(
177 |         default=None, metadata={
178 |             "title": "Workflow pass",
179 |             "description": "True if this workflow instance passes all checks"})
180 |     workflow_checks: list[CheckResult] = field(
181 |         default_factory=list, metadata={
182 |             "title": "Workflow checks",
183 |             "description": "An array of checks performed on the workflow instance"})
184 |     run_stats: RunStats | None = field(
185 |         default=None, metadata={
186 |             "title": "Samples",
187 |             "description": "Basic run statistics"})
188 |     client_fields: dict[str, Any] | None = field(
189 |         default_factory=dict, metadata={
190 |             "title": "Client fields",
191 |             "description": "Arbitrary key-value pairs provided by the client"})
192 | 
193 |     def to_json(self, filename):
194 |         """Save class as JSON."""
195 |         with open(filename, 'w') as f:
196 |             json.dump(asdict(self), f, default=str, indent=2, cls=DecimalEncoder)
197 | 
198 | 
199 | class DecimalEncoder(json.JSONEncoder):
200 |     """This should probably be moved."""
201 | 
202 |     def default(self, obj):
203 |         """Override the default method to handle Decimal objects."""
204 |         if isinstance(obj, Decimal):
205 |             return float(obj)
206 |         return super().default(obj)
207 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/parse_gffcompare.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Make report tables and data for plotting."""
  3 | import os
  4 | from pathlib import Path
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from .util import get_named_logger, wf_parser  # noqa: ABS101
  9 | 
 10 | 
 11 | def argparser():
 12 |     """Argument parser for entrypoint."""
 13 |     parser = wf_parser("Parse gffcompare")
 14 |     parser.add_argument(
 15 |         '--sample_id', help="Sample ID", required=True)
 16 |     parser.add_argument(
 17 |         '--gffcompare_dir',
 18 |         help="The gffcompare output directory",
 19 |         required=False,
 20 |         type=Path)
 21 |     parser.add_argument(
 22 |         '--isoform_table_out',
 23 |         help="Output path for per-isoform table",
 24 |         type=Path)
 25 |     parser.add_argument(
 26 |         '--tracking',
 27 |         help="gffcompare tracking file",
 28 |         type=Path)
 29 |     parser.add_argument(
 30 |         "--annotation",
 31 |         required=False,
 32 |         default=None, help="Reference annotation GFF file")
 33 | 
 34 |     return parser
 35 | 
 36 | 
 37 | def _parse_stat_line(sl):
 38 |     """Parse a stats line."""
 39 |     res = {}
 40 |     tmp = sl.split(':')[1].split('|')
 41 |     res['sensitivity'] = float(tmp[0].strip())
 42 |     res['precision'] = float(tmp[1].strip())
 43 |     return res
 44 | 
 45 | 
 46 | def _parse_matching_line(line):
 47 |     """Parse a matching line."""
 48 |     tmp = line.split(':')[1].strip()
 49 |     return int(tmp)
 50 | 
 51 | 
 52 | def _parse_mn_line(line):
 53 |     """Parse a miss or novel line."""
 54 |     res = {}
 55 |     tmp = line.split(':')[1].strip()
 56 |     tmp = tmp.split('/')
 57 |     res['value'] = int(tmp[0])
 58 |     tmp = tmp[1].split('(')
 59 |     res['value_total'] = int(tmp[0].strip())
 60 |     res['percent'] = float(tmp[1].split('%)')[0])
 61 |     return res
 62 | 
 63 | 
 64 | def _parse_total_line(line):
 65 |     """Parse a total line."""
 66 |     res = {}
 67 |     tmp = line.split(':')[1].strip()
 68 |     tmp = tmp.split('in')
 69 |     res['transcripts'] = int(tmp[0].strip())
 70 |     tmp = tmp[1].split('loci')
 71 |     res['loci'] = int(tmp[0].strip())
 72 |     tmp = int(tmp[1].split('(')[1].split(' ')[0])
 73 |     res['me_transcripts'] = tmp
 74 |     return res
 75 | 
 76 | 
 77 | def parse_gffcmp_stats(gffcompare_stats, sample_id, outpath):
 78 |     """Parse a gffcompare stats file.
 79 | 
 80 |     Gffcompare stats file
 81 |     :param gffcompare_stats: Path to the gffcompare stats file.
 82 |     :returns: Return as tuple of dataframes containing:
 83 |     perfromance statistics, match statistics, miss statistics,
 84 |     novel statistics, total statistics.
 85 |     :rtype: tuple
 86 |     """
 87 |     performance = []
 88 |     missed = []
 89 |     novel = []
 90 |     total = []
 91 | 
 92 |     with open(gffcompare_stats, 'r') as fh:
 93 |         for line in fh:
 94 |             line = line.strip()
 95 |             if len(line) == 0:
 96 |                 continue
 97 | 
 98 |             # Parse totals:
 99 |             if line.startswith('#     Query mRNAs'):
100 |                 r = _parse_total_line(line)
101 |                 total.append([r['loci'], 'loci', 'query'])
102 |                 total.append([r['transcripts'], 'transcripts', 'query'])
103 |                 total.append([r['me_transcripts'], 'multexonic', 'query'])
104 |             if line.startswith('# Reference mRNAs '):
105 |                 r = _parse_total_line(line)
106 |                 total.append([r['loci'], 'loci', 'reference'])
107 |                 total.append([r['transcripts'], 'transcripts', 'reference'])
108 |                 total.append([r['me_transcripts'], 'multexonic', 'reference'])
109 | 
110 |             # Parse basic statistics:
111 |             if line.startswith('Base level'):
112 |                 st = _parse_stat_line(line)
113 |                 performance.append((st['sensitivity'], 'Sensitivity', 'Base'))
114 |                 performance.append((st['precision'], 'Precision', 'Base'))
115 |             if line.startswith('Exon level'):
116 |                 st = _parse_stat_line(line)
117 |                 performance.append((st['sensitivity'], 'Sensitivity', 'Exon'))
118 |                 performance.append((st['precision'], 'Precision', 'Exon'))
119 |             if line.startswith('Intron level'):
120 |                 st = _parse_stat_line(line)
121 |                 performance.append((st['sensitivity'], 'Sensitivity', 'Intron'))
122 |                 performance.append((st['precision'], 'Precision', 'Intron'))
123 |             if line.startswith('Intron chain level'):
124 |                 st = _parse_stat_line(line)
125 |                 performance.append((st['sensitivity'], 'Sensitivity', 'Intron_chain'))
126 |                 performance.append((st['precision'], 'Precision', 'Intron_chain'))
127 |             if line.startswith('Transcript level'):
128 |                 st = _parse_stat_line(line)
129 |                 performance.append((st['sensitivity'], 'Sensitivity', 'Transcript'))
130 |                 performance.append((st['precision'], 'Precision', 'Transcript'))
131 |             if line.startswith('Locus level'):
132 |                 st = _parse_stat_line(line)
133 |                 performance.append((st['sensitivity'], 'Sensitivity', 'Locus'))
134 |                 performance.append((st['precision'], 'Precision', 'Locus'))
135 | 
136 |             # Parse missing statistics:
137 |             if line.startswith('Missed exons'):
138 |                 r = _parse_mn_line(line)
139 |                 missed.append((r['value'], 'Missed', 'Exons'))
140 |                 missed.append((r['value_total'], 'total', 'Exons'))
141 |                 missed.append((r['percent'], 'Percent', 'Exons'))
142 |             if line.startswith('Missed introns'):
143 |                 r = _parse_mn_line(line)
144 |                 missed.append((r['value'], 'Missed', 'Introns'))
145 |                 missed.append((r['value_total'], 'total', 'Introns'))
146 |                 missed.append((r['percent'], 'Percent', 'Introns'))
147 |             if line.startswith('Missed loci'):
148 |                 r = _parse_mn_line(line)
149 |                 missed.append((r['value'], 'Missed', 'Loci'))
150 |                 missed.append((r['value_total'], 'total', 'Loci'))
151 |                 missed.append((r['percent'], 'Percent', 'Loci'))
152 | 
153 |             # Parse novel statistics:
154 |             if line.startswith('Novel exons'):
155 |                 r = _parse_mn_line(line)
156 |                 novel.append((r['value'], 'Novel', 'Exons'))
157 |                 novel.append((r['value_total'], 'Total', 'Exons'))
158 |                 novel.append((r['percent'], 'Percent_novel', 'Exons'))
159 |             if line.startswith('Novel introns'):
160 |                 r = _parse_mn_line(line)
161 |                 novel.append((r['value'], 'Novel', 'Introns'))
162 |                 novel.append((r['value_total'], 'Total', 'Introns'))
163 |                 novel.append((r['percent'], 'Percent_novel', 'Introns'))
164 |             if line.startswith('Novel loci'):
165 |                 r = _parse_mn_line(line)
166 |                 novel.append((r['value'], 'Novel', 'Loci'))
167 |                 novel.append((r['value_total'], 'Total', 'Loci'))
168 |                 novel.append((r['percent'], 'Percent_novel', 'Loci'))
169 | 
170 |     def write_records(records, fn):
171 |         pd.DataFrame.from_records(records, columns=['counts', 'type', 'source']) \
172 |             .to_csv(outpath / fn, sep='\t')
173 | 
174 |     write_records(total, 'Totals.tsv')
175 |     write_records(missed, 'Missed.tsv')
176 |     write_records(performance, 'Performance.tsv')
177 |     write_records(novel, 'Novel.tsv')
178 | 
179 | 
180 | def tracking_summary(tracking_file, output_dir, annotations=None):
181 |     """Write per transcript class gffcompare tracking files."""
182 |     tracking_headings = [
183 |         "query_transfrag_id", "query_locus_id", "ref_gene_id",
184 |         "class", "details"]
185 |     nice_names = {
186 |         '=': 'complete', 'c': 'contained', 'k': 'containment',
187 |         'm': 'retained', 'n': 'retained (partial)', 'j': 'multi',
188 |         'e': 'single', 'o': 'overlap', 's': 'opposite',
189 |         'x': 'exonic', 'i': 'intron', 'y': 'contains', 'p': 'runon',
190 |         'r': 'repeat', 'u': 'unknown'}
191 | 
192 |     if os.path.exists(annotations):
193 |         tracking = pd.read_csv(
194 |             tracking_file, sep="\t", names=tracking_headings[1:],
195 |             index_col=0)
196 | 
197 |         df = (
198 |             pd.DataFrame(tracking['class'].value_counts())
199 |             .reset_index()
200 |             .rename(columns={'index': 'class', 'class': 'Count'})
201 |         )
202 | 
203 |         df['Percent'] = round(df['Count'] * 100 / df['Count'].sum(), 2)
204 |         df['description'] = [nice_names[x] for x in df['class']]
205 | 
206 |         df = df.sort_values('Count', ascending=True)
207 |         df.to_csv(output_dir / 'tracking_summary.tsv', sep='\t')
208 | 
209 |     else:
210 |         logger = get_named_logger('trackingSum')
211 |         logger.info("Skipping classification summary as no annotation provided.")
212 | 
213 | 
214 | def make_isoform_table(gffcompare_dir, sample_id, outpath):
215 |     """Make an isoform summary table."""
216 |     try:
217 |         tmap_file = next(gffcompare_dir.glob('*.tmap'))
218 |     except StopIteration:
219 |         raise ValueError("Cannot find .tmap file in {}".format(gffcompare_dir))
220 |     dtypes = {
221 |         'ref_gene_id': str,
222 |         'ref_id': str,
223 |         'class_code': str,
224 |         'qry_id': str,
225 |         'num_exons': np.uint16,
226 |         'cov': np.uint32,
227 |         'len': np.uint32
228 |     }
229 |     df = pd.read_csv(
230 |         tmap_file, sep='\t+',
231 |         index_col=None,
232 |         usecols=list(dtypes.keys()),
233 |         dtype=dtypes)
234 | 
235 |     if df.empty:  # No transcripts. Write a header only result file
236 |         df = pd.DataFrame(
237 |             columns=list(dtypes.keys()) + ['sample_id', 'parent gene iso num'])
238 |         df.to_csv(f'{sample_id}_transcripts_table.tsv', sep='\t', index=False)
239 |     else:
240 |         df = df.assign(sample_id=sample_id)
241 | 
242 |         #  Make a column of number of isoforms in parent gene
243 |         gb = df.groupby(['ref_gene_id']).count()
244 |         gb.rename(columns={'ref_id': 'num_isoforms'}, inplace=True)
245 | 
246 |         df['parent gene iso num'] = df.apply(
247 |             lambda x: gb.loc[(x.ref_gene_id), 'num_isoforms'], axis=1)
248 | 
249 |         # Unclassified transcripts should not be lumped together
250 |         df.loc[df.class_code == 'u', 'parent gene iso num'] = None
251 | 
252 |         df.to_csv(outpath, sep='\t', index=False)
253 | 
254 | 
255 | def main(args):
256 |     """Entry point."""
257 |     if args.gffcompare_dir:  # TODO: should this every be optional?
258 |         stats = args.gffcompare_dir / 'str_merged.stats'
259 |         parse_gffcmp_stats(stats, args.sample_id, args.gffcompare_dir)
260 |         make_isoform_table(args.gffcompare_dir, args.sample_id, args.isoform_table_out)
261 |         tracking_summary(
262 |             args.tracking, args.gffcompare_dir, args.annotation)
263 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/summarise_gff.py:
--------------------------------------------------------------------------------
 1 | """Get summary statistics from GFF file."""
 2 | from collections import Counter
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | import gffutils
 7 | from .util import wf_parser  # noqa: ABS101
 8 | 
 9 | 
10 | def argparser():
11 |     """Argument parser for entrypoint."""
12 |     parser = wf_parser("summ_gff")
13 |     parser.add_argument(
14 |         "gff",
15 |         help="Report output file",
16 |         type=Path)
17 |     parser.add_argument(
18 |         "sample_id",
19 |         help="Output TSV file path")
20 |     parser.add_argument(
21 |         "out",
22 |         default="gff_summary.tsv",
23 |         help="Output TSV file path",
24 |         type=Path)
25 | 
26 |     return parser
27 | 
28 | 
29 | def main(args):
30 |     """Entry point."""
31 |     db = gffutils.create_db(
32 |         str(args.gff), dbfn=':memory:', force=True, keep_order=True,
33 |         merge_strategy='merge', sort_attribute_values=True
34 |     )
35 | 
36 |     num_transcripts = db.count_features_of_type('transcript')
37 |     num_genes = db.count_features_of_type('gene')
38 | 
39 |     transcript_lens = []
40 |     exons_per_transcript = Counter()
41 |     isoforms_per_gene = Counter()
42 | 
43 |     for gene in db.features_of_type('gene'):
44 | 
45 |         n_isos = len(list(db.children(gene, featuretype='transcript')))
46 |         isoforms_per_gene[n_isos] += 1
47 | 
48 |         for transcript in db.children(
49 |                 gene, featuretype='transcript', order_by='start'):
50 |             tr_len = 0
51 |             exons = list(db.children(transcript, featuretype='exon'))
52 |             if len(exons) == 0:
53 |                 continue
54 |             exons_per_transcript[len(exons)] += 1
55 |             for ex in exons:
56 |                 tr_len += abs(ex.end - ex.start)
57 | 
58 |             transcript_lens.append(tr_len)
59 | 
60 |     results = {
61 |         'sample_id': args.sample_id,
62 |         'summaries': {
63 |             'Total genes': [num_genes],
64 |             'Total transcripts': [num_transcripts],
65 |             'Max trans. len': max(transcript_lens),
66 |             'Min trans. len': min(transcript_lens)
67 |         },
68 |         'transcript_lengths': transcript_lens,
69 |         'exons_per_transcript': exons_per_transcript,
70 |         'isoforms_per_gene': isoforms_per_gene
71 |     }
72 | 
73 |     with open(args.out, 'wb') as fh:
74 |         pickle.dump(results, fh)
75 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """__init__.py for the tests."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Pytests argument definitions."""
 3 | 
 4 | 
 5 | def pytest_addoption(parser):
 6 |     """Define command line arguments for pytest."""
 7 |     parser.addoption(
 8 |         "--test_data",
 9 |         action="store",
10 |         default="/host/test_data"
11 |     )
12 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_check_sample_sheet_condition.py:
--------------------------------------------------------------------------------
 1 | """Test check_sample_sheet.py."""
 2 | import os
 3 | 
 4 | import pytest
 5 | from workflow_glue import check_sample_sheet_condition
 6 | 
 7 | 
 8 | # define a list of error messages
 9 | ERROR_MESSAGES = [
10 |     ("sample_sheet_1.csv", "There must be only two unique conditions in the condition column of the sample sheet."),  # noqa: E501
11 |     ("sample_sheet_2.csv", "Sample sheet has no condition column which is required for the differential expression subworkflow."),  # noqa: E501
12 |     ("sample_sheet_3.csv", "There must be at least 2 repeats for each condition indicated in the sample sheet."),  # noqa: E501
13 |     ("sample_sheet_4.csv", "One of the condition types must be control, to indicate which samples to use as the reference."),  # noqa: E501
14 | ]
15 | 
16 | 
17 | @pytest.fixture
18 | def test_data(request):
19 |     """Define data location fixture."""
20 |     return os.path.join(
21 |         request.config.getoption("--test_data"),
22 |         "workflow_glue",
23 |         "check_sample_sheet_condition")
24 | 
25 | 
26 | @pytest.mark.parametrize("sample_sheet_name,error_msg", ERROR_MESSAGES)
27 | def test_check_sample_sheet(
28 |         test_data, sample_sheet_name, error_msg):
29 |     """Test the sample sheets."""
30 |     expected_error_message = error_msg
31 |     sample_sheet_path = f"{test_data}/{sample_sheet_name}"
32 |     args = check_sample_sheet_condition.argparser().parse_args(
33 |         [sample_sheet_path]
34 |     )
35 |     try:
36 |         check_sample_sheet_condition.main(args)
37 |     except SystemExit as e:
38 |         assert str(e) == expected_error_message
39 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_de_plots.py:
--------------------------------------------------------------------------------
 1 | """Test assign_barcodes."""
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | from workflow_glue.de_plots import get_translations
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def test_data(request):
10 |     """Define data location fixture."""
11 |     return Path(request.config.getoption("--test_data")) / "workflow_glue"
12 | 
13 | 
14 | @pytest.mark.parametrize(
15 |     'annotation_file,expected',
16 |     [
17 |         [
18 |             'MSTRG.11088.gtf',
19 |             dict(gid_to_gene_name={
20 |                 'ENSG00000236051.7': 'MYCBP2-AS1',
21 |                 'ENSG00000283208.2': 'ENSG00000283208',
22 |                 'ENSG00000102805.16': 'CLN5',
23 |                 'MSTRG.11088': 'MSTRG.11088'
24 |             },
25 |                 txid_to_gene_name={
26 |                     'ENST00000636183.2': 'CLN5',
27 |                     'ENST00000636780.2': 'CLN5',
28 |                     'ENST00000638147.2': 'ENSG00000283208',
29 |                     'ENST00000637192.1': 'ENSG00000283208',
30 |                     'ENST00000636737.1': 'MYCBP2-AS1',
31 |                     'ENST00000450627.6': 'MYCBP2-AS1',
32 |                     'MSTRG.11088.2': 'MSTRG.11088'
33 |                 },
34 |                 txid_to_gene_id={
35 |                     'ENST00000636183.2': 'ENSG00000102805.16',
36 |                     'MSTRG.11088.2': 'MSTRG.11088',
37 |                     'ENST00000636780.2': 'ENSG00000102805.16',
38 |                     'ENST00000638147.2': 'ENSG00000283208.2',
39 |                     'ENST00000637192.1': 'ENSG00000283208.2',
40 |                     'ENST00000636737.1': 'ENSG00000236051.7',
41 |                     'ENST00000450627.6': 'ENSG00000236051.7'
42 |                 })
43 | 
44 |         ],
45 |         # Small test to check that GFF3 works
46 |         [
47 |             'MSTRG.11088.gff3',
48 |             dict(gid_to_gene_name={
49 |                 "ENSG00000290825.1": "DDX11L2",
50 |                 "ENSG00000236397.3": "DDX11L2"
51 |             },
52 |                 txid_to_gene_name={
53 |                     "ENST00000456328.2": "DDX11L2",
54 |                     "ENST00000437401.1": "DDX11L2"
55 |                 },
56 |                 txid_to_gene_id={
57 |                     'ENST00000437401.1': 'ENSG00000236397.3',
58 |                     'ENST00000456328.2': 'ENSG00000290825.1'
59 |                 })
60 |         ]
61 |     ]
62 | )
63 | def test_get_translations(test_data, annotation_file, expected):
64 |     """Test that correct feature identifiers are extracted from the annotation.
65 | 
66 |     `stringtie --merge` can sometimes generate gene models that may span multiple
67 |     reference genes. Possibly related issue:
68 |     https://github.com/gpertea/stringtie/issues/217
69 |     This can lead to the original genes and transcripts being assigned to that
70 |     incorrectly-merged gene model. The test data contains such a gene model generated
71 |     from `stringtie --merge` but actually consists of multiple different genes.
72 | 
73 | 
74 |     """
75 |     input_gtf = test_data / annotation_file
76 |     txid_to_gene_name, txid_to_gene_id, gid_to_gene_name = get_translations(input_gtf)
77 | 
78 |     assert expected['gid_to_gene_name'] == gid_to_gene_name
79 |     assert expected['txid_to_gene_name'] == txid_to_gene_name
80 |     assert expected['txid_to_gene_id'] == txid_to_gene_id
81 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/util.py:
--------------------------------------------------------------------------------
 1 | """The odd helper function.
 2 | 
 3 | Be careful what you place in here. This file is imported into all glue.
 4 | """
 5 | import argparse
 6 | import logging
 7 | 
 8 | 
 9 | _log_name = None
10 | 
11 | 
12 | def get_main_logger(name):
13 |     """Create the top-level logger."""
14 |     global _log_name
15 |     _log_name = name
16 |     logging.basicConfig(
17 |         format='[%(asctime)s - %(name)s] %(message)s',
18 |         datefmt='%H:%M:%S', level=logging.INFO)
19 |     return logging.getLogger(name)
20 | 
21 | 
22 | def get_named_logger(name):
23 |     """Create a logger with a name.
24 | 
25 |     :param name: name of logger.
26 |     """
27 |     name = name.ljust(10)[:10]  # so logging is aligned
28 |     logger = logging.getLogger('{}.{}'.format(_log_name, name))
29 |     return logger
30 | 
31 | 
32 | def wf_parser(name):
33 |     """Make an argument parser for a workflow command."""
34 |     return argparse.ArgumentParser(
35 |         name,
36 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
37 |         add_help=False)
38 | 
39 | 
40 | def _log_level():
41 |     """Parser to set logging level and acquire software version/commit."""
42 |     parser = argparse.ArgumentParser(
43 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False)
44 | 
45 |     modify_log_level = parser.add_mutually_exclusive_group()
46 |     modify_log_level.add_argument(
47 |         '--debug', action='store_const',
48 |         dest='log_level', const=logging.DEBUG, default=logging.INFO,
49 |         help='Verbose logging of debug information.')
50 |     modify_log_level.add_argument(
51 |         '--quiet', action='store_const',
52 |         dest='log_level', const=logging.WARNING, default=logging.INFO,
53 |         help='Minimal logging; warnings only.')
54 | 
55 |     return parser
56 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | """A collection of helper scripts common to workflows."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_bam_headers_in_dir.py:
--------------------------------------------------------------------------------
 1 | """Check (u)BAM files for `@SQ` lines whether they are the same in all headers."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("checkBamHdr")
14 | 
15 |     if not args.input_path.is_dir():
16 |         raise ValueError(f"Input path '{args.input_path}' must be a directory.")
17 | 
18 |     target_files = list(args.input_path.glob("*"))
19 |     if not target_files:
20 |         raise ValueError(f"No files found in input directory '{args.input_path}'.")
21 |     # Loop over target files and check if there are `@SQ` lines in all headers or not.
22 |     # Set `is_unaligned` accordingly. If there are mixed headers (either with some files
23 |     # containing `@SQ` lines and some not or with different files containing different
24 |     # `@SQ` lines), set `mixed_headers` to `True`.
25 |     # Also check if there is the SO line, to validate whether the file is (un)sorted.
26 |     first_sq_lines = None
27 |     mixed_headers = False
28 |     sorted_xam = False
29 |     for xam_file in target_files:
30 |         # get the `@SQ` and `@HD` lines in the header
31 |         with pysam.AlignmentFile(xam_file, check_sq=False) as f:
32 |             # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with
33 |             # same reference but different SQ.UR as mixed_header (see CW-4842)
34 |             sq_lines = [{
35 |                 "SN": sq["SN"],
36 |                 "LN": sq["LN"],
37 |                 "M5": sq.get("M5"),
38 |             } for sq in f.header.get("SQ", [])]
39 |             hd_lines = f.header.get("HD")
40 |         # Check if it is sorted.
41 |         # When there is more than one BAM, merging/sorting
42 |         # will happen regardless of this flag.
43 |         if hd_lines is not None and hd_lines.get('SO') == 'coordinate':
44 |             sorted_xam = True
45 |         if first_sq_lines is None:
46 |             # this is the first file
47 |             first_sq_lines = sq_lines
48 |         else:
49 |             # this is a subsequent file; check with the first `@SQ` lines
50 |             if sq_lines != first_sq_lines:
51 |                 mixed_headers = True
52 |                 break
53 | 
54 |     # we set `is_unaligned` to `True` if there were no mixed headers and the last file
55 |     # didn't have `@SQ` lines (as we can then be sure that none of the files did)
56 |     is_unaligned = not mixed_headers and not sq_lines
57 |     # write `is_unaligned` and `mixed_headers` out so that they can be set as env.
58 |     # variables
59 |     sys.stdout.write(
60 |         f"IS_UNALIGNED={int(is_unaligned)};" +
61 |         f"MIXED_HEADERS={int(mixed_headers)};" +
62 |         f"IS_SORTED={int(sorted_xam)}"
63 |     )
64 |     logger.info(f"Checked (u)BAM headers in '{args.input_path}'.")
65 | 
66 | 
67 | def argparser():
68 |     """Argument parser for entrypoint."""
69 |     parser = wf_parser("check_bam_headers_in_dir")
70 |     parser.add_argument("input_path", type=Path, help="Path to target directory")
71 |     return parser
72 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_sample_sheet.py:
--------------------------------------------------------------------------------
  1 | """Check if a sample sheet is valid."""
  2 | import codecs
  3 | import csv
  4 | import os
  5 | import re
  6 | import sys
  7 | 
  8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
  9 | 
 10 | 
 11 | # Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my
 12 | # comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8
 13 | # I should add). If we do not handle this with the correct encoding, the mark will
 14 | # appear in the parsed data, causing the header to be malformed.
 15 | # See CW-2310
 16 | def determine_codec(f):
 17 |     """Peek at a file and return an appropriate reading codec."""
 18 |     with open(f, 'rb') as f_bytes:
 19 |         # Could use chardet here if we need to expand codec support
 20 |         initial_bytes = f_bytes.read(8)
 21 | 
 22 |         for codec, encoding_name in [
 23 |             [codecs.BOM_UTF8, "utf-8-sig"],  # use the -sig codec to drop the mark
 24 |             [codecs.BOM_UTF16_BE, "utf-16"],  # don't specify LE or BE to drop mark
 25 |             [codecs.BOM_UTF16_LE, "utf-16"],
 26 |             [codecs.BOM_UTF32_BE, "utf-32"],  # handle 32 for completeness
 27 |             [codecs.BOM_UTF32_LE, "utf-32"],  # again skip LE or BE to drop mark
 28 |         ]:
 29 |             if initial_bytes.startswith(codec):
 30 |                 return encoding_name
 31 |         return None  # will cause file to be opened with default encoding
 32 | 
 33 | 
 34 | def main(args):
 35 |     """Run the entry point."""
 36 |     logger = get_named_logger("checkSheet")
 37 | 
 38 |     barcodes = []
 39 |     aliases = []
 40 |     sample_types = []
 41 |     analysis_groups = []
 42 |     allowed_sample_types = [
 43 |         "test_sample", "positive_control", "negative_control", "no_template_control"
 44 |     ]
 45 | 
 46 |     if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet):
 47 |         sys.stdout.write("Could not open sample sheet file.")
 48 |         sys.exit()
 49 | 
 50 |     try:
 51 |         encoding = determine_codec(args.sample_sheet)
 52 |         with open(args.sample_sheet, "r", encoding=encoding) as f:
 53 |             try:
 54 |                 # Excel files don't throw any error until here
 55 |                 csv.Sniffer().sniff(f.readline())
 56 |                 f.seek(0)  # return to initial position again
 57 |             except Exception as e:
 58 |                 # Excel fails with UniCode error
 59 |                 sys.stdout.write(
 60 |                     "The sample sheet doesn't seem to be a CSV file.\n"
 61 |                     "The sample sheet has to be a CSV file.\n"
 62 |                     "Please verify that the sample sheet is a CSV file.\n"
 63 |                     f"Parsing error: {e}"
 64 |                  )
 65 | 
 66 |                 sys.exit()
 67 | 
 68 |             csv_reader = csv.DictReader(f)
 69 |             n_row = 0
 70 |             for row in csv_reader:
 71 |                 n_row += 1
 72 |                 if n_row == 1:
 73 |                     n_cols = len(row)
 74 |                 else:
 75 |                     # check we got the same number of fields
 76 |                     if len(row) != n_cols:
 77 |                         sys.stdout.write(
 78 |                             f"Unexpected number of cells in row number {n_row}"
 79 |                         )
 80 |                         sys.exit()
 81 |                 try:
 82 |                     barcodes.append(row["barcode"])
 83 |                 except KeyError:
 84 |                     sys.stdout.write("'barcode' column missing")
 85 |                     sys.exit()
 86 |                 try:
 87 |                     aliases.append(row["alias"])
 88 |                 except KeyError:
 89 |                     sys.stdout.write("'alias' column missing")
 90 |                     sys.exit()
 91 |                 try:
 92 |                     sample_types.append(row["type"])
 93 |                 except KeyError:
 94 |                     pass
 95 |                 try:
 96 |                     analysis_groups.append(row["analysis_group"])
 97 |                 except KeyError:
 98 |                     pass
 99 |     except Exception as e:
100 |         sys.stdout.write(f"Parsing error: {e}")
101 |         sys.exit()
102 | 
103 |     # check barcodes are correct format
104 |     for barcode in barcodes:
105 |         if not re.match(r'^barcode\d\d+$', barcode):
106 |             sys.stdout.write("values in 'barcode' column are incorrect format")
107 |             sys.exit()
108 | 
109 |     # check aliases are correct format
110 |     # for now we have decided they may not start with "barcode"
111 |     for alias in aliases:
112 |         if alias.startswith("barcode"):
113 |             sys.stdout.write("values in 'alias' column must not begin with 'barcode'")
114 |             sys.exit()
115 | 
116 |     # check barcodes are all the same length
117 |     first_length = len(barcodes[0])
118 |     for barcode in barcodes[1:]:
119 |         if len(barcode) != first_length:
120 |             sys.stdout.write("values in 'barcode' column are different lengths")
121 |             sys.exit()
122 | 
123 |     # check barcode and alias values are unique
124 |     if len(barcodes) > len(set(barcodes)):
125 |         sys.stdout.write("values in 'barcode' column not unique")
126 |         sys.exit()
127 |     if len(aliases) > len(set(aliases)):
128 |         sys.stdout.write("values in 'alias' column not unique")
129 |         sys.exit()
130 | 
131 |     if sample_types:
132 |         # check if "type" column has unexpected values
133 |         unexp_type_vals = set(sample_types) - set(allowed_sample_types)
134 | 
135 |         if unexp_type_vals:
136 |             sys.stdout.write(
137 |                 f"found unexpected values in 'type' column: {unexp_type_vals}. "
138 |                 f"Allowed values are: {allowed_sample_types}"
139 |             )
140 |             sys.exit()
141 | 
142 |         if args.required_sample_types:
143 |             for required_type in args.required_sample_types:
144 |                 if required_type not in allowed_sample_types:
145 |                     sys.stdout.write(f"Not an allowed sample type: {required_type}")
146 |                     sys.exit()
147 |                 if sample_types.count(required_type) < 1:
148 |                     sys.stdout.write(
149 |                         f"Sample sheet requires at least 1 of {required_type}")
150 |                     sys.exit()
151 |     if analysis_groups:
152 |         # if there was a "analysis_group" column, make sure it had values for all
153 |         # samples
154 |         if not all(analysis_groups):
155 |             sys.stdout.write(
156 |                 "if an 'analysis_group' column exists, it needs values in each row"
157 |             )
158 |             sys.exit()
159 | 
160 |     logger.info(f"Checked sample sheet {args.sample_sheet}.")
161 | 
162 | 
163 | def argparser():
164 |     """Argument parser for entrypoint."""
165 |     parser = wf_parser("check_sample_sheet")
166 |     parser.add_argument("sample_sheet", help="Sample sheet to check")
167 |     parser.add_argument(
168 |         "--required_sample_types",
169 |         help="List of required sample types. Each sample type provided must "
170 |              "appear at least once in the sample sheet",
171 |         nargs="*"
172 |     )
173 |     return parser
174 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_xam_index.py:
--------------------------------------------------------------------------------
 1 | """Validate a single (u)BAM file index."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def validate_xam_index(xam_file):
12 |     """Use fetch to validate the index.
13 | 
14 |     Invalid indexes will fail the call with a ValueError:
15 |     ValueError: fetch called on bamfile without index
16 |     """
17 |     with pysam.AlignmentFile(xam_file, check_sq=False) as alignments:
18 |         try:
19 |             alignments.fetch()
20 |             has_valid_index = True
21 |         except ValueError:
22 |             has_valid_index = False
23 |     return has_valid_index
24 | 
25 | 
26 | def main(args):
27 |     """Run the entry point."""
28 |     logger = get_named_logger("checkBamIdx")
29 | 
30 |     # Check if a XAM has a valid index
31 |     has_valid_index = validate_xam_index(args.input_xam)
32 |     # write `has_valid_index` out so that they can be set as env.
33 |     sys.stdout.write(
34 |         f"HAS_VALID_INDEX={int(has_valid_index)}"
35 |     )
36 |     logger.info(f"Checked (u)BAM index for: '{args.input_xam}'.")
37 | 
38 | 
39 | def argparser():
40 |     """Argument parser for entrypoint."""
41 |     parser = wf_parser("check_xam_index")
42 |     parser.add_argument("input_xam", type=Path, help="Path to target XAM")
43 |     return parser
44 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/configure_igv.py:
--------------------------------------------------------------------------------
  1 | """Create an IGV config file."""
  2 | 
  3 | import json
  4 | from pathlib import Path
  5 | import sys
  6 | 
  7 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
  8 | 
  9 | 
 10 | # Common variables
 11 | REF_EXTENSIONS = [".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fna.gz"]
 12 | DATA_TYPES_LISTS = {
 13 |     "bam": ["bam"],
 14 |     "bam_idx": ["bam.bai"],
 15 |     "cram": ["cram"],
 16 |     "cram_idx": ["cram.crai"],
 17 |     "vcf": ["vcf", "vcf.gz"],
 18 |     "vcf_idx": ["vcf.gz.tbi", "vcf.gz.csi"],
 19 |     "bcf": ["bcf"],
 20 |     "bcf_idx": ["bcf.csi"],
 21 |     "gtf": ["gtf", "gtf.gz"],
 22 |     "gtf_idx": ["gtf.gz.tbi"],
 23 |     "gff": ["gff", "gff.gz", "gff3", "gff3.gz"],
 24 |     "gff_idx": ["gff.gz.tbi", "gff3.gz.tbi"],
 25 |     "bed": ["bed", "bed.gz"],
 26 |     "bed_idx": ["bed.gz.tbi"],
 27 |     "bedmethyl": ["bedmethyl", "bedmethyl.gz"],
 28 |     "bedmethyl_idx": ["bedmethyl.gz.tbi"],
 29 |     "ref": REF_EXTENSIONS,
 30 | }
 31 | DATA_TYPES = {
 32 |     ext: ftype for ftype, extlist in DATA_TYPES_LISTS.items() for ext in extlist
 33 | }
 34 | 
 35 | # Data by idx
 36 | DATA_INDEXES_FMT = {
 37 |     fmt: f"{fmt}_idx" for fmt, dtype in DATA_TYPES.items() if "_idx" not in dtype
 38 | }
 39 | 
 40 | # Assign each format to its index
 41 | INDEX_PAIRS = {
 42 |     "bam": ("bai",),
 43 |     "cram": ("crai",),
 44 |     "vcf": ("tbi", "csi"),
 45 |     "bcf": ("csi",),
 46 |     "bed": ("tbi",),
 47 |     "bedmethyl": ("tbi",),
 48 |     "gff": ("tbi",),
 49 |     "gtf": ("tbi",),
 50 | }
 51 | 
 52 | 
 53 | class TrackBuilder:
 54 |     """Class that builds an IGV track."""
 55 | 
 56 |     def __init__(self):
 57 |         """Initialize properties for interval track."""
 58 |         # Reference properties
 59 |         self.ref = None
 60 |         self.fai = None
 61 |         self.gzi = None
 62 |         # Samples info
 63 |         self.samples = {}
 64 |         # Track properties
 65 |         self.igv_json = {"reference": {}, "tracks": []}
 66 |         self.track_type = {
 67 |             "bam": "alignment",
 68 |             "cram": "alignment",
 69 |             "bcf": "variant",
 70 |             "vcf": "variant",
 71 |             "bedmethyl": "annotation",
 72 |             "bed": "annotation",
 73 |             "gtf": "annotation",
 74 |             "gff": "annotation",
 75 |         }
 76 |         # Here we save aliases of file formats that IGV.js
 77 |         # wants and that do not match the input file extension.
 78 |         self.igv_fmt_alias = {"gff": "gff3"}
 79 |         # lookup of extra options for each data type
 80 |         self.extra_opts_lookups = {
 81 |             "bam": {},
 82 |             "cram": {},
 83 |             "bcf": {},
 84 |             "vcf": {},
 85 |             "bed": {},
 86 |             "bedmethyl": {},
 87 |             "gtf": {},
 88 |             "gff": {},
 89 |         }
 90 | 
 91 |     def add_ref(self, ref=None):
 92 |         """Add reference file, unless already defined."""
 93 |         if self.ref:
 94 |             raise Exception(
 95 |                 f"Reference genome has already been set to {self.ref}.\n"
 96 |                 "Only one reference FASTA file is expected."
 97 |             )
 98 |         else:
 99 |             self.ref = ref
100 | 
101 |     def add_ref_index(self, ref_index=None):
102 |         """Add reference index if valid."""
103 |         basename = Path(self.ref).name
104 |         idx_basename = Path(ref_index).name
105 |         if idx_basename == f"{basename}.fai":
106 |             self.fai = ref_index
107 |         if idx_basename == f"{basename}.gzi" and basename.endswith(".gz"):
108 |             self.gzi = ref_index
109 | 
110 |     def parse_fnames(self, fofn):
111 |         """Parse list with filenames and return them grouped.
112 | 
113 |         :param fofn: File with list of file names (one per line)
114 |         """
115 |         tmp_samples = {}
116 |         with open(fofn, "r") as f:
117 |             for line in f:
118 |                 # If the line contains the sample name, prepare the data structure
119 |                 if "," in line:
120 |                     sample, fname = line.strip().split(",")
121 |                     if sample not in tmp_samples:
122 |                         tmp_samples[sample] = SampleBundle(sample=sample)
123 |                     tmp_samples[sample].append(fname)
124 |                 else:
125 |                     # Otherwise, assign everything to NO_SAMPLE
126 |                     # Files will still be displayed, but in no specific order.
127 |                     fname = line.strip()
128 |                     if any(fname.endswith(ext) for ext in REF_EXTENSIONS):
129 |                         self.add_ref(ref=fname)
130 |                     elif fname.endswith(".fai") or fname.endswith(".gzi"):
131 |                         self.add_ref_index(ref_index=fname)
132 |                     else:
133 |                         if "NO_SAMPLE" not in tmp_samples.keys():
134 |                             tmp_samples["NO_SAMPLE"] = SampleBundle(sample="NO_SAMPLE")
135 |                         tmp_samples["NO_SAMPLE"].append(fname)
136 |         # Re-order samples in dict and add them to the list, leaving
137 |         # NO_SAMPLE as last
138 |         sorted_samples = (
139 |             sorted([sample for sample in tmp_samples.keys() if sample != 'NO_SAMPLE'])
140 |         )
141 |         if 'NO_SAMPLE' in tmp_samples.keys():
142 |             sorted_samples += ['NO_SAMPLE']
143 |         for sample in sorted_samples:
144 |             self.samples[sample] = tmp_samples[sample]
145 | 
146 |     def build_igv_json(self):
147 |         """Ensure there is a reference genome."""
148 |         if not self.ref:
149 |             raise ValueError(
150 |                 "No reference file (i.e. file ending in one of "
151 |                 f"{REF_EXTENSIONS} was found)."
152 |             )
153 |         # Evaluate that a bgzipped reference has the appropriate index.
154 |         if self.ref.endswith(".gz") and not self.gzi:
155 |             raise ValueError(f"GZI reference index for {self.ref} not found.")
156 | 
157 |         # Create the base track if there is a reference genome.
158 |         self.igv_json["reference"] = {
159 |             "id": "ref",
160 |             "name": "ref",
161 |             "wholeGenomeView": False,
162 |             "fastaURL": self.ref,
163 |         }
164 |         if self.fai:
165 |             self.igv_json["reference"]["indexURL"] = self.fai
166 |         if self.gzi:
167 |             self.igv_json["reference"]["compressedIndexURL"] = self.gzi
168 | 
169 |         # Add samples data now
170 |         for sample, bundle in self.samples.items():
171 |             bundle.process_data()
172 |             # Add the bundled data to the tracks
173 |             for fname, index, file_fmt in bundle.data_bundles:
174 |                 self.add_track(
175 |                     fname,
176 |                     file_fmt,
177 |                     sample_name=sample if sample != "NO_SAMPLE" else None,
178 |                     index=index,
179 |                     extra_opts=self.extra_opts_lookups[file_fmt],
180 |                 )
181 | 
182 |     def add_track(self, infile, file_fmt, sample_name=None, index=None, extra_opts={}):
183 |         """Add a track to an IGV json.
184 | 
185 |         This function takes an input file, an optional index file, its
186 |         file format and additional extra options for the track.
187 | 
188 |         :param infile: input file to create a track for
189 |         :param file_fmt: input file track type
190 |         :param sample_name: Name of the sample to display in the track name
191 |         :param index: index for the input file
192 |         :param extra_opts: dict of extra options for the track
193 |         :return: dict with track options
194 |         """
195 |         # Define track name depending on whether the sample ID is provided
196 |         track_name = Path(infile).name
197 |         if sample_name:
198 |             track_name = f"{sample_name}: {Path(infile).name}"
199 |         track_dict = {
200 |             "name": track_name,
201 |             "type": self.track_type[file_fmt],
202 |             "format": self.igv_fmt_alias.get(file_fmt, file_fmt),
203 |             "url": infile,
204 |         }
205 |         # add the index, if present
206 |         if index:
207 |             track_dict["indexURL"] = index
208 |         track_dict.update(extra_opts)
209 |         self.igv_json["tracks"] += [track_dict]
210 | 
211 |     def add_locus(self, locus):
212 |         """Add target locus to the json."""
213 |         self.igv_json["locus"] = locus
214 | 
215 |     def add_extra_opts(
216 |         self,
217 |         extra_alignment_opts=None,
218 |         extra_variant_opts=None,
219 |         extra_interval_opts=None,
220 |     ):
221 |         """Import extra options from json files."""
222 |         if extra_alignment_opts is not None:
223 |             with open(extra_alignment_opts, "r") as f:
224 |                 extra_alignment_opts_json = json.load(f)
225 |                 for ftype in ["bam", "cram"]:
226 |                     self.extra_opts_lookups[ftype] = extra_alignment_opts_json
227 |         if extra_variant_opts is not None:
228 |             with open(extra_variant_opts, "r") as f:
229 |                 extra_variant_opts_json = json.load(f)
230 |                 for ftype in ["vcf", "bcf"]:
231 |                     self.extra_opts_lookups[ftype] = extra_variant_opts_json
232 |         if extra_interval_opts is not None:
233 |             with open(extra_interval_opts, "r") as f:
234 |                 extra_interval_opts_json = json.load(f)
235 |                 for ftype in ["bed", "bedmethyl", "gff", "gtf"]:
236 |                     self.extra_opts_lookups[ftype] = extra_interval_opts_json
237 | 
238 | 
239 | class SampleBundle:
240 |     """Sample data class.
241 | 
242 |     This class stores the data for multiple tracks for a
243 |     single sample, then is used to generate a collection of
244 |     IGV.js tracks.
245 |     """
246 | 
247 |     def __init__(self, sample):
248 |         """Initialize properties for a sample."""
249 |         self.sample = sample
250 |         self.infiles = []
251 |         self.data_bundles = []
252 | 
253 |     def append(self, fname):
254 |         """Add a new raw file to the bundle."""
255 |         self.infiles.append(fname)
256 | 
257 |     def process_data(self):
258 |         """Process input files."""
259 |         fbasenames = [Path(fname).name for fname in self.infiles]
260 |         ftypes = [self.classify_files(bname) for bname in fbasenames]
261 |         self.data_bundles = self.pair_file_with_index(self.infiles, fbasenames, ftypes)
262 | 
263 |     @staticmethod
264 |     def classify_files(fname):
265 |         """Classify inputs."""
266 |         for extension, ftype in DATA_TYPES.items():
267 |             if fname.endswith(f".{extension}"):
268 |                 return ftype
269 | 
270 |     @staticmethod
271 |     def pair_file_with_index(infiles, fbasenames, ftypes):
272 |         """Clump files with their indexes."""
273 |         # Collect data by group type
274 |         groups = {ftype: {"basenames": [], "paths": []} for ftype in set(ftypes)}
275 |         # Group each file by its type and base name
276 |         for ftype, fbasename, fname in zip(ftypes, fbasenames, infiles):
277 |             groups[ftype]["basenames"] += [fbasename]
278 |             groups[ftype]["paths"] += [fname]
279 | 
280 |         # Output bundles
281 |         outputs = []
282 |         # Start matching the variant files
283 |         for ftype, itype in DATA_INDEXES_FMT.items():
284 |             # Ignore file formats that are not present in the bundle.
285 |             if ftype not in groups:
286 |                 continue
287 |             # Make pairs of files.
288 |             for fbasename, fpath in zip(
289 |                 groups[ftype]["basenames"], groups[ftype]["paths"]
290 |             ):
291 |                 #  Construct potential index file names based on basename of input files
292 |                 idx_basenames = set(
293 |                     [f"{fbasename}.{idx}" for idx in INDEX_PAIRS[ftype]]
294 |                 )
295 |                 # Find which indexes are available
296 |                 if itype in groups.keys():
297 |                     idx_basenames = list(
298 |                         idx_basenames.intersection(set(groups[itype]["basenames"]))
299 |                     )
300 |                     # Get the first index (if there are more than one,
301 |                     # it doesn't matter)
302 |                     bname = idx_basenames[0]
303 |                     idx_fn = groups[itype]["paths"][
304 |                         groups[itype]["basenames"].index(bname)
305 |                     ]
306 |                     outputs.append([fpath, idx_fn, ftype])
307 |                 # Otherwise, return only the simple file.
308 |                 else:
309 |                     outputs.append([fpath, None, ftype])
310 |         return outputs
311 | 
312 | 
313 | def main(args):
314 |     """Run the entry point."""
315 |     logger = get_named_logger("configIGV")
316 | 
317 |     # parse the FOFN
318 |     igv_builder = TrackBuilder()
319 | 
320 |     # Add the additional track configurations
321 |     igv_builder.add_extra_opts(
322 |         extra_alignment_opts=args.extra_alignment_opts,
323 |         extra_variant_opts=args.extra_variant_opts,
324 |         extra_interval_opts=args.extra_interval_opts
325 |     )
326 | 
327 |     # Import files
328 |     igv_builder.parse_fnames(args.fofn)
329 | 
330 |     # initialise the IGV options dict with the reference options
331 |     igv_builder.build_igv_json()
332 | 
333 |     # Add locus information
334 |     if args.locus is not None:
335 |         igv_builder.add_locus(args.locus)
336 | 
337 |     json.dump(igv_builder.igv_json, sys.stdout, indent=4)
338 | 
339 |     logger.info("Printed IGV config JSON to STDOUT.")
340 | 
341 | 
342 | def argparser():
343 |     """Argument parser for entrypoint."""
344 |     parser = wf_parser("configure_igv")
345 |     parser.add_argument(
346 |         "--fofn",
347 |         required=True,
348 |         help=(
349 |             "File with list of names of reference / XAM / VCF files and indices "
350 |             "(one filename per line)"
351 |         ),
352 |     )
353 |     parser.add_argument(
354 |         "--locus",
355 |         help="Locus string to set initial genomic coordinates to display in IGV",
356 |     )
357 |     parser.add_argument(
358 |         "--extra-alignment-opts",
359 |         help="JSON file with extra options for alignment tracks",
360 |     )
361 |     parser.add_argument(
362 |         "--extra-variant-opts",
363 |         help="JSON file with extra options for variant tracks",
364 |     )
365 |     parser.add_argument(
366 |         "--extra_interval_opts",
367 |         help="JSON file with extra options for interval tracks",
368 |     )
369 |     return parser
370 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/get_max_depth_locus.py:
--------------------------------------------------------------------------------
 1 | """Find max depth window in a `mosdepth` regions BED file and write as locus string."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("getMaxDepth")
14 | 
15 |     # read the regions BED file
16 |     df = pd.read_csv(
17 |         args.depths_bed, sep="\t", header=None, names=["ref", "start", "end", "depth"]
18 |     )
19 | 
20 |     # get the window with the largest depth
21 |     ref, start, end, depth = df.loc[df["depth"].idxmax()]
22 | 
23 |     # get the length of the reference of that window
24 |     ref_length = df.query("ref == @ref")["end"].iloc[-1]
25 | 
26 |     # show the whole reference in case it's shorter than the desired locus size
27 |     if ref_length < args.locus_size:
28 |         start = 1
29 |         end = ref_length
30 |     else:
31 |         # otherwise, show a region of the desired size around the window
32 |         half_size = args.locus_size // 2
33 |         mid = (start + end) // 2
34 |         start = mid - half_size
35 |         end = mid + half_size
36 |         # check if the region starts below `1` or ends beyond the end of the reference
37 |         if start < 1:
38 |             start = 1
39 |             end = args.locus_size
40 |         if end > ref_length:
41 |             start = ref_length - args.locus_size
42 |             end = ref_length
43 | 
44 |     # write depth and locus string
45 |     sys.stdout.write(f"{depth}\t{ref}:{start}-{end}")
46 | 
47 |     logger.info("Wrote locus with maximum depth to STDOUT.")
48 | 
49 | 
50 | def argparser():
51 |     """Argument parser for entrypoint."""
52 |     parser = wf_parser("get_max_depth_locus")
53 |     parser.add_argument(
54 |         "depths_bed",
55 |         type=Path,
56 |         help="path to mosdepth regions depth file (can be compressed)",
57 |     )
58 |     parser.add_argument(
59 |         "locus_size", type=int, help="size of the locus in basepairs (e.g. '2000')"
60 |     )
61 |     return parser
62 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/reheader_samstream.py:
--------------------------------------------------------------------------------
  1 | """Reheader a SAM in a stream.
  2 | 
  3 | When using the bam2fq -> minimap2 pattern for (re)aligning BAM data, we
  4 | lose any existing RG and PG headers. This is particularly egregious when
  5 | handling basecalled data as lines related to dorado basecalling settings
  6 | as well as dorado RG headers are lost; orphaning RG tags in the reads.
  7 | This is problematic for downstream anaylses that would like to read the
  8 | XAM header to intelligently determine how to handle the reads based on
  9 | the basecaller model and basecaller configuration.
 10 | 
 11 | This script handles:
 12 |   - Inserting RG, PG and CO lines from an existing XAM header into the
 13 |     header of the SAM emitted from minimap2's alignment stream
 14 |   - Inserting a PG header to indicate that a call to bam2fq was made
 15 |   - Updating the first streamed PG.PP parent tag with the last PG.ID
 16 |     of the existing XAM header to maintain a chain of custody
 17 |   - Updating any streamed PG.ID (and PG.PP) tags to avoid collisions
 18 |     with inserted PG.ID
 19 | 
 20 | Handling collisions may seem like overkill but it is anticipated that
 21 | this script will be called immediately after minimap2, any previous
 22 | attempt to use minimap2 will lead to ambiguity. This would be the
 23 | expected case where users have used wf-basecalling or wf-alignment to
 24 | align a set of reads, only to realign them to another reference (eg.
 25 | via wf-human-variation). Arguably, we should remove older references to
 26 | minimap2 as they will have been invalidated by the call to bam2fq but
 27 | removing PG records and sticking the PG chain back together seems more
 28 | fraught with annoying future bugs than simply resolving conflicts.
 29 | 
 30 | This script will explode on a stream that contains:
 31 |   - PG lines in the original header where the last PG in the chain is
 32 |     ambiguous, or where the parent PP IDs are not injective
 33 |   - PG lines in the stream that do not appear in the order of their
 34 |     chain (that is if a PG.PP refers to a PG.ID that has not been
 35 |     encountered yet)
 36 | 
 37 | SQ lines are retained after an HD line. That is to say, the most recent
 38 | set of SQ lines observed after an HD will appear in the final output.
 39 | SQ, RG, PG and CO lines are emitted as a group together, with elements
 40 | written out in the order observed.
 41 | 
 42 | PG lines are naively appended to the last PG element in the chain. No
 43 | attempt is made to keep multiple program chains intact as this can lead
 44 | to bloated headers. Broken PG metadata is a known problem (see
 45 | samtools/hts-specs#275) but one that is preferable to headers that
 46 | become unwieldly large to process: there IS an upper limit to a SAM
 47 | header's size after all.
 48 | 
 49 | This script takes advantage of minimap2's SAM output to immediately
 50 | reheader the stream before any downstream calls to other programs pollute
 51 | the PG header. This script is a little overkill but attempts to be robust
 52 | with handling PG collisions and more obviously encapsulates reheadering
 53 | behaviour, and leaves some room to do more clever things as necessary.
 54 | """
 55 | from shutil import copyfileobj
 56 | import sys
 57 | 
 58 | from ..util import wf_parser  # noqa: ABS101
 59 | 
 60 | 
 61 | class SamHeader:
 62 |     """An overkill container to manage merging PG lines in SAM headers.
 63 | 
 64 |     Collision handling is simple. If a PG.ID is duplicated by the stream
 65 |     then we add a suffix to its name and keep an eye out for the
 66 |     corresponding PG.PP later. We assume that headers emitted by the
 67 |     stream are chronological because this script should not be called as
 68 |     part of any complicated pipework other than immediately following
 69 |     minimap2.
 70 |     """
 71 | 
 72 |     def __init__(self):
 73 |         """Initialise a collision aware PG container."""
 74 |         self.remapped_pgids = {}
 75 |         self.collision_suffix = 0
 76 | 
 77 |         # Default HD, in case the new stream does not provide one
 78 |         self.hd = "@HD\tVN:1.6\tSO:unknown"
 79 | 
 80 |         # We'll merge RG, CO and PG
 81 |         self.rg_records = []
 82 |         self.co_records = []
 83 |         self.pg_records = []
 84 | 
 85 |         # We keep the most recently observed block of SQ records by
 86 |         # resetting SQ on the first SQ seen after non-SQ. We cannot
 87 |         # rely on HD being emitted (as minimap2 does not do this!)
 88 |         self.sq_records = []
 89 |         self.reset_sq = False
 90 | 
 91 |         self.observed_rgids = set()
 92 |         self.observed_pgids = set()
 93 |         self.last_pgid = None
 94 | 
 95 |     @staticmethod
 96 |     def str_to_record(line):
 97 |         """Return an appropriate struct for a given string record."""
 98 |         try:
 99 |             record_type, record_data = line.strip().split('\t', 1)
100 |         except ValueError:
101 |             raise Exception(f"Record type could not be determined: {line}")
102 | 
103 |         if len(record_type) > 3:
104 |             raise Exception(f"Record type malformed: {record_type}")
105 | 
106 |         record = {}
107 |         if record_type in ["@HD", "@CO", "@SQ"]:
108 |             return record_type, record_data
109 |         elif record_type in ["@RG", "@PG"]:
110 |             for field in record_data.strip().split('\t'):
111 |                 k, v = field.split(':', 1)
112 |                 if len(k) == 2 and k[0].isalpha() and k[1].isalnum():
113 |                     record[k] = v
114 |                 else:
115 |                     raise Exception(f"{record_type} with invalid tag: '{k}'")
116 |             if "ID" not in record:
117 |                 raise Exception(f"{record_type} with no ID: {record_data}")
118 |             return record_type, record
119 |         else:
120 |             raise Exception(f"Unknown record type: {line}")
121 | 
122 |     @staticmethod
123 |     def record_to_str(record_type, record_data):
124 |         """Form a string from a header record."""
125 |         if record_type in ["@PG", "@RG"]:
126 |             tags = [f"{k}:{v}" for k, v in record_data.items()]
127 |             return f"{record_type}\t" + '\t'.join(tags)
128 |         elif record_type in ["@SQ", "@CO"]:
129 |             return f"{record_type}\t{record_data}"
130 | 
131 |     @staticmethod
132 |     def resolve_pg_chain(pg_dicts):
133 |         """Check links between PG.ID and PP.ID, exploding if inconsistent."""
134 |         links = {}
135 |         # Document links between all ID and their PP parent
136 |         pgids_without_ppid = 0
137 |         for pgd in pg_dicts:
138 |             pgid = pgd["ID"]
139 |             pgpp = pgd.get("PP")
140 |             links[pgid] = pgpp
141 |             if pgpp is None:
142 |                 pgids_without_ppid += 1
143 |         if len(links) > 0:
144 |             # If there are links, exactly one should have a None parent
145 |             # to indicate the first PG in the chain. Explode if we see
146 |             # no head or multiple heads.
147 |             if pgids_without_ppid == 0:
148 |                 raise Exception("PG chain does not have a head.")
149 |             elif pgids_without_ppid > 1:
150 |                 raise Exception("PG chain has multiple heads.")
151 |         for source in links:
152 |             head = source
153 |             path = [head]
154 |             while True:
155 |                 head = links[head]
156 |                 if head is None:
157 |                     break
158 |                 if head in path:
159 |                     path.append(head)
160 |                     raise Exception(f"PG chain appears to contain cycle: {path}")
161 |                 path.append(head)
162 |         # This function is only really called to catch any explosions
163 |         # but we'll return the links here as it is useful for testing
164 |         return links
165 | 
166 |     def _bump_pg_collider(self):
167 |         """Alter the collision suffix after determining a collision."""
168 |         self.collision_suffix += 1
169 | 
170 |     def _uncollide_pgid(self, pgid):
171 |         """Return an uncollided string for a given PG ID."""
172 |         new_pgid = f"{pgid}-{self.collision_suffix}"
173 |         self.remapped_pgids[pgid] = new_pgid
174 |         self._bump_pg_collider()
175 |         return new_pgid
176 | 
177 |     def add_line(self, line):
178 |         """Add a header line to the header."""
179 |         record_type, record = self.str_to_record(line)
180 | 
181 |         if record_type == "@HD":
182 |             self.hd = f"@HD\t{record}"
183 |         elif record_type == "@CO":
184 |             self.co_records.append(record)
185 |         elif record_type == "@SQ":
186 |             if self.reset_sq:
187 |                 self.sq_records = []
188 |                 self.reset_sq = False
189 |             self.sq_records.append(record)
190 |         elif record_type == "@RG":
191 |             rgid = record["ID"]
192 |             if rgid not in self.observed_rgids:
193 |                 self.observed_rgids.add(rgid)
194 |                 self.rg_records.append(record)
195 |             elif record not in self.rg_records:
196 |                 # if rgid has been seen before, abort if this record is different
197 |                 raise Exception(
198 |                     f"Duplicate RG with ID '{rgid}' conflicts with previously seen RG with same ID."  # noqa:E501
199 |                 )
200 |         elif record_type == "@PG":
201 |             pgid = record["ID"]
202 |             if pgid in self.observed_pgids:
203 |                 # collision, rewrite the pgid
204 |                 pgid = self._uncollide_pgid(pgid)
205 |                 record["ID"] = pgid
206 |             else:
207 |                 self.observed_pgids.add(pgid)
208 | 
209 |             # maintain chain
210 |             ppid = record.get("PP")
211 |             if not ppid:
212 |                 # record has no parent, this is either
213 |                 # - the first record (last_pgid is None) so is the tail
214 |                 # - an inserted record that needs its parent to be the current tail
215 |                 if not self.last_pgid:
216 |                     self.last_pgid = pgid
217 |                 else:
218 |                     record["PP"] = self.last_pgid
219 |                     self.last_pgid = pgid
220 |             else:
221 |                 if ppid not in self.observed_pgids:
222 |                     raise Exception(
223 |                         f"Encountered PG.PP '{ppid}' before observing corresponding PG.ID"  # noqa:E501
224 |                     )
225 |                 # remap parent id (if needed)
226 |                 record["PP"] = self.remapped_pgids.get(ppid, ppid)
227 |                 # set tail to this record
228 |                 self.last_pgid = pgid
229 | 
230 |             self.pg_records.append(record)
231 | 
232 |         if len(self.sq_records) > 0 and record_type != '@SQ':
233 |             self.reset_sq = True
234 | 
235 |         return record
236 | 
237 |     def write_header(self, fh):
238 |         """Write this header to a file handle."""
239 |         self.resolve_pg_chain(self.pg_records)  # check PG header
240 |         fh.write(f"{self.hd}\n")
241 |         for sq in self.sq_records:
242 |             fh.write(self.record_to_str("@SQ", sq) + '\n')
243 |         for rg in self.rg_records:
244 |             fh.write(self.record_to_str("@RG", rg) + '\n')
245 |         for pg in self.pg_records:
246 |             fh.write(self.record_to_str("@PG", pg) + '\n')
247 |         for co in self.co_records:
248 |             fh.write(self.record_to_str("@CO", co) + '\n')
249 | 
250 | 
251 | def reheader_samstream(header_in, stream_in, stream_out, args):
252 |     """Run reheader_samstream."""
253 |     # read original header into container
254 |     sh = SamHeader()
255 |     for line in header_in:
256 |         sh.add_line(line)
257 | 
258 |     # append user provided lines to container
259 |     for line in args.insert:
260 |         sh.add_line(line)
261 | 
262 |     # read the header portion of the minimap2 stream
263 |     wrote_header = False
264 |     for line in stream_in:
265 |         if line[0] != '@':
266 |             # write out header on first alignment
267 |             sh.write_header(stream_out)
268 |             wrote_header = True
269 |             # and actually write the first alignment
270 |             stream_out.write(line)
271 |             break
272 |         sh.add_line(line)
273 | 
274 |     # Pass through the rest of the alignments.
275 |     # I toyed with a few ways of doing this:
276 |     #   - A trivial iter over the input file was slow. presumably as we incurred some
277 |     #     overhead calling read() and write() and decoding more than other methods.
278 |     #   - os.read/write avoids dealing with higher level python read/write but requires
279 |     #     file descriptors which rules out non-file-like objects. this made testing more
280 |     #     annoying as StringIO does not have a file descriptor. we could have mocked fds
281 |     #     but i was not happy with the discrepancy between real and test execution.
282 |     #   - copyfileobj with the stream_in.buffer would also avoid some of the higher
283 |     #     level text handling but would require all tests to provide inputs that have
284 |     #     an underlying binary buffer. it was also not possible to seek the buffer to
285 |     #     the position of the text stream as we've used next() to iterate over the
286 |     #     header lines, fixing this would have required rewriting of the header
287 |     #     handling or keeping track of the position in the stream ourselves which
288 |     #     just seemed unncessary overkill given how we expect this program to be used.
289 |     # copyfileobj on the text streams is more efficient than merely iterating the file
290 |     # and dumping the lines out and seems to do the job. this keeps the code and tests
291 |     # simple with minimal additional cost to performance. i anticipate any overhead of
292 |     # this program will be dwarfed by that of minimap2/samtools sort anyway.
293 |     # increasing the buffer size gave worse performance in my limited testing so we
294 |     # leave it as the default here.
295 |     copyfileobj(stream_in, stream_out)
296 | 
297 |     # If there were no alignments, we won't have hit the != @ case in the first stdin,
298 |     # and we won't have written the header out. Write a header if we haven't already.
299 |     if not wrote_header:
300 |         sh.write_header(stream_out)
301 | 
302 | 
303 | def argparser():
304 |     """Argument parser for entrypoint."""
305 |     parser = wf_parser("reheader_samstream")
306 |     parser.add_argument("header_in")
307 |     parser.add_argument("--insert", action="append", default=[])
308 |     return parser
309 | 
310 | 
311 | def main(args):
312 |     """reheader_samstream default entry point."""
313 |     with open(args.header_in) as header_in:
314 |         reheader_samstream(header_in, sys.stdin, sys.stdout, args)
315 | 


--------------------------------------------------------------------------------
/data/OPTIONAL_FILE:
--------------------------------------------------------------------------------
 1 | # Nothing to see here. A sentinel file to replace real data.
 2 | # e.g.:
 3 | #
 4 | # input:
 5 | #     file some_data
 6 | #     file extra_data
 7 | # script:
 8 | # def extra = extra_data.name != 'OPTIONAL_FILE' ? "--extra-data $opt" : ''
 9 | # """
10 | # command ${some_data} ${extra}
11 | # """
12 | 


--------------------------------------------------------------------------------
/docs/01_brief_description.md:
--------------------------------------------------------------------------------
1 | Transcriptome analysis of cDNA and direct RNA sequencing data.


--------------------------------------------------------------------------------
/docs/02_introduction.md:
--------------------------------------------------------------------------------
1 | This workflow can be used for the following:
2 | 
3 | + Identify RNA transcripts using either cDNA or direct RNA reads.
4 | + Reference aided transcriptome assembly.
5 | + Annotation of assembled transcripts.
6 | + Differential gene expression analysis using a pre-computed or assembled reference transcriptome.
7 | + Differential transcript usage analysis using a precomputed or assembled reference transcriptome.


--------------------------------------------------------------------------------
/docs/03_compute_requirements.md:
--------------------------------------------------------------------------------
 1 | Recommended requirements:
 2 | 
 3 | + CPUs = 16
 4 | + Memory = 64GB
 5 | 
 6 | Minimum requirements:
 7 | 
 8 | + CPUs = 8
 9 | + Memory = 32GB
10 | 
11 | Approximate run time: 15 minutes per sample, with 1 million reads and recommended resources.
12 | 
13 | ARM processor support: False
14 | 


--------------------------------------------------------------------------------
/docs/04_install_and_run.md:
--------------------------------------------------------------------------------
 1 | 
 2 | These are instructions to install and run the workflow on command line.
 3 | You can also access the workflow via the
 4 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/).
 5 | 
 6 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage
 7 | compute and software resources,
 8 | therefore Nextflow will need to be
 9 | installed before attempting to run the workflow.
10 | 
11 | The workflow can currently be run using either
12 | [Docker](https://www.docker.com/products/docker-desktop)
13 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
14 | to provide isolation of the required software.
15 | Both methods are automated out-of-the-box provided
16 | either Docker or Singularity is installed.
17 | This is controlled by the
18 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles)
19 | parameter as exemplified below.
20 | 
21 | It is not required to clone or download the git repository
22 | in order to run the workflow.
23 | More information on running EPI2ME workflows can
24 | be found on our [website](https://labs.epi2me.io/wfindex).
25 | 
26 | The following command can be used to obtain the workflow.
27 | This will pull the repository in to the assets folder of
28 | Nextflow and provide a list of all parameters
29 | available for the workflow as well as an example command:
30 | 
31 | ```
32 | nextflow run epi2me-labs/wf-transcriptomes --help
33 | ```
34 | To update a workflow to the latest version on the command line use
35 | the following command:
36 | ```
37 | nextflow pull epi2me-labs/wf-transcriptomes
38 | ```
39 | 
40 | A demo dataset is provided for testing of the workflow.
41 | It can be downloaded and unpacked using the following commands:
42 | ```
43 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-transcriptomes/wf-transcriptomes-demo.tar.gz
44 | tar -xzvf wf-transcriptomes-demo.tar.gz
45 | ```
46 | The workflow can then be run with the downloaded demo data using:
47 | ```
48 | nextflow run epi2me-labs/wf-transcriptomes \
49 | 	--de_analysis \
50 | 	--direct_rna \
51 | 	--fastq 'wf-transcriptomes-demo/differential_expression_fastq' \
52 | 	--minimap2_index_opts '-k 15' \
53 | 	--ref_annotation 'wf-transcriptomes-demo/gencode.v22.annotation.chr20.gtf' \
54 | 	--ref_genome 'wf-transcriptomes-demo/hg38_chr20.fa' \
55 | 	--sample_sheet 'wf-transcriptomes-demo/sample_sheet.csv' \
56 | 	-profile standard
57 | ```
58 | 
59 | For further information about running a workflow on
60 | the command line see https://labs.epi2me.io/wfquickstart/
61 | 


--------------------------------------------------------------------------------
/docs/05_related_protocols.md:
--------------------------------------------------------------------------------
1 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices.
2 | 
3 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/).


--------------------------------------------------------------------------------
/docs/06_input_example.md:
--------------------------------------------------------------------------------
 1 | <!---Example of input directory structure, delete and edit as appropriate per workflow.--->
 2 | This workflow accepts either FASTQ or BAM files as input.
 3 | 
 4 | The FASTQ or BAM input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ or BAM file; (ii) the path to a top-level directory containing FASTQ or BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ or BAM files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`.
 5 | 
 6 | ```
 7 | (i)                     (ii)                 (iii)    
 8 | input_reads.fastq   ─── input_directory  ─── input_directory
 9 |                         ├── reads0.fastq     ├── barcode01
10 |                         └── reads1.fastq     │   ├── reads0.fastq
11 |                                              │   └── reads1.fastq
12 |                                              ├── barcode02
13 |                                              │   ├── reads0.fastq
14 |                                              │   ├── reads1.fastq
15 |                                              │   └── reads2.fastq
16 |                                              └── barcode03
17 |                                               └── reads0.fastq
18 | ```


--------------------------------------------------------------------------------
/docs/06_input_parameters.md:
--------------------------------------------------------------------------------
 1 | ### Input Options
 2 | 
 3 | | Nextflow parameter name  | Type | Description | Help | Default |
 4 | |--------------------------|------|-------------|------|---------|
 5 | | fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
 6 | | bam | string | BAM or unaligned BAM (uBAM) files to use in the analysis. | This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
 7 | | transcriptome_source | string | Select how the transcriptome used for analysis should be prepared. | For differential expression analysis, use of an existing transcriptome may be preferred and so 'precomputed' should be selected. In this case the 'ref_transcriptome' parameter should be specified. To create a reference transcriptome using an existing reference genome, select 'reference guided' and specify the 'ref_genome' parameter. | reference-guided |
 8 | | ref_genome | string | Path to reference genome sequence [.fa/.fq/.fa.gz/fq.gz]. Required for reference-based workflow. | A reference genome is required for reference-based assembly of a transcriptome. |  |
 9 | | ref_transcriptome | string | Transcriptome reference file. Required for precomputed transcriptome calculation and for differential expression analysis. | A reference transcriptome related to the sample under study. Must be supplied when the 'Transcriptome source' parameter has been set to 'precomputed' or to perform differential expression. |  |
10 | | ref_annotation | string | A reference annotation in GFF2 or GFF3 format (extensions .gtf(.gz), .gff(.gz), .gff3(.gz)). Only annotation files from [Encode](https://www.encodeproject.org), [Ensembl](https://www.ensembl.org/index.html) and [NCBI](https://www.ncbi.nlm.nih.gov/) are supported. | This will be used for guiding the transcriptome assembly and to label transcripts with their corresponding gene identifiers. Note: If in de_analysis mode transcript strands must be only + or -. |  |
11 | | direct_rna | boolean | Set to true for direct RNA sequencing. |  Omits the pychopper step. | False |
12 | | analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False |
13 | 
14 | 
15 | ### Output Options
16 | 
17 | | Nextflow parameter name  | Type | Description | Help | Default |
18 | |--------------------------|------|-------------|------|---------|
19 | | out_dir | string | Directory for output of all user-facing files. |  | output |
20 | | igv | boolean | Visualize outputs in the EPI2ME IGV visualizer. | Enabling this option will visualize the output alignment files in the EPI2ME Desktop App IGV visualizer. | False |
21 | 
22 | 
23 | ### Sample Options
24 | 
25 | | Nextflow parameter name  | Type | Description | Help | Default |
26 | |--------------------------|------|-------------|------|---------|
27 | | sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. If you are running the differential expression workflow, there must be an additional column `condition` with two labels, one of which must be `control` (e.g. `control` and `treated`). Control will indicate which samples will be used as the reference. There should be at least 3 repeats for each condition. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. |  |
28 | | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. |  |  |
29 | 
30 | 
31 | ### Options for reference-based workflow
32 | 
33 | | Nextflow parameter name  | Type | Description | Help | Default |
34 | |--------------------------|------|-------------|------|---------|
35 | | plot_gffcmp_stats | boolean | Create a PDF of plots from showing gffcompare results | If set to true, a PDF file containing detailed gffcompare reults will be output | True |
36 | | gffcompare_opts | string | Extra command-line options to give to gffcompare -r | For a list of possible options see [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml). | -R |
37 | | minimap2_index_opts | string | Extra command-line options for minimap2 indexing. | See [minimap2 index options](https://lh3.github.io/minimap2/minimap2.html#4) for more information. These will only be relevant in the reference based transcriptome assembly. | -k 14 |
38 | | minimap2_opts | string | Additional command-line options for minimap2 alignment. | See [minimap2 options](https://lh3.github.io/minimap2/minimap2.html#5) for further information. These will only be relevant in the reference based transcriptome assembly. | -uf |
39 | | minimum_mapping_quality | integer | filter aligned reads by MAPQ quality. | Reads that do not meet this mapping quality after minimap2 alignment, will be filtered out. | 40 |
40 | | stringtie_opts | string | Extra command-line options for stringtie transcript assembly. | For additional String tie options see [here](https://github.com/gpertea/stringtie#stringtie-options). | --conservative |
41 | 
42 | 
43 | ### Differential Expression Options
44 | 
45 | | Nextflow parameter name  | Type | Description | Help | Default |
46 | |--------------------------|------|-------------|------|---------|
47 | | de_analysis | boolean | Run DE anaylsis | Running this requires you to provide at least two replicates for a control and treated sample as well as a sample sheet param. | False |
48 | | min_gene_expr | integer | The minimum number of total mapped sequence reads required for a gene to be considered in differential transcript usage analysis. | Filtering at the gene level ensures that the observed transcript ratios are calculated with a minimum number of counts per gene. | 10 |
49 | | min_feature_expr | integer | The minimum number of reads assigned to a transcript for it to be considered in differential transcript usage analysis. | Filter out transcripts that do not have this minimum number of transcript expression, reducing noise. | 3 |
50 | | min_samps_gene_expr | integer | Set the minimum number of samples in which a gene is expressed to be included in the differential transcript usage analysis. | A gene must be expressed in at least this number of samples for the gene be included in the differential transcript usage analysis. Filtering at the gene level improves the reliability of the observed transcript ratios. | 3 |
51 | | min_samps_feature_expr | integer | Set the minimum number of samples in which a transcript is expressed to be included in the differential transcript usage analysis. | A transcript must expressed in at least this minimum number of samples to be included in the analysis. Should be equal to the number of replicates per sample you have. | 1 |
52 | 
53 | 
54 | ### Advanced Options
55 | 
56 | | Nextflow parameter name  | Type | Description | Help | Default |
57 | |--------------------------|------|-------------|------|---------|
58 | | threads | integer | Number of CPU threads. | Only provided to processes including alignment and and assembly that benefit from multiple threads. | 4 |
59 | | cdna_kit | string | If cDNA reads are used, select the kit used. | This will be used by pychopper to preprocess the reads for downstream analysis. | SQK-PCS109 |
60 | | pychopper_backend | string | Pychopper can use one of two available backends for identifying primers in the raw reads | 'edlib' is set by default due to its high performance. However, it may be less sensitive than 'phmm'. | edlib |
61 | | pychopper_opts | string | Extra pychopper opts | See available options (here)[https://github.com/epi2me-labs/pychopper#usage] |  |
62 | | bundle_min_reads | integer | Minimum size of bam bundle for parallel processing. |  | 50000 |
63 | | isoform_table_nrows | integer | Maximum rows to dispay in the isoform report table |  | 5000 |
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/docs/07_outputs.md:
--------------------------------------------------------------------------------
 1 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}.
 2 | 
 3 | | Title | File path | Description | Per sample or aggregated |
 4 | |-------|-----------|-------------|--------------------------|
 5 | | workflow report | wf-transcriptomes-report.html | a HTML report document detailing the primary findings of the workflow | aggregated |
 6 | | Per file read stats | fastq_ingress_results/{{ alias }}//reads/fastcat_stats/per-file-stats.tsv | A TSV with per file read stats, including all samples. | aggregated |
 7 | | Read stats | fastq_ingress_results/{{ alias }}//reads/fastcat_stats/per-read-stats.tsv | A TSV with per read stats, including all samples. | aggregated |
 8 | | Run ID's | fastq_ingress_results/{{ alias }}//reads/fastcat_stats/run_ids | List of run IDs present in reads. | aggregated |
 9 | | Meta map json | fastq_ingress_results/{{ alias }}//reads/metamap.json | Metadata used in workflow presented in a JSON. | aggregated |
10 | | Concatenated sequence data | fastq_ingress_results/{{ alias }}//reads/{{ alias }}.fastq.gz | Per sample reads concatenated in to one FASTQ file. | per-sample |
11 | | Assembled transcriptome | {{ alias }}_transcriptome.fas | Per sample assembled transcriptome.  Not output if a reference annotation was supplied | per-sample |
12 | | Annotated assembled transcriptome | {{ alias }}_merged_transcriptome.fas | Per sample annotated assembled transcriptome. Only output if a reference annotation was supplied | per-sample |
13 | | Alignment summary statistics | {{ alias }}_read_aln_stats.tsv | Per sample alignment summary statistics. | per-sample |
14 | | GFF compare results. | {{ alias }}_gffcompare | All GFF compare output files. | per-sample |
15 | | Differential gene expression results | de_analysis/results_dge.tsv | This is a gene-level result file that describes genes and their probability of showing differential expression between experimental conditions. | aggregated |
16 | | Differential gene expression report | de_analysis/results_dge.pdf | Summary report of differential gene expression analysis as a PDF. | aggregated |
17 | | Differential transcript usage gene TSV | de_analysis/results_dtu_gene.tsv | This is a gene-level result file from DEXSeq that lists annotated genes and their probabilities of differential expression. | aggregated |
18 | | Differential transcript usage report | de_analysis/results_dtu.pdf | Summary report of differential transcript usage results as a PDF. | aggregated |
19 | | Differential transcript usage TSV | de_analysis/results_dtu_transcript.tsv | This is a transcript-level result file from DEXSeq that lists annotated genes and their probabilities of differential expression. | aggregated |
20 | | Differential transcript usage stageR TSV | de_analysis/results_dtu_stageR.tsv  | This is the output from StageR and it shows both gene and transcript probabilities of differential expression | aggregated |
21 | | Differential transcript usage DEXSeq TSV | de_analysis/results_dexseq.tsv | The complete output from the DEXSeq-analysis, shows both gene and transcript probabilities of differential expression. | aggregated |
22 | | Gene counts | de_analysis/all_gene_counts.tsv | Raw gene counts created by the Salmon tool, before filtering. | aggregated |
23 | | Gene counts per million | de_analysis/cpm_gene_counts.tsv | This file shows counts per million (CPM) of the raw gene counts to facilitate comparisons across samples. | aggregated |
24 | | Transcript counts | de_analysis/unfiltered_transcript_counts_with_genes.tsv | Raw transcript counts created by the Salmon tool, before filtering. Includes reference to the associated gene ID. | aggregated |
25 | | Transcript per million counts | de_analysis/unfiltered_tpm_transcript_counts.tsv | This file shows transcripts per million (TPM) of the raw counts to facilitate comparisons across samples. | aggregated |
26 | | Transcript counts filtered | de_analysis/filtered_transcript_counts_with_genes.tsv | Filtered transcript counts, used for differential transcript usage analysis. Includes a reference to the associated gene ID. | aggregated |
27 | | Transcript info table | {{ alias }}_transcripts_table.tsv | This file details each isoform that was reconstructed from the input reads. It contains a subset of columns from the .tmap output from [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml) | per-sample |
28 | | Final non redundant transcriptome | de_analysis/final_non_redundant_transcriptome.fasta | Transcripts that were used for differential expression analysis including novel transcripts with the identifiers used for DE analysis. Only applicable when the ref_transcriptome parameter is not provided. | aggregated |
29 | | Index of reference FASTA file | igv_reference/{{ ref_genome_file }}.fai | Reference genome index of the FASTA file required for IGV config. | aggregated |
30 | | GZI index of the reference FASTA file | igv_reference/{{ ref_genome_file }}.gzi | GZI Index of the reference FASTA file. | aggregated |
31 | | JSON configuration file for IGV browser | igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reference. | aggregated |
32 | | BAM file (minimap2) | BAMS/{{ alias }}.reads_aln_sorted.bam | BAM file generated from mapping input reads to the reference. | per-sample |
33 | | BAM index file (minimap2) | BAMS/{{ alias }}.reads_aln_sort.bam.bai | Index file generated from mapping input reads to the reference. | per-sample |
34 | 


--------------------------------------------------------------------------------
/docs/08_pipeline_overview.md:
--------------------------------------------------------------------------------
 1 | ### 1. Concatenate input files and generate per read stats.
 2 | The [fastcat](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. It will also output per read stats including average read lengths and qualities.
 3 | 
 4 | ### 2. Preprocess cDNA.
 5 | If input sequences are cDNA [Pychopper](https://github.com/epi2me-labs/pychopper) is used to orient, trim and rescue full length cDNA reads and associated statistics. If the `direct_rna` parameter is selected this step will be skipped.
 6 | 
 7 | ### 3. Build transcriptome.
 8 | If the `transcriptome_source` parameter is "reference-guided" a transcriptome will be built for each sample as outlined below. If the `transcriptome_source` is "precomputed" and the `reference_transcriptome` parameter is provided the workflow will skip step 3.
 9 | 
10 | #### 3.1 Align reads with reference genome.
11 | The reference genome will be indexed and aligned using [Minimap2](https://github.com/lh3/minimap2). The output is sorted and converted to a BAM file using [Samtools](https://www.htslib.org/). Alignment stats are created from these using [Seqkit BAM](https://bioinf.shenwei.me/seqkit/usage/#bam).
12 | 
13 | Additionally, the workflow will generate an IGV configuration file if `--igv` is selected. This file allows the user to view the aligned BAM in the EPI2ME Desktop Application in the Viewer tab.
14 | 
15 | #### 3.2 Chunk BAM
16 | The aligned BAMs are split into chunks using the bundle_min_reads parameter (default: 50000).
17 | 
18 | #### 3.3 Assemble transcripts
19 | [StringTie](https://ccb.jhu.edu/software/stringtie/) is then used to assemble the transcripts using the aligned segments in the chunked BAM files. The assembled transcript will be output as a [GFF file](https://www.ensembl.org/info/website/upload/gff3.html). If a `ref_annotation` file is provided this will also be included in the GFF.
20 | 
21 | #### 3.4 Merge Chunks
22 | Transcript GFF files from the chunks with the same sample aliases will then be merged.
23 | 
24 | #### 3.5 Annnotate
25 | [GffCompare](https://ccb.jhu.edu/software/stringtie/gffcompare.html) is then used to compare query and reference annotations, merging records where appropriate and then annotating them. This also creates estimates of accuracy of the GFF files output in a stats file per sample.
26 | 
27 | #### 3.6 Create transcriptomes
28 | [Gffread](https://github.com/gpertea/gffread) is used to create a transcriptome FASTA file from the final GFF as well as a merged transcriptome that includes annotations in the FASTA headers where available.
29 | 
30 | ### 4. Differential expression analysis
31 | 
32 | Differential gene expression (DGE) and differential transcript usage (DTU) analyses aim to identify genes and transcripts that show statistically altered expression patterns.
33 | 
34 | Differential Expression requires at least 2 replicates of each sample to compare (but we recommend three). You can see an example sample_sheet.csv below.
35 | 
36 | #### Sample sheet condition column
37 | The sample sheet should be a comma separated values file (.csv) and include at least three columns named `barcode`, `alias` and `condition`.
38 | - Each `barcode` should refer to a directory of the same name in the input FASTQ directory (in the example below `barcode01` to `barcode06` reflect the `test_data` directory).
39 | - The `alias` column allows you to rename each barcode to an alias that will be used in the report and other output files.
40 | - The condition column will need to contain one of two keys to indicate the two samples being compared. Control must be one of the keys, used to indicate which samples will be used as the reference in the differential expression analysis.
41 | 
42 | eg. sample_sheet.csv
43 | ```
44 | barcode,alias,condition
45 | barcode01,sample01,control
46 | barcode02,sample02,control
47 | barcode03,sample03,control
48 | barcode04,sample04,treated
49 | barcode05,sample05,treated
50 | barcode06,sample06,treated
51 | ```
52 | 
53 | #### 4.1 Merge cross sample transcriptomes
54 | If a `ref_transcriptome` is not provided, the transcriptomes created by the workflow will be used for DE analysis. To do this, the GFF outputs of GffCompare are merged using StringTie. A final non redundant FASTA file of the transcripts is created using the merged GFF file and the reference genome using seqkit.
55 | 
56 | #### 4.2 Create a final non redundant transcriptome
57 | The reads from all the samples will be aligned with the final non redundant transcriptome using Minimap2 in a splice aware manner.
58 | 
59 | #### 4.3 Count genes and transcripts
60 | [Salmon](https://github.com/COMBINE-lab/salmon) is used for transcript quantification, giving gene and transcript counts.
61 | 
62 | #### 4.4 edgeR based differential expression analysis
63 | A statistical analysis is first performed using [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) to identify the subset of differentially expressed genes using the gene counts as input. A normalisation factor is calculated for each sequence library using the default TMM method (see [McCarthy et al. (2012)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3378882/) for further details). The defined experimental design is used to calculate estimates of dispersion for each of the gene features. Statistical tests are calculated using the contrasts defined in the experimental design. The differentially expressed genes are corrected for false discovery (FDR) using the method of Benjamini & Hochberg ([Benjamini and Hochberg (1995)](https://www.jstor.org/stable/2346101))
64 | 
65 | #### 4.5 Pre-filtering of quantitative data using DRIMSeq
66 | [DRIMSeq](https://bioconductor.org/packages/release/bioc/html/DRIMSeq.html) is used to filter the transcript count data from the Salmon analysis for differential transcript usage (DTU) analysis. The filter step will be used to select for genes and transcripts that satisfy rules for the number of samples in which a gene or transcript must be observed, and minimum threshold levels for the number of observed reads. The parameters used for filtering are `min_samps_gene_expr`, `min_samps_feature_expr`, `min_gene_expr`, and `min_feature_expr`. By default, any transcripts with zero expression or one transcript in all samples are filtered out at this stage.
67 | 
68 | #### 4.6 Differential transcript usage using DEXSeq
69 | Differential transcript usage analysis is performed using the R [DEXSeq](https://bioconductor.org/packages/release/bioc/html/DEXSeq.html) package ([Anders et al. (2012)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3460195/)). Similar to the edgeR package, DEXSeq estimates the variance between the biological replicates and applies generalised linear models for the statistical testing. The key difference is that the DEXSeq method looks for differences at the exon count level. DEXSeq uses the filtered transcript count data prepared earlier in this analysis. 
70 | 
71 | #### 4.7 StageR stage-wise analysis of DGE and DTU
72 | The final component of this isoform analysis is a stage-wise statistical test using the R software package [stageR](https://bioconductor.org/packages/release/bioc/html/stageR.html)([Van den Berge and Clement (2018)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1277-0)). stageR uses (1) the raw p-values for DTU from the DEXSeq analysis in the previous section and (2) a false-discovery corrected set of p-values from testing whether individual genes contain at least one exon showing DTU. A hierarchical two-stage statistical testing evaluates the set of genes for DTU.
73 | 


--------------------------------------------------------------------------------
/docs/09_troubleshooting.md:
--------------------------------------------------------------------------------
1 | + If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug.
2 | + See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/).
3 | + Renaming, moving or deleting the input BAM, reference genome or the output directory from the location provided at runtime will stop IGV in the EPI2ME Desktop app from loading.


--------------------------------------------------------------------------------
/docs/10_FAQ.md:
--------------------------------------------------------------------------------
1 | *Does the workflow support de novo assembly?* - Currently the workflow does not have a *de novo* mode.
2 | 
3 | *Why is the IGV panel not showing?* - The workflow expects either an uncompressed or [`bgzip`](https://www.htslib.org/doc/bgzip.html)-compressed reference. If the user provides a reference compressed not with `bgzip`, the workflow will run to completion, but won't be able to generate the necessary indexes to visualize the outputs in IGV.
4 | 
5 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-transcriptomes/issues) page or start a discussion on the [community](https://community.nanoporetech.com/).


--------------------------------------------------------------------------------
/docs/11_other.md:
--------------------------------------------------------------------------------
1 | + [How to align your data](https://labs.epi2me.io/how-to-align/)
2 | 
3 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts.


--------------------------------------------------------------------------------
/evaluation/tests.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # A few simple tests with different combinations of CLI options
 4 | # Run from within an appropriate active conda environment
 5 | 
 6 | if [[ "$#" -lt 1 ]]; then
 7 |     echo "usage: tests.sh <outdir> [nextflow.config]"
 8 |     exit 1
 9 | fi
10 | 
11 | if [[ "$#" -eq 1 ]]; then
12 |     config=''
13 | fi
14 | 
15 | if [[ "$#" -eq 2 ]]; then
16 |   config="-c $2";
17 | fi
18 | 
19 | SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
20 | cd $SCRIPT_DIR/../;
21 | 
22 | singledir="test_data/fastq"
23 | multisampledir="test_data/demultiplexed_fastq"
24 | 
25 | # This is for when using SIRV dataset with non-canonical spice junctions
26 | #"--minimap2_opts '-uf --splice-flank=no'"
27 | results=()
28 | 
29 | # Reference based tests
30 | OUTPUT=$1/reference_single_dir;
31 | nextflow run . --fastq $singledir $config --ref_genome test_data/SIRV_150601a.fasta --minimap2_opts '-uf --splice-flank=no' \
32 | --ref_annotation test_data/SIRV_isofroms.gtf -profile local --out_dir ${OUTPUT} -w ${OUTPUT}/workspace -resume;
33 | r=$?
34 | results+=("$(basename $OUTPUT): $r")
35 | 
36 | OUTPUT=$1/multiple_samples;
37 | nextflow run . --fastq $multisampledir $config --ref_genome test_data/SIRV_150601a.fasta --minimap2_opts '-uf --splice-flank=no'\
38 | --ref_annotation test_data/SIRV_isofroms.gtf -profile local --out_dir ${OUTPUT} -w ${OUTPUT}/workspace \
39 | --sample_sheet test_data/sample_sheet -resume;
40 | r=$?
41 | results+=("$(basename $OUTPUT): $r")
42 | 
43 | OUTPUT=$1/reference_no_ref_annotation;
44 | nextflow run . --fastq $singledir $config --ref_genome test_data/SIRV_150601a.fasta --minimap2_opts '-uf --splice-flank=no'\
45 |  -profile local --out_dir ${OUTPUT} -w ${OUTPUT}/workspace -resume;
46 | r=$?
47 | results+=("$(basename $OUTPUT): $r")
48 | 
49 | # Force split_bam to make multiple alignment bundles
50 | OUTPUT=$1/reference_frce_split_bam;
51 | nextflow run . --fastq $singledir  $config --ref_genome test_data/SIRV_150601a.fasta --minimap2_opts '-uf --splice-flank=no'\
52 | --ref_annotation test_data/SIRV_isofroms.gtf -profile local --out_dir ${OUTPUT} -w ${OUTPUT}/workspace \
53 | --bundle_min_reads 5 -resume;
54 | r=$?
55 | results+=("$(basename $OUTPUT): $r")
56 | 
57 | echo "Exit status codes for each test"
58 | for value in "${results[@]}"; do
59 |      echo "${value}"
60 | done


--------------------------------------------------------------------------------
/lib/ArgumentParser.groovy:
--------------------------------------------------------------------------------
 1 | /* Check arguments of a Nextflow function
 2 |  *
 3 |  * Nextflow script does not support the Groovy idiom:
 4 |  *
 5 |  *     def function(Map args[:], arg1, arg2, ...)
 6 |  * 
 7 |  * to support unordered kwargs. The methods here are designed
 8 |  * to reduce boileplate while allowing Nextflow script to implement
 9 |  *
10 |  *     def function(Map args[:])
11 |  *
12 |  * with required and default values. This is similar to some Python
13 |  * libraries' (notably matplotlib) extensive use of things like:
14 |  *
15 |  *     def function(*args, **kwargs)
16 |  *
17 |  * to implement generic APIs. Why do we want to do all this? Because
18 |  * we want to write library code with a clean set of required parameters
19 |  * but also extensible with non-required parameters with default values.
20 |  * This allows us to later add parameters without breaking existing code,
21 |  * and is very common practice elsewhere.
22 |  */
23 | 
24 | import java.util.Set
25 | 
26 | class ArgumentParser {
27 |     Set args
28 |     Map kwargs
29 |     String name
30 | 
31 |     /* Parse arguments, raising an error on unknown keys */
32 |     public Map parse_args(LinkedHashMap given_args) {
33 |         Set opt_keys = kwargs.keySet()
34 |         Set given_keys = given_args.keySet()
35 |         check_required(given_keys)
36 |         check_unknown(given_keys, opt_keys)
37 |         return kwargs + given_args
38 |     }
39 |     
40 |     /* Parse arguments, without raising an error for extra keys */
41 |     public Map parse_known_args(LinkedHashMap given_args) {
42 |         Set opt_keys = kwargs.keySet()
43 |         Set given_keys = given_args.keySet()
44 |         check_required(given_keys)
45 |         return kwargs + given_args
46 |     }
47 |     
48 |     private void check_required(Set given) {
49 |         Set missing_keys = args - given
50 |         if (!missing_keys.isEmpty()) {
51 |             throw new Exception("Missing arguments for function ${name}: ${missing_keys}")
52 |         }
53 |     }
54 |     
55 |     private void check_unknown(Set given, Set kwargs_keys) {
56 |         Set extra_keys = given - (args + kwargs_keys)
57 |         if (!extra_keys.isEmpty()) {
58 |             throw new Exception("Unknown arguments provided to function ${name}: ${extra_keys}.")
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/lib/CWUtil.groovy:
--------------------------------------------------------------------------------
 1 | /* Miscellaneous utilities for workflows from the ONT Customer Workflows Group.
 2 |  */
 3 | class CWUtil {
 4 | 
 5 |     /* Mutate the global Nextflow params map
 6 |     *
 7 |     * Occasionally, we may wish to mutate the value of a parameter provided
 8 |     * by the user. Typically, this leads to workflows with `params.my_param`
 9 |     * and `params._my_param` which is ripe for confusion. Instead, we can
10 |     * mutate the parameter value in the Nextflow params ScriptMap itself
11 |     * with the following call:
12 |     *
13 |     *     CWUtil.mutateParam(params, k, v)
14 |     *
15 |     * This is possible as Groovy actually has a surprisingly loose
16 |     * definition of "private", and allows us to call the private `allowNames`
17 |     * method on the ScriptMap which removes the read-only status for a key set.
18 |     * We can follow this up with a call to the private `put0` to reinsert
19 |     * the key and mark it as read-only again.
20 |     */
21 |     public static void mutateParam(nf_params, key, value) {
22 |         Set s = [key] // must be a set to allow call to allowNames
23 |         nf_params.allowNames(s)
24 |         nf_params.put0(key, value)
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/lib/Pinguscript.groovy:
--------------------------------------------------------------------------------
  1 | import static groovy.json.JsonOutput.toJson
  2 | import groovy.json.JsonBuilder
  3 | import groovy.json.JsonSlurper
  4 | 
  5 | 
  6 | class Pinguscript {
  7 | 
  8 |     // Send a ping for the start of a workflow
  9 |     public static void ping_start(nextflow, workflow, params) {
 10 |         wf_ping(nextflow, workflow, "start", null, params)
 11 |     }
 12 |     // Send a ping for a completed workflow (successful or otherwise)
 13 |     public static void ping_complete(nextflow, workflow, params) {
 14 |         wf_ping(nextflow, workflow, "end", null, params)
 15 |     }
 16 |     // Send a ping for a workflow error
 17 |     public static void ping_error(nextflow, workflow, params) {
 18 |         def error_message = workflow.errorMessage
 19 |         wf_ping(nextflow, workflow, "error", error_message, params)
 20 |     }
 21 |     // Shared handler to construct a ping JSON and send it
 22 |     private static String wf_ping(nextflow, workflow, event, error_message, params) {
 23 |         if (params.disable_ping) {
 24 |             return "{}"
 25 |         }
 26 |         def body_json = make_wf_ping(nextflow, workflow, event, error_message, params)
 27 |         send_ping_post("epilaby", body_json)
 28 |     }
 29 | 
 30 |     // Helper to removing keys from a map
 31 |     private static clean_meta(meta, keys_to_remove) {
 32 |         for (key in keys_to_remove) {
 33 |             if (meta.containsKey(key)) {
 34 |                 meta.remove(key)
 35 |             }
 36 |         }
 37 |     }
 38 | 
 39 |     // Helper for fetching a key from the params map
 40 |     // seems pointless but you just know someone is going to end up writing meta.this ? meta.that
 41 |     private static get_meta(meta, key) {
 42 |         (meta.containsKey(key) && meta[key]) ? meta[key].toString() : null
 43 |     }
 44 | 
 45 |     // Construct workflow ping JSON
 46 |     private static String make_wf_ping(nextflow, workflow, event, error_message, params) {
 47 |         // cheeky deepcopy using json
 48 |         String paramsJSON = new JsonBuilder(params).toPrettyString()
 49 |         def params_data = new JsonSlurper().parseText(paramsJSON)
 50 | 
 51 |         // hostname
 52 |         def host = null
 53 |         try {
 54 |             host = InetAddress.getLocalHost().getHostName()
 55 |         }
 56 |         catch(Exception e) {}
 57 | 
 58 |         // OS
 59 |         // TODO check version on WSL
 60 |         def opsys = System.properties['os.name'].toLowerCase()
 61 |         def opver = System.properties['os.version']
 62 |         if (opver.toLowerCase().contains("wsl")){
 63 |             opsys = "wsl"
 64 |         }
 65 | 
 66 |         // placeholder for any future okta business
 67 |         // for now we'll use the guest_<ulid> sent to wf.epi2me_user
 68 |         def user = get_meta(params.wf, "epi2me_user")
 69 | 
 70 |         // drop cruft to save some precious bytes
 71 |         // affects the deep copy rather than original params
 72 |         clean_meta(params_data, [
 73 |             "schema_ignore_params",
 74 |         ])
 75 |         def ingress_ids = []
 76 |         if (params_data.containsKey("wf")) {
 77 |             ingress_ids = params_data.wf["ingress.run_ids"] ?: []
 78 |             clean_meta(params_data.wf, [
 79 |                 "agent", // we send this later
 80 |                 "epi2me_instance", // we send this later
 81 |                 "epi2me_user", // we send this later
 82 |                 "example_cmd",
 83 |                 "ingress.run_ids", // we will send this elsewhere
 84 |             ])
 85 |         }
 86 | 
 87 |         // try and get runtime information
 88 |         def cpus = null
 89 |         try {
 90 |             cpus = Runtime.getRuntime().availableProcessors()
 91 |         }
 92 |         catch(Exception e) {}
 93 | 
 94 |         def workflow_success = null
 95 |         def workflow_exitcode = null
 96 |         if (event != "start") {
 97 |             workflow_success = workflow.success
 98 |             workflow_exitcode = workflow.exitStatus
 99 |         }
100 | 
101 |         /// build message
102 |         def body_json = new JsonBuilder()
103 |         body_json \
104 |             "tracking_id": [
105 |                 "msg_id": UUID.randomUUID().toString(),
106 |                 "version": "3.0.0"
107 |             ],
108 |             "source": "workflow",
109 |             "event": event,
110 |             "params": params_data,
111 |             // data will be null on start events, as ingress has not run
112 |             "data": event != "start" ? [run_ids: ingress_ids] : null,
113 |             "workflow": [
114 |                 "name": workflow.manifest.name,
115 |                 "version": workflow.manifest.version, // could use NfcoreTemplate.version(workflow)
116 |                 "run_name": workflow.runName, // required to disambiguate sessions
117 |                 "session": workflow.sessionId,
118 |                 "profile": workflow.profile,
119 |                 "resume": workflow.resume,
120 |                 "error": error_message, // null if no error
121 |                 "success": workflow_success,
122 |                 "exitcode": workflow_exitcode,
123 |             ],
124 |             "env": [
125 |                 "user": user, // placeholder for any future okta
126 |                 "hostname": host,
127 |                 "os": [
128 |                     "name": opsys,
129 |                     "version": opver
130 |                 ],
131 |                 "resource": [
132 |                     "cpus": cpus,
133 |                     "memory": null, // placeholder, no point asking via Runtime as it will just give us the Xmx size
134 |                 ],
135 |                 "agent": get_meta(params.wf, "agent"), // access via original params
136 |                 "epi2me": [
137 |                     "instance": get_meta(params.wf, "epi2me_instance"),
138 |                     "user": user,
139 |                 ],
140 |                 "nextflow": [
141 |                     "version": nextflow.version.toString(),
142 |                     "version_compat": nextflow.version.matches(workflow.manifest.nextflowVersion)
143 |                 ]
144 |             ]
145 |         return body_json
146 |     }
147 | 
148 |     // Send a JSON payload to a given endpoint
149 |     private static String send_ping_post(endpoint, body_json) {
150 |         // Attempt to send payload and absorb any possible Exception gracefully
151 |         String postResult
152 |         boolean raise_exception = false
153 |         try {
154 |             ((HttpURLConnection)new URL("https://ping.oxfordnanoportal.com/${endpoint}").openConnection()).with({
155 |                 requestMethod = 'POST'
156 |                 doOutput = true
157 |                 setConnectTimeout(5000)
158 |                 setReadTimeout(10000)
159 |                 setRequestProperty('Content-Type', 'application/json')
160 |                 setRequestProperty('accept', 'application/json')
161 |                 outputStream.withPrintWriter({printWriter ->
162 |                     printWriter.write(body_json.toString())
163 |                 })
164 | 
165 |                 // Rethrow exceptions that imply we're not using this endpoint properly
166 |                 if(responseCode >= 400 && agent.toString() == "cw-ci") {
167 |                     raise_exception = true
168 |                 }
169 |                 // Accessing inputStream.text will raise an Exception for failed requests
170 |                 postResult = inputStream.text
171 |             })
172 |         }
173 |         catch(Exception e) {
174 |             if(raise_exception) { throw e }
175 |         }
176 |         return (postResult)
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/lib/WorkflowMain.groovy:
--------------------------------------------------------------------------------
 1 | // This file is based on the nf-core/tools pipeline-template.
 2 | // Changes to this file must be propagated via wf-template.
 3 | 
 4 | class WorkflowMain {
 5 | 
 6 |     // Citation string for pipeline
 7 |     public static String citation(workflow) {
 8 |         return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" +
 9 |             "* The nf-core framework\n" +
10 |             "  https://doi.org/10.1038/s41587-020-0439-x\n\n"
11 |     }
12 | 
13 |     // Generate help string
14 |     public static String help(workflow, params, log) {
15 |         String line_sep = ' \\ \n\t'
16 |         String command_example = params.wf.example_cmd.join(line_sep)
17 |         String command = 'nextflow run ' + workflow.manifest.name + line_sep + command_example
18 |         String help_string = ''
19 |         help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs)
20 |         help_string += NfcoreSchema.paramsHelp(workflow, params, command)
21 |         help_string += '\n' + citation(workflow) + '\n'
22 |         return help_string
23 |     }
24 | 
25 |     // Generate parameter summary log string
26 |     public static String paramsSummaryLog(workflow, params, log) {
27 |         String workflow_version = NfcoreTemplate.version(workflow)
28 |         String summary_log = ''
29 |         summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs)
30 |         summary_log += NfcoreSchema.paramsSummaryLog(workflow, params)
31 |         summary_log += '\n' + citation(workflow) + '\n'
32 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
33 |         summary_log += "\nThis is ${workflow.manifest.name} ${workflow_version}.\n"
34 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
35 |         return summary_log
36 |     }
37 | 
38 |     // Validate parameters and print summary to screen
39 |     public static void initialise(workflow, params, log) {
40 |         // Print help to screen if required
41 |         if (params.help) {
42 |             log.info help(workflow, params, log)
43 |             System.exit(0)
44 |         }
45 | 
46 |         // Print workflow version and exit on --version
47 |         if (params.version) {
48 |             String workflow_version = NfcoreTemplate.version(workflow)
49 |             log.info "${workflow.manifest.name} ${workflow_version}"
50 |             System.exit(0)
51 |         }
52 | 
53 |         // Explode on conda
54 |         // conda.enabled seems to be backward compatible but wrap this
55 |         // in a generic catch just in case
56 |         try {
57 |             if (workflow.session.config.conda.enabled) {
58 |                 log.error "Sorry, this workflow is not compatible with Conda, please use -profile standard (Docker) or -profile singularity."
59 |                 System.exit(1)
60 |             }
61 |         } catch(Exception e) {}
62 | 
63 |         // Validate workflow parameters via the JSON schema
64 |         if (params.validate_params) {
65 |             NfcoreSchema.validateParameters(workflow, params, log)
66 |         }
67 | 
68 |         // Print parameter summary log to screen
69 |         log.info paramsSummaryLog(workflow, params, log)
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/lib/common.nf:
--------------------------------------------------------------------------------
 1 | import groovy.json.JsonBuilder
 2 | 
 3 | process getParams {
 4 |     label "wf_common"
 5 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "params.json"
 6 |     cache false
 7 |     cpus 1
 8 |     memory "2 GB"
 9 |     output:
10 |         path "params.json"
11 |     script:
12 |         def paramsJSON = new JsonBuilder(params).toPrettyString().replaceAll("'", "'\\\\''")
13 |     """
14 |     # Output nextflow params object to JSON
15 |     echo '$paramsJSON' > params.json
16 |     """
17 | }
18 | 
19 | process configure_igv {
20 |     publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv
21 |     label "wf_common"
22 |     cpus 1
23 |     memory "2 GB"
24 |     input:
25 |         // the python script will work out what to do with all the files based on their
26 |         // extensions
27 |         path "file-names.txt"
28 |         val locus_str
29 |         val aln_extra_opts
30 |         val var_extra_opts
31 |     output: path "igv.json"
32 |     script:
33 |     // the locus argument just makes sure that the initial view in IGV shows something
34 |     // interesting
35 |     String locus_arg = locus_str ? "--locus $locus_str" : ""
36 |     // extra options for alignment tracks
37 |     def aln_opts_json_str = \
38 |         aln_extra_opts ? new JsonBuilder(aln_extra_opts).toPrettyString() : ""
39 |     String aln_extra_opts_arg = \
40 |         aln_extra_opts ? "--extra-alignment-opts extra-aln-opts.json" : ""
41 |     // extra options for variant tracks
42 |     def var_opts_json_str = \
43 |         var_extra_opts ? new JsonBuilder(var_extra_opts).toPrettyString() : ""
44 |     String var_extra_opts_arg = \
45 |         var_extra_opts ? "--extra-vcf-opts extra-var-opts.json" : ""
46 |     """
47 |     # write out JSON files with extra options for the alignment and variant tracks
48 |     echo '$aln_opts_json_str' > extra-aln-opts.json
49 |     echo '$var_opts_json_str' > extra-var-opts.json
50 | 
51 |     workflow-glue configure_igv \
52 |         --fofn file-names.txt \
53 |         $locus_arg \
54 |         $aln_extra_opts_arg \
55 |         $var_extra_opts_arg \
56 |     > igv.json
57 |     """
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/lib/nfcore_external_java_deps.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/lib/nfcore_external_java_deps.jar


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
  1 | //
  2 | // Notes to End Users.
  3 | //
  4 | // The workflow should run without editing this configuration file,
  5 | // however there may be instances in which you wish to edit this
  6 | // file for compute performance or other reasons. Please see:
  7 | //
  8 | //   https://nextflow.io/docs/latest/config.html#configuration
  9 | //
 10 | // for further help editing this file.
 11 | 
 12 | 
 13 | params {
 14 |     help = false
 15 |     fastq = null
 16 |     bam = null
 17 |     ref_genome = null
 18 |     ref_annotation = null
 19 |     transcriptome_source = "reference-guided"
 20 |     threads = 4
 21 |      // Thresholds for viewing isoforms in report table
 22 |     isoform_table_nrows = 5000
 23 | 
 24 |     out_dir = "output"
 25 |     sample = null
 26 |     sample_sheet = null
 27 |     aws_image_prefix = null
 28 |     aws_queue = null
 29 |     analyse_unclassified = false
 30 |     version = false
 31 | 
 32 |     monochrome_logs = false
 33 |     igv = false
 34 |     validate_params = true
 35 |     show_hidden_params = false
 36 |     schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf'
 37 | 
 38 |     // Process cDNA reads using pychopper, turn off for direct RNA:
 39 |     direct_rna = false
 40 |     // Options passed to pychopper:
 41 |     pychopper_opts = null
 42 |     pychopper_backend = "edlib"
 43 |     cdna_kit = "SQK-PCS109"
 44 | 
 45 |     // Extra option passed to minimap2 when generating index
 46 |     minimap2_index_opts = "-k 14"
 47 | 
 48 |     // Extra options passed to minimap2
 49 |     // For SIRV data
 50 |     //minimap2_opts = "-uf --splice-flank=no"
 51 |     // AFor non-SIRV data:
 52 |     minimap2_opts = "-uf"
 53 | 
 54 |     // Minmum mapping quality
 55 |     minimum_mapping_quality = 40
 56 | 
 57 |     // Internal priming filter context size:
 58 |     poly_context = 24
 59 | 
 60 |     // Maximum allowed poly(A) length in the genome near the 3' end of mapping:
 61 |     max_poly_run = 8
 62 | 
 63 |     // Minimium number of reads in BAM bundles:
 64 |     bundle_min_reads = 50000
 65 | 
 66 |     // Options passed to stringtie:
 67 |     stringtie_opts = "--conservative"
 68 | 
 69 |     // Options passed to gffcompare:
 70 |     gffcompare_opts = "-R"
 71 | 
 72 |     // Plot gffcompare results:
 73 |     plot_gffcmp_stats = true
 74 | 
 75 |     disable_ping = false
 76 |     store_dir = null
 77 | 
 78 |     // de options
 79 |     de_analysis = false
 80 |     ref_transcriptome = null
 81 |     min_samps_gene_expr	= 3
 82 |     min_samps_feature_expr = 1
 83 |     min_gene_expr = 10
 84 |     min_feature_expr = 3
 85 | 
 86 | 
 87 |     wf {
 88 |       example_cmd = [
 89 |         "--de_analysis",
 90 |         "--direct_rna",
 91 |         "--fastq 'wf-transcriptomes-demo/differential_expression_fastq'",
 92 |         "--minimap2_index_opts '-k 15'",
 93 |         "--ref_annotation 'wf-transcriptomes-demo/gencode.v22.annotation.chr20.gtf'",
 94 |         "--ref_genome 'wf-transcriptomes-demo/hg38_chr20.fa'",
 95 |         "--sample_sheet 'wf-transcriptomes-demo/sample_sheet.csv'",
 96 |       ]
 97 |     agent = null
 98 |     container_sha = "shac733d952a14257cf3c5c5d5d44c6aed84d5fe5a1"
 99 | 	common_sha = "sha9ef2f4e4585c4ce6a604616e77185077551abf50"
100 |     }
101 | }
102 | 
103 | manifest {
104 |     name            = 'epi2me-labs/wf-transcriptomes'
105 |     author          = 'Oxford Nanopore Technologies'
106 |     homePage        = 'https://github.com/epi2me-labs/wf-transcriptomes'
107 |     description     = 'Transcriptome analysis including differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.'
108 |     mainScript      = 'main.nf'
109 |     nextflowVersion = '>=23.04.2'
110 |     version         = 'v1.7.0'
111 | }
112 | 
113 | epi2melabs {
114 |   tags = "wf-transcriptomes,isoforms,transcriptomics,denovo,human,mouse,plant"
115 | }
116 | 
117 | // used by default for "standard" (docker) and singularity profiles,
118 | // other profiles may override.
119 | process {
120 |     withLabel:isoforms {
121 |         container = "ontresearch/wf-transcriptomes:${params.wf.container_sha}"
122 |     }
123 |     withLabel:wf_common {
124 |         container = "ontresearch/wf-common:${params.wf.common_sha}"
125 |     }
126 | 
127 |     shell = ['/bin/bash', '-euo', 'pipefail']
128 | }
129 | 
130 | 
131 | profiles {
132 |     // the "standard" profile is used implicitely by nextflow
133 |     // if no other profile is given on the CLI
134 |     standard {
135 |         docker {
136 |             enabled = true
137 |             // this ensures container is run as host user and group, but
138 |             //    also adds host user to the within-container group
139 |             runOptions = "--user \$(id -u):\$(id -g) --group-add 100"
140 |         }
141 |     }
142 | 
143 |     // using singularity instead of docker
144 |     singularity {
145 |         singularity {
146 |             enabled = true
147 |             autoMounts = true
148 |         }
149 |     }
150 | 
151 | 
152 |     conda {
153 |         conda.enabled = true
154 |     }
155 | 
156 |     // Using AWS batch.
157 |     // May need to set aws.region and aws.batch.cliPath
158 |     awsbatch {
159 |         process {
160 |             executor = 'awsbatch'
161 |             queue = "${params.aws_queue}"
162 |             memory = '8G'
163 |             withLabel:isoforms {
164 |                 container = "${params.aws_image_prefix}-wf-transcriptomes:${params.wf.container_sha}"
165 |             }
166 |             withLabel:wf_common {
167 |                 container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}"
168 |             }
169 |             shell = ['/bin/bash', '-euo', 'pipefail']
170 |         }
171 |     }
172 | 
173 |     // local profile for simplified development testing
174 |     local {
175 |         process.executor = 'local'
176 |     }
177 | }
178 | 
179 | 
180 | timeline {
181 |   enabled = true
182 |   overwrite = true
183 |   file = "${params.out_dir}/execution/timeline.html"
184 | }
185 | report {
186 |   enabled = true
187 |   overwrite = true
188 |   file = "${params.out_dir}/execution/report.html"
189 | }
190 | trace {
191 |   enabled = true
192 |   overwrite = true
193 |   file = "${params.out_dir}/execution/trace.txt"
194 | }
195 | 
196 | env {
197 |     PYTHONNOUSERSITE = 1
198 |     JAVA_TOOL_OPTIONS = "-Xlog:disable -Xlog:all=warning:stderr"
199 | }
200 | 


--------------------------------------------------------------------------------
/output_definition.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "files": {
  3 |     "workflow-report": {
  4 |       "filepath": "wf-transcriptomes-report.html",
  5 |       "title": "workflow report",
  6 |       "description": "a HTML report document detailing the primary findings of the workflow",
  7 |       "mime-type": "text/html",
  8 |       "optional": false,
  9 |       "type": "aggregated"
 10 |     },
 11 |     "read-stats-per-file": {
 12 |       "filepath": "fastq_ingress_results/{{ alias }}//reads/fastcat_stats/per-file-stats.tsv",
 13 |       "title": "Per file read stats",
 14 |       "description": "A TSV with per file read stats, including all samples.",
 15 |       "mime-type": "text/tab-separated-values",
 16 |       "optional": false,
 17 |       "type": "aggregated"
 18 |     },
 19 |     "read-stats-per-read": {
 20 |       "filepath": "fastq_ingress_results/{{ alias }}//reads/fastcat_stats/per-read-stats.tsv",
 21 |       "title": "Read stats",
 22 |       "description": "A TSV with per read stats, including all samples.",
 23 |       "mime-type": "text/tab-separated-values",
 24 |       "optional": false,
 25 |       "type": "aggregated"
 26 |     },
 27 |     "run-ids": {
 28 |       "filepath": "fastq_ingress_results/{{ alias }}//reads/fastcat_stats/run_ids",
 29 |       "title": "Run ID's",
 30 |       "description": "List of run IDs present in reads.",
 31 |       "mime-type": "text/txt",
 32 |       "optional": false,
 33 |       "type": "aggregated"
 34 |     },
 35 |     "metamap": {
 36 |       "filepath": "fastq_ingress_results/{{ alias }}//reads/metamap.json",
 37 |       "title": "Meta map json",
 38 |       "description": "Metadata used in workflow presented in a JSON.",
 39 |       "mime-type": "text/json",
 40 |       "optional": false,
 41 |       "type": "aggregated"
 42 |     },
 43 |     "sample-data": {
 44 |       "filepath": "fastq_ingress_results/{{ alias }}//reads/{{ alias }}.fastq.gz",
 45 |       "title": "Concatenated sequence data",
 46 |       "description": "Per sample reads concatenated in to one FASTQ file.",
 47 |       "mime-type": "text/json",
 48 |       "optional": false,
 49 |       "type": "per-sample"
 50 |     },
 51 |     "transcriptome": {
 52 |       "filepath": "{{ alias }}_transcriptome.fas",
 53 |       "title": "Assembled transcriptome",
 54 |       "description": "Per sample assembled transcriptome.  Not output if a reference annotation was supplied",
 55 |       "mime-type": "text/x-fasta",
 56 |       "optional": true,
 57 |       "type": "per-sample"
 58 |     },
 59 |     "merged_transcriptome": {
 60 |       "filepath": "{{ alias }}_merged_transcriptome.fas",
 61 |       "title": "Annotated assembled transcriptome",
 62 |       "description": "Per sample annotated assembled transcriptome. Only output if a reference annotation was supplied",
 63 |       "mime-type": "text/x-fasta",
 64 |       "optional": true,
 65 |       "type": "per-sample"
 66 |     },
 67 |     "alignment-stats": {
 68 |       "filepath": "{{ alias }}_read_aln_stats.tsv",
 69 |       "title": "Alignment summary statistics",
 70 |       "description": "Per sample alignment summary statistics.",
 71 |       "mime-type": "text/tab-separated-valuesa",
 72 |       "optional": false,
 73 |       "type": "per-sample"
 74 |     },
 75 |     "gff_compare": {
 76 |       "filepath": "{{ alias }}_gffcompare",
 77 |       "title": "GFF compare results.",
 78 |       "description": "All GFF compare output files.",
 79 |       "mime-type": "text/directory",
 80 |       "optional": true,
 81 |       "type": "per-sample"
 82 |     },
 83 |     "dge-results-tsv": {
 84 |       "filepath": "de_analysis/results_dge.tsv",
 85 |       "title": "Differential gene expression results",
 86 |       "description": "This is a gene-level result file that describes genes and their probability of showing differential expression between experimental conditions.",
 87 |       "mime-type": "text/tab-separated-values",
 88 |       "optional": true,
 89 |       "type": "aggregated"
 90 |     },
 91 |     "dge-report-pdf": {
 92 |       "filepath": "de_analysis/results_dge.pdf",
 93 |       "title": "Differential gene expression report",
 94 |       "description": "Summary report of differential gene expression analysis as a PDF.",
 95 |       "mime-type": "application/pdf",
 96 |       "optional": true,
 97 |       "type": "aggregated"
 98 |     },
 99 |     "dtu-gene-tsv": {
100 |       "filepath": "de_analysis/results_dtu_gene.tsv",
101 |       "title": "Differential transcript usage gene TSV",
102 |       "description": "This is a gene-level result file from DEXSeq that lists annotated genes and their probabilities of differential expression.",
103 |       "mime-type": "text/tab-separated-values",
104 |       "optional": true,
105 |       "type": "aggregated"
106 |     },
107 |     "dtu-report-pdf": {
108 |       "filepath": "de_analysis/results_dtu.pdf",
109 |       "title": "Differential transcript usage report",
110 |       "description": "Summary report of differential transcript usage results as a PDF.",
111 |       "mime-type": "application/pdf",
112 |       "optional": true,
113 |       "type": "aggregated"
114 |     },
115 |     "dtu-transcript": {
116 |       "filepath": "de_analysis/results_dtu_transcript.tsv",
117 |       "title": "Differential transcript usage TSV",
118 |       "description": "This is a transcript-level result file from DEXSeq that lists annotated genes and their probabilities of differential expression.",
119 |       "mime-type": "text/tab-separated-values",
120 |       "optional": true,
121 |       "type": "aggregated"
122 |     },
123 |     "dtu-stageR": {
124 |       "filepath": "de_analysis/results_dtu_stageR.tsv ",
125 |       "title": "Differential transcript usage stageR TSV",
126 |       "description": "This is the output from StageR and it shows both gene and transcript probabilities of differential expression",
127 |       "mime-type": "text/tab-separated-values",
128 |       "optional": true,
129 |       "type": "aggregated"
130 |     },
131 |     "dexseq": {
132 |       "filepath": "de_analysis/results_dexseq.tsv",
133 |       "title": "Differential transcript usage DEXSeq TSV",
134 |       "description": "The complete output from the DEXSeq-analysis, shows both gene and transcript probabilities of differential expression.",
135 |       "mime-type": "text/tab-separated-values",
136 |       "optional": true,
137 |       "type": "aggregated"
138 |     },
139 |     "gene_counts": {
140 |       "filepath": "de_analysis/all_gene_counts.tsv",
141 |       "title": "Gene counts",
142 |       "description": "Raw gene counts created by the Salmon tool, before filtering.",
143 |       "mime-type": "text/tab-separated-values",
144 |       "optional": true,
145 |       "type": "aggregated"
146 |     },
147 |     "gene_counts_per_million": {
148 |       "filepath": "de_analysis/cpm_gene_counts.tsv",
149 |       "title": "Gene counts per million",
150 |       "description": "This file shows counts per million (CPM) of the raw gene counts to facilitate comparisons across samples.",
151 |       "mime-type": "text/tab-separated-values",
152 |       "optional": true,
153 |       "type": "aggregated"
154 |     },
155 |     "transcript_counts": {
156 |       "filepath": "de_analysis/unfiltered_transcript_counts_with_genes.tsv",
157 |       "title": "Transcript counts",
158 |       "description": "Raw transcript counts created by the Salmon tool, before filtering. Includes reference to the associated gene ID.",
159 |       "mime-type": "text/tab-separated-values",
160 |       "optional": true,
161 |       "type": "aggregated"
162 |     },
163 |     "tpm_transcript_counts": {
164 |       "filepath": "de_analysis/unfiltered_tpm_transcript_counts.tsv",
165 |       "title": "Transcript per million counts",
166 |       "description": "This file shows transcripts per million (TPM) of the raw counts to facilitate comparisons across samples.",
167 |       "mime-type": "text/tab-separated-values",
168 |       "optional": true,
169 |       "type": "aggregated"
170 |     },
171 |     "transcipt_counts_filtered": {
172 |       "filepath": "de_analysis/filtered_transcript_counts_with_genes.tsv",
173 |       "title": "Transcript counts filtered",
174 |       "description": "Filtered transcript counts, used for differential transcript usage analysis. Includes a reference to the associated gene ID.",
175 |       "mime-type": "text/tab-separated-values",
176 |       "optional": true,
177 |       "type": "aggregated"
178 |     },
179 |     "transcripts_table": {
180 |       "filepath": "{{ alias }}_transcripts_table.tsv",
181 |       "title": "Transcript info table",
182 |       "description": "This file details each isoform that was reconstructed from the input reads. It contains a subset of columns from the .tmap output from [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml)",
183 |       "mime-type": "text/tab-separated-values",
184 |       "optional": true,
185 |       "type": "per-sample"
186 |     },
187 |     "final_non_redundant_transcriptome": {
188 |       "filepath": "de_analysis/final_non_redundant_transcriptome.fasta",
189 |       "title": "Final non redundant transcriptome",
190 |       "description": "Transcripts that were used for differential expression analysis including novel transcripts with the identifiers used for DE analysis. Only applicable when the ref_transcriptome parameter is not provided.",
191 |       "mime-type": "text/x-fasta",
192 |       "optional": true,
193 |       "type": "aggregated"
194 |     },
195 |     "reference-index": {
196 |       "filepath": "igv_reference/{{ ref_genome_file }}.fai",
197 |       "title": "Index of reference FASTA file",
198 |       "description": "Reference genome index of the FASTA file required for IGV config.",
199 |       "mime-type": "text/tab-separated-values",
200 |       "optional": true,
201 |       "type": "aggregated"
202 |     },
203 |     "reference-gzi-index": {
204 |         "filepath": "igv_reference/{{ ref_genome_file }}.gzi",
205 |         "title": "GZI index of the reference FASTA file",
206 |         "description": "GZI Index of the reference FASTA file.",
207 |         "mime-type": "application/octet-stream",
208 |         "optional": true,
209 |         "type": "aggregated"
210 |     },
211 |     "igv-config": {
212 |       "filepath": "igv.json",
213 |       "title": "JSON configuration file for IGV browser",
214 |       "description": "JSON configuration file to be loaded in IGV for visualising alignments against the reference.",
215 |       "mime-type": "text/json",
216 |       "optional": true,
217 |       "type": "aggregated"
218 |     },
219 |     "minimap2-bam": {
220 |       "filepath": "BAMS/{{ alias }}.reads_aln_sorted.bam",
221 |       "title": "BAM file (minimap2)",
222 |       "description": "BAM file generated from mapping input reads to the reference.",
223 |       "mime-type": "application/gzip",
224 |       "optional": true,
225 |       "type": "per-sample"
226 |     },
227 |     "minimap2-index": {
228 |       "filepath": "BAMS/{{ alias }}.reads_aln_sort.bam.bai",
229 |       "title": "BAM index file (minimap2)",
230 |       "description": "Index file generated from mapping input reads to the reference.",
231 |       "mime-type": "application/octet-stream",
232 |       "optional": true,
233 |       "type": "per-sample"
234 |     }
235 |   }
236 | }
237 | 


--------------------------------------------------------------------------------
/subworkflows/differential_expression.nf:
--------------------------------------------------------------------------------
  1 | process checkSampleSheetCondition {
  2 |     label "isoforms"
  3 |     cpus 1
  4 |     memory "2 GB"
  5 |     input:
  6 |         path "sample_sheet.csv"
  7 |     """
  8 |     workflow-glue check_sample_sheet_condition "sample_sheet.csv"
  9 |     """
 10 | }
 11 | 
 12 | 
 13 | 
 14 | process count_transcripts {
 15 |     // Count transcripts using Salmon.
 16 |     // library type is specified as forward stranded (-l SF) as it should have either been through pychopper or come from direct RNA reads.
 17 |     label "isoforms"
 18 |     cpus params.threads
 19 |     memory "31 GB"
 20 |     input:
 21 |         tuple val(meta), path(bam), path(ref_transcriptome)
 22 |     output:
 23 |         path "*transcript_counts.tsv", emit: counts
 24 |     """
 25 |     salmon quant --noErrorModel -p "${task.cpus}" -t "${ref_transcriptome}" -l SF -a "${bam}" -o counts
 26 |     mv counts/quant.sf "${meta.alias}.transcript_counts.tsv"
 27 |     """
 28 | }
 29 | 
 30 | 
 31 | process mergeCounts {
 32 |     label "isoforms"
 33 |     cpus 1
 34 |     memory "2 GB"
 35 |     input:
 36 |         path counts
 37 |     output:
 38 |         path "unfiltered_transcript_counts.tsv"
 39 |     """
 40 |     workflow-glue merge_count_tsvs -z -o unfiltered_transcript_counts.tsv -tsvs ${counts}
 41 |     """
 42 | }
 43 | 
 44 | process mergeTPM {
 45 |     label "isoforms"
 46 |     cpus 1
 47 |     memory "2 GB"
 48 |     input:
 49 |         path counts
 50 |     output:
 51 |         path "unfiltered_tpm_transcript_counts.tsv"
 52 |     // Use tpm parameter with merge_counts_tsvs.py to out transcript per million file
 53 |     """
 54 |     workflow-glue merge_count_tsvs -o unfiltered_tpm_transcript_counts.tsv -z -tpm True -tsvs $counts
 55 |     """
 56 | }
 57 | 
 58 | 
 59 | process deAnalysis {
 60 |     label "isoforms"
 61 |     cpus 4
 62 |     memory "16 GB"
 63 |     input:
 64 |         path "sample_sheet.csv"
 65 |         path "all_counts.tsv" 
 66 |         path "annotation.gtf"
 67 |     output:
 68 |         path "de_analysis/results_dtu_stageR.tsv", emit: stageR
 69 |         path "merged/filtered_transcript_counts_with_genes.tsv", emit: flt_counts
 70 |         path "merged/all_gene_counts.tsv", emit: gene_counts
 71 |         path "de_analysis/unfiltered_transcript_counts_with_genes.tsv", emit: unflt_counts
 72 |         path "de_analysis/results_dge.tsv", emit: dge
 73 |         path "de_analysis/results_dexseq.tsv", emit: dexseq
 74 |         path "de_analysis/results_dge.pdf", emit: dge_pdf
 75 |         path "de_analysis/results_dge.tsv", emit: dge_tsv
 76 |         path "de_analysis/results_dtu_gene.tsv", emit: dtu_gene
 77 |         path "de_analysis/results_dtu_transcript.tsv", emit: dtu_transcript
 78 |         path "de_analysis/results_dtu_stageR.tsv", emit: dtu_stageR
 79 |         path "de_analysis/results_dtu.pdf", emit: dtu_pdf
 80 |         path "de_analysis/cpm_gene_counts.tsv", emit: cpm
 81 |     """
 82 |     de_analysis.R \
 83 |     --annotation annotation.gtf \
 84 |     --min_samps_gene_expr $params.min_samps_gene_expr \
 85 |     --min_samps_feature_expr $params.min_samps_feature_expr \
 86 |     --min_gene_expr $params.min_gene_expr \
 87 |     --min_feature_expr $params.min_feature_expr \
 88 |     --sample_sheet sample_sheet.csv \
 89 |     --all_counts all_counts.tsv \
 90 |     --de_out_dir de_analysis \
 91 |     --merged_out_dir merged
 92 |     """
 93 | }
 94 | 
 95 | 
 96 | process plotResults {
 97 |     label "isoforms"
 98 |     cpus 2
 99 |     memory "2 GB"
100 |     input:
101 |         path "filtered_transcript_counts_with_genes.tsv"
102 |         path "results_dtu_stageR.tsv"
103 |         path "sample_sheet.tsv"
104 |     output:
105 |         path "dtu_plots.pdf", emit: dtu_plots
106 |     """
107 |     plot_dtu_results.R \
108 |     --counts filtered_transcript_counts_with_genes.tsv \
109 |     --results_dtu results_dtu_stageR.tsv \
110 |     --sample_sheet sample_sheet.tsv \
111 |     --pdf_out dtu_plots.pdf
112 |     """
113 | }
114 | 
115 | process build_minimap_index_transcriptome{
116 |     /*
117 |     Build minimap index from reference genome
118 |     */
119 |     label "isoforms"
120 |     cpus params.threads
121 |     memory "31 GB"
122 |     input:
123 |         path reference
124 |     output:
125 |         tuple path("genome_index.mmi"), path(reference), emit: index
126 |     script:
127 |     """
128 |     minimap2 -t "${task.cpus}" ${params.minimap2_index_opts}  -I 1000G -d "genome_index.mmi" "${reference}"
129 |   
130 |     """
131 | }
132 | 
133 | 
134 | process map_transcriptome{
135 |     /*
136 |     Map reads to reference using minimap2.
137 |     Filter reads by mapping quality.
138 |     Filter internally-primed reads.
139 |     */
140 |     label "isoforms"
141 |     cpus params.threads
142 |     memory "16 GB"
143 | 
144 |     input:
145 |        tuple val(meta), path (fastq_reads), path(index)
146 |     output:
147 |        tuple val(meta), path("${meta.alias}_reads_aln_sorted.bam"), emit: bam
148 |        path("${meta.alias}.flagstat.stats"), emit: align_stats
149 |     """
150 |     minimap2 -t ${task.cpus} -ax splice -uf -p 1.0 "${index}" "${fastq_reads}" \
151 |     | samtools view -Sb > "output.bam"
152 |     samtools sort -@ ${task.cpus} "output.bam" -o "${meta.alias}_reads_aln_sorted.bam"
153 |     samtools flagstat -O json "${meta.alias}_reads_aln_sorted.bam" > "${meta.alias}.flagstat.stats"
154 |     """
155 | }
156 | 
157 | 
158 | workflow differential_expression {
159 |     take:
160 |        ref_transcriptome
161 |        full_len_reads
162 |        sample_sheet
163 |        ref_annotation
164 |     main:
165 |         sample_sheet = Channel.fromPath(sample_sheet)
166 |         checkSampleSheetCondition(sample_sheet)
167 |         t_index = build_minimap_index_transcriptome(ref_transcriptome)
168 |         mapped = map_transcriptome(full_len_reads.combine(t_index)
169 |         .map{meta, fastq, reference, transcriptome -> tuple(meta, fastq, reference) })
170 |         count_transcripts(mapped.bam.combine(t_index.map{ mmi, reference -> reference}))
171 |         merged = mergeCounts(count_transcripts.out.counts.collect())
172 |         merged_TPM = mergeTPM(count_transcripts.out.counts.collect())
173 |         analysis = deAnalysis(sample_sheet, merged, ref_annotation)
174 |         plotResults(analysis.flt_counts, analysis.stageR, sample_sheet)
175 |         // Concat files required for making the report
176 |         de_report = analysis.flt_counts.concat(
177 |             analysis.gene_counts, analysis.dge, analysis.dexseq,
178 |             analysis.stageR, sample_sheet, merged, ref_annotation, merged_TPM, analysis.unflt_counts).collect()
179 |         // Concat files required to be output to user without any changes
180 |         de_outputs_concat = analysis.cpm.concat(plotResults.out.dtu_plots, analysis.dge_pdf, analysis.dge_tsv,
181 |         analysis.dtu_gene, analysis.dtu_transcript, analysis.dtu_stageR, analysis.dtu_pdf, merged_TPM).collect()
182 |         collected_de_alignment_stats = mapped.align_stats.collect()
183 | emit:
184 |        all_de = de_report
185 |        de_alignment_stats = collected_de_alignment_stats
186 |        de_outputs = de_outputs_concat
187 | }
188 | 


--------------------------------------------------------------------------------
/subworkflows/reference_assembly.nf:
--------------------------------------------------------------------------------
 1 | process map_reads{
 2 |     /*
 3 |     Map reads to reference using minimap2.
 4 |     Filter reads by mapping quality.
 5 |     Filter internally-primed reads.
 6 |     */
 7 |     label "isoforms"
 8 |     cpus params.threads
 9 |     memory "31 GB"
10 |     publishDir path: "${params.out_dir}/${publish_prefix_bams}", mode: 'copy', pattern: "${sample_id}_reads_aln_sorted.bam*", overwrite: true
11 |     input:
12 |        tuple val(sample_id), path (fastq_reads), path(index), path(reference)
13 |        val publish_prefix_bams
14 |     output:
15 |        tuple val(sample_id), 
16 |              path("${sample_id}_reads_aln_sorted.bam"), 
17 |              path("${sample_id}_reads_aln_sorted.bam.bai"),
18 |              emit: bam
19 |        tuple val(sample_id), path("${sample_id}_read_aln_stats.tsv"), emit: stats
20 |     script:
21 |         def ContextFilter = """AlnContext: { Ref: "${reference}", LeftShift: -${params.poly_context},
22 |         RightShift: ${params.poly_context}, RegexEnd: "[Aa]{${params.max_poly_run},}",
23 |         Stranded: True,Invert: True, Tsv: "internal_priming_fail.tsv"} """
24 | 
25 |         def mm2_threads = Math.max(task.cpus - 3, 1)
26 |     """
27 |     minimap2 -t ${mm2_threads} -ax splice ${params.minimap2_opts} ${index} ${fastq_reads}\
28 |         | samtools view -q ${params.minimum_mapping_quality} -F 2304 -Sb -\
29 |         | seqkit bam -j 1 -x -T '${ContextFilter}' -\
30 |         | samtools sort --write-index -@ 1 -o "${sample_id}_reads_aln_sorted.bam##idx##${sample_id}_reads_aln_sorted.bam.bai" - ;
31 |     ((cat "${sample_id}_reads_aln_sorted.bam" | seqkit bam -s -j 1 - 2>&1)  | tee "${sample_id}_read_aln_stats.tsv" ) || true
32 | 
33 |     # Add sample id header and column; remove last column (File)
34 |     cat "${sample_id}_read_aln_stats.tsv" \
35 |         | sed "s/^/${sample_id} /" \
36 |         | sed "1 s/^${sample_id}/sample_id/" \
37 |         | awk 'NF{NF-=1};1' \
38 |         > tmp
39 |     mv tmp "${sample_id}_read_aln_stats.tsv"
40 | 
41 |     if [[ -s "internal_priming_fail.tsv" ]];
42 |         then
43 |             tail -n +2 "internal_priming_fail.tsv" | awk '{print ">" \$1 "\\n" \$4 }' - > "context_internal_priming_fail_start.fasta"
44 |             tail -n +2 "internal_priming_fail.tsv" | awk '{print ">" \$1 "\\n" \$6 }' - > "context_internal_priming_fail_end.fasta"
45 |     fi
46 |     """
47 | }
48 | 
49 | workflow reference_assembly {
50 |     take:
51 |        index
52 |        reference
53 |        fastq_reads
54 |        publish_prefix_bams
55 |     main:
56 |         map_reads(fastq_reads.combine(index).combine(reference), publish_prefix_bams)
57 |     emit:
58 |        bam = map_reads.out.bam
59 |        stats = map_reads.out.stats
60 | }
61 | 


--------------------------------------------------------------------------------
/test_data/SIRV_150601a.fasta.fai:
--------------------------------------------------------------------------------
1 | SIRV1	12643	7	80	81
2 | SIRV2	6911	12816	80	81
3 | SIRV3	10943	19821	80	81
4 | SIRV4	16122	30908	80	81
5 | SIRV5	14606	47239	80	81
6 | SIRV6	12837	62035	80	81
7 | SIRV7	148957	75040	80	81
8 | 


--------------------------------------------------------------------------------
/test_data/demultiplexed_fastq/barcode01/SIRV_E0_PCS109_51.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/test_data/demultiplexed_fastq/barcode01/SIRV_E0_PCS109_51.fq.gz


--------------------------------------------------------------------------------
/test_data/demultiplexed_fastq/barcode02/SIRV_E0_PCS109_25.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/test_data/demultiplexed_fastq/barcode02/SIRV_E0_PCS109_25.fq.gz


--------------------------------------------------------------------------------
/test_data/fastq/SIRV_E0_PCS109_50.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-transcriptomes/a74a34763edecdf74a7e484e2b85c6e128b8ae98/test_data/fastq/SIRV_E0_PCS109_50.fq.gz


--------------------------------------------------------------------------------
/test_data/sample_sheet.csv:
--------------------------------------------------------------------------------
1 | barcode,sample_id,alias,condition
2 | barcode01,sample01,sample01,control
3 | barcode02,sample02,sample02,control
4 | barcode03,sample03,sample03,control
5 | barcode04,sample04,sample04,treated
6 | barcode05,sample05,sample05,treated
7 | barcode06,sample06,sample06,treated
8 | 


--------------------------------------------------------------------------------
/test_data/workflow_glue/MSTRG.11088.gff3:
--------------------------------------------------------------------------------
1 | chr1	HAVANA	transcript	11869	14409	.	+	.	ID=ENST00000456328.2;Parent=ENSG00000290825.1;gene_id=ENSG00000290825.1;transcript_id=ENST00000456328.2;gene_type=lncRNA;gene_name=DDX11L2;transcript_type=lncRNA;transcript_name=DDX11L2-202;level=2;transcript_support_level=1;tag=basic,Ensembl_canonical;havana_transcript=OTTHUMT00000362751.1
2 | chr2	HAVANA	transcript	113599036	113601261	.	-	.	ID=ENST00000437401.1;Parent=ENSG00000236397.3;gene_id=ENSG00000236397.3;transcript_id=ENST00000437401.1;gene_type=unprocessed_pseudogene;gene_name=DDX11L2;transcript_type=unprocessed_pseudogene;transcript_name=DDX11L2-201;level=2;transcript_support_level=NA;hgnc_id=HGNC:37103;ont=PGO:0000005;tag=basic,Ensembl_canonical;havana_gene=OTTHUMG00000047823.1;havana_transcript=OTTHUMT00000109036.1
3 | 


--------------------------------------------------------------------------------
/test_data/workflow_glue/MSTRG.11088.gtf:
--------------------------------------------------------------------------------
 1 | chr13	StringTie	transcript	76990660	77005117	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 
 2 | chr13	StringTie	exon	76990660	76992271	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; exon_number "1"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 
 3 | chr13	StringTie	exon	76995063	76995228	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; exon_number "2"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 
 4 | chr13	StringTie	exon	76995902	76996127	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; exon_number "3"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 
 5 | chr13	StringTie	exon	77000458	77005117	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636183.2"; exon_number "4"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16";
 6 | chr13	StringTie	transcript	76991729	77005117	1000	+	.	gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; 
 7 | chr13	StringTie	exon	76991729	76991832	1000	+	.	gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; exon_number "1"; 
 8 | chr13	StringTie	exon	76995063	76995228	1000	+	.	gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; exon_number "2"; 
 9 | chr13	StringTie	exon	76995902	76996127	1000	+	.	gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; exon_number "3"; 
10 | chr13	StringTie	exon	77000458	77005117	1000	+	.	gene_id "MSTRG.11088"; transcript_id "MSTRG.11088.2"; exon_number "4";
11 | chr13	StringTie	transcript	76992044	77005117	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 
12 | chr13	StringTie	exon	76992044	76992271	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "1"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 
13 | chr13	StringTie	exon	76995063	76995228	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "2"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 
14 | chr13	StringTie	exon	76995902	76996127	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "3"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 
15 | chr13	StringTie	exon	76998043	76998085	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "4"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16"; 
16 | chr13	StringTie	exon	77000458	77005117	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636780.2"; exon_number "5"; gene_name "CLN5"; ref_gene_id "ENSG00000102805.16";
17 | chr13	StringTie	transcript	76992078	77078025	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2";
18 | chr13	StringTie	exon	76992078	76992271	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "1"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 
19 | chr13	StringTie	exon	76995063	76995228	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "2"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 
20 | chr13	StringTie	exon	76995902	76996127	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "3"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 
21 | chr13	StringTie	exon	77075518	77075584	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "4"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 
22 | chr13	StringTie	exon	77076816	77078025	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000638147.2"; exon_number "5"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2";
23 | chr13	StringTie	transcript	76995915	77129717	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000637192.1"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 
24 | chr13	StringTie	exon	76995915	76996127	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000637192.1"; exon_number "1"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 
25 | chr13	StringTie	exon	77109648	77110102	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000637192.1"; exon_number "2"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2"; 
26 | chr13	StringTie	exon	77129147	77129717	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000637192.1"; exon_number "3"; gene_name "ENSG00000283208"; ref_gene_id "ENSG00000283208.2";
27 | chr13	StringTie	transcript	77026767	77078025	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636737.1"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7";
28 | chr13	StringTie	exon	77026767	77027122	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636737.1"; exon_number "1"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 
29 | chr13	StringTie	exon	77075518	77075584	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636737.1"; exon_number "2"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 
30 | chr13	StringTie	exon	77076816	77078025	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000636737.1"; exon_number "3"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 
31 | chr13	StringTie	transcript	77075514	77087778	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000450627.6"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 
32 | chr13	StringTie	exon	77075514	77075584	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000450627.6"; exon_number "1"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 
33 | chr13	StringTie	exon	77076816	77076866	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000450627.6"; exon_number "2"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7"; 
34 | chr13	StringTie	exon	77087552	77087778	1000	+	.	gene_id "MSTRG.11088"; transcript_id "ENST00000450627.6"; exon_number "3"; gene_name "MYCBP2-AS1"; ref_gene_id "ENSG00000236051.7";


--------------------------------------------------------------------------------
/test_data/workflow_glue/check_sample_sheet_condition/sample_sheet_1.csv:
--------------------------------------------------------------------------------
1 | barcode,sample_id,alias,condition
2 | barcode01,sample01,sample01,control
3 | barcode02,sample02,sample02,control
4 | barcode03,sample03,sample03,control
5 | barcode04,sample04,sample04,treated
6 | barcode05,sample05,sample05,treated
7 | barcode06,sample06,sample06,other


--------------------------------------------------------------------------------
/test_data/workflow_glue/check_sample_sheet_condition/sample_sheet_2.csv:
--------------------------------------------------------------------------------
1 | barcode,sample_id,alias
2 | barcode01,sample01,sample01
3 | barcode02,sample02,sample02
4 | barcode03,sample03,sample03
5 | barcode04,sample04,sample04
6 | barcode05,sample05,sample05
7 | barcode06,sample06,sample06


--------------------------------------------------------------------------------
/test_data/workflow_glue/check_sample_sheet_condition/sample_sheet_3.csv:
--------------------------------------------------------------------------------
1 | barcode,sample_id,alias,condition
2 | barcode01,sample01,sample01,control
3 | barcode04,sample04,sample04,treated
4 | 


--------------------------------------------------------------------------------
/test_data/workflow_glue/check_sample_sheet_condition/sample_sheet_4.csv:
--------------------------------------------------------------------------------
1 | barcode,sample_id,alias,condition
2 | barcode01,sample01,sample01,untreated
3 | barcode02,sample02,sample02,untreated
4 | barcode03,sample03,sample03,untreated
5 | barcode04,sample04,sample04,treated
6 | barcode05,sample05,sample05,treated
7 | barcode06,sample06,sample06,treated
8 | 


--------------------------------------------------------------------------------