├── .dockerignore
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── config.yml
    │   ├── feature_request.yml
    │   └── question.yml
├── .gitignore
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── bin
    ├── workflow-glue
    └── workflow_glue
    │   ├── __init__.py
    │   ├── adapter_scan_vsearch.py
    │   ├── assign_barcodes.py
    │   ├── assign_features.py
    │   ├── calc_saturation.py
    │   ├── clip_depth.py
    │   ├── create_matrix.py
    │   ├── create_shortlist.py
    │   ├── create_umap.py
    │   ├── expression_matrix.py
    │   ├── extract_barcode.py
    │   ├── format_ctat_output.py
    │   ├── models
    │       ├── __init__.py
    │       └── common.py
    │   ├── parse_kit_metadata.py
    │   ├── prepare_report_data.py
    │   ├── process_matrix.py
    │   ├── report.py
    │   ├── sc_util.py
    │   ├── summarise_adapters.py
    │   ├── tag_bam.py
    │   ├── tags_from_bam.py
    │   ├── tests
    │       ├── __init__.py
    │       ├── test_adapter_scan_vsearch.py
    │       ├── test_assign_barcodes.py
    │       ├── test_assign_features.py
    │       ├── test_calc_saturation.py
    │       ├── test_cluster_umis.py
    │       ├── test_expression_matrix.py
    │       ├── test_extract_barcode.py
    │       ├── test_format_ctat_output.py
    │       └── test_tag_bam.py
    │   ├── util.py
    │   ├── variant_mex.py
    │   └── wfg_helpers
    │       ├── __init__.py
    │       ├── check_bam_headers_in_dir.py
    │       ├── check_sample_sheet.py
    │       ├── check_xam_index.py
    │       ├── configure_igv.py
    │       ├── get_max_depth_locus.py
    │       └── reheader_samstream.py
├── data
    ├── 3M-3pgex-may-2023.txt.gz
    ├── 3M-5pgex-jan-2023.txt.gz
    ├── 3M-february-2018.txt.gz
    ├── 737K-arc-v1.txt.gz
    ├── 737K-august-2016.txt.gz
    ├── OPTIONAL_FILE
    ├── genes_of_interest.csv
    ├── visium-v1.txt.gz
    └── visium-v1_coordinates.txt
├── docs
    ├── 01_brief_description.md
    ├── 02_introduction.md
    ├── 03_compute_requirements.md
    ├── 04_install_and_run.md
    ├── 05_related_protocols.md
    ├── 06_input_example.md
    ├── 06_input_parameters.md
    ├── 07_outputs.md
    ├── 08_pipeline_overview.md
    ├── 09_troubleshooting.md
    ├── 10_FAQ.md
    ├── 11_other.md
    └── images
    │   ├── 3prime_read.png
    │   └── probe.png
├── kit_configs.csv
├── lib
    ├── ArgumentParser.groovy
    ├── CWUtil.groovy
    ├── NfcoreSchema.groovy
    ├── NfcoreTemplate.groovy
    ├── Pinguscript.groovy
    ├── WorkflowMain.groovy
    ├── common.nf
    ├── ingress.nf
    └── nfcore_external_java_deps.jar
├── limitations_and_known_issues.md
├── main.nf
├── modules
    └── local
    │   └── common.nf
├── nextflow.config
├── nextflow_schema.json
├── output_definition.json
├── subworkflows
    ├── assign_features.nf
    ├── barcode_correction.nf
    ├── fusions.nf
    ├── preprocess.nf
    ├── process_bams.nf
    ├── process_spaceranger.nf
    └── snv.nf
└── test
    ├── conftest.py
    ├── test_ingress.py
    └── workflow_integration.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | bin
 3 | CHANGELOG.md
 4 | data
 5 | lib
 6 | LICENSE
 7 | main.nf
 8 | nextflow.config
 9 | README.md
10 | test_data
11 | # we typically run tests with outputs to these:
12 | output
13 | work
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
  1 | name: Bug Report
  2 | description: File a bug report
  3 | labels: ["triage"]
  4 | body:
  5 |   - type: markdown
  6 |     attributes:
  7 |       value: |
  8 |         Thanks for taking the time to fill out this bug report!
  9 | 
 10 | 
 11 |   - type: markdown
 12 |     attributes:
 13 |       value: |
 14 |           # Background
 15 |   - type: dropdown
 16 |     id: os
 17 |     attributes:
 18 |       label: Operating System
 19 |       description: What operating system are you running?
 20 |       options:
 21 |         - Windows 10
 22 |         - Windows 11
 23 |         - macOS
 24 |         - Ubuntu 22.04
 25 |         - CentOS 7
 26 |         - Other Linux (please specify below)
 27 |     validations:
 28 |       required: true
 29 |   - type: input
 30 |     id: other-os
 31 |     attributes:
 32 |       label: Other Linux
 33 |       placeholder: e.g. Fedora 38
 34 |   - type: input
 35 |     id: version
 36 |     attributes:
 37 |       label: Workflow Version
 38 |       description: This is most easily found in the workflow output log
 39 |       placeholder: v1.2.3
 40 |     validations:
 41 |       required: true
 42 |   - type: dropdown
 43 |     id: execution
 44 |     attributes:
 45 |       label: Workflow Execution
 46 |       description: Where are you running the workflow?
 47 |       options:
 48 |         - EPI2ME Desktop (Local)
 49 |         - EPI2ME Desktop (Cloud)
 50 |         - Command line (Local)
 51 |         - Command line (Cluster)
 52 |         - Other (please describe)
 53 |     validations:
 54 |       required: true
 55 |   - type: input
 56 |     id: other-workflow-execution
 57 |     attributes:
 58 |       label: Other workflow execution
 59 |       description: If "Other", please describe
 60 |       placeholder: Tell us where / how you are running the workflow.
 61 | 
 62 |   - type: markdown
 63 |     attributes:
 64 |       value: |
 65 |         # EPI2ME Desktop Application
 66 |         If you are using the application please provide the following.
 67 |   - type: input
 68 |     id: labs-version
 69 |     attributes:
 70 |       label: EPI2ME Version
 71 |       description: Available from the application settings page.
 72 |       placeholder: v5.1.1
 73 |     validations:
 74 |       required: false
 75 | 
 76 | 
 77 |   - type: markdown
 78 |     attributes:
 79 |       value: |
 80 |         # Command-line execution
 81 |         If you are using nextflow on a command-line, please provide the following.
 82 |   - type: textarea
 83 |     id: cli-command
 84 |     attributes:
 85 |       label: CLI command run
 86 |       description: Please tell us the command you are running
 87 |       placeholder: e.g. nextflow run epi2me-labs/wf-human-variations -profile standard --fastq my-reads/fastq
 88 |     validations:
 89 |       required: false
 90 |   - type: dropdown
 91 |     id: profile
 92 |     attributes:
 93 |       label: Workflow Execution - CLI Execution Profile
 94 |       description: Which execution profile are you using? If you are using a custom profile or nextflow configuration, please give details below.
 95 |       options:
 96 |         - standard (default)
 97 |         - singularity
 98 |         - custom
 99 |     validations:
100 |       required: false
101 | 
102 | 
103 |   - type: markdown
104 |     attributes:
105 |       value: |
106 |         # Report details
107 |   - type: textarea
108 |     id: what-happened
109 |     attributes:
110 |       label: What happened?
111 |       description: Also tell us, what did you expect to happen?
112 |       placeholder: Tell us what you see!
113 |     validations:
114 |       required: true
115 |   - type: textarea
116 |     id: logs
117 |     attributes:
118 |       label: Relevant log output
119 |       description: For CLI execution please include the full output from running nextflow. For execution from the EPI2ME application please copy the contents of the "Workflow logs" panel from the "Logs" tab corresponding to your workflow instance. (This will be automatically formatted into code, so no need for backticks).
120 |       render: shell
121 |     validations:
122 |       required: true
123 |   - type: textarea
124 |     id: activity-log
125 |     attributes:
126 |       label: Application activity log entry
127 |       description: For use with the EPI2ME application please see the Settings > View Activity Log page, and copy the contents of any items listed in red using the Copy to clipboard button.
128 |       render: shell
129 |     validations:
130 |       required: false
131 |   - type: dropdown
132 |     id: run-demo
133 |     attributes:
134 |       label: Were you able to successfully run the latest version of the workflow with the demo data?
135 |       description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button?
136 |       options:
137 |         - 'yes'
138 |         - 'no'
139 |         - other (please describe below)
140 |     validations:
141 |       required: true
142 |   - type: textarea
143 |     id: demo-other
144 |     attributes:
145 |       label: Other demo data information
146 |       render: shell
147 |     validations:
148 |       required: false
149 | 
150 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |     - name: Nanopore customer support
4 |       url: https://nanoporetech.com/contact
5 |       about: For general support, including bioinformatics questions.
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature request
 2 | description: Suggest an idea for this project
 3 | labels: ["feature request"]
 4 | body:
 5 |   
 6 |   - type: textarea
 7 |     id: question1
 8 |     attributes:
 9 |       label: Is your feature related to a problem?
10 |       placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 |     validations:
12 |       required: true
13 |   - type: textarea
14 |     id: question2
15 |     attributes:
16 |       label: Describe the solution you'd like
17 |       placeholder: A clear and concise description of what you want to happen.
18 |     validations:
19 |       required: true
20 |   - type: textarea
21 |     id: question3
22 |     attributes:
23 |       label: Describe alternatives you've considered
24 |       placeholder: A clear and concise description of any alternative solutions or features you've considered.
25 |     validations:
26 |       required: true
27 |   - type: textarea
28 |     id: question4
29 |     attributes:
30 |       label: Additional context
31 |       placeholder: Add any other context about the feature request here.
32 |     validations:
33 |       required: false
34 | 
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.yml:
--------------------------------------------------------------------------------
 1 | name: Question
 2 | description: Ask a generic question about this project unrelated to features or bugs.
 3 | labels: ["question"]
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Please reserve this form for issues not related to bugs or feature requests. If our developers deem your questions to be related to bugs or features you will be asked to fill in the appropriate form.
 9 |   - type: textarea
10 |     id: question1
11 |     attributes:
12 |       label: Ask away!
13 |       placeholder: |
14 |           Bad question: How do I use this workflow in my HPC cluster?
15 |           Good question: My HPC cluster uses a GridEngine scheduler. Can you point me to documentation for how to use your workflows to efficiently submit jobs to my cluster?
16 |     validations:
17 |       required: true
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | nextflow
2 | .nextflow*
3 | template-workflow
4 | .*.swp
5 | .*.swo
6 | *.pyc
7 | *.pyo
8 | .DS_store
9 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: docs_readme
 5 |         name: docs_readme
 6 |         entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json
 7 |         language: python
 8 |         always_run: true
 9 |         pass_filenames: false
10 |         additional_dependencies:
11 |           - epi2melabs==0.0.58
12 |   - repo: https://github.com/pycqa/flake8
13 |     rev: 5.0.4
14 |     hooks:
15 |       - id: flake8
16 |         pass_filenames: false
17 |         additional_dependencies:
18 |           - flake8-rst-docstrings
19 |           - flake8-docstrings
20 |           - flake8-import-order
21 |           - flake8-forbid-visual-indent
22 |           - pep8-naming
23 |           - flake8-no-types
24 |           - flake8-builtins
25 |           - flake8-absolute-import
26 |           - flake8-print
27 |           # avoid snowballstemmer>=3.0 as it causes flake8-docstrings to stop working [CW-6098]
28 |           - snowballstemmer==2.2.0
29 |         args: [
30 |             "bin",
31 |             "--import-order-style=google",
32 |             "--statistics",
33 |             "--max-line-length=88",
34 |             "--per-file-ignores=bin/workflow_glue/models/*:NT001",
35 |         ]
36 | 


--------------------------------------------------------------------------------
/bin/workflow-glue:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Entry point for sc_tools (single_cell_tools)."""
3 | 
4 | from workflow_glue import cli
5 | 
6 | if __name__ == '__main__':
7 |     cli()
8 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/__init__.py:
--------------------------------------------------------------------------------
 1 | """Workflow Python code."""
 2 | import argparse
 3 | import glob
 4 | import importlib
 5 | import itertools
 6 | import os
 7 | import sys
 8 | 
 9 | from .util import _log_level, get_main_logger  # noqa: ABS101
10 | 
11 | 
12 | __version__ = "0.0.1"
13 | _package_name = "workflow_glue"
14 | 
15 | HELPERS = "wfg_helpers"
16 | 
17 | 
18 | def get_components(allowed_components=None):
19 |     """Find a list of workflow command scripts."""
20 |     logger = get_main_logger(_package_name)
21 | 
22 |     # gather all python files in the current directory and the wfg_helpers
23 |     home_path = os.path.dirname(os.path.abspath(__file__))
24 |     standard_lib = os.path.join(home_path, HELPERS)
25 |     globs = itertools.chain.from_iterable((
26 |         glob.glob(os.path.join(path, "*.py"))
27 |         for path in (home_path, standard_lib)))
28 | 
29 |     components = dict()
30 |     for fname in globs:
31 |         name = os.path.splitext(os.path.basename(fname))[0]
32 |         if name in ("__init__", "util"):
33 |             continue
34 |         if allowed_components is not None and name not in allowed_components:
35 |             continue
36 | 
37 |         # leniently attempt to import module
38 |         try:
39 |             if HELPERS in fname:
40 |                 mod = importlib.import_module(f"{_package_name}.{HELPERS}.{name}")
41 |             else:
42 |                 mod = importlib.import_module(f"{_package_name}.{name}")
43 |         except ModuleNotFoundError as e:
44 |             # if imports cannot be satisifed, refuse to add the component
45 |             # rather than exploding
46 |             logger.warn(f"Could not load {name} due to missing module {e.name}")
47 |             continue
48 | 
49 |         # if theres a main() and and argparser() thats good enough for us.
50 |         try:
51 |             req = "main", "argparser"
52 |             if all(callable(getattr(mod, x)) for x in req):
53 |                 components[name] = mod
54 |         except Exception:
55 |             pass
56 |     return components
57 | 
58 | 
59 | def cli():
60 |     """Run workflow entry points."""
61 |     logger = get_main_logger(_package_name)
62 |     logger.info("Bootstrapping CLI.")
63 |     parser = argparse.ArgumentParser(
64 |         'wf-glue',
65 |         parents=[_log_level()],
66 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
67 | 
68 |     parser.add_argument(
69 |         '-v', '--version', action='version',
70 |         version='%(prog)s {}'.format(__version__))
71 | 
72 |     subparsers = parser.add_subparsers(
73 |         title='subcommands', description='valid commands',
74 |         help='additional help', dest='command')
75 |     subparsers.required = True
76 | 
77 |     # importing everything can take time, try to shortcut
78 |     if len(sys.argv) > 1:
79 |         components = get_components(allowed_components=[sys.argv[1]])
80 |         if not sys.argv[1] in components:
81 |             logger.warn("Importing all modules, this may take some time.")
82 |             components = get_components()
83 |     else:
84 |         components = get_components()
85 | 
86 |     # add all module parsers to main CLI
87 |     for name, module in components.items():
88 |         p = subparsers.add_parser(
89 |             name.split(".")[-1], parents=[module.argparser()])
90 |         p.set_defaults(func=module.main)
91 | 
92 |     args = parser.parse_args()
93 | 
94 |     logger.info("Starting entrypoint.")
95 |     args.func(args)
96 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/assign_barcodes.py:
--------------------------------------------------------------------------------
  1 | """Assign barcodes.
  2 | 
  3 | Given a whitelist of barcodes assign raw barcodes to nearest match.
  4 | """
  5 | import collections
  6 | from pathlib import Path
  7 | 
  8 | import pandas as pd
  9 | import rapidfuzz
 10 | from rapidfuzz.process import extract
 11 | 
 12 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 13 | 
 14 | logger = get_named_logger("AsgnBrcdes")
 15 | 
 16 | 
 17 | def argparser():
 18 |     """Create argument parser."""
 19 |     parser = wf_parser("assign_barcodes")
 20 | 
 21 |     parser.add_argument(
 22 |         "whitelist", type=Path,
 23 |         help="File containing list of expected cell barcodes.")
 24 | 
 25 |     parser.add_argument(
 26 |         "barcode_tags", type=Path,
 27 |         help="TSV file of read_id, uncorrected_barcode, qscores.")
 28 | 
 29 |     parser.add_argument(
 30 |         "output_tags", type=Path,
 31 |         help="Output TSV containing columns from `barcode_tags` \
 32 |             and additional a CB (corrected barcode) column.")
 33 | 
 34 |     parser.add_argument(
 35 |         "output_counts", type=Path,
 36 |         help="Output TSV file containing counts for each of the assigned \
 37 |             barcodes.")
 38 | 
 39 |     parser.add_argument(
 40 |         "report", type=Path,
 41 |         help="Path to TSV file to store reasons for barcode assignment.")
 42 | 
 43 |     parser.add_argument(
 44 |         "--chunksize", type=int, default=50000,
 45 |         help="Process the BAM in chunks no larger than this.")
 46 | 
 47 |     parser.add_argument(
 48 |         "--use_kmer_index", action='store_true',
 49 |         help="Use a kmer index to reduce the search space of fuzzy matching.")
 50 | 
 51 |     parser.add_argument(
 52 |         "--max_ed", type=int, default=2,
 53 |         help="Max. edit distance between putative barcode \
 54 |             and the matching whitelist barcode.")
 55 | 
 56 |     parser.add_argument(
 57 |         "--min_ed_diff", type=int, default=2,
 58 |         help="Min. difference in edit distance between the \
 59 |             best and second best whitelist matches.")
 60 | 
 61 |     return parser
 62 | 
 63 | 
 64 | def determine_barcode(
 65 |         bc_uncorr, whitelist, whiteset,
 66 |         max_ed, min_ed_diff, assignment_log, index=None):
 67 |     """Find barcode in a whitelist corresponding to read barcode.
 68 | 
 69 |     :param bc_uncorr: uncorrected barcode.
 70 |     :param whitelist: list of possible barcodes.
 71 |     :param whiteset: whitelist as a set.
 72 |     :param max_ed: max. edit distance between barcode and whitelist hit.
 73 |     :param min_ed_diff: min. edit distance difference between first and
 74 |         second best hits in order to accept the first as valid.
 75 |     :param assignment_log: a Counter object to store reasons for barcode assignment.
 76 |     :param index: a kmer index for reducing search space of fuzzy-matching.
 77 | 
 78 |     Passing the whitelist as both a list and set is for performance reasons
 79 |     when calling this function many times.
 80 |     """
 81 |     # quick return
 82 |     if bc_uncorr in whiteset:
 83 |         assignment_log["bc_shortlist_exact_match"] += 1
 84 |         return bc_uncorr
 85 | 
 86 |     if index is not None:
 87 |         shortlist = set()
 88 |         for kmer in build_kmers(bc_uncorr):
 89 |             shortlist.update(index[kmer])
 90 |         shortlist = list(shortlist)
 91 |     else:
 92 |         shortlist = whitelist
 93 | 
 94 |     result = extract(
 95 |         bc_uncorr,
 96 |         shortlist,
 97 |         scorer=rapidfuzz.distance.Levenshtein.distance,
 98 |         score_cutoff=max_ed + min_ed_diff + 1)
 99 | 
100 |     corrected = "-"
101 |     if len(result) > 0:
102 |         # There is at least 1 initial match (ED >= max_ed + min_ed_diff + 1)
103 |         bc_match = result[0][0]
104 |         bc_match_ed = result[0][1]
105 |     else:
106 |         assignment_log['bc_no_shortlist_match'] += 1
107 |         return corrected
108 |     if len(result) > 1:
109 |         next_match_diff = result[1][1] - bc_match_ed
110 |     else:
111 |         next_match_diff = len(bc_uncorr)
112 | 
113 |     # are we better than the second place?
114 |     # This criteria is a little odd: we have (2, 2) as the defaults
115 |     # for max_ed and min_ed_diff. But some true barcodes are within
116 |     # and edit distance of 2 to start with, so they would be guaranteed
117 |     # to be filtered out (the exact match shortcut above saves us a lot
118 |     # of the time). Consider removing this?
119 |     if (bc_match_ed <= max_ed) and (next_match_diff >= min_ed_diff):
120 |         corrected = bc_match
121 |         assignment_log['bc_corrected'] += 1
122 |     elif bc_match_ed > max_ed:
123 |         # There was an initial rapidfuzz match, but ED was greater than our max ED.
124 |         assignment_log['bc_no_shortlist_match'] += 1
125 |     elif next_match_diff < min_ed_diff:
126 |         # Two or more hits to the rapidfuzz results.
127 |         assignment_log['bc_shortlist_multiple_hits'] += 1
128 | 
129 |     return corrected
130 | 
131 | 
132 | def build_index(whitelist, klen=5):
133 |     """Build a kmer index of a list of sequences."""
134 |     index = collections.defaultdict(set)
135 |     for seq in whitelist:
136 |         for ss in build_kmers(seq, klen):
137 |             index[ss].add(seq)
138 |     return index
139 | 
140 | 
141 | def build_kmers(seq, klen=5):
142 |     """Create a list of kmers in a sequence."""
143 |     return [seq[i:i+klen] for i in range(0, len(seq) - klen)]
144 | 
145 | 
146 | def process_records(
147 |         barcode_tags, whiteset, max_ed, min_ed_diff, tags_output,
148 |         chunksize=50000, use_kmer_index=False):
149 |     """Process read barcodes stored in text file to find whitelist equivalents.
150 | 
151 |     :param barcode_tags: path to TSV with tag data
152 |     :param whiteset: set of allowed barcodes.
153 |     :param: max_ed: max allowed edit distance between an uncorrected barcode
154 |         and a potential corrected whiteset barcode.
155 |     :param: min_ed_diff: minimum allowed edit distance between top two
156 |         barcode candidates.
157 |     """
158 |     barcode_counter = collections.Counter()
159 |     # we need a list for indexing and because rapidfuzz appears to coerce
160 |     # its input to a list on every call, saves 10% of the time.
161 |     whitelist = list(whiteset)
162 |     barcode_length = len(whitelist[0])
163 |     # for 16mers with 2 mismatches we must have a least a 5mer match.
164 |     # The limit is reached by distributing the mismatches evenly, any
165 |     # perturbation will increase the longest match length.
166 |     # 0123456789ABCDEF
167 |     #     |     |
168 |     index = None
169 |     if use_kmer_index:
170 |         kmer = barcode_length // (max_ed + 1)
171 |         index = build_index(whitelist, klen=kmer)
172 | 
173 |     output_cols = [
174 |         'read_id', 'CR', 'CY', 'UR', 'UY', 'chr',
175 |         'start', 'end', 'mapq', 'CB', 'SA']
176 |     with open(tags_output, 'w') as fh:
177 |         fh.write("\t".join(output_cols))
178 |         fh.write("\n")
179 | 
180 |     total_reads = 0
181 |     assignment_log = collections.Counter()
182 |     for df_tags in pd.read_csv(barcode_tags, sep='\t', chunksize=chunksize):
183 |         df_tags["CB"] = "-"
184 |         selected = df_tags["CR"].str.len() >= barcode_length - max_ed
185 |         df_tags.loc[selected, "CB"] = df_tags.loc[selected].apply(
186 |             lambda x: determine_barcode(
187 |                 x.CR, whitelist, whiteset, max_ed, min_ed_diff, assignment_log,
188 |                 index),
189 |             axis=1)
190 |         total_reads += len(df_tags)
191 |         logger.info(f"Processed {total_reads} reads.")
192 |         # Remove reads without a corrected barcode assigned.
193 |         n_records = len(df_tags)
194 |         df_tags.query('CB != "-"', inplace=True)
195 |         logger.info(
196 |             f"Removed {n_records - len(df_tags)} reads without a corrected barcode.")
197 |         if len(df_tags) != 0:
198 |             df_tags[output_cols].to_csv(
199 |                 tags_output, mode='a', sep='\t', header=None, index=False)
200 |             barcode_counter.update(df_tags["CB"])
201 | 
202 |     return barcode_counter, assignment_log
203 | 
204 | 
205 | def main(args):
206 |     """Run main entry point."""
207 |     logger.info("Reading whitelist.")
208 |     whiteset = set(pd.read_csv(
209 |         args.whitelist, index_col=None, sep='\t', header=None)[0])
210 | 
211 |     logger.info("Processing reads.")
212 |     barcode_counter, assignment_log = process_records(
213 |         args.barcode_tags, whiteset,
214 |         args.max_ed, args.min_ed_diff,
215 |         args.output_tags,
216 |         chunksize=args.chunksize,
217 |         use_kmer_index=args.use_kmer_index)
218 | 
219 |     with open(args.output_counts, "w") as f:
220 |         for bc, n in barcode_counter.most_common():
221 |             f.write(f"{bc}\t{n}\n")
222 | 
223 |     df_summary = pd.DataFrame.from_dict(assignment_log, orient='index')
224 |     df_summary.to_csv(args.report, sep='\t', header=False)
225 | 
226 |     logger.info("Finished.")
227 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/calc_saturation.py:
--------------------------------------------------------------------------------
  1 | """Calculate saturation."""
  2 | 
  3 | import polars as pl
  4 | 
  5 | from .util import get_named_logger, wf_parser  # noqa: ABS101
  6 | 
  7 | 
  8 | def argparser():
  9 |     """Create argument parser."""
 10 |     parser = wf_parser("Calculate satutation")
 11 | 
 12 |     parser.add_argument(
 13 |         "--read_tags",
 14 |         help="TSV file with read_id, gene, barcode, and UMI"
 15 |     )
 16 | 
 17 |     parser.add_argument(
 18 |         "--output",
 19 |         help="Output TSV file with saturation curves."
 20 |     )
 21 | 
 22 |     parser.add_argument(
 23 |         "--sample",
 24 |         help="sample ID/alias"
 25 |     )
 26 | 
 27 |     return parser
 28 | 
 29 | 
 30 | def downsample_dataframe(df, fraction):
 31 |     """Downsample dataframe of read tags and tabulate genes and UMIs per cell."""
 32 |     logger = get_named_logger('ClcSat')
 33 | 
 34 |     logger.info(f"Doing {fraction}")
 35 |     df_scaled = df.sample(fraction=fraction)
 36 |     n_reads = df_scaled.shape[0]
 37 | 
 38 |     # Get the unique number of reads, genes and UMIs per cell barcode
 39 |     gb_cell = df_scaled.group_by("barcode")
 40 |     gb_cell_median = gb_cell.n_unique().median()
 41 |     genes_per_cell = gb_cell_median['gene'][0]
 42 |     umis_per_cell = gb_cell_median['umi'][0]
 43 |     # Since polars 0.20.5 groupby.count() has been renamed groupby.len()
 44 |     reads_per_cell = gb_cell.count().median()['count'][0]
 45 | 
 46 |     n_deduped_reads = df_scaled.group_by(['gene', 'barcode', 'umi']).count().shape[0]
 47 |     if n_reads < 1:
 48 |         umi_saturation = 0
 49 |     else:
 50 |         umi_saturation = 1 - (n_deduped_reads / n_reads)
 51 | 
 52 |     record = (
 53 |         (
 54 |             fraction,
 55 |             n_reads,
 56 |             reads_per_cell,
 57 |             genes_per_cell,
 58 |             umis_per_cell,
 59 |             umi_saturation,
 60 |         )
 61 |     )
 62 |     logger.info(f"Done saturation calculation for fraction {fraction}")
 63 |     return record
 64 | 
 65 | 
 66 | def run_jobs(args):
 67 |     """Create job to send off to workers, and collate results."""
 68 |     logger = get_named_logger('ClcSat')
 69 | 
 70 |     df = pl.read_csv(
 71 |         args.read_tags,
 72 |         separator='\t',
 73 |         columns=['corrected_barcode', 'corrected_umi', 'gene'],
 74 |         new_columns=['barcode', 'umi', 'gene'],
 75 |         low_memory=True,
 76 |         dtypes={
 77 |             'corrected_barcode': pl.Categorical,
 78 |             'corrected_umi': pl.Categorical,
 79 |             'gene': str}
 80 |     )
 81 | 
 82 |     df.filter((df['barcode'] != '-') & (df['umi'] != '-'))
 83 | 
 84 |     logger.info("Downsampling reads for saturation curves")
 85 |     fractions = [
 86 |         0.01,
 87 |         0.02,
 88 |         0.03,
 89 |         0.04,
 90 |         0.05,
 91 |         0.1,
 92 |         0.2,
 93 |         0.3,
 94 |         0.4,
 95 |         0.5,
 96 |         0.6,
 97 |         0.7,
 98 |         0.8,
 99 |         0.9,
100 |         1.0,
101 |     ]
102 | 
103 |     records = [(0.0, 0, 0, 0, 0, 0.0)]
104 |     for frac in fractions:
105 |         records.append(downsample_dataframe(df, frac))
106 | 
107 |     res = pl.from_records(
108 |         data=records,
109 |         schema=[
110 |             "downsamp_frac",
111 |             "downsamp_reads",
112 |             "reads_pc",
113 |             "genes_pc",
114 |             "umis_pc",
115 |             "umi_sat",
116 |         ]
117 |     )
118 |     res = res.with_columns(
119 |         pl.lit(args.sample).alias("sample"),
120 |     )
121 |     res.write_csv(args.output, separator="\t")
122 | 
123 | 
124 | def main(args):
125 |     """Entry point."""
126 |     run_jobs(args)
127 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/clip_depth.py:
--------------------------------------------------------------------------------
 1 | """Clip read depth."""
 2 | 
 3 | import random
 4 | 
 5 | import pandas as pd
 6 | from pysam import AlignmentFile
 7 | from .util import wf_parser  # noqa: ABS101
 8 | 
 9 | 
10 | def argparser():
11 |     """Create argument parser."""
12 |     parser = wf_parser("clip_depth")
13 | 
14 |     parser.add_argument(
15 |         "--bed",
16 |         help="Regions to clip")
17 | 
18 |     parser.add_argument(
19 |         "--bam_in",
20 |         help="input bam file")
21 | 
22 |     parser.add_argument(
23 |         "--target_depth",
24 |         help="Desired read depth",
25 |         type=int)
26 | 
27 |     parser.add_argument(
28 |         "--bam_out",
29 |         help="output bam file")
30 | 
31 |     return parser
32 | 
33 | 
34 | def main(args):
35 |     """Run entry point."""
36 |     df_hi_cov = pd.read_csv(
37 |         args.bed, sep='\t',
38 |         names=['read_id', 'depth'],
39 |         index_col='read_id',
40 |         dtype={
41 |             'read_id': str,
42 |             'depth': float
43 |         }
44 |     )
45 | 
46 |     random.seed(1889)
47 |     with AlignmentFile(args.bam_in, "rb", check_sq=False) as bam:
48 |         with AlignmentFile(args.bam_out, "wb", template=bam) as out_bam:
49 |             for aln in bam.fetch(until_eof=True):
50 |                 if aln.query_name in df_hi_cov.index:
51 |                     window_depth = df_hi_cov.at[aln.query_name, 'depth']
52 |                     # Randomly discard records to achieve target depth
53 |                     # Random returns a random floating number between 0 and 1
54 |                     if random.random() > args.target_depth / window_depth:
55 |                         continue  # Discard record
56 |                 out_bam.write(aln)
57 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/create_umap.py:
--------------------------------------------------------------------------------
 1 | """Umap reduce."""
 2 | from pathlib import Path
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from sklearn.decomposition import PCA
 7 | import umap
 8 | 
 9 | from .util import get_named_logger, wf_parser  # noqa: ABS101
10 | 
11 | 
12 | def argparser():
13 |     """Create argument parser."""
14 |     parser = wf_parser("umap_reduce")
15 | 
16 |     parser.add_argument(
17 |         "matrix",
18 |         help="Gene expression matrix: rows=genes/transcripts, "
19 |              "columns=barcodes, values=UMIs")
20 |     parser.add_argument(
21 |         "output", type=Path,
22 |         help="UMAP TSV output file path.")
23 | 
24 |     parser.add_argument(
25 |         "--pcn", type=int, default=100,
26 |         help="Number of principal components to generate prior to UMAP")
27 | 
28 |     parser.add_argument(
29 |         "--dimensions", type=int, default=2,
30 |         help="Number of dimensions in UMAP embedding")
31 | 
32 |     parser.add_argument(
33 |         "--min_dist", type=float, default=0.1,
34 |         help="Minimum distance parameter of UMAP")
35 | 
36 |     parser.add_argument(
37 |         "--n_neighbors", type=int, default=15,
38 |         help="Number of neighbors parameter of UMAP")
39 | 
40 |     return parser
41 | 
42 | 
43 | def main(args):
44 |     """Run entry point."""
45 |     logger = get_named_logger('UmapReduce')
46 | 
47 |     # find the numpy of columns, since the numpy API doesn't allow skipping cols
48 |     names = np.loadtxt(
49 |         args.matrix, delimiter="\t", dtype=str, max_rows=1)
50 |     names = names[1:]  # first is transcript/gene
51 |     n_barcodes = len(names)
52 | 
53 |     logger.info("Expression matrix has {n_barcodes} cells.")
54 |     logger.info("Reading entire matrix.")
55 |     mat = np.loadtxt(
56 |         args.matrix, delimiter="\t", dtype=float,
57 |         skiprows=1, usecols=list(range(1, n_barcodes + 1)))
58 |     mat = np.atleast_2d(mat).transpose()
59 |     logger.info("Finished reading matrix.")
60 | 
61 |     logger.info(f"Expression matrix has shape: {mat.shape}")
62 |     pcn = min(args.pcn, *mat.shape)
63 |     model = PCA(n_components=pcn, copy=False)
64 |     mat = model.fit_transform(mat)
65 |     logger.info(f"PCA output matrix has shape: {mat.shape}")
66 | 
67 |     mapper = umap.UMAP(
68 |         n_neighbors=args.n_neighbors,
69 |         min_dist=args.min_dist,
70 |         n_components=args.dimensions,
71 |         verbose=0)
72 |     embedding = mapper.fit_transform(mat)
73 |     logger.info(f"UMAP Embedding has shape: {embedding.shape}")
74 | 
75 |     # would be nice to avoid a copy here, but the array is fairly small
76 |     cols = [f"D{i+1}" for i in range(args.dimensions)]
77 |     out = pd.DataFrame(embedding, columns=cols, index=names)
78 |     out.to_csv(
79 |         args.output, sep="\t", index=True, index_label="barcode")
80 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/format_ctat_output.py:
--------------------------------------------------------------------------------
  1 | """Convert ctat-LR-fusion outputs from long to short format."""
  2 | from collections import defaultdict
  3 | import csv
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from .util import get_named_logger, wf_parser  # noqa: ABS101
  8 | 
  9 | 
 10 | def argparser():
 11 |     """Parse the arguments."""
 12 |     parser = wf_parser(
 13 |         "Map fusions to cell barcodes.")
 14 |     parser.add_argument(
 15 |         "fusion_file",
 16 |         help="Path to the fusion output file (TSV format).")
 17 |     parser.add_argument(
 18 |         "read_info_file",
 19 |         help="Path to the per-read info file (TSV format).")
 20 |     parser.add_argument(
 21 |         "per_read_output",
 22 |         help="Path to save the output TSV file.")
 23 |     parser.add_argument(
 24 |         "per_fusion_output",
 25 |         help="Path to save the output TSV file.")
 26 |     parser.add_argument(
 27 |         "cell_summary_out",
 28 |         help="Path to save the output TSV file.")
 29 |     parser.add_argument(
 30 |         "sample_id",
 31 |         help="sample identifier to add to tables.")
 32 |     parser.add_argument(
 33 |         "--unmatched_reads_out",
 34 |         help="Path to save the output TSV file of fusions with no matching CB.",
 35 |         default='unmatched_reads.txt')
 36 | 
 37 |     return parser
 38 | 
 39 | 
 40 | def load_fusion_data(fusion_file):
 41 |     """Load fusion data. Extract relevant fields along with read associations."""
 42 |     logger = get_named_logger('FmtCtat')
 43 |     try:
 44 |         per_fusion_df = pd.read_csv(fusion_file, sep="\t")
 45 |     except pd.errors.EmptyDataError:
 46 |         logger.warning(
 47 |             f"""The fusion file {fusion_file} is empty.
 48 |             No candidate fusions found by ctat-LR-fusion.""")
 49 |         return None
 50 |     if len(per_fusion_df) == 0:
 51 |         logger.warning(
 52 |             f"""The fusion file {fusion_file} contained no entries.
 53 |             No candidate fusions passed ctat-LR-fusion filters.""")
 54 |         return None
 55 |     per_fusion_df.rename(columns={"#FusionName": "FusionName"}, inplace=True)
 56 | 
 57 |     # Convert read IDs to lists
 58 |     per_fusion_df["LR_accessions"] = per_fusion_df["LR_accessions"].str.split(",")
 59 | 
 60 |     # Expand multiple read IDs per fusion
 61 |     per_read_df = per_fusion_df.explode("LR_accessions")
 62 |     # Select relevant columns
 63 |     per_read_df = per_read_df[[
 64 |         "FusionName", "LeftGene", "LeftBreakpoint", "RightGene",
 65 |         "RightBreakpoint", "SpliceType", "LR_accessions"
 66 |     ]].rename(columns={"LR_accessions": "read_id"})
 67 | 
 68 |     # Handle duplicate read IDs using defaultdict (list)
 69 |     fusion_dict = defaultdict(list)
 70 |     for row in per_read_df.itertuples():
 71 |         fusion_dict[row.read_id].append({
 72 |             "FusionName": row.FusionName,
 73 |             "LeftGene": row.LeftGene,
 74 |             "LeftBreakpoint": row.LeftBreakpoint,
 75 |             "RightGene": row.RightGene,
 76 |             "RightBreakpoint": row.RightBreakpoint,
 77 |             "SpliceType": row.SpliceType
 78 |         })
 79 | 
 80 |     logger.info(f"Total fusions processed: {per_fusion_df['FusionName'].nunique()}")
 81 |     logger.info(f"Total unique reads linked to fusions: {len(fusion_dict)}")
 82 | 
 83 |     return fusion_dict
 84 | 
 85 | 
 86 | def process_read_info(read_info_file, fusion_dict):
 87 |     """Combine single-cell tags with fusion info."""
 88 |     matched_results = []
 89 |     unmatched_reads = set(fusion_dict.keys())  # Track reads missing barcode/UMI
 90 | 
 91 |     with open(read_info_file, newline='') as csvfile:
 92 |         reader = csv.DictReader(csvfile, delimiter="\t")
 93 | 
 94 |         for line in reader:
 95 |             read_id = line['read_id']
 96 |             if read_id in fusion_dict:  # Only process relevant reads
 97 |                 for fusion in fusion_dict[read_id]:
 98 |                     matched_results.append({
 99 |                         "FusionName": fusion["FusionName"],
100 |                         "LeftGene": fusion["LeftGene"],
101 |                         "LeftBreakpoint": fusion["LeftBreakpoint"],
102 |                         "RightGene": fusion["RightGene"],
103 |                         "RightBreakpoint": fusion["RightBreakpoint"],
104 |                         "SpliceType": fusion["SpliceType"],
105 |                         "CB": line["corrected_barcode"],
106 |                         "UB": line["corrected_umi"],
107 |                         "read_id": read_id
108 |                     })
109 |                 unmatched_reads.discard(read_id)  # Remove matched reads
110 | 
111 |     return pd.DataFrame(matched_results), unmatched_reads
112 | 
113 | 
114 | def main(args):
115 |     """Run the script."""
116 |     logger = get_named_logger('FmtCtat')
117 | 
118 |     logger.info("Loading fusion data...")
119 | 
120 |     fusion_dict = load_fusion_data(args.fusion_file)
121 | 
122 |     if fusion_dict is None:
123 |         logger.warning("No fusion data found. writing empty files.")
124 | 
125 |         with open(args.per_read_output, 'w') as fh1:
126 |             fh1.write(
127 |                 "FusionName\tLeftGene\tLeftBreakpoint\tRightGene\tRightBreakpoint"
128 |                 "\tSpliceType\tCB\tUB\tread_id\n"
129 |             )
130 | 
131 |         with open(args.per_fusion_output, 'w') as fh2:
132 |             fh2.write(
133 |                 "Fusion\tLeftGene\tLeftBreakpoint\tRightGene\tRightBreakpoint"
134 |                 "\tSpliceType\tcells\tUMIs\tsample_ID\n"
135 |             )
136 |         (
137 |             pd.DataFrame.from_records(
138 |                 [[args.sample_id, 0, 0, 0, 0, 0]],
139 |                 columns=[
140 |                     'sample_ID', 'cells_with_fusions', 'unique_fusions', 'reads',
141 |                     'mean_fusion_reads_per_cell', 'mean_unique_fusions_per_cell'],
142 |             )
143 |             .to_csv(args.cell_summary_out, sep="\t", index=False)
144 |         )
145 |     else:
146 |         logger.info("Processing read information...")
147 |         merged_df, unmatched_reads = process_read_info(
148 |             args.read_info_file, fusion_dict)
149 | 
150 |         # Make per-fusion summary from barcode-assigned fusion + reads.
151 |         (
152 |             merged_df.groupby(
153 |                 ['FusionName', 'LeftGene', 'LeftBreakpoint',
154 |                  'RightGene', 'RightBreakpoint', 'SpliceType'])
155 |             .agg(
156 |                 cells=('CB', 'nunique'),
157 |                 UMIs=('UB', 'nunique'))
158 |             .reset_index()
159 |             .assign(
160 |                 sample_ID=args.sample_id)
161 |             .rename(columns={'FusionName': 'Fusion'})
162 |             .to_csv(args.per_fusion_output, sep="\t", index=False)
163 |         )
164 | 
165 |         # Sort reads by most commonly occuring fusion pair
166 |         merged_df = (merged_df.sort_values(
167 |                 by="FusionName",
168 |                 key=lambda col: col.map(col.value_counts()), ascending=False))
169 | 
170 |         logger.info("Saving merged single cell/fusion output")
171 |         merged_df.to_csv(args.per_read_output, sep="\t", index=False)
172 | 
173 |         logger.info("Writing the per fusion summary...")
174 |         # Regerate the per-fusion summary adding some cell-specific info.
175 |         (
176 |             pd.DataFrame.from_dict({
177 |                 'sample_ID': args.sample_id,
178 |                 'cells_with_fusions': merged_df['CB'].nunique(),
179 |                 'unique_fusions': merged_df['FusionName'].nunique(),
180 |                 'reads': len(merged_df),
181 |                 'mean_fusion_reads_per_cell': (
182 |                     merged_df.groupby('CB')['FusionName'].count().mean()),
183 |                 'mean_unique_fusions_per_cell': (
184 |                     merged_df.groupby('CB')['FusionName'].nunique().mean()),
185 |                 }, orient='index')
186 |             .T
187 |             .to_csv(args.cell_summary_out, sep="\t", index=False)
188 |         )
189 | 
190 |         # Summary of unmatched reads
191 |         logger.info(
192 |             (
193 |                 "\n**Summary:**"
194 |                 f"Matched reads with barcode/UMI: {len(merged_df)}\n"
195 |                 f"Reads missing barcode/UMI info: {len(unmatched_reads)}")
196 |             )
197 | 
198 |         if unmatched_reads:
199 |             with open(args.unmatched_reads_out, "w") as uf:
200 |                 for read in unmatched_reads:
201 |                     uf.write(f"{read}\n")
202 |         logger.info("Process complete!")
203 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/models/__init__.py:
--------------------------------------------------------------------------------
1 | """A collection of scripts for results models."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/parse_kit_metadata.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Read in and validate user sample data."""
  3 | from pathlib import Path
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from .util import wf_parser  # noqa: ABS101
  8 | 
  9 | 
 10 | def argparser():
 11 |     """Create argument parser."""
 12 |     parser = wf_parser("read_samples")
 13 |     parent_parser = wf_parser("read_samples_parent")
 14 | 
 15 |     parent_parser.add_argument(
 16 |         "--kit_config",
 17 |         help="Kit-specific details CSV",
 18 |         type=Path,
 19 |         required=True
 20 |     )
 21 |     parent_parser.add_argument(
 22 |         "--sample_ids",
 23 |         help="File with IDs from each sample",
 24 |         type=Path,
 25 |         required=True
 26 |     )
 27 |     parent_parser.add_argument(
 28 |         "--output",
 29 |         help="Output path for merged config",
 30 |         type=Path,
 31 |         required=True
 32 |     )
 33 |     parent_parser.add_argument(
 34 |         "--spaceranger_bam",
 35 |         help="premade demultiplex tags CSV",
 36 |         default=None
 37 |     )
 38 | 
 39 |     subparsers = parser.add_subparsers(help='commands', dest="cmd")
 40 | 
 41 |     parser_sheet = subparsers.add_parser(
 42 |         'from_sheet', help='Get kit metadata per sample from sample sheet',
 43 |         parents=[parent_parser]
 44 |     )
 45 |     parser_sheet.add_argument(
 46 |         "--user_config",
 47 |         help="User sample metadata CSV file",
 48 |         type=Path,
 49 |         required=True
 50 |     )
 51 | 
 52 |     parser_cli = subparsers.add_parser(
 53 |         "from_cli",
 54 |         help='Apply the same kit metadata to all samples from CLI variables',
 55 |         parents=[parent_parser],
 56 |     )
 57 |     parser_cli.add_argument(
 58 |         "--kit",
 59 |         help="10x kit (name:version)",
 60 |         required=True
 61 |     )
 62 |     parser_cli.add_argument(
 63 |         "--expected_cells",
 64 |         help="Number of expected cells",
 65 |         required=True
 66 |     )
 67 |     parser_cli.add_argument(
 68 |         "--adapter_configs",
 69 |         help="premade demultiplex tags CSV",
 70 |         default=None
 71 |     )
 72 | 
 73 |     return parser
 74 | 
 75 | 
 76 | def main(args):
 77 |     """Entry point."""
 78 |     # Single cell sample sheet expected header
 79 |     sc_sample_sheet_header = [
 80 |         'sample_id',
 81 |         'kit',
 82 |         'expected_cells'
 83 |     ]
 84 | 
 85 |     sample_ids = pd.read_csv(args.sample_ids, index_col=None, header=None)[0].to_list()
 86 | 
 87 |     if args.cmd == 'from_cli':
 88 |         # No per-sample single-cell sample sheet given by user, so we will use the
 89 |         # individual CLI parameters to build a CSV and apply the same parameters to
 90 |         # each sample
 91 |         if args.spaceranger_bam:
 92 |             sc_sample_sheet_header.extend(['spaceranger_bam', 'adapter_configs'])
 93 |         entries = []
 94 |         for sid in sample_ids:
 95 |             entry = [sid.strip(), args.kit, args.expected_cells]
 96 |             if args.spaceranger_bam:
 97 |                 entry.extend([args.spaceranger_bam, args.adapter_configs])
 98 |             entries.append(entry)
 99 |         user_df = pd.DataFrame.from_records(
100 |             entries, columns=sc_sample_sheet_header
101 |         )
102 | 
103 |     elif args.cmd == 'from_sheet':
104 |         if args.spaceranger_bam:
105 |             raise NotImplementedError("""
106 |                 --single_cell_sample_sheet is not currently compatible
107 |                 with --spaceranger_bam""")
108 |         user_df = pd.read_csv(args.user_config)
109 | 
110 |         # Validate sample sheet header
111 |         if len(set(sc_sample_sheet_header).difference(set(user_df.columns))) != 0:
112 |             raise ValueError(
113 |                 'single_cell_sample_sheet should have the following column names: '
114 |                 f'{sc_sample_sheet_header}')
115 | 
116 |     # Validate kit + version combinations.
117 |     kit_df = pd.read_csv(args.kit_config)
118 | 
119 |     # Check if all supplied kits + version strings are supported
120 |     kit_and_version_diff = set(user_df.kit).difference(kit_df.kit)
121 |     if len(kit_and_version_diff) != 0:
122 |         raise ValueError(
123 |             'the following are not valid kit and version combinations: '
124 |             f'{kit_and_version_diff}')
125 | 
126 |     # Check that ingressed IDs match sample_ids from sample_sheet
127 |     if set(user_df['sample_id']) != set(sample_ids):
128 |         raise ValueError(
129 |             'Sample IDs from the sc_sample_sheet must match those from those inferred '
130 |             'from the input data:'
131 |             f'\nSample IDs from ingressed data: {sample_ids}'
132 |             f'\nSample IDs from sample sheet: {user_df.sample_id.to_list()}'
133 |             f'\nSamples IDs in sample sheet but not in ingressed data :'
134 |             f'{set(user_df["sample_id"]).difference(sample_ids)}'
135 |             f'\nSamples IDs ingressed data but not in sample sheet :'
136 |             f'{set(sample_ids).difference(user_df["sample_id"])}'
137 |         )
138 | 
139 |     merged_config = user_df.merge(
140 |         kit_df, on='kit', how='left', suffixes=(None, '_delete'))
141 |     # Create kit name and version columns from the kit:version string
142 |     merged_config[['kit_name', 'kit_version']] \
143 |         = merged_config['kit'].str.split(':', expand=True)
144 |     cols_to_drop = merged_config.columns[merged_config.columns.str.contains('delete')]
145 |     merged_config = merged_config.drop(cols_to_drop, axis=1)
146 |     merged_config.to_csv(args.output, sep=',', index=False)
147 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/prepare_report_data.py:
--------------------------------------------------------------------------------
  1 | """Prepare data for the report."""
  2 | from pathlib import Path
  3 | 
  4 | import pandas as pd
  5 | from scipy.io import mmread
  6 | 
  7 | from .adapter_scan_vsearch import AdapterSummary  # noqa: ABS101
  8 | from .create_matrix import ExpressionSummary  # noqa: ABS101
  9 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 10 | 
 11 | 
 12 | # TODO: Code in this script move into the main report.py script
 13 | 
 14 | 
 15 | def argparser():
 16 |     """Argument parser for entrypoint."""
 17 |     parser = wf_parser("prepare_report_data")
 18 | 
 19 |     parser.add_argument(
 20 |         "sample_id",
 21 |         help="ID of the sample being processed")
 22 |     parser.add_argument(
 23 |         "bam_stats", type=Path,
 24 |         help="Alignment summary statistics")
 25 |     parser.add_argument(
 26 |         "expression_stats", type=Path,
 27 |         help="Expression summary statistics")
 28 |     parser.add_argument(
 29 |         "white_list",
 30 |         help="Workflow summary statistics")
 31 |     parser.add_argument(
 32 |         "survival_out", type=Path,
 33 |         help="Output TSV with survival data for each stage.")
 34 |     parser.add_argument(
 35 |         "bam_stats_out", type=Path,
 36 |         help="Output TSV with combined alignment summary stats.")
 37 |     parser.add_argument(
 38 |         "raw_gene_expression", type=Path,
 39 |         help="Sparse data in MEX format.")
 40 |     parser.add_argument(
 41 |         "matrix_stats", type=Path,
 42 |         help="TSV file with matrix sumary stats.")
 43 |     parser.add_argument(
 44 |         "genes_of_interest", type=Path,
 45 |         help="TSV file of file names.")
 46 |     parser.add_argument(
 47 |         "n_input_seqs", type=int,
 48 |         help="Number of seqs input to the workflow after read quality filtering.")
 49 |     parser.add_argument(
 50 |         "adapter_stats", type=Path,
 51 |         help="Workflow summary statistics")
 52 |     return parser
 53 | 
 54 | 
 55 | def combine_bam_stats(input_dir, sample_id):
 56 |     """Aggregate alignment statistics."""
 57 |     dfs = []
 58 |     colnames = {
 59 |         "PrimAln": "primary",
 60 |         "SecAln": "secondary",
 61 |         "SupAln": "supplementary",
 62 |         "Unmapped": "unmapped",
 63 |         "TotalReads": "reads_aligned"
 64 | 
 65 |     }
 66 |     for stats in input_dir.glob('*.tsv'):
 67 |         dfs.append(pd.read_csv(
 68 |             stats, sep='\t',
 69 |             usecols=colnames.keys(),
 70 |             dtype=int
 71 |         ))
 72 |     df = pd.concat(dfs)
 73 |     df = pd.DataFrame(df.sum(axis=0)).T
 74 |     df = df.rename(columns=colnames)
 75 |     df.insert(0, 'sample', sample_id)
 76 |     df.insert(1, 'reads_aligned', df.pop('reads_aligned'))
 77 | 
 78 |     return df
 79 | 
 80 | 
 81 | def combine_expression_stats(input_dir):
 82 |     """Summarise expressions summary files."""
 83 |     fnames = list(input_dir.glob("*.json"))
 84 |     if len(fnames) == 0:
 85 |         raise IOError("No summary JSON files found.")
 86 | 
 87 |     summary = ExpressionSummary.from_json(fnames[0])
 88 |     if len(fnames) > 1:
 89 |         for other in fnames[1:]:
 90 |             summary += ExpressionSummary.from_json(other)
 91 |     return summary
 92 | 
 93 | 
 94 | def combine_adapter_stats(input_dir):
 95 |     """Combine adapter configuration summary files."""
 96 |     fnames = list(input_dir.glob("*.json"))
 97 |     if len(fnames) == 0:
 98 |         raise IOError("No summary JSON files found.")
 99 | 
100 |     summary = AdapterSummary.from_json(fnames[0])
101 |     if len(fnames) > 1:
102 |         for other in fnames[1:]:
103 |             summary += AdapterSummary.from_json(other)
104 |     return summary
105 | 
106 | 
107 | def get_total_cells(white_list):
108 |     """Create dataframe with total cells."""
109 |     # ok this is a little cheesy, but consistent for ease
110 |     total_cells = len(pd.read_csv(white_list, sep='\t', header=None))
111 |     return {"cells": total_cells}
112 | 
113 | 
114 | def get_genes_of_interest_expression(mex_dir, genes):
115 |     """Get a subset of the expression data.
116 | 
117 |     Given a list of genes, extract corresponding expression data from the MEX format
118 |     matrix, write TSV of X,Y, <genes ...>.  skipping zero values
119 |     """
120 |     genes_to_plot = pd.read_csv(genes, header=None)[0]
121 |     matrix = mmread(mex_dir / 'matrix.mtx.gz')
122 |     barcodes = pd.read_csv(mex_dir / 'barcodes.tsv.gz', header=None)
123 |     # Remove '-1' suffix from barcodes
124 |     barcodes = barcodes[0].str.split('-', expand=True)[0]
125 |     features = pd.read_csv(mex_dir / 'features.tsv.gz', sep='\t', header=None)[1]
126 |     sparse_gene_data = []
127 |     for gene in genes_to_plot:
128 |         try:
129 |             feature_idx = features[features == gene].index[0]
130 |             mask = matrix.row == feature_idx
131 |             # Extract matching entries
132 |             col_indices = matrix.col[mask]
133 |             barcodes_for_gene = barcodes[col_indices].values
134 | 
135 |             values = matrix.data[mask]
136 |             single_gene = [
137 |                 (gene, bc, val) for bc, val in zip(barcodes_for_gene, values)]
138 |             sparse_gene_data.extend(single_gene)
139 |         except IndexError:
140 |             continue  # no data
141 |     if len(sparse_gene_data) > 0:
142 |         return (
143 |             pd.DataFrame.from_records(
144 |                 sparse_gene_data, columns=['gene', 'barcode', 'count'])
145 |         )
146 |     else:
147 |         return pd.DataFrame()
148 | 
149 | 
150 | def main(args):
151 |     """Entry point for script."""
152 |     logger = get_named_logger('PrepReport')
153 |     logger.info('Preparing report data.')
154 |     stats = dict()
155 |     stats.update(combine_expression_stats(args.expression_stats))
156 |     # temp. We need to get adapter stats from the original script
157 |     stats.update(combine_adapter_stats(args.adapter_stats))
158 |     stats.update(get_total_cells(args.white_list))
159 |     # n seqs after any read quality filtering
160 |     n_input_reads = args.n_input_seqs
161 |     stats.update({'reads': n_input_reads})
162 |     matstats = pd.read_csv(
163 |         args.matrix_stats, sep='\t', header=None, names=['stat', 'val'])
164 |     for _, row in matstats.iterrows():
165 |         stats[row['stat']] = row['val']
166 |     stats['mean_reads_per_cell'] = stats['reads'] / stats['cells']
167 | 
168 |     survival = (
169 |         pd.DataFrame.from_dict(stats, orient="index", columns=['count'])
170 |         .reset_index(names="statistic"))
171 | 
172 |     # this is a little nonsensical for some stats
173 |     survival['pct_of_input_reads'] = 100 * survival['count'] / n_input_reads
174 |     survival['pct_of_fl_reads'] = 100 * survival['count'] / stats['full_length']
175 |     survival['sample_id'] = args.sample_id
176 | 
177 |     survival.set_index('statistic', inplace=True, drop=True)
178 |     survival.to_csv(args.survival_out, sep='\t', index=True)
179 | 
180 |     aln_stats = combine_bam_stats(args.bam_stats, args.sample_id)
181 |     aln_stats.to_csv(args.bam_stats_out, sep='\t', index=False)
182 | 
183 |     # If we are given adapter_stats, then this is visium_hd data
184 |     logger.info("Gtetting genes of interest expression data.")
185 |     goi_df = get_genes_of_interest_expression(
186 |         args.raw_gene_expression, args.genes_of_interest)
187 |     goi_df.to_csv(
188 |         Path(f"{args.sample_id}_expression") / 'raw_goi_expression.tsv', sep='\t')
189 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/process_matrix.py:
--------------------------------------------------------------------------------
  1 | """Expression counts matrix construction."""
  2 | import argparse
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.decomposition import PCA, TruncatedSVD
  8 | import umap
  9 | 
 10 | from .expression_matrix import ExpressionMatrix  # noqa: ABS101
 11 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 12 | 
 13 | 
 14 | def argparser():
 15 |     """Create argument parser."""
 16 |     parser = wf_parser("exp_mat")
 17 | 
 18 |     parser.add_argument(
 19 |         "input", type=Path, nargs='+',
 20 |         help="TSV with read tag data or batched expression matrices in HDF.")
 21 |     parser.add_argument(
 22 |         "--feature", default="gene", choices=["gene", "transcript"],
 23 |         help="Feature to compute matrix. Only used when read tag input is given.")
 24 |     parser.add_argument(
 25 |         "--raw", default="raw_feature_bc_matrix",
 26 |         help="Output folder for raw counts MEX data.")
 27 |     parser.add_argument(
 28 |         "--processed", default="processed_feature_bc_matrix",
 29 |         help="Output folder for processed counts MEX data.")
 30 |     parser.add_argument(
 31 |         "--per_cell_expr", default="expression.mean-per-cell.tsv", type=Path,
 32 |         help="Output TSV for per-cell mean expression level.")
 33 |     parser.add_argument(
 34 |         "--per_cell_mito", default="expression.mito-per-cell.tsv", type=Path,
 35 |         help="Output TSV for per-cell mean mito expression level.")
 36 |     parser.add_argument(
 37 |         "--stats", type=Path, help="Output path for stats TSV.")
 38 |     parser.add_argument(
 39 |         "--text", action="store_true", help=argparse.SUPPRESS)
 40 | 
 41 |     grp = parser.add_argument_group("Filtering")
 42 |     grp.add_argument(
 43 |         "--enable_filtering", action="store_true",
 44 |         help="Enable filtering of matrix.")
 45 |     grp.add_argument(
 46 |         "--min_features", type=int, default=100,
 47 |         help="Filter out cells that contain fewer features than this.")
 48 |     grp.add_argument(
 49 |         "--min_cells", type=int, default=3,
 50 |         help="Filter out features that are observed in fewer than this "
 51 |              "number of cells")
 52 |     grp.add_argument(
 53 |         "--max_mito", type=int, default=5,
 54 |         help="Filter out cells where more than this percentage of counts "
 55 |              "belong to mitochondrial features.")
 56 |     grp.add_argument(
 57 |         "--mito_prefixes", default=["MT-"], nargs='*',
 58 |         help="prefixes to identify mitochondrial features.")
 59 |     grp.add_argument(
 60 |         "--norm_count", type=int, default=10000,
 61 |         help="Normalize to this number of counts per cell as "
 62 |              "is performed in CellRanger.")
 63 |     grp.add_argument(
 64 |         "--filtered_mex", default="filtered_feature_bc_matrix",
 65 |         help="Output folder for raw counts MEX data.")
 66 | 
 67 |     grp = parser.add_argument_group("UMAP creation")
 68 |     grp.add_argument(
 69 |         "--enable_umap", action="store_true",
 70 |         help="Perform UMAP on matrix.")
 71 |     grp.add_argument(
 72 |         "--umap_tsv", default="expression.umap.tsv", type=Path,
 73 |         help=(
 74 |             "UMAP TSV output file path. If --replicates is greater than 1 "
 75 |             "files will be named: name.index.tsv."))
 76 |     grp.add_argument(
 77 |         "--replicates", type=int, default=1,
 78 |         help="Number of UMAP replicated to perform.")
 79 |     grp.add_argument(
 80 |         "--pcn", type=int, default=100,
 81 |         help="Number of principal components to generate prior to UMAP")
 82 |     grp.add_argument(
 83 |         "--max_umap_cells", type=int, default=30000,
 84 |         help="Maximum number of cells/spots to use for UMAP. "
 85 |              "If the matrix has more cells, a random subset is used."
 86 |              "After this all cells are projected into the UMAP space.")
 87 |     grp.add_argument(
 88 |         "--dimensions", type=int, default=2,
 89 |         help="Number of dimensions in UMAP embedding")
 90 |     grp.add_argument(
 91 |         "--min_dist", type=float, default=0.1,
 92 |         help="Minimum distance parameter of UMAP")
 93 |     grp.add_argument(
 94 |         "--n_neighbors", type=int, default=15,
 95 |         help="Number of neighbors parameter of UMAP")
 96 | 
 97 |     return parser
 98 | 
 99 | 
100 | def main(args):
101 |     """Make feature x cell, UMI-deduplicated, counts matrix."""
102 |     logger = get_named_logger('AggreMatrix')
103 |     logger.info('Constructing count matrices')
104 | 
105 |     # converting to float on fly means we can save a copy when normalizing
106 |     try:
107 |         matrix = ExpressionMatrix.aggregate_tags(args.input, args.feature, dtype=float)
108 |     except UnicodeDecodeError:
109 |         matrix = ExpressionMatrix.aggregate_hdfs(args.input, dtype=float)
110 | 
111 |     logger.info("Removing unknown features.")
112 |     if len(matrix.cells) == 0:
113 |         raise ValueError("""The expression matrix contains no cells.
114 |             This may indicate an issue with data quality or volume.
115 |             Incorrectly specified 10x kits/versions and reference data can also lead to
116 |             to removal of all data at this point.""")
117 | 
118 |     if matrix.is_visium_hd:
119 |         logger.info(
120 |             "Converting Visium HD matrix to 8um binning.")
121 |         matrix.bin_cells_by_coordinates(bin_size=4, inplace=True)
122 | 
123 |     # Generate statistics from the assembled matrix before any filtering.
124 |     stats = {}
125 |     stats['median_umis_per_cell'] = matrix.median_counts
126 |     stats['median_genes_per_cell'] = matrix.median_features_per_cell
127 | 
128 |     with open(args.stats, 'w') as fh:
129 |         for k, v in stats.items():
130 |             fh.write(f'{k}\t{v}\n')
131 | 
132 |     # Begin filtering
133 |     matrix.remove_unknown()
134 | 
135 |     logger.info("Writing raw counts to file.")
136 |     if args.text:
137 |         matrix.to_tsv(args.raw, args.feature)
138 |     else:
139 |         matrix.to_mex(args.raw, dtype=int)
140 | 
141 |     if args.enable_filtering:
142 |         logger.info("Filtering, normalizing and log-transforming matrix.")
143 |         matrix = (
144 |             matrix
145 |             .remove_cells_and_features(args.min_features, args.min_cells)
146 |             .remove_skewed_cells(
147 |                 args.max_mito / 100, args.mito_prefixes,
148 |                 fname=args.per_cell_mito, label="mito_pct")
149 |             .normalize(args.norm_count)
150 |             .log_transform()
151 |         )
152 |         logger.info("Writing filtered matrix.")
153 |         if args.text:
154 |             matrix.to_tsv(args.processed, args.feature)
155 |         else:
156 |             matrix.to_mex(args.processed)
157 |     else:
158 |         logger.info("Normalizing and log-transforming matrix.")
159 |         matrix.normalize(args.norm_count).log_transform()
160 | 
161 |     logger.info("Writing mean expression levels.")
162 |     ExpressionMatrix.write_matrix(
163 |         args.per_cell_expr,
164 |         matrix.mean_expression, matrix.tcells, ['mean_expression'], index_name='CB')
165 | 
166 |     if args.enable_umap:
167 |         logger.info(f"Performing PCA on matrix of shape: {matrix.matrix.shape}")
168 |         pcn = min(args.pcn, *matrix._matrix.shape)
169 |         if matrix.sparse:
170 |             logger.info("Using TruncatedSVD for PCA.")
171 |             model = TruncatedSVD(n_components=pcn)
172 |         else:
173 |             logger.info("Matrix is dense, using PCA for PCA.")
174 |             model = PCA(n_components=pcn, copy=False)
175 | 
176 |         # note, we're going to do things in place so ExpressionMatrix will
177 |         # become modified (trimmed on feature axis, and transposed)
178 |         mat = matrix._matrix
179 |         mat = model.fit_transform(mat.transpose())
180 | 
181 |         logger.info(f"PCA output matrix has shape: {mat.shape}")
182 |         # as we've done PCS in place, we should update the features
183 |         matrix._features = np.array([f"pca_{i}" for i in range(pcn)])
184 |         matrix._s_features = np.arange(pcn)
185 | 
186 |         for replicate in range(args.replicates):
187 |             logger.info(f"Performing UMAP replicate {replicate + 1}.")
188 |             if mat.shape[0] > args.max_umap_cells:  # cells is now first dim ;)
189 |                 logger.warning(
190 |                     f"Downsampling to {args.max_umap_cells} cells/spots for UMAP. ")
191 |                 rng = np.random.default_rng(seed=replicate)
192 |                 subset_indices = rng.choice(
193 |                     mat.shape[0], size=args.max_umap_cells, replace=False)
194 |                 fit_data = mat[subset_indices, :]
195 |             else:
196 |                 fit_data = mat
197 | 
198 |             mapper = umap.UMAP(
199 |                 n_neighbors=args.n_neighbors,
200 |                 min_dist=args.min_dist,
201 |                 n_components=args.dimensions)
202 |             logger.info("Fitting UMAP model.")
203 |             mapper.fit(fit_data)
204 |             logger.info("Transforming matrix to UMAP embedding.")
205 |             embedding = mapper.transform(mat)
206 |             logger.info(f"UMAP Embedding has shape: {embedding.shape}")
207 | 
208 |             # would be nice to avoid a copy here, but the array is fairly small
209 |             fname = str(args.umap_tsv).replace('REPEAT', str(replicate))
210 |             logger.info(f"Writing UMAP embedding {fname}.")
211 |             cols = [f"D{i+1}" for i in range(args.dimensions)]
212 |             out = pd.DataFrame(embedding, columns=cols, index=matrix.tcells)
213 |             out.to_csv(fname, sep="\t", index=True, index_label="CB")
214 |         matrix._matrix = mat.transpose()  # undo the PCA transpose
215 | 
216 |     logger.info("Done.")
217 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/sc_util.py:
--------------------------------------------------------------------------------
 1 | """Common code to be used across workflow scripts."""
 2 | import collections
 3 | import json
 4 | 
 5 | kit_adapters = {
 6 |     '3prime': {
 7 |         'adapter1': 'CTACACGACGCTCTTCCGATCT',
 8 |         'adapter2': 'ATGTACTCTGCGTTGATACCACTGCTT'
 9 |     },
10 |     'multiome': {
11 |         'adapter1': 'CTACACGACGCTCTTCCGATCT',
12 |         'adapter2': 'ATGTACTCTGCGTTGATACCACTGCTT'
13 |     },
14 |     'visium': {
15 |         'adapter1': 'CTACACGACGCTCTTCCGATCT',
16 |         'adapter2': 'ATGTACTCTGCGTTGATACCACTGCTT'
17 |     },
18 |     '5prime': {
19 |         'adapter1': 'CTACACGACGCTCTTCCGATCT',
20 |         'adapter2': 'GTACTCTGCGTTGATACCACTGCTT'
21 |     }
22 | }
23 | 
24 | revcomp_map = str.maketrans("ACGTacgt", "TGCAtgca")
25 | 
26 | 
27 | def rev_cmp(seq):
28 |     """Reverse complement a DNA sequence."""
29 |     return seq[::-1].translate(revcomp_map)
30 | 
31 | 
32 | class StatsSummary(collections.Counter):
33 |     """Summary dictionary for storing."""
34 | 
35 |     fields = {}  # subclasses should fill this in
36 | 
37 |     def __init__(self, *args, **kwargs):
38 |         """Count some numbers."""
39 |         self.update(*args, **kwargs)
40 | 
41 |     @classmethod
42 |     def from_pandas(cls, df):
43 |         """Create an instance from a pandas dataframe."""
44 |         raise NotImplementedError("This method has not been implemented.")
45 | 
46 |     def to_dict(self):
47 |         """Create dictionary with explicit zeroes."""
48 |         return {k: self[k] for k in self}
49 | 
50 |     @classmethod
51 |     def from_json(cls, fname):
52 |         """Create and instance from a JSON file."""
53 |         with open(fname, "r") as fh:
54 |             data = json.load(fh)
55 |         return cls(data)
56 | 
57 |     def to_json(self, fname):
58 |         """Save to JSON."""
59 |         with open(fname, "w") as fh:
60 |             json.dump(self.to_dict(), fh, indent=4)
61 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/summarise_adapters.py:
--------------------------------------------------------------------------------
 1 | """Aggregate adapter configuration summaries."""
 2 | from pathlib import Path
 3 | 
 4 | from .adapter_scan_vsearch import AdapterSummary  # noqa: ABS101
 5 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 6 | 
 7 | 
 8 | def argparser():
 9 |     """Create argument parser."""
10 |     parser = wf_parser("summarise_adapters")
11 | 
12 |     parser.add_argument(
13 |         "input_dir", type=Path,
14 |         help="Path to JSON files to aggregate.")
15 |     parser.add_argument(
16 |         "output", type=Path,
17 |         help="Path to output JSON file")
18 | 
19 |     return parser
20 | 
21 | 
22 | def main(args):
23 |     """Aggregate multiple adapter configuration summary files."""
24 |     logger = get_named_logger('AggAdptCnf')
25 |     logger.info("Aggregating adapter configurations")
26 | 
27 |     fnames = list(args.input_dir.glob("*.json"))
28 |     if len(fnames) == 0:
29 |         raise IOError("No summary JSON files found.")
30 | 
31 |     summary = AdapterSummary.from_json(fnames[0])
32 |     if len(fnames) > 1:
33 |         for other in fnames[1:]:
34 |             summary += AdapterSummary.from_json(other)
35 |     summary.to_json(args.output)
36 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tag_bam.py:
--------------------------------------------------------------------------------
  1 | """Tag BAM with workflow-derived information.
  2 | 
  3 | Tags files are TSV files containing read_id to tags mappings (such as barcodes, UMIs,
  4 | assigned features). We iterate over the BAM file by chromosome, loading the tags for
  5 | each chromosome individually to avoid holding all tags in memory
  6 | at once. The records are tagged and output to a tagged BAM. This process only tags
  7 | primary records, or supplementary records that are on the same chromosome as their
  8 | primary record.
  9 | 
 10 | To tag supplementary records, there is another tags file input that contains
 11 | read_id to tag mappings for all supplementary records, formatted identically to the
 12 | primary tags file.
 13 | During tagging, these are all loaded into memory regardless of chromosome they map to.
 14 | This allows supplementary records that map to a different chromosome than their
 15 | primary alignment to be properly tagged.
 16 | """
 17 | import csv
 18 | from dataclasses import dataclass
 19 | import itertools
 20 | from pathlib import Path
 21 | 
 22 | import pysam
 23 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 24 | 
 25 | logger = get_named_logger("TagBAMs")
 26 | 
 27 | 
 28 | BAM_TAGS = {
 29 |     "corrected_barcode": "CB",
 30 |     "uncorrected_barcode": "CR",
 31 |     "quality_barcode": "CY",
 32 |     "corrected_umi": "UB",
 33 |     "uncorrected_umi": "UR",
 34 |     "quality_umi": "UY",
 35 |     "gene": "GN",
 36 |     "transcript": "TR"
 37 | }
 38 | 
 39 | 
 40 | def argparser():
 41 |     """Create argument parser."""
 42 |     parser = wf_parser("tag_bams")
 43 | 
 44 |     parser.add_argument(
 45 |         "in_bam", type=Path,
 46 |         help="BAM file for tagging")
 47 | 
 48 |     parser.add_argument(
 49 |         "out_bam", type=Path,
 50 |         help="Path for tagged output BAM")
 51 | 
 52 |     parser.add_argument(
 53 |         "tags", type=Path,
 54 |         help="Read tags TSV")
 55 | 
 56 |     parser.add_argument(
 57 |         "sa_tags", type=Path,
 58 |         help="Read supplementary tags TSV")
 59 | 
 60 |     parser.add_argument(
 61 |         "--threads", default=2, type=int,
 62 |         help="Number of threads used for BAM reading/writing.")
 63 |     return parser
 64 | 
 65 | 
 66 | # The use of a dataclass here is primarily to reduce memory:
 67 | #  chr1, 9517964 reads. dict: 11.6 GB, class: 8.5 GB
 68 | # The overhead in creating instances of the class is small
 69 | # compared to the BAM writing time, and access is similarly
 70 | # fast enough.
 71 | 
 72 | @dataclass
 73 | class Tags:
 74 |     """Storing tag data for a read."""
 75 | 
 76 |     CB = None
 77 |     CR = None
 78 |     CY = None
 79 |     UB = None
 80 |     UR = None
 81 |     UY = None
 82 |     GN = None
 83 |     TR = None
 84 |     chrom = None
 85 | 
 86 |     @classmethod
 87 |     def from_dict(cls, d):
 88 |         """Create instance from a dictionary."""
 89 |         self = cls()
 90 |         for k in BAM_TAGS.values():
 91 |             setattr(self, k, d[k])
 92 |         setattr(self, "chrom", d["chr"])
 93 |         return self
 94 | 
 95 | 
 96 | class TagStore:
 97 |     """Proxy to tag files for retrieving per-read tag information."""
 98 | 
 99 |     def __init__(self, tags, bam=None, sa_tags=None):
100 |         """Initialize an instance."""
101 |         self._sa_tags = self._load_supplementary_tags(sa_tags)
102 |         self._cur = None
103 |         self._single = False
104 |         if tags.is_file():
105 |             self._single = True
106 |             self._tags = self._read_file(tags)
107 |         elif tags.is_dir():
108 |             if bam is None:
109 |                 raise ValueError("`bam` should be provided when `tags` is a directory.")
110 |             tags = tags.glob("*.tsv")
111 |             self._index = dict()
112 |             for fname in tags:
113 |                 d = self._read_file(fname, nrows=10)
114 |                 try:
115 |                     chrom = getattr(next(iter(d.values())), "chrom")
116 |                     self._index[chrom] = fname
117 |                 except StopIteration:
118 |                     logger.warning(f"{fname} appears empty.")
119 |                 else:
120 |                     logger.info(f"{fname} contains tags for reference: {chrom}.")
121 |         else:
122 |             raise ValueError(
123 |                 "`tags` should be a tags file or directory containing such files.")
124 | 
125 |     def _load_supplementary_tags(self, sa_tags):
126 |         # Load tag info for all reads with one or more suppl. records.
127 |         # These are added to self._tags later regardless of chr mapping.
128 |         # This enures that supplementary records are tagged even if on a different
129 |         # chr to primary record.
130 |         sa_tags_files = []
131 |         if sa_tags is not None:
132 |             sa_tags_files = sa_tags.glob("*.tsv")
133 |         sa_tags = {}
134 |         for fname in sa_tags_files:
135 |             sa_tags.update(self._read_file(fname))
136 |         return sa_tags
137 | 
138 |     def populate(self, rname):
139 |         """Populate the proxy for a given reference."""
140 |         if not self._single:
141 |             self._cur = rname
142 |             try:
143 |                 self._tags = self._read_file(self._index[self._cur])
144 |             except KeyError:
145 |                 # No primary records for this chr, but there may be suppl records
146 |                 self._tags = {}
147 |             self._tags.update(self._sa_tags)
148 | 
149 |     def _read_file(self, fname, nrows=None, cols=None):
150 |         """Read a tags file."""
151 |         # note: this is actually around 50% faster than:
152 |         #       pd.read_csv().to_dict(orient="index")
153 |         # first find and rename the fields to be tags rather than human names
154 |         fields = None
155 |         with open(fname) as csvfile:
156 |             iterator = csv.DictReader(csvfile, delimiter="\t")
157 |             fields = iterator.fieldnames
158 |             for k, v in BAM_TAGS.items():
159 |                 for i in range(len(fields)):
160 |                     if fields[i] == k:
161 |                         fields[i] = v
162 |         # now parse the file
163 |         with open(fname) as csvfile:
164 |             iterator = csv.DictReader(csvfile, delimiter="\t", fieldnames=fields)
165 |             next(iterator)  # setting fieldnames doesn't read header
166 |             if nrows is not None:
167 |                 iterator = itertools.islice(iterator, nrows)
168 |             data = {d["read_id"]: Tags.from_dict(d) for d in iterator}
169 |         return data
170 | 
171 |     def __getitem__(self, read_data):
172 |         """Retrieve tags for a read."""
173 |         read_id, chrom = read_data
174 |         try:
175 |             data = self._tags[read_id]
176 |         except KeyError:
177 |             exp = KeyError(f"Read '{read_id}' not found in tag data.")
178 |             if chrom != self._cur:
179 |                 self._cur = chrom
180 |                 self._tags = self._read_file(self._index[self._cur])
181 |                 try:
182 |                     data = self._tags[read_id]
183 |                 except KeyError:
184 |                     raise exp
185 |             else:
186 |                 raise exp
187 |         return data
188 | 
189 | 
190 | def add_tags(tags, sa_tags, in_bam, out_bam, threads):
191 |     """Add all the required tags to the BAM file."""
192 |     store = TagStore(tags, bam=in_bam, sa_tags=sa_tags)
193 | 
194 |     skipped = 0
195 |     written = 0
196 |     with pysam.AlignmentFile(in_bam, "rb", threads=threads) as bam_in:
197 |         with pysam.AlignmentFile(
198 |                 out_bam, "wb", template=bam_in, threads=threads) as bam_out:
199 |             for ref in bam_in.references:
200 |                 logger.info(f"Processing reads from reference: {ref}.")
201 |                 # There may be no primary records for this reference, but we'll process
202 |                 # it in case there are any supplementary records
203 |                 store.populate(ref)
204 |                 logger.info("Tagging reads.")
205 |                 for align in bam_in.fetch(ref):
206 |                     read_id = align.query_name
207 |                     try:
208 |                         row = store._tags[read_id]
209 |                     except KeyError:
210 |                         skipped += 1
211 |                         continue  # don't write reads without tags
212 |                     else:
213 |                         written += 1
214 |                         for tag in BAM_TAGS.values():
215 |                             align.set_tag(tag, getattr(row, tag), value_type="Z")
216 |                         bam_out.write(align)
217 |     total = skipped + written
218 |     written_pct = 0
219 |     skipped_pct = 0
220 |     if total > 0:
221 |         written_pct = 100 * written / total
222 |         skipped_pct = 100 * skipped / total
223 |     logger.info(
224 |         f"Written: {written} ({written_pct:0.2f}%). "
225 |         f"Skipped: {skipped} ({skipped_pct:0.2f}%).")
226 | 
227 | 
228 | def main(args):
229 |     """Entry point."""
230 |     add_tags(args.tags, args.sa_tags, args.in_bam, args.out_bam, args.threads)
231 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tags_from_bam.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """Extract tags from BAM file and write to TSV."""
 3 | from collections import Counter
 4 | 
 5 | import pandas as pd
 6 | import pysam
 7 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 8 | 
 9 | 
10 | def argparser():
11 |     """Argument parser for entrypoint."""
12 |     parser = wf_parser("prepare_report_data")
13 | 
14 |     parser.add_argument(
15 |         'bam_in', help="Input FASTQ file. Can be path or stdin (-)")
16 |     parser.add_argument(
17 |         'tags_out', help="Output TSV file with tags per read")
18 |     parser.add_argument(
19 |         'barcode_counts_out', help="Output TSV file with tags per read")
20 |     parser.add_argument(
21 |         "--tags", nargs='*',
22 |         help="List of tags to extract from the BAM file")
23 |     parser.add_argument(
24 |         "--chrom", default=None,
25 |         help="List of tags to extract from the BAM file")
26 | 
27 |     return parser
28 | 
29 | 
30 | def main(args):
31 |     """Read in BAM and extract tags. Write TSV of tags per read.."""
32 |     logger = get_named_logger("TagsFromBAM")
33 |     logger.info("Extracting tags from BAM file")
34 | 
35 |     # Write output file header
36 |     with open(args.tags_out, "w") as tags_out:
37 |         tags_out.write(
38 |             "read_id\t" + "\t".join(args.tags) + "\n")
39 | 
40 |     barcode_counter = Counter()
41 | 
42 |     in_ = pysam.AlignmentFile(args.bam_in, "rb")
43 |     out = open(args.tags_out, "a")
44 | 
45 |     with in_ as bam_in, out as tags_out:
46 |         if args.chrom:
47 |             iterator = bam_in.fetch(args.chrom, until_eof=True)
48 |         else:
49 |             iterator = bam_in.fetch(until_eof=True)
50 |         for record in iterator:
51 |             if record.is_unmapped:
52 |                 continue
53 |             tag_values = []
54 | 
55 |             if not record.has_tag('CB'):
56 |                 continue
57 | 
58 |             for tag in args.tags:
59 |                 if record.has_tag(tag):
60 |                     val = record.get_tag(tag)
61 |                     if tag == 'CB':
62 |                         barcode_counter[val] += 1
63 |                     # Quote double quaotes in quality strings
64 |                     # For now just report ?
65 |                     if tag in ['CY', 'UY']:
66 |                         # val = val.replace('"', '""')
67 |                         # val = f'"{val}"'
68 |                         val = "?" * 8
69 |                 else:
70 |                     val = '-'
71 |                 tag_values.append(val)
72 | 
73 |             tags_str = f"{record.query_name}\t" + "\t".join(tag_values)
74 |             tags_out.write(f"{tags_str}\n")
75 | 
76 |     (
77 |         pd.DataFrame(
78 |             barcode_counter.items(), columns=['barcode', 'count'])
79 |         .sort_values(by='count', ascending=False)
80 |         .to_csv(args.barcode_counts_out, sep='\t', index=False, header=True)
81 |     )
82 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """__init__.py for the tests."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_adapter_scan_vsearch.py:
--------------------------------------------------------------------------------
  1 | """Test adapter_scan_vsearch."""
  2 | from pathlib import Path
  3 | import tempfile
  4 | 
  5 | import pysam
  6 | import pytest
  7 | from pytest import fixture
  8 | from workflow_glue.adapter_scan_vsearch import (
  9 |     call_vsearch, complement_trans, create_stranded_reads,
 10 |     parse_vsearch, write_adapters_fasta)
 11 | from workflow_glue.sc_util import kit_adapters
 12 | 
 13 | 
 14 | @fixture
 15 | def segment():
 16 |     """Random sequence to build a seq by concatenating along with adapters."""
 17 |     return (
 18 |         "ATTCAGCGCTGGAGACCGAGCGCCCCGCAAAGGGCCTGATCT"
 19 |         "ATCGCGCACGGGACTACTCATTGGGACTGCGGCAATAGGGGAGGGGCCTAACAACGTT")
 20 | 
 21 | 
 22 | @pytest.mark.parametrize(
 23 |     'adapters,expected_results',
 24 |     [
 25 |         # Non-full length reads
 26 |         [[], [['*', 'no_adapters', '*']]],
 27 |         [['adapter1_f'], [['adapter1_f', 'single_adapter1', '+']]],
 28 |         [['adapter2_r'], [['adapter2_r', 'single_adapter2', '-']]],
 29 |         [['adapter2_r', 'adapter1_f'], [['adapter2_r-adapter1_f', 'other', '*']]],
 30 |         # Full length reds
 31 |         [['adapter1_f', 'adapter2_f'], [['adapter1_f-adapter2_f', 'full_len', '+']]],
 32 |         # 3 adapters with one full length segment
 33 |         [['adapter2_r', 'adapter1_r', 'adapter1_f'],
 34 |          [['adapter2_r-adapter1_r', 'full_len', '-']]],
 35 |         # Mutiple subreads in a read
 36 |         [['adapter1_f', 'adapter2_f', 'adapter2_r', 'adapter1_r'],
 37 |             [
 38 |                ['adapter1_f-adapter2_f', 'full_len', '+'],
 39 |                ['adapter2_r-adapter1_r', 'full_len', '-'],
 40 |         ]],
 41 |     ]
 42 | )
 43 | def test_call_vsearch(adapters, expected_results, segment):
 44 |     """Test call_vsearch running and parsing.
 45 | 
 46 |     This is the main function of the script that calls a bunch of other functions.
 47 |     """
 48 |     id_ = 'read_1'
 49 |     kits = ['3prime', '5prime', 'multiome']
 50 | 
 51 |     for kit in kits:
 52 |         # Build the read
 53 |         adapter_seqs = []
 54 |         for a in adapters:
 55 |             # Get name and orientation from eg: adapter2_r
 56 |             adapter_name, ori = a.split('_')
 57 |             adap = kit_adapters[kit][adapter_name]
 58 |             if ori == 'r':
 59 |                 adap = adap[::-1].translate(complement_trans)
 60 |             adapter_seqs.append(adap)
 61 | 
 62 |         seq = segment.join(adapter_seqs) + segment
 63 | 
 64 |         fastq = (
 65 |             f"@{id_}\n"
 66 |             f"{seq}\n"
 67 |             "+\n"
 68 |             f"{'<' * len(seq)}")
 69 | 
 70 |         fastq_file = tempfile.NamedTemporaryFile(suffix='.fq')
 71 |         with open(fastq_file.name, 'w') as fh:
 72 |             fh.write(fastq)
 73 | 
 74 |         adapter_fasta = 'adapter_seqs.fasta'
 75 |         write_adapters_fasta(
 76 |             kit_adapters[kit]['adapter1'], kit_adapters[kit]['adapter2'],
 77 |             adapter_fasta)
 78 | 
 79 |         vsearch_results = tempfile.NamedTemporaryFile(suffix='.fq')
 80 |         call_vsearch(
 81 |             Path(fastq_file.name), Path(vsearch_results.name), 0.7, adapter_fasta, 4)
 82 |         parsed_results = parse_vsearch(vsearch_results.name)
 83 | 
 84 |         # Each result can contain 0 or more subreads -
 85 |         #  segments with consecutive pairs of compatible adapters.
 86 |         parsed_results = iter(parsed_results[id_])
 87 |         for exp_result in expected_results:
 88 |             subread_result = next(parsed_results)
 89 | 
 90 |             assert subread_result['adapter_config'] == exp_result[0]
 91 |             assert subread_result['lab'] == exp_result[1]
 92 |             assert subread_result['orig_strand'] == exp_result[2]
 93 | 
 94 | 
 95 | def test_write_stranded_fastq():
 96 |     """Test that the  correct stranded and trimmed fastq files are being written."""
 97 |     # Build a dummy fastq file containing a single read with two subreads.
 98 | 
 99 |     # This is a 3prime read
100 |     seq = 't' * 10 + 'A' * 100 + 't' * 10 + 'G' * 200
101 |     fastq = (
102 |         f"@read_1\n"
103 |         f"{seq}\n"
104 |         "+\n"
105 |         f"{'<' * len(seq)}")
106 | 
107 |     # This config defines one read containing two subreads.
108 |     config = {
109 |         'read_1': [
110 |             {
111 |                 'readlen': 100, 'read_id': 'read_1_0', 'start': 10,
112 |                 'end': 110,
113 |                 'fl': True, 'stranded': True, 'orig_strand': '+',
114 |                 'orig_adapter_config':
115 |                     'adapter1_f-adapter2_f-adapter2_r-adapter1_r',
116 |                 'adapter_config': 'adapter1_f-adapter2_f',
117 |                 'lab': 'full_len'},
118 |             {
119 |                 'readlen': 200, 'read_id': 'read_1_1', 'start': 120,
120 |                 'end': 320,
121 |                 'fl': True, 'stranded': True, 'orig_strand': '-',
122 |                 'orig_adapter_config':
123 |                     'adapter1_f-adapter2_f-adapter2_r-adapter1_r',
124 |                 'adapter_config': 'adapter1_f-adapter2_f',
125 |                 'lab': 'full_len'}
126 |             ]
127 |     }
128 | 
129 |     temp_fq = tempfile.NamedTemporaryFile(suffix='.fq')
130 | 
131 |     temp_fq_out = tempfile.NamedTemporaryFile(mode='wt', suffix='.fq')
132 | 
133 |     with open(temp_fq.name, 'w') as fh:
134 |         fh.write(fastq)
135 | 
136 |     data = create_stranded_reads(temp_fq.name, config, '3prime', fl_only=True)
137 |     for read in data:
138 |         temp_fq_out.write(read)
139 |     temp_fq_out.flush()
140 | 
141 |     results = []
142 |     with pysam.FastxFile(temp_fq_out.name) as fh_res:
143 |         for entry in fh_res:
144 |             results.append(entry)
145 |     assert len(results) == 2
146 |     assert len(results[0].sequence) == 100
147 |     # Subread 0 should have been reverse complemented and be all 'T'
148 |     assert set(results[0].sequence) == {'T'}
149 |     assert len(results[1].sequence) == 200
150 |     # Subread 1 should remain unchanged
151 |     assert set(results[1].sequence) == {'G'}
152 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_assign_barcodes.py:
--------------------------------------------------------------------------------
  1 | """Test assign_barcodes."""
  2 | from collections import Counter
  3 | 
  4 | import pandas as pd
  5 | import pytest
  6 | from workflow_glue.assign_barcodes import (
  7 |     determine_barcode, process_records
  8 | )
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def allowed_barcodes():
 13 |     """Make a small barcode whitelist."""
 14 |     return set(
 15 |         ('AAAAAAAAAAAAAAAA',
 16 |          'ttAAAAAAAAAAAAAA',
 17 |          'ttttAAAAAAAAAAAA',
 18 |          'AAAAAAAAAAAAAccc',
 19 |          'AAAggggAAAAAAAAA'))
 20 | 
 21 | 
 22 | def test_calc_ed_with_allowed_barcodes(allowed_barcodes):
 23 |     """Test edit distance calculation."""
 24 |     bc1 = 'AAAAAAAAAAAAAAAA'
 25 |     bc_match = determine_barcode(
 26 |         bc1, list(allowed_barcodes), allowed_barcodes, 2, 2, Counter())
 27 |     assert bc_match == 'AAAAAAAAAAAAAAAA'
 28 | 
 29 |     # An uncorrected BC with a nearest match ED of 7 (cutoff = 6)
 30 |     # return no match
 31 |     bc2 = 'AAAAAAAAAggggggg'
 32 |     bc_match = determine_barcode(
 33 |         bc2, list(allowed_barcodes), allowed_barcodes, 2, 2, Counter())
 34 |     assert bc_match == '-'
 35 | 
 36 | 
 37 | @pytest.mark.parametrize("use_kmer_index", [False, True])
 38 | def test_process_records(tmp_path, allowed_barcodes, use_kmer_index):
 39 |     """Test process_records.
 40 | 
 41 |     Check if barcodes are corrected and enumerated appropriately.
 42 |     """
 43 |     # Build some uncorrected barcodes.
 44 |     # The columns used in this test are read_id and CR (uncorrected barcode). The other
 45 |     # Columns can be any value for now
 46 |     header = ('read_id', 'CR', 'CY', 'UR', 'UY', 'chr', 'start', 'end', 'mapq')
 47 |     rows = [
 48 |         # 100% match to whitelist
 49 |         ('read1', 'AAAAAAAAAAAAAAAA', 'qual', 'umi', 'qual' 'chr', 0, 100, 20),
 50 |         # This should be corrected to AAAAAAAAAAAAAAAA
 51 |         ('read2', 'AAAAcAAAcAAAAAAA', 'qual', 'umi', 'qual', 'chr', 0, 100, 20),
 52 |         # Not corrected due to multiple hits to whitelist.
 53 |         # bc_match_ed <= max_ed  but next_match_diff < min_ed_diff
 54 |         ('read3', 'tAAAAAAAAAAAAAAA', 'qual', 'umi', 'qual', 'chr', 0, 100, 20),
 55 |         ('read4', 'AtAAAAAAAAAAAAAA', 'qual', 'umi', 'qual', 'chr', 0, 100, 20),
 56 |         # No matches to whitelist
 57 |         ('read5', 'GGGGGGGGGGGGGGGG', 'qual', 'umi', 'qual', 'chr', 0, 100, 20),
 58 |         ('read6', 'GCGCGCGCGCGCGCGC', 'qual', 'umi', 'qual', 'chr', 0, 100, 20),
 59 |         # Not corrected. A hit will have been found in the initial rapidfuzz search but
 60 |         # none have an ED <= max_ed (2).
 61 |         ('read7', 'cccAAAAAAAAAAAAA', 'qual', 'umi', 'qual', 'chr', 0, 100, 20),
 62 |     ]
 63 | 
 64 |     tags = (
 65 |         pd.DataFrame(rows, columns=header)
 66 |         .set_index('read_id', drop=True)
 67 |         .assign(SA='True'))  # Add constant SA column (not used in the tested code)
 68 |     tags_file = tmp_path / 'tags.tsv'
 69 |     tags.to_csv(tags_file.name, sep='\t')
 70 |     tags_output = tmp_path / 'tags_out.tsv'
 71 | 
 72 |     max_ed = 2
 73 |     min_ed_diff = 2
 74 |     barcode_counter, reasons_counter = process_records(
 75 |         tags_file.name, allowed_barcodes,
 76 |         max_ed, min_ed_diff, tags_output.name,
 77 |         use_kmer_index=use_kmer_index)
 78 | 
 79 |     result_tags_df = pd.read_csv(tags_output.name, sep='\t', index_col=0)
 80 | 
 81 |     # Just the single corrected barcode should be present: AAAAAAAAAAAAAAAA
 82 |     assert len(barcode_counter) == 1
 83 |     assert barcode_counter['AAAAAAAAAAAAAAAA'] == 2
 84 | 
 85 |     assert result_tags_df.loc['read1', 'CB'] == 'AAAAAAAAAAAAAAAA'
 86 |     assert result_tags_df.loc['read2', 'CB'] == 'AAAAAAAAAAAAAAAA'
 87 |     # Reads without a corrected barcode should not be present in the output
 88 |     assert 'read3' not in result_tags_df.index
 89 |     assert 'read4' not in result_tags_df.index
 90 |     assert 'read5' not in result_tags_df.index
 91 |     assert 'read6' not in result_tags_df.index
 92 | 
 93 |     assert dict(reasons_counter) == \
 94 |            {
 95 |             'bc_shortlist_exact_match': 1,   # Read1
 96 |             'bc_corrected': 1,               # Read2
 97 |             'bc_no_shortlist_match': 3,      # Read5, Read6, Read7
 98 |             'bc_shortlist_multiple_hits': 2  # Read3, Read4
 99 |            }
100 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_assign_features.py:
--------------------------------------------------------------------------------
  1 | """Test assign_barcodes."""
  2 | import subprocess
  3 | import tempfile
  4 | 
  5 | import pandas as pd
  6 | from workflow_glue.assign_features import (
  7 |     main
  8 | )
  9 | 
 10 | 
 11 | def getbam():
 12 |     """Create a synthetic bam file containing adapter and barcode sequences."""
 13 |     # two refrence transcripts in the BAM
 14 |     # One is 70 and one is 100bp long
 15 |     header = (
 16 |         "@SQ	SN:ST001	LN:2000\n"
 17 |         "@SQ	SN:ST002	LN:1000\n")
 18 | 
 19 |     # Define the transcripts. Normally we wouldn't have transcripts this samll as
 20 |     # they would not pass the size threadhold in the stringtie step, but they work for
 21 |     # out purposes here
 22 | 
 23 |     alns = [
 24 |         # Uniquely-mapped read
 25 |         ['read_1', 200, 100, 500, 60, 'ST001', '500M'],
 26 |         # read_2 mapped to two locations. ST002 should be assigned to read 2
 27 |         # as it's higher AS and read and transcript cov > 0.4
 28 |         ['read_2', 150, 100, 400, 1, 'ST001', '100H400M'],
 29 |         ['read_2', 200, 500, 500, 60, 'ST002', '500M'],
 30 |         # read_3 maps to two locations. The second alignemnt has higher AS score
 31 |         # but will not be assigned as reference coverage is < 0.4
 32 |         ['read_3', 150, 500, 100, 1, 'ST001', '50M400H50M'],
 33 |         ['read_3', 200, 500, 150, 60, 'ST001', '200H150M200H']
 34 |     ]
 35 | 
 36 |     sam = header
 37 |     for align in alns:
 38 |         qname, a_score, start, seqlen, mapq, rname, cigar = align
 39 |         # Make a sam file containing the read and a quality qscore of 60.
 40 |         sam += (
 41 |             f"{qname}\t0\t{rname}\t{start}\t{mapq}\t{cigar}\t*\t0\t0\t"
 42 |             f"{'A' * seqlen}\t{'?' * seqlen}\tAS:i:{a_score}\n"
 43 |         )
 44 | 
 45 |     # Write out a test BAM
 46 |     with tempfile.NamedTemporaryFile(
 47 |             mode='w', suffix='.sam', delete=False) as fh_sam:
 48 |         fh_sam.write(sam)
 49 |         sam_file = fh_sam.name
 50 | 
 51 |     bam = 'align.bam'
 52 |     subprocess.check_output(['samtools', 'view', sam_file, '-o', 'align.bam'])
 53 | 
 54 |     return bam
 55 | 
 56 | 
 57 | def test_main():
 58 |     """Test main."""
 59 |     # gffcompare tmap dataframe. Maps stringtie transcripts (qry_id)
 60 |     # to reference transcripts and reference gene IDs
 61 |     df_gffcompare_tmap_rows = (
 62 |         ('ST001', 'ref_tr_1', 'gene_id_1', '='),
 63 |         ('ST002', 'ref_tr_2', 'gene_id_2', '='),
 64 |         ('ST003', 'ref_tr_3', 'gene_id_3', '=')
 65 |     )
 66 |     df_gffcompare_tmap = pd.DataFrame(
 67 |         df_gffcompare_tmap_rows, columns=[
 68 |             ['qry_id', 'ref_id', 'ref_gene_id', 'class_code']]
 69 |     )
 70 |     gffcompare_file = tempfile.NamedTemporaryFile('w', delete=False, suffix='.tsv').name
 71 |     df_gffcompare_tmap.to_csv(gffcompare_file, sep='\t', index=None)
 72 | 
 73 |     # All we want from tags is the mapq alignment score
 74 |     df_tags_rows = (
 75 |         ('read_1', '60'),
 76 |         ('read_2', '60'),
 77 |         ('read_3', '30'),
 78 |     )
 79 |     df_tags = pd.DataFrame(
 80 |         df_tags_rows, columns=['read_id', 'mapq']
 81 |     )
 82 |     tags_file = tempfile.NamedTemporaryFile('w', delete=False, suffix='.tsv').name
 83 |     df_tags.to_csv(tags_file, index=None, sep='\t')
 84 | 
 85 |     # GTF file is used for mapping transcript id (from the gffcompare tmap file) to
 86 |     # gene name.
 87 |     # Here is just a subset of the gtf. We need 'transcript' in pos [2]. gene_name and
 88 |     # transcript_id are grepped
 89 |     gtf_str = (
 90 |         'chr1\tHAVANA\ttranscript\tgene_name "gene_name_1";transcript_id "ref_tr_1";\n',
 91 |         'chr1\tHAVANA\ttranscript\tgene_name "gene_name_2";transcript_id "ref_tr_2";\n',
 92 |         'chr1\tHAVANA\ttranscript\tgene_name "gene_name_3";transcript_id "ref_tr_3";\n',
 93 |         'chr1\tHAVANA\ttranscript\tgene_name "gene_name_4";transcript_id "ref_tr_4";',
 94 |     )
 95 | 
 96 |     with tempfile.NamedTemporaryFile('w', delete=False, suffix='.tsv') as fh:
 97 |         fh.writelines(gtf_str)
 98 |         gtf_file = fh.name
 99 | 
100 |     class Args:
101 |         transcriptome_bam = getbam()
102 |         gffcompare_tmap = gffcompare_file
103 |         tags = tags_file
104 |         gtf = gtf_file
105 |         output = tempfile.NamedTemporaryFile('w', suffix='.tsv', delete=False).name
106 |         min_mapq = 30
107 |         min_tr_coverage = 0.4
108 |         min_read_coverage = 0.4
109 |         chunksize = 1
110 | 
111 |     args = Args()
112 |     main(args)
113 | 
114 |     result = pd.read_csv(args.output, sep='\t', index_col=0)
115 | 
116 |     assert result.at['read_1', 'gene'] == 'gene_name_1'
117 |     assert result.at['read_1', 'transcript'] == 'ref_tr_1'
118 | 
119 |     assert result.at['read_2', 'gene'] == 'gene_name_2'
120 |     assert result.at['read_2', 'transcript'] == 'ref_tr_2'
121 | 
122 |     assert result.at['read_3', 'gene'] == 'gene_name_1'
123 |     assert result.at['read_3', 'transcript'] == '-'
124 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_calc_saturation.py:
--------------------------------------------------------------------------------
 1 | """Test adapter_scan_vsearch."""
 2 | from unittest.mock import Mock
 3 | 
 4 | import polars as pl
 5 | from workflow_glue.calc_saturation import (
 6 |     downsample_dataframe, run_jobs
 7 | )
 8 | 
 9 | 
10 | def test_run_jobs(tmp_path):
11 |     """Test_downsample_reads.
12 | 
13 |     Check for the correct number of downsampled dataframes are returned, each with
14 |     the correct size.
15 |     """
16 |     args = Mock()
17 |     args.read_tags = tmp_path / 'read_tags.tsv'
18 |     args.output = tmp_path / 'output.tsv'
19 |     args.threads = 2
20 |     args.sample = 'test'
21 | 
22 |     # Create df with 1000 rows of fake data.
23 |     with open(args.read_tags, 'w') as fh:
24 |         fh.write('read_id\tcorrected_barcode\tcorrected_umi\tgene\n')
25 |         row = 'id\tagtcgatcgatcgta\tatcgtacaatct\tYFG'
26 |         for i in range(1000):
27 |             fh.write(f'{row}\n')
28 | 
29 |     run_jobs(args)
30 |     result = pl.read_csv(source=args.output, separator='\t')
31 | 
32 |     # Simply check correct number of results are returned
33 |     # and that the downsampled reads are the correct size.
34 |     assert len(result) == 16
35 |     for row in result.iter_rows(named=True):
36 |         assert row['downsamp_reads'] == 1000 * row['downsamp_frac']
37 | 
38 | 
39 | def test_downsample_dataframe():
40 |     """Test calc_saturation."""
41 |     header = ['barcode', 'umi', 'gene']
42 | 
43 |     rows = (
44 |         # Cell 1: 4 reads, 2 umis with two reads each, 2 genes.
45 |         ('AGATAGATAGATAGAT', 'ATAGATAGATAG', 'YFG1'),
46 |         ('AGATAGATAGATAGAT', 'ATAGATAGATAG', 'YFG1'),
47 |         ('AGATAGATAGATAGAT', 'ccccATAGATAG', 'YFG2'),
48 |         ('AGATAGATAGATAGAT', 'ccccATAGATAG', 'YFG2'),
49 | 
50 |         # Cell 2: 4 reads, 3 umis, 3 genes.
51 |         ('TATATATATATATATA', 'TACTACTACTAC', 'YFG3'),
52 |         ('TATATATATATATATA', 'CACTACTACTCA', 'YFG4'),
53 |         ('TATATATATATATATA', 'CACTACTACTCA', 'YFG4'),
54 |         ('TATATATATATATATA', 'GACGACGACGAC', 'YFG5')
55 |     )
56 | 
57 |     df = pl.from_records(
58 |         data=rows, schema=header)
59 | 
60 |     (
61 |         label,
62 |         n_reads,
63 |         reads_per_cell,
64 |         genes_per_cell,
65 |         umis_per_cell,
66 |         umi_saturation
67 |     ) = downsample_dataframe(df, 1.0)
68 | 
69 |     assert n_reads == 8
70 |     assert reads_per_cell == 4
71 |     assert genes_per_cell == 2.5
72 |     assert umis_per_cell == 2.5
73 | 
74 |     unique_umis = 5
75 |     n_reads = 8
76 |     assert umi_saturation == 1 - (unique_umis / n_reads)
77 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_cluster_umis.py:
--------------------------------------------------------------------------------
 1 | """Test cluster barcodes."""
 2 | import pandas as pd
 3 | import pytest
 4 | from workflow_glue.create_matrix import cluster_dataframe
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def umi_gene_df():
 9 |     """Make read tag and feature assignment DataFrames."""
10 |     # Define 3 clusters of UMIs.
11 |     # each entry contains (UMI, gene name and number of UMIs
12 |     clusters = [
13 | 
14 |         # Cluster1 - 3 UMIS ###################
15 |         # 'true' UMI
16 |         ('AAAAAAAAAAAA', 'YFG1', 20),  # umi1
17 |         # ED to umi1 = 2. n_true > (n_umi2 * 2) - 1
18 |         ('ttAAAAAAAAAA', 'YFG1', 10),  # umi2
19 | 
20 |         # Cluster2 - single UMI ###############
21 |         # ED to umi1 = 2, n_true < (n_umi2 * 2) - 1
22 |         ('ggAAAAAAAAAA', 'YFG1', 15),  # umi3
23 | 
24 |         # Cluster3 - single UMI ###############
25 |         # ED to umi1 = 3, n_true > (n_umi2 * 2) - 1
26 |         ('AAAAAAAAAggg', 'YFG1', 10),  # umi4
27 | 
28 |         # Cluster4 - single UMI ###############
29 |         # ED to umi1 = 1, n_true > (n_umi2 * 2) - 1,
30 |         # but has a diffrent gene assignment to UMI 1
31 |         ('cAAAAAAAAAAA', 'YFG2', 10),  # umi5
32 | 
33 |         # Cluster5 - single UMI with insertion ####
34 |         # Should not be returned due to incorrect UMI length
35 |         ('AAAAAAAAAAAAtttt', 'YFG2', 1),  # no umi
36 | 
37 |     ]
38 | 
39 |     # The actual dataframes used in the workflow will contain more columns,
40 |     # but they are not used in the clustering process so are omitted for clarity.
41 |     # CB is required, but can be any non '-' string
42 |     header = ('read_id', 'UR', 'gene', 'CB')
43 |     records = []
44 |     read_num = 0
45 | 
46 |     for umi, gene, n_molecules in clusters:
47 |         for _ in range(n_molecules):
48 |             records.append((f'read_{read_num}', umi, gene, 'CB'))
49 |             read_num += 1
50 | 
51 |     df = pd.DataFrame(
52 |         records, columns=header).set_index(
53 |         'read_id', drop=True)
54 | 
55 |     return df
56 | 
57 | 
58 | def test_process_records(umi_gene_df):
59 |     """Check that process_records is clustering and correcting UMIs appropriately."""
60 |     result = cluster_dataframe(umi_gene_df, 1000, umi_length=12)
61 | 
62 |     assert 'UB' in result
63 |     # Check for the correct number of clusters
64 |     assert result['UB'].nunique() == 4
65 | 
66 |     # Check that UMI2 is corrected to the 'true' UMI of cluster1
67 |     assert all(
68 |         result.loc[
69 |             result['UR'] == 'ttAAAAAAAAAA'].loc[:, 'UB'] == 'AAAAAAAAAAAA')
70 | 
71 |     # Check that the read with a UMI insertion is set to '-'
72 |     assert 'AAAAAAAAAAAAtttt' not in result.UB
73 | 
74 |     # Check that the rest of the UMIs map back to themselves
75 |     # as they are all single-UMI clusters
76 |     df_no_clust1 = result[~result.UR.isin(
77 |         ['AAAAAAAAAAAA', 'ttAAAAAAAAAA', 'AAAAAAAAAAAAtttt'])]
78 |     assert all(df_no_clust1.UR == df_no_clust1.UB)
79 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_extract_barcode.py:
--------------------------------------------------------------------------------
  1 | """Test adapter_scan_vsearch."""
  2 | import functools
  3 | import os
  4 | import tempfile
  5 | from unittest.mock import Mock
  6 | 
  7 | import pandas as pd
  8 | import pytest
  9 | from pytest import fixture
 10 | from workflow_glue import extract_barcode
 11 | from ..sc_util import rev_cmp  # noqa: ABS101
 12 | 
 13 | # prevent stdout writing in align_adapter even with `pytest -s`
 14 | devnull = open(os.devnull, 'w')
 15 | extract_barcode.align_adapter = functools.partial(
 16 |     extract_barcode.align_adapter, fastq_out=devnull)
 17 | 
 18 | 
 19 | def gene():
 20 |     """Get a randomly-generated gene seuence."""
 21 |     return (
 22 |         "ATTCAGCGCTGGAGACCGAGCGCCCCGCAAAGGGCCTGATCT"
 23 |         "ATCGCGCACGGGACTACTCATTGGGACTGCGGCAATAGGGGAGGGGCCTAACAACGTT")
 24 | 
 25 | 
 26 | def make_fastq(
 27 |         read_adapter1='CTACACGACGCTCTTCCGATCT',
 28 |         read_barcode='AAACCCAAGAAACACT',
 29 |         read_umi='GACTGACTGACT',
 30 |         read_polyt='T'*12,
 31 |         rev=True):
 32 |     """Create a synthetic fastq file containing adapter and barcode sequences."""
 33 |     read = \
 34 |         f'{read_adapter1}{read_barcode}{read_umi}{read_polyt}{gene()}'
 35 |     if rev:
 36 |         read = rev_cmp(read)
 37 | 
 38 |     # Make a FASTQ file containing the read and a quality score of 60.
 39 |     fastq = (
 40 |         '@test_id\n'
 41 |         f"{read}\n"
 42 |         '+\n'
 43 |         f"{'?' * len(read)}\n"
 44 |     )
 45 | 
 46 |     # Write out a test FASTQ
 47 |     with tempfile.NamedTemporaryFile(
 48 |             mode='w', suffix='.fastq', delete=False) as fh_fq:
 49 |         fh_fq.write(fastq)
 50 |         fq_fn = fh_fq.name
 51 | 
 52 |     return fq_fn
 53 | 
 54 | 
 55 | @fixture
 56 | def make_superlist():
 57 |     """Make a small superlist(whitelist) of barcodes."""
 58 |     superlist = (
 59 |         "AAACCCAAGAAACACT\n"
 60 |         "AAACCCAAGAAACCAT")
 61 | 
 62 |     with tempfile.NamedTemporaryFile(
 63 |             mode='w', suffix='.tsv', delete=False) as fh_sl:
 64 |         fh_sl.write(superlist)
 65 |         superlist_fname = fh_sl.name
 66 |     return superlist_fname
 67 | 
 68 | 
 69 | @fixture()
 70 | def args(make_superlist):
 71 |     """Mock Args with workflow defaults set."""
 72 |     class Args:
 73 |         contig = 'chr17'
 74 |         fastq = make_fastq()
 75 |         match = 5
 76 |         mismatch = -1
 77 |         acg_to_n_match = 1
 78 |         t_to_n_match = 1
 79 |         adapter1_seq = 'CTACACGACGCTCTTCCGATCT'
 80 |         adapter1_suff_length = 10
 81 |         kit = '3prime'
 82 |         barcode_length = 16
 83 |         umi_length = 12
 84 |         window = 100
 85 |         gap_open = 2
 86 |         gap_extend = 4
 87 |         max_adapter1_ed = 3
 88 |         min_barcode_qv = 15
 89 |         polyt_length = 10
 90 |         superlist = make_superlist
 91 |         verbosity = 2
 92 | 
 93 |     return Args
 94 | 
 95 | 
 96 | def test_main(args):
 97 |     """Test the final output from main()."""
 98 |     # Make temp files to store the output
 99 |     counts_file = tempfile.NamedTemporaryFile(
100 |         mode='w', suffix='.tsv')
101 |     tags_file = tempfile.NamedTemporaryFile(
102 |         mode='w', suffix='.tsv')
103 |     trimmed_fastq_file = tempfile.NamedTemporaryFile(
104 |         mode='w', suffix='.tsv')
105 | 
106 |     args.fastq = make_fastq(read_adapter1=args.adapter1_seq, rev=True)
107 |     args.output_barcode_counts = counts_file.name
108 |     args.output_read_tags = tags_file.name
109 |     args.output_trimmed_fastq = trimmed_fastq_file.name
110 |     extract_barcode.main(args)
111 | 
112 |     # Barcode we expect to find in the input BAM
113 |     # For 3prime this will be reverse
114 |     expected_barcode = 'AAACCCAAGAAACACT'
115 | 
116 |     counts_result = pd.read_csv(counts_file.name, sep='\t')
117 |     assert counts_result.shape == (1, 2)
118 |     assert counts_result.iat[0, 0] == expected_barcode
119 |     assert counts_result.iat[0, 1] == 1
120 | 
121 |     tags_result = pd.read_csv(tags_file.name, sep='\t', index_col=0)
122 | 
123 |     # Check we have correct number of rows returned # read_id(index), CR, CY, UR, UY
124 |     assert tags_result.shape == (1, 4)
125 |     assert tags_result.loc['test_id', 'CR'] == 'AAACCCAAGAAACACT'
126 |     assert tags_result.loc['test_id', 'UR'] == 'GACTGACTGACT'
127 | 
128 |     # TODO: test if barcode missing from superlist
129 | 
130 | 
131 | @pytest.mark.parametrize(
132 |     'adapter1_seq,tags_results_shape,counts_results_shape',
133 |     [
134 |         # Tags files should have 5 columns (read_id, CR, CY, UR, UY) ,
135 |         # counts results should have one column (count) with read_id index
136 |         ['CTACACGACGCTCTTCCGATCT', (1, 5), (1, 1)],  # ED 0
137 |         ['CTACACGACGCTCTTCCGAggg', (1, 5), (1, 1)],  # ED 3
138 |         ['CTACACGACGCTCTTCCGgggg', (0, 5), (0, 1)]   # ED 4; no results
139 |     ]
140 | )
141 | def test_align_adapter(args, adapter1_seq, tags_results_shape, counts_results_shape):
142 |     """Test the identification of adapter1 sequences.
143 | 
144 |     algin_adapter() should return results with a max adapter1 edit distance,
145 |     which defaults to 3.
146 |     """
147 |     tags_file = tempfile.NamedTemporaryFile(
148 |         mode='w', suffix='.tsv')
149 |     trimmed_fastq_file = tempfile.NamedTemporaryFile(
150 |         mode='w', suffix='.tsv')
151 |     args.fastq = make_fastq(read_adapter1=adapter1_seq, rev=True)
152 |     args.output_read_tags = tags_file.name
153 |     args.kit = '3prime'
154 |     args.output_trimmed_fastq = trimmed_fastq_file.name
155 |     df_counts = extract_barcode.align_adapter(args)
156 |     assert df_counts.shape == counts_results_shape
157 | 
158 |     df_tags = pd.read_csv(tags_file.name, sep='\t')
159 |     assert df_tags.shape == tags_results_shape
160 | 
161 | 
162 | def ascii_decode_qscores(string):
163 |     """Convert ASCII character quality values into integers."""
164 |     return list(map(lambda x: ord(x) - 33, string))
165 | 
166 | 
167 | def ascii_encode_qscores(integers):
168 |     """Convert integer quality values into ASCII characters."""
169 |     return "".join(map(lambda x: chr(x + 33), integers))
170 | 
171 | 
172 | @pytest.mark.parametrize(
173 |     # 3 prime tests
174 |     'query,query_aln,query_ascii_q,expected_adapter1_ed',
175 |     [
176 |         # Adapter 1             BC                UMI          polyT
177 | 
178 |         # 100% match of the query adapter1 and the 10bp adapter1 prefix in the ref probe
179 |         ["CTACACGACGCTCTTCCGATCT AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT",
180 |          "            CTTCCGATCT AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT",
181 |          "?????????????????????? ()*+,-./01234567 89:;<=>?@ABC ????????????",
182 |          0],
183 | 
184 |         # 2 bp substitution in the adapter1 query
185 |         ["CTACACGACGCTCTTCCGATaa AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT",
186 |          "            CTTCCGATaa AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT",
187 |          "?????????????????????? ()*+,-./01234567 89:;<=>?@ABC ????????????",
188 |          2],
189 | 
190 |         # 2 bp deletion in the adapter1 query
191 |         ["CTACACGACGCTCTTCCGAT   AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT",
192 |          "            CTTCCGAT-- AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT",
193 |          "????????????????????   ()*+,-./01234567 89:;<=>?@ABC ????????????",
194 |          2],
195 | 
196 |     ]
197 | )
198 | def test_parse_probe_alignment(query, query_aln, query_ascii_q, expected_adapter1_ed):
199 |     """Test_parse_probe_alignment.
200 | 
201 |     In this test, a mocked parasail alignment is created. We want to test that the
202 |     correct barcode, UMI and associated quality scores are extracted from the
203 |     query.
204 | 
205 |     :param query: read query
206 |     :param query_aln: the query alignment that would result from parasail alignment to
207 |         the reference probe
208 |     :param query_ascii_q: the ascii-encoded qualitey string associated with the query
209 |     :param: expected_adapter1_ed: the expected edit distance of the adapter1
210 |     """
211 |     # Build a mock parasail alignment result. Although there would be geen sequrnce
212 |     # after the polyT, we can omit it here.
213 | 
214 |     # This is the read including the full 22bp adapter1 probe
215 |     #       adapter1                BC               UMI          PolyT
216 |     barcode, umi = query.split()[1:3]
217 |     barcode_q, umi_q = query_ascii_q.split()[1:3]
218 |     query = query.replace(' ', '')
219 |     query_aln = query_aln.replace(' ', '')
220 |     query_ascii_q = query_ascii_q.replace(' ', '')
221 |     qual_ints = ascii_decode_qscores(query_ascii_q)
222 | 
223 |     # The parasail reference alignment. Contains only the 10 bp suffix of the adapter1
224 |     # For 3prime kit
225 |     ref_align = (
226 |         # 10 bp A1  Ns for BC        Ns for UMI   PolyT
227 |         "CTTCCGATCT NNNNNNNNNNNNNNNN NNNNNNNNNNNN TTTTTTTTTTTT"
228 |     ).replace(' ', '')
229 | 
230 |     p_alignment = Mock()
231 |     p_alignment.traceback.query = query_aln
232 |     p_alignment.traceback.ref = ref_align
233 | 
234 |     adapter1_probe_suffix = 'CTTCCGATCT'  # Forward seq
235 | 
236 |     (
237 |         adapter1_editdist, barcode_result, umi_result,
238 |         bc_qscores, umi_qscores
239 |     ) = extract_barcode.parse_probe_alignment(
240 |         p_alignment, adapter1_probe_suffix,
241 |         16, 12, qual_ints, query)
242 | 
243 |     # convert the return qscores to ascii-encoded values
244 |     bc_qscores = ascii_encode_qscores(bc_qscores)
245 |     umi_qscores = ascii_encode_qscores(umi_qscores)
246 | 
247 |     assert adapter1_editdist == expected_adapter1_ed
248 |     assert barcode_result == barcode
249 |     assert umi_result == umi
250 |     assert bc_qscores == barcode_q
251 |     assert umi_qscores == umi_q
252 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_format_ctat_output.py:
--------------------------------------------------------------------------------
 1 | """Tests for the format_ctat_output module."""
 2 | 
 3 | from workflow_glue.format_ctat_output import load_fusion_data
 4 | 
 5 | 
 6 | def test_load_fusion_data_valid_file(tmp_path):
 7 |     """Test load_fusion_data with a valid fusion file."""
 8 |     # Create a temporary valid fusion file
 9 |     fusion_file = tmp_path / "fusions.tsv"
10 |     fusion_file.write_text(
11 |         "#FusionName\tLR_accessions\tLeftGene\tRightGene\tLeftBreakpoint"
12 |         "\tRightBreakpoint\tSpliceType\n"
13 | 
14 |         "Fusion1\tread1,read2\tGeneA\tGeneB\tchr1:100\tchr2:200\tSpliceA\n"
15 |         "Fusion2\tread3\tGeneC\tGeneD\tchr3:300\tchr4:400\tSpliceB\n"
16 |     )
17 | 
18 |     fusion_dict = load_fusion_data(fusion_file)
19 | 
20 |     assert fusion_dict is not None
21 |     assert len(fusion_dict) == 3  # 3 unique reads
22 | 
23 | 
24 | def test_load_fusion_data_empty_file(tmp_path):
25 |     """Test load_fusion_data with an empty fusion file."""
26 |     fusion_file = tmp_path / "empty.tsv"
27 |     fusion_file.write_text("")
28 | 
29 |     fusion_dict = load_fusion_data(fusion_file)
30 | 
31 |     assert fusion_dict is None
32 | 
33 | 
34 | def test_load_fusion_data_no_entries(tmp_path):
35 |     """Test load_fusion_data with a fusion file containing no valid entries."""
36 |     # Create a fusion file with no valid entries
37 |     fusion_file = tmp_path / "no_entries_fusion.tsv"
38 |     fusion_file.write_text(
39 |         "#FusionName\tLR_accessions\tLeftGene\tRightGene\tLeftBreakpoint"
40 |         "\tRightBreakpoint\tSpliceType\n"
41 |     )
42 | 
43 |     fusion_dict = load_fusion_data(fusion_file)
44 | 
45 |     assert fusion_dict is None
46 | 
47 | 
48 | def test_load_fusion_data_duplicate_reads(tmp_path):
49 |     """Test load_fusion_data with duplicate read IDs."""
50 |     # Create a fusion file with duplicate read IDs
51 |     fusion_file = tmp_path / "duplicate_reads_fusion.tsv"
52 |     fusion_file.write_text(
53 |         "#FusionName\tLR_accessions\tLeftGene\tRightGene\tLeftBreakpoint"
54 |         "\tRightBreakpoint\tSpliceType\n"
55 | 
56 |         "Fusion1\tread1,read1\tGeneA\tGeneB\tchr1:100\tchr2:200\tSpliceA\n"
57 |     )
58 | 
59 |     fusion_dict = load_fusion_data(fusion_file)
60 | 
61 |     assert fusion_dict is not None
62 |     assert len(fusion_dict) == 1  # Only 1 unique read
63 |     assert len(fusion_dict["read1"]) == 2  # 2 entries for the same read ID
64 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_tag_bam.py:
--------------------------------------------------------------------------------
  1 | """Test tag_bam.py"."""
  2 | import subprocess as sub
  3 | 
  4 | import pandas as pd
  5 | import pysam
  6 | import pytest
  7 | from workflow_glue import tag_bam
  8 | 
  9 | 
 10 | def make_bam(tmp_path, bam_entries):
 11 |     """Make a BAM file."""
 12 |     read = (
 13 |         "ATTCAGCGCTGGAGACCGAGCGCCCCGCAAAGGGCCTGATCT"
 14 |         "ATCGCGCACGGGACTACTCATTGGGACTGCGGCAATAGGGGAGGGGCCTAACAACGTT")
 15 |     chrs = set([x[1] for x in bam_entries])
 16 | 
 17 |     # Create the BAM file to be tagged
 18 |     header = '\n'.join([f'@SQ	SN:{chr_}	LN:10000000' for chr_ in chrs])
 19 | 
 20 |     entries = [f'{header}']
 21 | 
 22 |     for records in bam_entries:
 23 |         # Make a sam file containing the read and a quality qscore of 60.
 24 |         id_, chr_, flag, sa_tag = records
 25 |         entries.append(
 26 |             f"{id_}\t{flag}\t{chr_}\t1\t60\t{len(read)}M\t*\t0\t0\t"
 27 |             f"{read}\t{'?' * len(read)}\t{sa_tag}"
 28 |         )
 29 |     sam = '\n'.join(entries)
 30 |     # Write out a test BAM
 31 |     sam_file = tmp_path / 'align.sam'
 32 |     with open(sam_file, 'w') as fh_sam:
 33 |         fh_sam.write(sam)
 34 | 
 35 |     test_bam = tmp_path / 'align.bam'
 36 |     sub.check_output(['samtools', 'view', sam_file, '-o', test_bam])
 37 |     sub.check_output(['samtools', 'index', test_bam])
 38 |     return test_bam
 39 | 
 40 | 
 41 | @pytest.mark.parametrize(
 42 |     "tags,prim_records,supp_records",
 43 |     [
 44 |         (  # A single entry from a tag file representing a primary alignment.
 45 |             [
 46 |                 {
 47 |                     'read_id': 'read1',
 48 |                     'CR': 'AAAAAAAAAAAAAgAA',
 49 |                     'CB': 'AAAAAAAAAAAAAaAA',
 50 |                     'CY': '????????????????',
 51 |                     'UR': 'GGGGGtGGGGGG',
 52 |                     'UB': 'GGGGGGGGGGGG',
 53 |                     'UY': '????????????',
 54 |                     'GN': 'YFG',
 55 |                     'TR': 'YFT',
 56 |                     'chr': 'chr1',
 57 |                     'start': 1000,
 58 |                     'end': 2000
 59 |                 },
 60 |                 {
 61 |                     'read_id': 'read2',
 62 |                     'CR': 'GCGCGCGCGCGCGCGc',
 63 |                     'CB': 'GCGCGCGCGCGCGCCC',
 64 |                     'CY': '????????????????',
 65 |                     'UR': 'TTTTTTTaTTTT',
 66 |                     'UB': 'TTTTTTTTTTTT',
 67 |                     'UY': '????????????',
 68 |                     'GN': 'YFG2',
 69 |                     'TR': 'YFT2',
 70 |                     'chr': 'chr9',
 71 |                     'start': 1000,
 72 |                     'end': 2000
 73 |                 }
 74 |             ],
 75 |             # Primary records
 76 |             [
 77 |                 ('read1', 'chr1', 0, "SA:Z:chr2,10000,+,10S100M1S,60,11"),
 78 |                 ('read2', 'chr9', 0, "")],
 79 |             # Supplementary records
 80 |             [('read1', 'chr2', 2048, "")]
 81 |          )
 82 |     ]
 83 | )
 84 | def test_add_tags(tmp_path, tags, prim_records, supp_records):
 85 |     """Check that the output BAMs are tagged correctly."""
 86 |     bam_out = tmp_path / 'test_tags.bam'
 87 | 
 88 |     tags_to_test = ['CR', 'CB', 'CY', 'UR', 'UB', 'UY', 'GN', 'TR']
 89 | 
 90 |     tag_rows = []
 91 |     for tag_entry in tags:
 92 |         tag_rows.append(pd.DataFrame.from_dict(tag_entry, orient='index').T)
 93 |     tags_df = pd.concat(tag_rows, axis=0)
 94 |     tags_df.set_index('read_id', drop=True, inplace=True)
 95 | 
 96 |     # Get the SA tags; a subset of the primary tags that have a suppl record
 97 |     supp_read_ids = [x[0] for x in supp_records]
 98 |     sa_tags_df = tags_df.loc[supp_read_ids]
 99 | 
100 |     # Create primary and supplementary tag files in temporary directories
101 |     prim_tags_dir = tmp_path / 'tags'
102 |     prim_tags_dir.mkdir()
103 |     sa_tags_dir = tmp_path / 'sa_tags'
104 |     sa_tags_dir.mkdir()
105 | 
106 |     # Write the per chr primary tags file
107 |     for chr_, chr_df in tags_df.groupby('chr'):
108 |         chr_df.to_csv(prim_tags_dir / f'{chr_}.tsv', sep='\t')
109 | 
110 |     sa_tags_file = sa_tags_dir / 'sa_tags.tsv'
111 |     sa_tags_df.to_csv(sa_tags_file, sep='\t')
112 | 
113 |     test_bam = make_bam(tmp_path, prim_records + supp_records)
114 | 
115 |     # Run the test
116 |     tag_bam.add_tags(prim_tags_dir, sa_tags_dir, test_bam, bam_out, threads=1)
117 | 
118 |     # Check that the correct tags have been set on primary and supplementary
119 |     # record.
120 |     primary_tagged = 0
121 |     supp_tagged = 0
122 |     with pysam.AlignmentFile(bam_out, "rb") as bam_result:
123 |         for align in bam_result:
124 |             expected_tags = tags_df.loc[align.query_name]
125 |             for expected_tag, expected_value in expected_tags.items():
126 |                 if expected_tag in tags_to_test:
127 |                     assert align.get_tag(expected_tag) == expected_value
128 |             if align.is_supplementary:
129 |                 supp_tagged += 1
130 |             else:
131 |                 primary_tagged += 1
132 | 
133 |     assert primary_tagged == len(prim_records)
134 |     assert supp_tagged == len(supp_records)
135 | 
136 | 
137 | def test_empty_file(tmp_path):
138 |     """Test giving a header-only tags file, in a tags directory, to tag_bams."""
139 |     input_bam = make_bam(tmp_path, [('read1', 'chr1', 0, '')])
140 |     tags_header = (
141 |         'read_id', 'CR', 'CB', 'CY', 'UR', 'UB', 'UY', 'chr', 'start', 'end', 'gene',
142 |         'transcript')
143 |     tags_df = (
144 |         pd.DataFrame(columns=tags_header)
145 |         .set_index('read_id', drop=True)
146 |         .rename(
147 |             columns={v: k for k, v in tag_bam.BAM_TAGS.items()})
148 |     )
149 |     tmp_test_dir = tmp_path / 'tags'
150 |     tmp_test_dir.mkdir()
151 |     tmp_sa_dir = tmp_path / 'sa_tags'
152 |     tmp_sa_dir.mkdir()
153 |     header_only_file = tmp_test_dir / 'test_tags.tsv'
154 |     header_only_sa_file = tmp_sa_dir / 'test_sa_tags.tsv'
155 |     tags_df.to_csv(header_only_file, sep='\t')
156 |     tags_df.to_csv(header_only_sa_file, sep='\t')
157 |     out_bam = tmp_path / 'out.bam'
158 |     tag_bam.add_tags(tmp_test_dir, tmp_sa_dir, input_bam, out_bam, threads=1)
159 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/util.py:
--------------------------------------------------------------------------------
 1 | """The odd helper function.
 2 | 
 3 | Be careful what you place in here. This file is imported into all glue.
 4 | """
 5 | import argparse
 6 | import logging
 7 | 
 8 | 
 9 | _log_name = None
10 | 
11 | 
12 | def get_main_logger(name):
13 |     """Create the top-level logger."""
14 |     global _log_name
15 |     _log_name = name
16 |     logging.basicConfig(
17 |         format='[%(asctime)s - %(name)s] %(message)s',
18 |         datefmt='%H:%M:%S', level=logging.INFO)
19 |     return logging.getLogger(name)
20 | 
21 | 
22 | def get_named_logger(name):
23 |     """Create a logger with a name.
24 | 
25 |     :param name: name of logger.
26 |     """
27 |     name = name.ljust(10)[:10]  # so logging is aligned
28 |     logger = logging.getLogger('{}.{}'.format(_log_name, name))
29 |     return logger
30 | 
31 | 
32 | def wf_parser(name):
33 |     """Make an argument parser for a workflow command."""
34 |     return argparse.ArgumentParser(
35 |         name,
36 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
37 |         add_help=False)
38 | 
39 | 
40 | def _log_level():
41 |     """Parser to set logging level and acquire software version/commit."""
42 |     parser = argparse.ArgumentParser(
43 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False)
44 | 
45 |     modify_log_level = parser.add_mutually_exclusive_group()
46 |     modify_log_level.add_argument(
47 |         '--debug', action='store_const',
48 |         dest='log_level', const=logging.DEBUG, default=logging.INFO,
49 |         help='Verbose logging of debug information.')
50 |     modify_log_level.add_argument(
51 |         '--quiet', action='store_const',
52 |         dest='log_level', const=logging.WARNING, default=logging.INFO,
53 |         help='Minimal logging; warnings only.')
54 | 
55 |     return parser
56 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/variant_mex.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """Make sparse snv x cell matrix.
  3 | 
  4 | TODO: We may want the expression matrix to use this code.
  5 | Or modify the expression matrix code to deal with genotype matrices
  6 | """
  7 | import gzip
  8 | from pathlib import Path
  9 | import sys
 10 | 
 11 | import pandas as pd
 12 | import pysam
 13 | 
 14 | 
 15 | from .util import wf_parser  # noqa: ABS101
 16 | 
 17 | 
 18 | def argparser():
 19 |     """Create argument parser."""
 20 |     parser = wf_parser("MEX matrix")
 21 | 
 22 |     parser.add_argument(
 23 |         "vcf_in", help="VCF file")
 24 |     parser.add_argument(
 25 |         "mex_out_dir", help="MEX output directory")
 26 |     parser.add_argument(
 27 |         "--report_vars",
 28 |         help="List of variant ot report (chr_pos_ref_alt)", default=None)
 29 | 
 30 |     return parser
 31 | 
 32 | 
 33 | def main(args):
 34 |     """Write a matrix to disk in mtx format.
 35 | 
 36 |     :param args.matrix: Path to encoded genotype matrix file with encoding:
 37 |         hom ref: 0
 38 |         het: 1
 39 |         hom alt: 2
 40 |         no data: -1
 41 |     """
 42 |     # Full (non-sparse) matrix variants to write to a TSV file.
 43 |     for_report = []
 44 |     max_report_vars = 50
 45 |     report_vars_written = 0
 46 | 
 47 |     if args.report_vars:
 48 |         # Load the list of interesting variants
 49 |         with open(args.report_vars, 'r') as fh:
 50 |             report_vars = [line.strip() for line in fh]
 51 | 
 52 |     mex_folder = Path(args.mex_out_dir)
 53 |     mex_folder.mkdir(parents=True, exist_ok=True)
 54 | 
 55 |     vcf = pysam.VariantFile(args.vcf_in, threads=6)
 56 |     samples = list(vcf.header.samples)
 57 | 
 58 |     fhf = gzip.open(mex_folder / "features.tsv.gz", 'wt')
 59 | 
 60 |     n_rows = 0
 61 |     n_vars = 0
 62 | 
 63 |     with fhf as fh_feat:
 64 | 
 65 |         for i, record in enumerate(vcf.fetch()):
 66 |             n_rows += 1
 67 |             write_for_report = False
 68 |             if args.report_vars:
 69 |                 if f"{record.chrom}_{record.pos}" in report_vars:
 70 |                     if report_vars_written < max_report_vars:
 71 |                         write_for_report = True
 72 |                         report_vars_written += 1
 73 |             else:
 74 |                 # Just get the first n variants to show in the report
 75 |                 if report_vars_written < max_report_vars:
 76 |                     write_for_report = True
 77 |                     report_vars_written += 1
 78 |             variant_id = f"{record.chrom}_{record.pos}_{record.ref}_{record.alts[0]}"
 79 |             fh_feat.write(variant_id + '\n')
 80 |             for j, sample in enumerate(samples):
 81 |                 genotype = record.samples[sample]['GT']  # Get genotype
 82 |                 # numerically-encode diploid genotype
 83 |                 try:
 84 |                     gt_val = sum(allele for allele in genotype)
 85 |                 except TypeError:
 86 |                     gt_val = -1  # Missing genotype
 87 |                 if write_for_report:
 88 |                     for_report.append(
 89 |                         (variant_id, sample, gt_val))
 90 |                 if gt_val == -1:
 91 |                     continue  # Skip missing genotypes from sparse matrix
 92 |                 n_vars += 1
 93 |                 # 1-based indexing for mtx format
 94 |                 sys.stdout.write(f"{i + 1} {j + 1} {gt_val}\n")
 95 |     with gzip.open(mex_folder / "barcodes.tsv.gz", 'wt') as fh:
 96 |         for col in samples:
 97 |             fh.write(f"{col}\n")
 98 | 
 99 |     header = ((
100 |             '%%MatrixMarket matrix coordinate integer general\n'
101 |             '%metadata_json:'
102 |             '{"software_version": "ont-single-cell","format_version": 2}\n'
103 |             f'{n_rows} {len(samples)} {n_vars} \n')
104 |         )
105 |     with open('header.txt', 'w') as f:
106 |         f.write(header)
107 | 
108 |     # Write full matrix of interesting variants to a TSV file
109 |     df_top = pd.DataFrame.from_records(
110 |         for_report, columns=['variant', 'sample', 'gt_val'])
111 |     df_top = df_top.pivot(
112 |         index='variant', columns='sample', values='gt_val').fillna(-1)
113 |     df_top.to_csv("top_snvs.tsv", sep='\t', index=True)
114 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | """A collection of helper scripts common to workflows."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_bam_headers_in_dir.py:
--------------------------------------------------------------------------------
 1 | """Check (u)BAM files for `@SQ` lines whether they are the same in all headers."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("checkBamHdr")
14 | 
15 |     if not args.input_path.is_dir():
16 |         raise ValueError(f"Input path '{args.input_path}' must be a directory.")
17 | 
18 |     target_files = list(args.input_path.glob("*"))
19 |     if not target_files:
20 |         raise ValueError(f"No files found in input directory '{args.input_path}'.")
21 |     # Loop over target files and check if there are `@SQ` lines in all headers or not.
22 |     # Set `is_unaligned` accordingly. If there are mixed headers (either with some files
23 |     # containing `@SQ` lines and some not or with different files containing different
24 |     # `@SQ` lines), set `mixed_headers` to `True`.
25 |     # Also check if there is the SO line, to validate whether the file is (un)sorted.
26 |     first_sq_lines = None
27 |     mixed_headers = False
28 |     sorted_xam = False
29 |     for xam_file in target_files:
30 |         # get the `@SQ` and `@HD` lines in the header
31 |         with pysam.AlignmentFile(xam_file, check_sq=False) as f:
32 |             # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with
33 |             # same reference but different SQ.UR as mixed_header (see CW-4842)
34 |             sq_lines = [{
35 |                 "SN": sq["SN"],
36 |                 "LN": sq["LN"],
37 |                 "M5": sq.get("M5"),
38 |             } for sq in f.header.get("SQ", [])]
39 |             hd_lines = f.header.get("HD")
40 |         # Check if it is sorted.
41 |         # When there is more than one BAM, merging/sorting
42 |         # will happen regardless of this flag.
43 |         if hd_lines is not None and hd_lines.get('SO') == 'coordinate':
44 |             sorted_xam = True
45 |         if first_sq_lines is None:
46 |             # this is the first file
47 |             first_sq_lines = sq_lines
48 |         else:
49 |             # this is a subsequent file; check with the first `@SQ` lines
50 |             if sq_lines != first_sq_lines:
51 |                 mixed_headers = True
52 |                 break
53 | 
54 |     # we set `is_unaligned` to `True` if there were no mixed headers and the last file
55 |     # didn't have `@SQ` lines (as we can then be sure that none of the files did)
56 |     is_unaligned = not mixed_headers and not sq_lines
57 |     # write `is_unaligned` and `mixed_headers` out so that they can be set as env.
58 |     # variables
59 |     sys.stdout.write(
60 |         f"IS_UNALIGNED={int(is_unaligned)};" +
61 |         f"MIXED_HEADERS={int(mixed_headers)};" +
62 |         f"IS_SORTED={int(sorted_xam)}"
63 |     )
64 |     logger.info(f"Checked (u)BAM headers in '{args.input_path}'.")
65 | 
66 | 
67 | def argparser():
68 |     """Argument parser for entrypoint."""
69 |     parser = wf_parser("check_bam_headers_in_dir")
70 |     parser.add_argument("input_path", type=Path, help="Path to target directory")
71 |     return parser
72 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_sample_sheet.py:
--------------------------------------------------------------------------------
  1 | """Check if a sample sheet is valid."""
  2 | import codecs
  3 | import csv
  4 | import os
  5 | import re
  6 | import sys
  7 | 
  8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
  9 | 
 10 | 
 11 | # Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my
 12 | # comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8
 13 | # I should add). If we do not handle this with the correct encoding, the mark will
 14 | # appear in the parsed data, causing the header to be malformed.
 15 | # See CW-2310
 16 | def determine_codec(f):
 17 |     """Peek at a file and return an appropriate reading codec."""
 18 |     with open(f, 'rb') as f_bytes:
 19 |         # Could use chardet here if we need to expand codec support
 20 |         initial_bytes = f_bytes.read(8)
 21 | 
 22 |         for codec, encoding_name in [
 23 |             [codecs.BOM_UTF8, "utf-8-sig"],  # use the -sig codec to drop the mark
 24 |             [codecs.BOM_UTF16_BE, "utf-16"],  # don't specify LE or BE to drop mark
 25 |             [codecs.BOM_UTF16_LE, "utf-16"],
 26 |             [codecs.BOM_UTF32_BE, "utf-32"],  # handle 32 for completeness
 27 |             [codecs.BOM_UTF32_LE, "utf-32"],  # again skip LE or BE to drop mark
 28 |         ]:
 29 |             if initial_bytes.startswith(codec):
 30 |                 return encoding_name
 31 |         return None  # will cause file to be opened with default encoding
 32 | 
 33 | 
 34 | def main(args):
 35 |     """Run the entry point."""
 36 |     logger = get_named_logger("checkSheet")
 37 | 
 38 |     barcodes = []
 39 |     aliases = []
 40 |     sample_types = []
 41 |     analysis_groups = []
 42 |     allowed_sample_types = [
 43 |         "test_sample", "positive_control", "negative_control", "no_template_control"
 44 |     ]
 45 | 
 46 |     if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet):
 47 |         sys.stdout.write("Could not open sample sheet file.")
 48 |         sys.exit()
 49 | 
 50 |     try:
 51 |         encoding = determine_codec(args.sample_sheet)
 52 |         with open(args.sample_sheet, "r", encoding=encoding) as f:
 53 |             try:
 54 |                 # Excel files don't throw any error until here
 55 |                 csv.Sniffer().sniff(f.readline())
 56 |                 f.seek(0)  # return to initial position again
 57 |             except Exception as e:
 58 |                 # Excel fails with UniCode error
 59 |                 sys.stdout.write(
 60 |                     "The sample sheet doesn't seem to be a CSV file.\n"
 61 |                     "The sample sheet has to be a CSV file.\n"
 62 |                     "Please verify that the sample sheet is a CSV file.\n"
 63 |                     f"Parsing error: {e}"
 64 |                  )
 65 | 
 66 |                 sys.exit()
 67 | 
 68 |             csv_reader = csv.DictReader(f)
 69 |             n_row = 0
 70 |             for row in csv_reader:
 71 |                 n_row += 1
 72 |                 if n_row == 1:
 73 |                     n_cols = len(row)
 74 |                 else:
 75 |                     # check we got the same number of fields
 76 |                     if len(row) != n_cols:
 77 |                         sys.stdout.write(
 78 |                             f"Unexpected number of cells in row number {n_row}"
 79 |                         )
 80 |                         sys.exit()
 81 |                 try:
 82 |                     barcodes.append(row["barcode"])
 83 |                 except KeyError:
 84 |                     sys.stdout.write("'barcode' column missing")
 85 |                     sys.exit()
 86 |                 try:
 87 |                     aliases.append(row["alias"])
 88 |                 except KeyError:
 89 |                     sys.stdout.write("'alias' column missing")
 90 |                     sys.exit()
 91 |                 try:
 92 |                     sample_types.append(row["type"])
 93 |                 except KeyError:
 94 |                     pass
 95 |                 try:
 96 |                     analysis_groups.append(row["analysis_group"])
 97 |                 except KeyError:
 98 |                     pass
 99 |     except Exception as e:
100 |         sys.stdout.write(f"Parsing error: {e}")
101 |         sys.exit()
102 | 
103 |     # check barcodes are correct format
104 |     for barcode in barcodes:
105 |         if not re.match(r'^barcode\d\d+$', barcode):
106 |             sys.stdout.write("values in 'barcode' column are incorrect format")
107 |             sys.exit()
108 | 
109 |     # check aliases are correct format
110 |     # for now we have decided they may not start with "barcode"
111 |     for alias in aliases:
112 |         if alias.startswith("barcode"):
113 |             sys.stdout.write("values in 'alias' column must not begin with 'barcode'")
114 |             sys.exit()
115 | 
116 |     # check barcodes are all the same length
117 |     first_length = len(barcodes[0])
118 |     for barcode in barcodes[1:]:
119 |         if len(barcode) != first_length:
120 |             sys.stdout.write("values in 'barcode' column are different lengths")
121 |             sys.exit()
122 | 
123 |     # check barcode and alias values are unique
124 |     if len(barcodes) > len(set(barcodes)):
125 |         sys.stdout.write("values in 'barcode' column not unique")
126 |         sys.exit()
127 |     if len(aliases) > len(set(aliases)):
128 |         sys.stdout.write("values in 'alias' column not unique")
129 |         sys.exit()
130 | 
131 |     if sample_types:
132 |         # check if "type" column has unexpected values
133 |         unexp_type_vals = set(sample_types) - set(allowed_sample_types)
134 | 
135 |         if unexp_type_vals:
136 |             sys.stdout.write(
137 |                 f"found unexpected values in 'type' column: {unexp_type_vals}. "
138 |                 f"Allowed values are: {allowed_sample_types}"
139 |             )
140 |             sys.exit()
141 | 
142 |         if args.required_sample_types:
143 |             for required_type in args.required_sample_types:
144 |                 if required_type not in allowed_sample_types:
145 |                     sys.stdout.write(f"Not an allowed sample type: {required_type}")
146 |                     sys.exit()
147 |                 if sample_types.count(required_type) < 1:
148 |                     sys.stdout.write(
149 |                         f"Sample sheet requires at least 1 of {required_type}")
150 |                     sys.exit()
151 |     if analysis_groups:
152 |         # if there was a "analysis_group" column, make sure it had values for all
153 |         # samples
154 |         if not all(analysis_groups):
155 |             sys.stdout.write(
156 |                 "if an 'analysis_group' column exists, it needs values in each row"
157 |             )
158 |             sys.exit()
159 | 
160 |     logger.info(f"Checked sample sheet {args.sample_sheet}.")
161 | 
162 | 
163 | def argparser():
164 |     """Argument parser for entrypoint."""
165 |     parser = wf_parser("check_sample_sheet")
166 |     parser.add_argument("sample_sheet", help="Sample sheet to check")
167 |     parser.add_argument(
168 |         "--required_sample_types",
169 |         help="List of required sample types. Each sample type provided must "
170 |              "appear at least once in the sample sheet",
171 |         nargs="*"
172 |     )
173 |     return parser
174 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/check_xam_index.py:
--------------------------------------------------------------------------------
 1 | """Validate a single (u)BAM file index."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def validate_xam_index(xam_file):
12 |     """Use fetch to validate the index.
13 | 
14 |     Invalid indexes will fail the call with a ValueError:
15 |     ValueError: fetch called on bamfile without index
16 |     """
17 |     with pysam.AlignmentFile(xam_file, check_sq=False) as alignments:
18 |         try:
19 |             alignments.fetch()
20 |             has_valid_index = True
21 |         except ValueError:
22 |             has_valid_index = False
23 |     return has_valid_index
24 | 
25 | 
26 | def main(args):
27 |     """Run the entry point."""
28 |     logger = get_named_logger("checkBamIdx")
29 | 
30 |     # Check if a XAM has a valid index
31 |     has_valid_index = validate_xam_index(args.input_xam)
32 |     # write `has_valid_index` out so that they can be set as env.
33 |     sys.stdout.write(
34 |         f"HAS_VALID_INDEX={int(has_valid_index)}"
35 |     )
36 |     logger.info(f"Checked (u)BAM index for: '{args.input_xam}'.")
37 | 
38 | 
39 | def argparser():
40 |     """Argument parser for entrypoint."""
41 |     parser = wf_parser("check_xam_index")
42 |     parser.add_argument("input_xam", type=Path, help="Path to target XAM")
43 |     return parser
44 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/wfg_helpers/get_max_depth_locus.py:
--------------------------------------------------------------------------------
 1 | """Find max depth window in a `mosdepth` regions BED file and write as locus string."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from ..util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("getMaxDepth")
14 | 
15 |     # read the regions BED file
16 |     df = pd.read_csv(
17 |         args.depths_bed, sep="\t", header=None, names=["ref", "start", "end", "depth"]
18 |     )
19 | 
20 |     # get the window with the largest depth
21 |     ref, start, end, depth = df.loc[df["depth"].idxmax()]
22 | 
23 |     # get the length of the reference of that window
24 |     ref_length = df.query("ref == @ref")["end"].iloc[-1]
25 | 
26 |     # show the whole reference in case it's shorter than the desired locus size
27 |     if ref_length < args.locus_size:
28 |         start = 1
29 |         end = ref_length
30 |     else:
31 |         # otherwise, show a region of the desired size around the window
32 |         half_size = args.locus_size // 2
33 |         mid = (start + end) // 2
34 |         start = mid - half_size
35 |         end = mid + half_size
36 |         # check if the region starts below `1` or ends beyond the end of the reference
37 |         if start < 1:
38 |             start = 1
39 |             end = args.locus_size
40 |         if end > ref_length:
41 |             start = ref_length - args.locus_size
42 |             end = ref_length
43 | 
44 |     # write depth and locus string
45 |     sys.stdout.write(f"{depth}\t{ref}:{start}-{end}")
46 | 
47 |     logger.info("Wrote locus with maximum depth to STDOUT.")
48 | 
49 | 
50 | def argparser():
51 |     """Argument parser for entrypoint."""
52 |     parser = wf_parser("get_max_depth_locus")
53 |     parser.add_argument(
54 |         "depths_bed",
55 |         type=Path,
56 |         help="path to mosdepth regions depth file (can be compressed)",
57 |     )
58 |     parser.add_argument(
59 |         "locus_size", type=int, help="size of the locus in basepairs (e.g. '2000')"
60 |     )
61 |     return parser
62 | 


--------------------------------------------------------------------------------
/data/3M-3pgex-may-2023.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/d6178271e36558cfd6143c81779ca5403d3dbb51/data/3M-3pgex-may-2023.txt.gz


--------------------------------------------------------------------------------
/data/3M-5pgex-jan-2023.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/d6178271e36558cfd6143c81779ca5403d3dbb51/data/3M-5pgex-jan-2023.txt.gz


--------------------------------------------------------------------------------
/data/3M-february-2018.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/d6178271e36558cfd6143c81779ca5403d3dbb51/data/3M-february-2018.txt.gz


--------------------------------------------------------------------------------
/data/737K-arc-v1.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/d6178271e36558cfd6143c81779ca5403d3dbb51/data/737K-arc-v1.txt.gz


--------------------------------------------------------------------------------
/data/737K-august-2016.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/d6178271e36558cfd6143c81779ca5403d3dbb51/data/737K-august-2016.txt.gz


--------------------------------------------------------------------------------
/data/OPTIONAL_FILE:
--------------------------------------------------------------------------------
 1 | # Nothing to see here. A sentinel file to replace real data.
 2 | # e.g.:
 3 | #
 4 | # process run {
 5 | #     input:
 6 | #         path some_data
 7 | #         path extra_data
 8 | #     script:
 9 | #     def extra = extra_data.name != 'OPTIONAL_FILE' ? "--extra-data $opt" : ''
10 | #     """
11 | #     command ${some_data} ${extra}
12 | #     """
13 | # }
14 | #
15 | # some_data = ...
16 | # extra_data = Channel.fromPath("$projectDir/data/OPTIONAL_FILE"))
17 | # run(some_data, extra_data)
18 | 


--------------------------------------------------------------------------------
/data/genes_of_interest.csv:
--------------------------------------------------------------------------------
1 | COX16
2 | AAGAB
3 | CD70
4 | NOGENE


--------------------------------------------------------------------------------
/data/visium-v1.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/d6178271e36558cfd6143c81779ca5403d3dbb51/data/visium-v1.txt.gz


--------------------------------------------------------------------------------
/docs/01_brief_description.md:
--------------------------------------------------------------------------------
1 | Cell barcode & UMI identification from single-cell sequencing data.


--------------------------------------------------------------------------------
/docs/02_introduction.md:
--------------------------------------------------------------------------------
 1 | This workflow extracts cell barcodes and UMIs from [10x](https://www.10xgenomics.com/)-generated single cell libraries.
 2 | It was initially created as a Nextflow port of [Sockeye](https://github.com/nanoporetech/sockeye).
 3 | 
 4 | In brief, the workflow does the following:
 5 | 
 6 | + Adapter identification, fused read splitting and stranding.
 7 | + Mapping of reads to genomic reference.
 8 | + Gene and transcript read assignment.
 9 | + Cell barcode and UMI extraction and correction.
10 | + Generation of gene and transcript count matrices for unique UMIs.
11 | + Tagging BAM files with cell barcodes and UMIs.
12 | + Calculation of library saturation.
13 | 
14 | This workflow supports the following 10x kits:
15 | + 3': v2/v3 and v4 (GEM-X)
16 | + 5': v1/v2
17 | + multiome (gene expression only): v1 
18 | + visium spatial transcriptomics: v1
19 | 
20 | 
21 | The [BLAZE](https://github.com/shimlab/BLAZE) preprint provided useful benchmarking of the original sockeye implementation. 
22 | This assisted in the selection of appropriate thresholds for cell cut-off and for defining the limits of the gene x cell matrix.
23 | 
24 | The isoform selection procedure used in this workflow was adapted from that found in the [FLAMES](https://github.com/LuyiTian/FLAMES) 
25 | package.
26 | 


--------------------------------------------------------------------------------
/docs/03_compute_requirements.md:
--------------------------------------------------------------------------------
 1 | Recommended requirements:
 2 | 
 3 | + CPUs = 64
 4 | + Memory = 256GB
 5 | 
 6 | Minimum requirements:
 7 | 
 8 | + CPUs = 32
 9 | + Memory = 32GB
10 | 
11 | Approximate run time: Approximately 8h for 120M reads with the recommended requirements.
12 | 
13 | ARM processor support: False
14 | 


--------------------------------------------------------------------------------
/docs/04_install_and_run.md:
--------------------------------------------------------------------------------
 1 | 
 2 | These are instructions to install and run the workflow on command line.
 3 | You can also access the workflow via the
 4 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/).
 5 | 
 6 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage
 7 | compute and software resources,
 8 | therefore Nextflow will need to be
 9 | installed before attempting to run the workflow.
10 | 
11 | The workflow can currently be run using either
12 | [Docker](https://docs.docker.com/get-started/)
13 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
14 | to provide isolation of the required software.
15 | Both methods are automated out-of-the-box provided
16 | either Docker or Singularity is installed.
17 | This is controlled by the
18 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles)
19 | parameter as exemplified below.
20 | 
21 | It is not required to clone or download the git repository
22 | in order to run the workflow.
23 | More information on running EPI2ME workflows can
24 | be found on our [website](https://labs.epi2me.io/wfindex).
25 | 
26 | The following command can be used to obtain the workflow.
27 | This will pull the repository in to the assets folder of
28 | Nextflow and provide a list of all parameters
29 | available for the workflow as well as an example command:
30 | 
31 | ```
32 | nextflow run epi2me-labs/wf-single-cell --help
33 | ```
34 | To update a workflow to the latest version on the command line use
35 | the following command:
36 | ```
37 | nextflow pull epi2me-labs/wf-single-cell
38 | ```
39 | 
40 | A demo dataset is provided for testing of the workflow.
41 | It can be downloaded and unpacked using the following commands:
42 | ```
43 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/wf-single-cell-demo.tar.gz
44 | tar -xzvf wf-single-cell-demo.tar.gz
45 | ```
46 | The workflow can then be run with the downloaded demo data using:
47 | ```
48 | nextflow run epi2me-labs/wf-single-cell \
49 | 	--expected_cells 100 \
50 | 	--fastq 'wf-single-cell-demo/chr17.fq.gz' \
51 | 	--kit '3prime:v3' \
52 | 	--ref_genome_dir 'wf-single-cell-demo' \
53 | 	--genes_of_interest 'wf-single-cell-demo/umap_plot_genes.csv' \
54 | 	-profile standard
55 | ```
56 | 
57 | For further information about running a workflow on
58 | the command line see https://labs.epi2me.io/wfquickstart/
59 | 


--------------------------------------------------------------------------------
/docs/05_related_protocols.md:
--------------------------------------------------------------------------------
1 | <!---Hyperlinks to any related protocols that are directly related to this workflow, check the community for any such protocols.--->
2 | 
3 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices.
4 | 
5 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/).
6 | 
7 | + [Library prep and sequencing protocol for the 10x 5' kit](https://community.nanoporetech.com/docs/prepare/library_prep_protocols/ligation-sequencing-v14-single-cell-transcriptomics-with-5-cdna/v/sst_9204_v114_revd_06mar2024)
8 | + [Library prep and sequencing protocol for the 10x 3' kit](https://community.nanoporetech.com/docs/prepare/library_prep_protocols/single-cell-transcriptomics-with-cdna-prepared-using-10x/v/sst_9198_v114_reve_06dec2023)


--------------------------------------------------------------------------------
/docs/06_input_example.md:
--------------------------------------------------------------------------------
 1 | <!---Example of input directory structure, delete and edit as appropriate per workflow.--->
 2 | This workflow accepts either FASTQ or BAM files as input.
 3 | 
 4 | The FASTQ or BAM input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ or BAM file; (ii) the path to a top-level directory containing FASTQ or BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ or BAM files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`.
 5 | 
 6 | ```
 7 | (i)                     (ii)                 (iii)    
 8 | input_reads.fastq   ─── input_directory  ─── input_directory
 9 |                         ├── reads0.fastq     ├── barcode01
10 |                         └── reads1.fastq     │   ├── reads0.fastq
11 |                                              │   └── reads1.fastq
12 |                                              ├── barcode02
13 |                                              │   ├── reads0.fastq
14 |                                              │   ├── reads1.fastq
15 |                                              │   └── reads2.fastq
16 |                                              └── barcode03
17 |                                               └── reads0.fastq
18 | ```


--------------------------------------------------------------------------------
/docs/06_input_parameters.md:
--------------------------------------------------------------------------------
 1 | ### Input Options
 2 | 
 3 | | Nextflow parameter name  | Type | Description | Help | Default |
 4 | |--------------------------|------|-------------|------|---------|
 5 | | fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
 6 | | bam | string | BAM or unaligned BAM (uBAM) files to use in the analysis. | This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
 7 | | spaceranger_bam | string | - | - |  |
 8 | | adapter_configs | string | Only required if demultiplex_tags is supplied. | - |  |
 9 | | epi2me_resource_bundle | string | Reference genome resource bundle to automatically download. | If selected, a prebuilt 10x reference genome bundle will be automatically downloaded from the EPI2ME AWS cloud. If `call_fusions` is selected, a matched ctat-LR-fusion resource directory will also be downloaded. This overrides `ref_genome_dir`, and `ctat_resourses`. The selected resources will be automatically downloaded, on the first run, into the directory defined by the `store_dir` parameter (default `wf-single-cell_resources`). Subsequent workflow runs will use the pre-downloaded resources. |  |
10 | | ref_genome_dir | string | A local path to the 10x reference directory. | The workflow requires a 10x reference directory containing sequence and annotation data. The folder should contain these files: `genes/genes.gtf`, `fasta/genome.fa`, and `fasta/genome.fa.fai` as per the 10x reference folder format. 10x reference folders can be downloaded from https://www.10xgenomics.com/support/software/cell-ranger/downloads. Alternatively, the workflow can download a limited set of prebuilt 10x references using the `epi2me_resource_bundle` parameter |  |
11 | | ctat_resources | string | For fusion transcript calling. A local path to ctat-LR-fusion resource directory. | The ctat-LR-fusion resource bundle must be built against the same reference genome data as is given with `ref_genome_dir`. Resource bundles can be downloaded from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/, and instructions for building your own resources bundle can be found here: https://github.com/TrinityCTAT/ctat-genome-lib-builder. Alternatively see the `epi2me_resource_bundle` option |  |
12 | | kit | string | The 10x kit and version separated by a colon (eg: 3prime:v3) | 10x kits can be released with different versions, each requiring a specific whitelist that is looked-up by the workflow. If `single_cell_sample_sheet` is not defined, the 10x kit is applied to all samples. This parameter is ignored if `single_cell_sample_sheet` is supplied. |  |
13 | | expected_cells | integer | Number of expected cells in the sample. | The number of expected cells. If `single_cell_sample_sheet` is not defined, `expected_cells` is applied to all samples. This parameter is ignored if `single_cell_sample_sheet` is supplied. |  |
14 | | estimate_cell_count | boolean | Estimate cell count from the data. | If set to true, the cell count will be estimated from the read count distribution. If set to false, the top `expected_cells` cells with highest read support will be selected. | True |
15 | | single_cell_sample_sheet | string | An optional CSV file used to assign library metadata per sample. If all samples have the same library metadata, this can be supplied instead by using the `--kit` and `--expected_cells` parameters. | Columns should be: [sample_id, kit, exp_cells]. This must not be confused with the MinKNOW sample_sheet. `sample_id` should correspond to `sample_name` which is defined either in the `sample_sheet`, given by the `sample` parameter (for single sample runs) or if no `sample_sheet` or `sample` is given, is derived from the folder name containing the FASTQ files. |  |
16 | | full_length_only | boolean | Only process full length reads. | If set to true, only process reads or subreads that are classified as full length (read segments flanked by compatible adapters in the expected orientation). | True |
17 | | min_read_qual | number | Specify read quality lower limit. | Any reads with a quality lower than this limit will not be included in the analysis. |  |
18 | | call_fusions | boolean | Use ctat-LR-fusion to call fusion reads. | ctat-LR-fusion is a tool for calling fusions from long reads. | False |
19 | 
20 | 
21 | ### Sample Options
22 | 
23 | | Nextflow parameter name  | Type | Description | Help | Default |
24 | |--------------------------|------|-------------|------|---------|
25 | | sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. |  |
26 | | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. |  |  |
27 | 
28 | 
29 | ### Output Options
30 | 
31 | | Nextflow parameter name  | Type | Description | Help | Default |
32 | |--------------------------|------|-------------|------|---------|
33 | | out_dir | string | Directory for output of all workflow results. |  | output |
34 | 
35 | 
36 | ### Advanced options
37 | 
38 | | Nextflow parameter name  | Type | Description | Help | Default |
39 | |--------------------------|------|-------------|------|---------|
40 | | call_variants | boolean | Call cell-level single nucleotide variants (SNV). | Call single cell variants using a longshot-based workflow. This subworkflow is computationally intensive, datasets with large numbers of cells may take a long time. | False |
41 | | report_variants | string | Display information about variants of interest in the report. | A VCF file containing variants of interest. |  |
42 | | kit_config | string | A file defining the configurations associated with the various supported 10x kits. | A CSV file is expected with the following headers [kit, barcode_length, umi_length]. If not specified, a default `kit_configs.csv` (found in the project directory root) will be used. This parameter does not typically need be changed. |  |
43 | | threads | integer | Number of CPU threads to use in resource intensive processes. | The total CPU resource used by the workflow is constrained by the executor configuration. | 8 |
44 | | fastq_chunk | integer | Sets the maximum number of reads per chunk for the initial processing of reads. | Controls batching of reads for processing. | 1000000 |
45 | | barcode_adapter1_suff_length | integer | Suffix length of the read1 adapter to use in creating the probe sequence for identifying barcode/UMI bases. | For example, specifying 12 would mean that the last 12 bases of the specified read1 sequence will be included in the probe sequence. | 10 |
46 | | barcode_min_quality | integer | Minimum allowed nucleotide-level quality score in the extracted/uncorrected barcode sequence. | Values equal or higher to this this will be considered 'high-quality' and used for generating the barcode whitelist. | 15 |
47 | | barcode_max_ed | integer | Maximum allowable edit distance between uncorrected barcode and the best matching corrected barcode from the sample whitelist. | Barcodes are corrected by searching from a list of barcodes known to exist in the dataset. A maximum edit distance of 2 between query and whitelist barcode is recommended. | 2 |
48 | | barcode_min_ed_diff | integer | Minimum allowable edit distance difference between whitelist barcode candidates. | If there is more than one candidate barcode found in the whitelist, the edit distance difference of the top hit and second best hits (in relation to the uncorrected barcode) must be at least this value to be able to assign a barcode. If the edit distance difference is less than this, it is assumed that barcode identity is amiguous, and the read is not tagged with a corrected barcode. | 2 |
49 | | gene_assigns_minqv | integer | Minimum MAPQ score allowed for a read to be assigned to a gene. |  | 30 |
50 | | matrix_min_genes | integer | Filter cells from the gene expression matrix if they contain fewer than <matrix_min_genes> genes. |  | 200 |
51 | | matrix_min_cells | integer | Filter genes from the gene expression matrix that are observed in fewer than <matrix_min_cells> cells. |  | 3 |
52 | | matrix_max_mito | integer | Filter cells from the gene expression matrix if more than <matrix_max_mito> percent of UMI counts come from mitochondrial genes. |  | 20 |
53 | | matrix_norm_count | integer | Normalize expression matrix to <matrix_norm_count> counts per cell. |  | 10000 |
54 | | genes_of_interest | string | File containing a list of gene symbols (one symbol per line) to annotate with expression values in the UMAP projections. If doing visium spatial analysis, these genes will be used to annotate the spatial plots.  |  |  |
55 | | mito_prefix | string | Gene name prefix to identify for mitochondrial genes. | Parts of the workflow analyse mitochondrial genes separately. These genes are identified by searching for a gene name prefix. Human mitochondrial genes can be identified with prefix 'MT-' and mouse genes with prefix 'mt-'. If the reference genome contains data from multiple organisms with different nomenclature, multiple prefixes can be supplied like so: 'MT-,mt-' | MT- |
56 | | umap_n_repeats | integer | Number of UMAP projection to repeat for each dataset. | The UMAP algorithm contains elements of randomness that can mislead users into seeing associations between cells that are not meaningful. It is recommended to view multiple plots generated with the same parameters and check that any observed structure is consistent across runs. | 3 |
57 | | stringtie_opts | string | StringTie options for transcriptome assembly. | StringTie option string can be supplied at the command line as in this example: `--stringtie_opts="-c 5 -m 100 "`. StringTie options can be found here: http://ccb.jhu.edu/software/stringtie/index.shtml?t=manual. The default option (-c 2) ensures that only transcripts with a coverage of 2 or higher are included in the generated transcriptome | -c 2 |
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/docs/07_outputs.md:
--------------------------------------------------------------------------------
 1 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}.
 2 | 
 3 | | Title | File path | Description | Per sample or aggregated |
 4 | |-------|-----------|-------------|--------------------------|
 5 | | workflow report | wf-single-cell-report.html | Report for all samples | aggregated |
 6 | | Results summaries | {{ alias }}/{{ alias }}.config_stats.json | Results summaries including adapter configuration numbers. | per-sample |
 7 | | Gene expression counts | {{ alias }}/{{ alias }}.gene_raw_feature_bc_matrix/matrix.mtx.gz | Gene x cell expression sparse matrix values (MEX format). | per-sample |
 8 | | Gene expression barcodes | {{ alias }}/{{ alias }}.gene_raw_feature_bc_matrix/barcodes.tsv.gz | Barcode column names (MEX format). | per-sample |
 9 | | Gene expression features | {{ alias }}/{{ alias }}.gene_raw_feature_bc_matrix/features.tsv.gz | Feature row names (MEX format). | per-sample |
10 | | Transcript expression counts | {{ alias }}/{{ alias }}.transcript_raw_feature_bc_matrix/matrix.mtx.gz | Transcript x cell expression sparse matrix values (MEX format). | per-sample |
11 | | Transcript expression MEX barcodes | {{ alias }}/{{ alias }}.transcript_raw_feature_bc_matrix/barcodes.tsv.gz | Barcode column names (MEX format). | per-sample |
12 | | Transcript expression MEX features | {{ alias }}/{{ alias }}.transcript_raw_feature_bc_matrix/features.tsv.gz | Feature row names (MEX format). | per-sample |
13 | | Processed gene expression counts | {{ alias }}/{{ alias }}.gene_processed_feature_bc_matrix/matrix.mtx.gz | Filtered and normalized gene x cell expression sparse matrix values (MEX format). | per-sample |
14 | | Processed gene expression barcodes | {{ alias }}/{{ alias }}.gene_processed_feature_bc_matrix/barcodes.tsv.gz | Barcode column names (MEX format) for processed matrix. | per-sample |
15 | | Processed gene expression features | {{ alias }}/{{ alias }}.gene_processed_feature_bc_matrix/features.tsv.gz | Feature row names (MEX format) for processed matrix. | per-sample |
16 | | Processed transcript expression counts | {{ alias }}/{{ alias }}.transcript_processed_feature_bc_matrix/matrix.mtx.gz | Filtered and normalized transcript x cell expression sparse matrix values (MEX format). | per-sample |
17 | | Processed transcript expression MEX barcodes | {{ alias }}/{{ alias }}.transcript_processed_feature_bc_matrix/barcodes.tsv.gz | Barcode column names (MEX format) for processed matrix. | per-sample |
18 | | Processed transcript expression MEX features | {{ alias }}/{{ alias }}.transcript_processed_feature_bc_matrix/features.tsv.gz | Feature row names (MEX format) for processed matrix. | per-sample |
19 | | Mitochondrial expression levels | {{ alias }}/{{ alias }}.gene_expression_mito_per_cell.tsv | Per cell mitochondrial gene expression as percentage total of total gene expression. | per-sample |
20 | | Read summary | {{ alias }}/{{ alias }}.read_summary.tsv | Per read assigned barcodes UMIs genes and transcripts. | per-sample |
21 | | Whitelist | {{ alias }}/{{ alias }}.whitelist.tsv | The barcodes found in the library that remain after filtering. | per-sample |
22 | | Alignment output per sample | {{ alias }}/{{ alias }}.tagged.bam | Genomic alignment output file. | per-sample |
23 | | Alignment index per sample | {{ alias }}/{{ alias }}.tagged.bam.bai | Genomic alignment index file. | per-sample |
24 | | Transcriptome sequence | {{ alias }}/{{ alias }}.transcriptome.fa.gz | Transcriptome generated by Stringtie during transcript discovery stage | per-sample |
25 | | Transcriptome annotation | {{ alias }}/{{ alias }}.transcriptome.gff.gz | Transcriptome annotation generated by Stringtie during transcript discovery stage | per-sample |
26 | | Gene expression umap | {{ alias }}/{{ alias }}.gene_expression_umap_*.tsv | UMAP matrix from gene expression. Varying number of files will be present based on number of umap repeats. | per-sample |
27 | | Transcript expression umap | {{ alias }}/{{ alias }}.transcript_expression_umap_*.tsv | UMAP matrix from transcript expression. Varying number of files will be present based on number of umap repeats. | per-sample |
28 | | Barcode assignment summary | {{ alias }}/{{ alias }}.bc_assignment_summary.tsv | TSV file with barcode assignment summary statistics. | per-sample |
29 | | Single cell SNVs | {{ alias }}/{{ alias }}.final_merged.vcf.gz | VCF file containing per-barcode single nucleotide variant calls. | per-sample |
30 | | Single cell SNVs index | {{ alias }}/{{ alias }}.final_merged.vcf.gz.tbi | VCF index file. | per-sample |
31 | | Genotype matrix | {{ alias }}/{{ alias }}.genotype_matrix/matrix.mtx.gz | Sparse MEX format matrix file. | per-sample |
32 | | Genotype matrix barcodes | {{ alias }}/{{ alias }}.genotype_matrix/barcodes.tsv.gz | Sparse MEX format barcode (columns) file. | per-sample |
33 | | Genotype matrix features | {{ alias }}/{{ alias }}.genotype_matrix/features.tsv.gz | Sparse MEX format SNV ID (rows) file. | per-sample |
34 | | Per-read fusion info | {{ alias }}/fusions/{{ alias }}.ctat-LR-fusion.fusion_predictions_per-read.tsv | TSV file with per-read fusion information, including gene fusion pairs and cell/UMI barcodes. | per-sample |
35 | | Fusion summary | {{ alias }}/fusions/{{ alias }}.ctat-LR-fusion.fusion_predictions_per-fusion.tsv | Summary of each prediciton fusion gene. | per-sample |
36 | | ctat-LR-fusion output | {{ alias }}/fusions/{{ alias }}.ctat-LR-fusion.tar.gz | The complete output of ctat-LR-fusion. | per-sample |
37 | 


--------------------------------------------------------------------------------
/docs/09_troubleshooting.md:
--------------------------------------------------------------------------------
1 | <!---Any additional tips.--->
2 | + If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug.
3 | + See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/).
4 | 


--------------------------------------------------------------------------------
/docs/10_FAQ.md:
--------------------------------------------------------------------------------
1 | <!---Frequently asked questions, pose any known limitations as FAQ's.--->
2 | 
3 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-single-cell/issues) page or start a discussion on the [community](https://community.nanoporetech.com/).


--------------------------------------------------------------------------------
/docs/11_other.md:
--------------------------------------------------------------------------------
1 | <!---Any other sections that are relevant specifically to this workflow and may be useful to users eg. ## Related blog posts. ## Learning center links.--->
2 | 
3 | ## Related blog posts
4 | 
5 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts.


--------------------------------------------------------------------------------
/docs/images/3prime_read.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/d6178271e36558cfd6143c81779ca5403d3dbb51/docs/images/3prime_read.png


--------------------------------------------------------------------------------
/docs/images/probe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/d6178271e36558cfd6143c81779ca5403d3dbb51/docs/images/probe.png


--------------------------------------------------------------------------------
/kit_configs.csv:
--------------------------------------------------------------------------------
 1 | kit,barcode_length,umi_length,bc_long_list
 2 | 3prime:v2,16,10,737K-august-2016.txt.gz
 3 | 3prime:v3,16,12,3M-february-2018.txt.gz
 4 | 3prime:v4,16,12,3M-3pgex-may-2023.txt.gz
 5 | 5prime:v1,16,10,737K-august-2016.txt.gz
 6 | 5prime:v2,16,10,737K-august-2016.txt.gz
 7 | 5prime:v3,16,12,3M-5pgex-jan-2023.txt.gz
 8 | multiome:v1,16,12,737K-arc-v1.txt.gz
 9 | visium:v1,16,12,visium-v1.txt.gz
10 | visium_hd:v1,0,0,visum_hd_demux_only_supported_with_spaceranger
11 | 


--------------------------------------------------------------------------------
/lib/ArgumentParser.groovy:
--------------------------------------------------------------------------------
 1 | /* Check arguments of a Nextflow function
 2 |  *
 3 |  * Nextflow script does not support the Groovy idiom:
 4 |  *
 5 |  *     def function(Map args[:], arg1, arg2, ...)
 6 |  * 
 7 |  * to support unordered kwargs. The methods here are designed
 8 |  * to reduce boileplate while allowing Nextflow script to implement
 9 |  *
10 |  *     def function(Map args[:])
11 |  *
12 |  * with required and default values. This is similar to some Python
13 |  * libraries' (notably matplotlib) extensive use of things like:
14 |  *
15 |  *     def function(*args, **kwargs)
16 |  *
17 |  * to implement generic APIs. Why do we want to do all this? Because
18 |  * we want to write library code with a clean set of required parameters
19 |  * but also extensible with non-required parameters with default values.
20 |  * This allows us to later add parameters without breaking existing code,
21 |  * and is very common practice elsewhere.
22 |  */
23 | 
24 | import java.util.Set
25 | 
26 | class ArgumentParser {
27 |     Set args
28 |     Map kwargs
29 |     String name
30 | 
31 |     /* Parse arguments, raising an error on unknown keys */
32 |     public Map parse_args(LinkedHashMap given_args) {
33 |         Set opt_keys = kwargs.keySet()
34 |         Set given_keys = given_args.keySet()
35 |         check_required(given_keys)
36 |         check_unknown(given_keys, opt_keys)
37 |         return kwargs + given_args
38 |     }
39 |     
40 |     /* Parse arguments, without raising an error for extra keys */
41 |     public Map parse_known_args(LinkedHashMap given_args) {
42 |         Set opt_keys = kwargs.keySet()
43 |         Set given_keys = given_args.keySet()
44 |         check_required(given_keys)
45 |         return kwargs + given_args
46 |     }
47 |     
48 |     private void check_required(Set given) {
49 |         Set missing_keys = args - given
50 |         if (!missing_keys.isEmpty()) {
51 |             throw new Exception("Missing arguments for function ${name}: ${missing_keys}")
52 |         }
53 |     }
54 |     
55 |     private void check_unknown(Set given, Set kwargs_keys) {
56 |         Set extra_keys = given - (args + kwargs_keys)
57 |         if (!extra_keys.isEmpty()) {
58 |             throw new Exception("Unknown arguments provided to function ${name}: ${extra_keys}.")
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/lib/CWUtil.groovy:
--------------------------------------------------------------------------------
 1 | /* Miscellaneous utilities for workflows from the ONT Customer Workflows Group.
 2 |  */
 3 | class CWUtil {
 4 | 
 5 |     /* Mutate the global Nextflow params map
 6 |     *
 7 |     * Occasionally, we may wish to mutate the value of a parameter provided
 8 |     * by the user. Typically, this leads to workflows with `params.my_param`
 9 |     * and `params._my_param` which is ripe for confusion. Instead, we can
10 |     * mutate the parameter value in the Nextflow params ScriptMap itself
11 |     * with the following call:
12 |     *
13 |     *     CWUtil.mutateParam(params, k, v)
14 |     *
15 |     * This is possible as Groovy actually has a surprisingly loose
16 |     * definition of "private", and allows us to call the private `allowNames`
17 |     * method on the ScriptMap which removes the read-only status for a key set.
18 |     * We can follow this up with a call to the private `put0` to reinsert
19 |     * the key and mark it as read-only again.
20 |     */
21 |     public static void mutateParam(nf_params, key, value) {
22 |         Set s = [key] // must be a set to allow call to allowNames
23 |         nf_params.allowNames(s)
24 |         nf_params.put0(key, value)
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/lib/Pinguscript.groovy:
--------------------------------------------------------------------------------
  1 | import static groovy.json.JsonOutput.toJson
  2 | import groovy.json.JsonBuilder
  3 | import groovy.json.JsonSlurper
  4 | 
  5 | 
  6 | class Pinguscript {
  7 | 
  8 |     // Send a ping for the start of a workflow
  9 |     public static void ping_start(nextflow, workflow, params) {
 10 |         wf_ping(nextflow, workflow, "start", null, params)
 11 |     }
 12 |     // Send a ping for a completed workflow (successful or otherwise)
 13 |     public static void ping_complete(nextflow, workflow, params) {
 14 |         wf_ping(nextflow, workflow, "end", null, params)
 15 |     }
 16 |     // Send a ping for a workflow error
 17 |     public static void ping_error(nextflow, workflow, params) {
 18 |         def error_message = workflow.errorMessage
 19 |         wf_ping(nextflow, workflow, "error", error_message, params)
 20 |     }
 21 |     // Shared handler to construct a ping JSON and send it
 22 |     private static String wf_ping(nextflow, workflow, event, error_message, params) {
 23 |         if (params.disable_ping) {
 24 |             return "{}"
 25 |         }
 26 |         def body_json = make_wf_ping(nextflow, workflow, event, error_message, params)
 27 |         send_ping_post("epilaby", body_json)
 28 |     }
 29 | 
 30 |     // Helper to removing keys from a map
 31 |     private static clean_meta(meta, keys_to_remove) {
 32 |         for (key in keys_to_remove) {
 33 |             if (meta.containsKey(key)) {
 34 |                 meta.remove(key)
 35 |             }
 36 |         }
 37 |     }
 38 | 
 39 |     // Helper for fetching a key from the params map
 40 |     // seems pointless but you just know someone is going to end up writing meta.this ? meta.that
 41 |     private static get_meta(meta, key) {
 42 |         (meta.containsKey(key) && meta[key]) ? meta[key].toString() : null
 43 |     }
 44 | 
 45 |     // Construct workflow ping JSON
 46 |     private static String make_wf_ping(nextflow, workflow, event, error_message, params) {
 47 |         // cheeky deepcopy using json
 48 |         String paramsJSON = new JsonBuilder(params).toPrettyString()
 49 |         def params_data = new JsonSlurper().parseText(paramsJSON)
 50 | 
 51 |         // OS
 52 |         // TODO check version on WSL
 53 |         def opsys = System.properties['os.name'].toLowerCase()
 54 |         def opver = System.properties['os.version']
 55 |         if (opver.toLowerCase().contains("wsl")){
 56 |             opsys = "wsl"
 57 |         }
 58 | 
 59 |         // placeholder for any future okta business
 60 |         // for now we'll use the guest_<ulid> sent to wf.epi2me_user
 61 |         def user = get_meta(params.wf, "epi2me_user")
 62 | 
 63 |         // drop cruft to save some precious bytes
 64 |         // affects the deep copy rather than original params
 65 |         clean_meta(params_data, [
 66 |             "schema_ignore_params",
 67 |         ])
 68 |         def ingress_ids = []
 69 |         if (params_data.containsKey("wf")) {
 70 |             ingress_ids = params_data.wf["ingress.run_ids"] ?: []
 71 |             clean_meta(params_data.wf, [
 72 |                 "agent", // we send this later
 73 |                 "epi2me_instance", // we send this later
 74 |                 "epi2me_user", // we send this later
 75 |                 "example_cmd",
 76 |                 "ingress.run_ids", // we will send this elsewhere
 77 |             ])
 78 |         }
 79 | 
 80 |         // try and get runtime information
 81 |         def cpus = null
 82 |         try {
 83 |             cpus = Runtime.getRuntime().availableProcessors()
 84 |         }
 85 |         catch(Exception e) {}
 86 | 
 87 |         def workflow_success = null
 88 |         def workflow_exitcode = null
 89 |         if (event != "start") {
 90 |             workflow_success = workflow.success
 91 |             workflow_exitcode = workflow.exitStatus
 92 |         }
 93 | 
 94 |         /// build message
 95 |         def body_json = new JsonBuilder()
 96 |         body_json \
 97 |             "tracking_id": [
 98 |                 "msg_id": UUID.randomUUID().toString(),
 99 |                 "version": "3.0.1"
100 |             ],
101 |             "source": "workflow",
102 |             "event": event,
103 |             "params": params_data,
104 |             // data will be null on start events, as ingress has not run
105 |             "data": event != "start" ? [run_ids: ingress_ids] : null,
106 |             "workflow": [
107 |                 "name": workflow.manifest.name,
108 |                 "version": workflow.manifest.version, // could use NfcoreTemplate.version(workflow)
109 |                 "run_name": workflow.runName, // required to disambiguate sessions
110 |                 "session": workflow.sessionId,
111 |                 "profile": workflow.profile,
112 |                 "resume": workflow.resume,
113 |                 "error": error_message, // null if no error
114 |                 "success": workflow_success,
115 |                 "exitcode": workflow_exitcode,
116 |             ],
117 |             "env": [
118 |                 "user": user, // placeholder for any future okta
119 |                 "os": [
120 |                     "name": opsys,
121 |                     "version": opver
122 |                 ],
123 |                 "resource": [
124 |                     "cpus": cpus,
125 |                     "memory": null, // placeholder, no point asking via Runtime as it will just give us the Xmx size
126 |                 ],
127 |                 "agent": get_meta(params.wf, "agent"), // access via original params
128 |                 "epi2me": [
129 |                     "instance": get_meta(params.wf, "epi2me_instance"),
130 |                     "user": user,
131 |                 ],
132 |                 "nextflow": [
133 |                     "version": nextflow.version.toString(),
134 |                     "version_compat": nextflow.version.matches(workflow.manifest.nextflowVersion)
135 |                 ]
136 |             ]
137 |         return body_json
138 |     }
139 | 
140 |     // Send a JSON payload to a given endpoint
141 |     private static String send_ping_post(endpoint, body_json) {
142 |         // Attempt to send payload and absorb any possible Exception gracefully
143 |         String postResult
144 |         boolean raise_exception = false
145 |         try {
146 |             ((HttpURLConnection)new URL("https://ping.oxfordnanoportal.com/${endpoint}").openConnection()).with({
147 |                 requestMethod = 'POST'
148 |                 doOutput = true
149 |                 setConnectTimeout(5000)
150 |                 setReadTimeout(10000)
151 |                 setRequestProperty('Content-Type', 'application/json')
152 |                 setRequestProperty('accept', 'application/json')
153 |                 outputStream.withPrintWriter({printWriter ->
154 |                     printWriter.write(body_json.toString())
155 |                 })
156 | 
157 |                 // Rethrow exceptions that imply we're not using this endpoint properly
158 |                 if(responseCode >= 400 && agent.toString() == "cw-ci") {
159 |                     raise_exception = true
160 |                 }
161 |                 // Accessing inputStream.text will raise an Exception for failed requests
162 |                 postResult = inputStream.text
163 |             })
164 |         }
165 |         catch(Exception e) {
166 |             if(raise_exception) { throw e }
167 |         }
168 |         return (postResult)
169 |     }
170 | }
171 | 


--------------------------------------------------------------------------------
/lib/WorkflowMain.groovy:
--------------------------------------------------------------------------------
 1 | // This file is based on the nf-core/tools pipeline-template.
 2 | // Changes to this file must be propagated via wf-template.
 3 | 
 4 | class WorkflowMain {
 5 | 
 6 |     // Citation string for pipeline
 7 |     public static String citation(workflow) {
 8 |         return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" +
 9 |             "* The nf-core framework\n" +
10 |             "  https://doi.org/10.1038/s41587-020-0439-x\n\n"
11 |     }
12 | 
13 |     // Generate help string
14 |     public static String help(workflow, params, log) {
15 |         String line_sep = ' \\ \n\t'
16 |         String command_example = params.wf.example_cmd.join(line_sep)
17 |         String command = 'nextflow run ' + workflow.manifest.name + line_sep + command_example
18 |         String help_string = ''
19 |         help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs)
20 |         help_string += NfcoreSchema.paramsHelp(workflow, params, command)
21 |         help_string += '\n' + citation(workflow) + '\n'
22 |         return help_string
23 |     }
24 | 
25 |     // Generate parameter summary log string
26 |     public static String paramsSummaryLog(workflow, params, log) {
27 |         String workflow_version = NfcoreTemplate.version(workflow)
28 |         String summary_log = ''
29 |         summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs)
30 |         summary_log += NfcoreSchema.paramsSummaryLog(workflow, params)
31 |         summary_log += '\n' + citation(workflow) + '\n'
32 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
33 |         summary_log += "\nThis is ${workflow.manifest.name} ${workflow_version}.\n"
34 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
35 |         return summary_log
36 |     }
37 | 
38 |     // Validate parameters and print summary to screen
39 |     public static void initialise(workflow, params, log) {
40 |         // Print help to screen if required
41 |         if (params.help) {
42 |             log.info help(workflow, params, log)
43 |             System.exit(0)
44 |         }
45 | 
46 |         // Print workflow version and exit on --version
47 |         if (params.version) {
48 |             String workflow_version = NfcoreTemplate.version(workflow)
49 |             log.info "${workflow.manifest.name} ${workflow_version}"
50 |             System.exit(0)
51 |         }
52 | 
53 |         // Explode on conda
54 |         // conda.enabled seems to be backward compatible but wrap this
55 |         // in a generic catch just in case
56 |         try {
57 |             if (workflow.session.config.conda.enabled) {
58 |                 log.error "Sorry, this workflow is not compatible with Conda, please use -profile standard (Docker) or -profile singularity."
59 |                 System.exit(1)
60 |             }
61 |         } catch(Exception e) {}
62 | 
63 |         // Validate workflow parameters via the JSON schema
64 |         if (params.validate_params) {
65 |             NfcoreSchema.validateParameters(workflow, params, log)
66 |         }
67 | 
68 |         // Print parameter summary log to screen
69 |         log.info paramsSummaryLog(workflow, params, log)
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/lib/common.nf:
--------------------------------------------------------------------------------
 1 | import groovy.json.JsonBuilder
 2 | 
 3 | process getParams {
 4 |     label "wf_common"
 5 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "params.json"
 6 |     cache false
 7 |     cpus 1
 8 |     memory "2 GB"
 9 |     output:
10 |         path "params.json"
11 |     script:
12 |         def paramsJSON = new JsonBuilder(params).toPrettyString().replaceAll("'", "'\\\\''")
13 |     """
14 |     # Output nextflow params object to JSON
15 |     echo '$paramsJSON' > params.json
16 |     """
17 | }
18 | 
19 | process configure_igv {
20 |     publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv
21 |     label "wf_common"
22 |     cpus 1
23 |     memory "2 GB"
24 |     input:
25 |         // the python script will work out what to do with all the files based on their
26 |         // extensions
27 |         path "file-names.txt"
28 |         val locus_str
29 |         val aln_extra_opts
30 |         val var_extra_opts
31 |     output: path "igv.json"
32 |     script:
33 |     // the locus argument just makes sure that the initial view in IGV shows something
34 |     // interesting
35 |     String locus_arg = locus_str ? "--locus $locus_str" : ""
36 |     // extra options for alignment tracks
37 |     def aln_opts_json_str = \
38 |         aln_extra_opts ? new JsonBuilder(aln_extra_opts).toPrettyString() : ""
39 |     String aln_extra_opts_arg = \
40 |         aln_extra_opts ? "--extra-alignment-opts extra-aln-opts.json" : ""
41 |     // extra options for variant tracks
42 |     def var_opts_json_str = \
43 |         var_extra_opts ? new JsonBuilder(var_extra_opts).toPrettyString() : ""
44 |     String var_extra_opts_arg = \
45 |         var_extra_opts ? "--extra-vcf-opts extra-var-opts.json" : ""
46 |     """
47 |     # write out JSON files with extra options for the alignment and variant tracks
48 |     echo '$aln_opts_json_str' > extra-aln-opts.json
49 |     echo '$var_opts_json_str' > extra-var-opts.json
50 | 
51 |     workflow-glue configure_igv \
52 |         --fofn file-names.txt \
53 |         $locus_arg \
54 |         $aln_extra_opts_arg \
55 |         $var_extra_opts_arg \
56 |     > igv.json
57 |     """
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/lib/nfcore_external_java_deps.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/d6178271e36558cfd6143c81779ca5403d3dbb51/lib/nfcore_external_java_deps.jar


--------------------------------------------------------------------------------
/limitations_and_known_issues.md:
--------------------------------------------------------------------------------
 1 | # wf-single-cell: Limitations and known issues  
 2 | 
 3 | 
 4 | ## No trimmed FASTQ output 
 5 | Users may want to obtained FASTQ files trimmed of adapter sequences, barcodes and UMIs for their
 6 | downstream analysis, but there is not currently an option to output such files.
 7 | 
 8 | ## No ability to filter non-full-length reads
 9 | Subreads are classified as being full length if flanked by two compatible adapters. 
10 | However, at the moment this classification has no effect 
11 | on whether these are further processed by the workflow. A user option to control this behaviour may be desirable. 
12 | 
13 | ## 10x gene expression and feature barcodes discrimination
14 | A 10x barcode whitelist containing all possible barcodes is used to cross-reference the discovered baroccodes for
15 | barcode error correction. For the 3prime and multiome kits, the whitelist contains ~3M gene expression barcodes that this workflow is interested in.
16 | However, it also contains a similar number of feature barcodes, see this [10x article](https://kb.10xgenomics.com/hc/en-us/articles/360031133451-Why-is-there-a-discrepancy-in-the-3M-february-2018-txt-barcode-whitelist-).
17 | It's not currently possible to differentiate between the two types of barcode in this whitelist. 
18 | Therefore, it is possible that somen error-containing gene expression barcodes may be being incorrectly assigned
19 | to feature barcodes. To what extent this is happening is currently unknown. 
20 | 
21 | ## Gene and feature assignment discrepancy 
22 | In the `assign_features` process, genes are only assigned if they have a MAPQ score greater than a user-defined MAPQ score (default 30)
23 | However,  transcripts are assigned based on alignment to a transcriptome that is built during the workflow. 
24 | Transcripts are not filtered by MAPQ, but by applying some alternative heuristics based on alignment scores as well as transcript and query 
25 | coverages. This can lead to cases where transcripts are called, but not genes. This will be fixed in a future version.
26 |   


--------------------------------------------------------------------------------
/modules/local/common.nf:
--------------------------------------------------------------------------------
 1 | // Merge TSVs and sum the specified column
 2 | // Currently support only headerless inputs and summing of the second column
 3 | process merge_and_publish_tsv {
 4 |     publishDir "${params.out_dir}/${meta.alias}", mode: 'copy'
 5 |     label "wf_common"
 6 |     cpus 1
 7 |     memory "2GB"
 8 |     input:
 9 |         tuple val(meta),
10 |               path("inputs/input*.tsv")
11 |         val(output_fname)
12 |     output:
13 |         tuple val(meta),
14 |               path("${meta.alias}.${output_fname}")
15 |     script:
16 |     """
17 |     find inputs -name "*.tsv" \
18 |         -exec cat {} + \
19 |         | csvtk -t summary -H -f 2:sum -g 1 \
20 |         > "${meta.alias}.${output_fname}"
21 |     """
22 | }
23 | 
24 | process build_minimap_index {
25 |     /*
26 |     Build minimap index from reference genome
27 |     */
28 |     label "singlecell"
29 |     cpus params.threads
30 |     memory '16 GB'
31 |     input:
32 |         path "reference.fa"
33 |     output:
34 |         path "genome_index.mmi", emit: index
35 |     script:
36 |     """
37 |     minimap2 -t ${task.cpus} -I 16G -d "genome_index.mmi" "reference.fa"
38 |     """
39 | }
40 | 
41 | process call_paftools {
42 |     label "singlecell"
43 |     memory "2 GB"
44 |     cpus 1
45 |     input:
46 |         path "ref_genes.gtf"
47 |     output:
48 |         path "ref_genes.bed", emit: ref_genes_bed
49 |     script:
50 |     """
51 |     paftools.js gff2bed -j ref_genes.gtf > ref_genes.bed
52 |     """
53 | }
54 | 
55 | process cat_tags_by_chrom {
56 |     // Merge per-chunk tags to create per-chromosome tags
57 |     label "wf_common"
58 |     cpus params.threads
59 |     memory "8 GB"
60 |     input:
61 |         tuple val(meta),
62 |               path('tags/*tags.tsv')
63 |     output:
64 |          tuple val(meta),
65 |               path("chr_tags/*"),
66 |               emit: merged_tags
67 | 
68 |     script:
69 |     """
70 |     mkdir chr_tags
71 |     # Find the chr column number
72 |     files=(tags/*)
73 |     chr_col=\$(awk -v RS='\t' '/chr/{print NR; exit}' "\${files[0]}")
74 | 
75 |     # merge the tags TSVs, keep header from first file and split entries by chromosome
76 |     awk -F'\t' -v chr_col=\$chr_col 'FNR==1{hdr=\$0; next} \
77 |     {if (!seen[\$chr_col]++) \
78 |         print hdr>"chr_tags/"\$chr_col".tsv"; \
79 |         print>"chr_tags/"\$chr_col".tsv"}' tags/*
80 | 
81 |     # Sort by corrected barcode to allow chunked reading later.
82 |     for file in chr_tags/*.tsv;
83 |     do
84 |         csvtk sort -t --keys CB "\$file" -o tmp.tsv;
85 |         mv tmp.tsv "\$file";
86 |     done
87 |     """
88 | }


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
  1 | //
  2 | // Notes to End Users.
  3 | //
  4 | // The workflow should run without editing this configuration file,
  5 | // however there may be instances in which you wish to edit this
  6 | // file for compute performance or other reasons. Please see:
  7 | //
  8 | //   https://nextflow.io/docs/latest/config.html#configuration
  9 | //
 10 | // for further help editing this file.
 11 | 
 12 | params {
 13 |     help = false
 14 |     version = false
 15 |     fastq = null
 16 |     bam = null
 17 |     spaceranger_bam = null
 18 |     adapter_configs = null
 19 |     out_dir = "output"
 20 |     sample_sheet = null
 21 |     sample = null
 22 |     single_cell_sample_sheet = null
 23 |     aws_image_prefix = null
 24 |     aws_queue = null
 25 |     disable_ping = false
 26 |     kit_config = null
 27 |     kit = null
 28 |     threads = 8
 29 |     full_length_only = true
 30 |     min_read_qual = null
 31 | 
 32 |     fastq_chunk = 1000000
 33 |     barcode_adapter1_suff_length = 10
 34 |     barcode_min_quality = 15
 35 |     barcode_max_ed = 2
 36 |     barcode_min_ed_diff = 2
 37 |     gene_assigns_minqv = 30
 38 |     matrix_min_genes = 200
 39 |     matrix_min_cells = 3
 40 |     matrix_max_mito = 20
 41 |     matrix_norm_count = 10000
 42 |     genes_of_interest = null
 43 |     umap_n_repeats = 3
 44 |     expected_cells = null
 45 |     estimate_cell_count = true
 46 |     mito_prefix = "MT-"
 47 |     stringtie_opts = "-c 2"
 48 |     call_variants = false
 49 |     report_variants = null
 50 |     call_fusions = false
 51 | 
 52 |     ref_genome_dir = null
 53 |     ctat_resources = null
 54 |     epi2me_resource_bundle = null
 55 | 
 56 |     monochrome_logs = false
 57 |     validate_params = true
 58 |     show_hidden_params = false
 59 |     schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf,resource_bundles'
 60 |     store_dir = "wf-single-cell_resources"
 61 | 
 62 |     resource_bundles = [
 63 |         'gex-GRCh38-2024-A': [
 64 |             '10x': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/refdata-gex-GRCh38-2024-A.tar.gz',
 65 |             'ctat-lr-fusion': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/ctat_genome_lib_10x_2024.tar.gz'
 66 |         ],
 67 |          'gex-GRCh38-2024-A_chr_20-21': [
 68 |             '10x': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/refdata-gex-GRCh38-2024-A_chr20_21.tar.gz',
 69 |             'ctat-lr-fusion': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/ctat_genome_lib_chr20_21_UyHq1cFI.tar.gz'
 70 |         ]
 71 |     ]
 72 | 
 73 |     wf {
 74 |         example_cmd = [
 75 |             "--expected_cells 100",
 76 |             "--fastq 'wf-single-cell-demo/chr17.fq.gz'",
 77 |             "--kit '3prime:v3'",
 78 |             "--ref_genome_dir 'wf-single-cell-demo'",
 79 |             "--genes_of_interest 'wf-single-cell-demo/umap_plot_genes.csv'",
 80 |         ]
 81 |         merge_threads = 24
 82 |         fusion_threads = 12
 83 |         container_sha = "shab5b0dea0efc4685f74c8b4f91c979c587e23a020"
 84 |         common_sha = "sha1c69fd30053aad5d516e9567b3944384325a0fee"
 85 |     }
 86 | }
 87 | 
 88 | manifest {
 89 |     name            = 'epi2me-labs/wf-single-cell'
 90 |     author          = 'Oxford Nanopore Technologies'
 91 |     homePage        = 'https://github.com/epi2me-labs/wf-single-cell'
 92 |     description     = 'Identification of cell- and UMI barcodes from single-cell sequencing.'
 93 |     mainScript      = 'main.nf'
 94 |     nextflowVersion = '>=23.04.2'
 95 |     version         = '3.3.0'
 96 | }
 97 | 
 98 | epi2melabs {
 99 |     tags = 'wf-single-cell,transcriptomics,human,mouse'
100 |     icon = 'faCircle'
101 | }
102 | 
103 | // used by default for "standard" (docker) and singularity profiles,
104 | // other profiles may override.
105 | process {
106 |     withLabel:singlecell {
107 |         container = "ontresearch/wf-single-cell:${params.wf.container_sha}"
108 |     }
109 |     withLabel:wf_common {
110 | 		container = "ontresearch/wf-common:${params.wf.common_sha}"
111 | 	}
112 |     withLabel:ctat_lr_fusion {
113 |         container = "trinityctat/ctat_lr_fusion:1.1.0"
114 |     }
115 |     shell = ['/bin/bash', '-euo', 'pipefail']
116 | }
117 | 
118 | 
119 | profiles {
120 |     // the "standard" profile is used implicitely by nextflow
121 |     // if no other profile is given on the CLI
122 |     standard {
123 |         docker {
124 |             enabled = true
125 |             // this ensures container is run as host user and group, but
126 |             //    also adds host user to the within-container group
127 |             runOptions = "--user \$(id -u):\$(id -g) --group-add 100"
128 |         }
129 |     }
130 | 
131 |     // using singularity instead of docker
132 |     singularity {
133 |         singularity {
134 |             enabled = true
135 |             autoMounts = true
136 |         }
137 |     }
138 | 
139 |     conda {
140 | 		conda.enabled = true
141 | 	}
142 | 
143 |     // Using AWS batch.
144 |     // May need to set aws.region and aws.batch.cliPath
145 |     awsbatch {
146 |         process {
147 |             executor = 'awsbatch'
148 |             queue = "${params.aws_queue}"
149 |             memory = '8G'
150 |             withLabel:singlecell {
151 |                 container = "${params.aws_image_prefix}-wf-single-cell:${params.wf.container_sha}"
152 |             }
153 |             withLabel:wf_common {
154 | 				container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}"
155 | 			}
156 |             shell = ['/bin/bash', '-euo', 'pipefail']
157 |         }
158 |     }
159 | 
160 |     // local profile for simplified development testing
161 |     local {
162 |         process.executor = 'local'
163 |     }
164 | }
165 | 
166 | 
167 | timeline {
168 |   enabled = true
169 |   overwrite = true
170 |   file = "${params.out_dir}/execution/timeline.html"
171 | }
172 | report {
173 |   enabled = true
174 |   overwrite = true
175 |   file = "${params.out_dir}/execution/report.html"
176 | }
177 | trace {
178 |   enabled = true
179 |   overwrite = true
180 |   file = "${params.out_dir}/execution/trace.txt"
181 | }
182 | 
183 | env {
184 |   PYTHONNOUSERSITE = 1
185 |   JAVA_TOOL_OPTIONS = "-Xlog:disable -Xlog:all=warning:stderr"
186 |   NUMBA_CACHE_DIR = "./numba_cache_dir"
187 | }
188 | 


--------------------------------------------------------------------------------
/subworkflows/assign_features.nf:
--------------------------------------------------------------------------------
  1 | process split_gtf_by_chroms {
  2 |     label "singlecell"
  3 |     cpus 1
  4 |     memory "1 GB"
  5 |     input:
  6 |         path("ref.gtf")
  7 |     output:
  8 |         path("*"), emit: chrom_gtf
  9 |     script:
 10 |     """
 11 |     gawk '/^[^#]/ {print>\$1".gtf"}' ref.gtf
 12 |     """
 13 | }
 14 | 
 15 | 
 16 | process stringtie {
 17 |     label "singlecell"
 18 |     cpus params.threads
 19 |     // Memory usage for this process is usually less than 3GB, but some cases it may go over this.
 20 |     memory { 3.GB * task.attempt }
 21 |     maxRetries 3
 22 |     input:
 23 |         path 'ref_genome.fa'
 24 |         path 'ref_genome.fa.fai'
 25 |         tuple val(meta),
 26 |               path("align.bam"),
 27 |               path("align.bam.bai"),
 28 |               val(chr),
 29 |               path("chr.gtf")
 30 | 
 31 |     output:
 32 |         tuple val(meta),
 33 |               val(chr),
 34 |               path("transcriptome.fa"),
 35 |               path("chr.gtf"),
 36 |               path("stringtie.gff"),
 37 |               path("reads.fastq.gz"),
 38 |               emit: read_tr_map
 39 |     script:
 40 |     """
 41 |     # Add chromosome label (-l) to generated transcripts
 42 |     # so we don't get name collisions during file merge later
 43 |     samtools view -h align.bam ${chr}  \
 44 |         | tee >(
 45 |             stringtie -L ${params.stringtie_opts} -p ${task.cpus} \
 46 |                 -G chr.gtf -l "${chr}.stringtie" -o "stringtie.gff" - ) \
 47 |         | samtools fastq \
 48 |         | bgzip --threads 2 -c > reads.fastq.gz
 49 |     # Get transcriptome sequence
 50 |     gffread -g ref_genome.fa -w "transcriptome.fa" "stringtie.gff"
 51 |     """
 52 | }
 53 | 
 54 | 
 55 | process align_to_transcriptome {
 56 |     label "singlecell"
 57 |     cpus params.threads
 58 |     memory "31 GB"
 59 |     input:
 60 |         tuple val(meta),
 61 |               val(chr),
 62 |               path('transcriptome.fa'),
 63 |               path('chr.gtf'),
 64 |               path('stringtie.gff'),
 65 |               path("reads.fq.gz")
 66 |     output:
 67 |         tuple val(meta),
 68 |               val(chr),
 69 |               path("chr.gtf"),
 70 |               path("tr_align.bam"),
 71 |               path('stringtie.gff'),
 72 |               emit: read_tr_map
 73 |     script:
 74 |     def view_threads = 1
 75 |     def sort_threads = 3
 76 |     def mm2_threads = Math.max(task.cpus - view_threads - sort_threads, 4)
 77 |     """
 78 |     minimap2 -ax map-ont \
 79 |         --cap-kalloc 100m --cap-sw-mem 50m \
 80 |         --end-bonus 10 -p 0.9 -N 3 -t $mm2_threads \
 81 |         transcriptome.fa reads.fq.gz \
 82 |     | samtools view -h -@ $view_threads -b -F 2052 - \
 83 |     | samtools sort -n -@ $sort_threads --no-PG - > tr_align.bam
 84 |     """
 85 | }
 86 | 
 87 | process assign_features {
 88 |     label "singlecell"
 89 |     cpus 1
 90 |     // This step is performed per-chromosome. The tags file per chrom can vary
 91 |     // quite widely in size. We don't have a fixed memory size here in order
 92 |     // to get better parallelism on single-host setups.
 93 |     memory { 1.0.GB.toBytes() + (tags.size() * 4 ) }
 94 |     input:
 95 |         tuple val(meta),
 96 |               val(chr),
 97 |               path("chr.gtf"),
 98 |               path("tr_align.bam"),
 99 |               path("stringtie.gff"),
100 |               path(tags, stageAs: "tags.tsv")
101 |     output:
102 |         tuple val(meta),
103 |               val(chr),
104 |               path("feature_assigns.tsv"),
105 |               emit: feature_assigns
106 |         tuple val(meta),
107 |               path("gffcompare.annotated.gtf"),
108 |               emit: annotation
109 |     script:
110 |     """
111 |     # gffcomapre maps transcript reference IDs to query transcripts.
112 |     gffcompare -o gffcompare -r chr.gtf stringtie.gff
113 | 	touch test
114 |     workflow-glue assign_features \
115 |         tr_align.bam \
116 |         gffcompare.stringtie.gff.tmap \
117 |         chr.gtf \
118 |         tags.tsv \
119 |         feature_assigns.tsv \
120 |         --min_mapq ${params.gene_assigns_minqv}
121 |     """
122 | }
123 | 
124 | 
125 | 
126 | workflow assign_features_with_stringtie {
127 |     take:
128 |         bam
129 |         chr_tags
130 |         ref_gtf
131 |         ref_genome_fasta
132 |         ref_genome_idx
133 |     main:
134 |          chr_gtf = split_gtf_by_chroms(ref_gtf)
135 |             .flatten()
136 |             .map {fname -> tuple(fname.baseName, fname)} // [chr, gtf]
137 |         
138 |         stringtie(
139 |             ref_genome_fasta,
140 |             ref_genome_idx,
141 |             bam.combine(chr_gtf))
142 | 
143 |         // TODO: We're likely to change this to use bambu and avoid using
144 |         //       stringtie altogether. However note that the next three steps
145 |         //       are a strict linear pipeline and should be combined into one
146 |         //       process to avoid staging of files between processes. Note further
147 |         //       that it would be trivial to combine the assign_features and
148 |         //       and create_matrix steps into a single program to avoid writing
149 |         //       any intermediate files whatsoever.
150 |         align_to_transcriptome(stringtie.out.read_tr_map)
151 | 
152 |         assign_features(
153 |             align_to_transcriptome.out.read_tr_map
154 |                 .join(chr_tags, by: [0, 1]))
155 | 
156 |     emit:
157 |         feaure_assignmnets = assign_features.out.feature_assigns
158 |         annotation = assign_features.out.annotation
159 |         read_to_transcript_map = stringtie.out.read_tr_map
160 | 
161 | }
162 | 


--------------------------------------------------------------------------------
/subworkflows/barcode_correction.nf:
--------------------------------------------------------------------------------
  1 | include { merge_and_publish_tsv } from '../modules/local/common'
  2 | include { cat_tags_by_chrom } from '../modules/local/common'
  3 | 
  4 | process generate_whitelist{
  5 |     label "singlecell"
  6 |     cpus 4
  7 |     memory "4 GB"
  8 |     publishDir "${params.out_dir}/${meta.alias}",
  9 |                 mode: 'copy',
 10 |                 pattern: "*whitelist.tsv"
 11 |     input:
 12 |         tuple val(meta),
 13 |               path("barcodes/?_barcode.tsv")
 14 |     output:
 15 |         tuple val(meta),
 16 |               path("${meta.alias}.whitelist.tsv"),
 17 |               emit: whitelist
 18 |         path "high_qual_bc_counts.tsv",
 19 |               emit: hq_counts
 20 |         tuple val(meta),
 21 |               path("shortlist_summary.tsv"),
 22 |               emit: shortlist_summary
 23 |     // TODO: change this to take precomputed, filtered counts from extract_barcodes
 24 |     script:
 25 |     // It doesn't make sense to do cell count thresholding of the shortlist for visium data.
 26 |     // A visium barcode is a tissue coordinate not a cell.
 27 |     def no_thresholding_opt = meta.kit.split(':')[0] == 'visium' ? '--no_cell_filter' : ""
 28 |     def exp_cells_opt = meta.kit.split(':')[0] != 'visium' ? "--exp_cells ${meta['expected_cells']}" : ""
 29 |     def method_opt = params.estimate_cell_count ? "--method quantile" : "--method fixed"
 30 |     """
 31 |     workflow-glue create_shortlist \
 32 |         barcodes "${meta.alias}.whitelist.tsv" shortlist_summary.tsv "${meta.alias}" \
 33 |         --counts \
 34 |         ${method_opt} \
 35 |         ${exp_cells_opt} \
 36 |         --counts_out "high_qual_bc_counts.tsv" \
 37 |         --threads ${task.cpus} \
 38 |         ${no_thresholding_opt}
 39 |     """
 40 | }
 41 | 
 42 | 
 43 | process assign_barcodes{
 44 |     label "singlecell"
 45 |     cpus 1
 46 |     memory "2 GB"
 47 |     input:
 48 |          tuple val(meta),
 49 |                path("whitelist.tsv"),
 50 |                path("extract_barcodes.tsv")
 51 |     output:
 52 |         tuple val(meta),
 53 |               path("bc_assign_counts.tsv"),
 54 |               emit: chrom_assigned_barcode_counts
 55 |         tuple val(meta),
 56 |               path("extract_barcodes_with_bc.tsv"),
 57 |               emit: tags
 58 |         tuple val(meta),
 59 |               path("summary.tsv"),
 60 |               emit: summary
 61 |     script:
 62 |     """
 63 |     workflow-glue assign_barcodes \
 64 |         whitelist.tsv extract_barcodes.tsv \
 65 |         extract_barcodes_with_bc.tsv bc_assign_counts.tsv summary.tsv \
 66 |         --max_ed ${params.barcode_max_ed} \
 67 |         --min_ed_diff ${params.barcode_min_ed_diff} \
 68 |         --use_kmer_index
 69 |     """
 70 | }
 71 | 
 72 | 
 73 | 
 74 | workflow correct_10x_barcodes {
 75 |     take: 
 76 |         extracted_barcodes
 77 |         high_qual_bc_counts
 78 |     main:
 79 | 
 80 | 
 81 |         generate_whitelist(high_qual_bc_counts)
 82 | 
 83 |         // TODO: this process really has no business being here. It should be
 84 |         //       moved into main.nf as an aggregation across all the chunks
 85 |         //       in extracted_barcodes. It takes a long time per-chunk so should
 86 |         //       be left as parallel across chunks.
 87 |         assign_barcodes(
 88 |             generate_whitelist.out.whitelist
 89 |             .cross(extracted_barcodes)
 90 |             .map {it ->
 91 |                 def meta = it[0][0]
 92 |                 def whitelist = it[0][1]
 93 |                 def barcodes = it[1][1]
 94 |                 [meta, whitelist, barcodes]})
 95 | 
 96 |         merge_and_publish_tsv(
 97 |             assign_barcodes.out.summary
 98 |                 .concat(generate_whitelist.out.shortlist_summary)
 99 |                 .groupTuple(),
100 |             'bc_assignment_summary.tsv')
101 | 
102 |         // Combine the tag chunks to per chrom chunks and emit [meta, chr, tags]
103 |         chr_tags = cat_tags_by_chrom(assign_barcodes.out.tags.groupTuple())
104 |             .transpose()
105 |             .map {meta, file -> [meta, file.baseName, file]}
106 | 
107 |     emit:
108 |         chr_tags = chr_tags
109 |         white_list = generate_whitelist.out.whitelist
110 |         hq_bc_counts = generate_whitelist.out.hq_counts.collectFile(keepHeader: true)
111 | }


--------------------------------------------------------------------------------
/subworkflows/fusions.nf:
--------------------------------------------------------------------------------
  1 | process get_ctat_data {
  2 |     label "wf_common"
  3 |     cpus 1
  4 |     memory "2 GB"
  5 |     storeDir {params.store_dir ? "${params.store_dir}/${name}" : null }
  6 |     input:
  7 |             val name
  8 |             val url
  9 |     output:
 10 |         path "${name}", emit: resource_dir
 11 |     script:
 12 |     """
 13 |     wget -qO- $url \
 14 |         | tar --no-same-owner -xzv --one-top-level=${name} --strip-component=1
 15 |     """
 16 | }
 17 | 
 18 | process find_fusions {
 19 |     /*
 20 |     Run ctat-LR-fusion to find fusion reads. 
 21 |     */
 22 |     label "ctat_lr_fusion"
 23 |     cpus params.wf.fusion_threads
 24 |     memory '16 GB'
 25 |     publishDir "${params.out_dir}/${meta.alias}/fusions", mode: 'copy', pattern: "${meta.alias}.ctat-LR-fusion.tar.gz"
 26 |     input:
 27 |         tuple val(meta),
 28 |               path("tagged.bam")
 29 |         path("ctat_reference_bundle")
 30 |     output:
 31 |         tuple val(meta),
 32 |               path("${meta.alias}.ctat-LR-fusion.tar.gz"),
 33 |               emit: gzipped_ctat_dir
 34 |         tuple val(meta),
 35 |               path(fusion_preds),
 36 |               emit: ctat_fusion_predictions
 37 |         stdout emit: stdout
 38 |     script:
 39 |         String ctat_outdir = "fusions"
 40 |         // Expected main output file
 41 |         // This will be present if no fusion candidates are verified (header only)
 42 |         // or absent if no fusion candidates are found
 43 |         fusion_preds = "${ctat_outdir}/ctat-LR-fusion.fusion_predictions.tsv"
 44 |         Integer threads = Math.max(2, task.cpus - 2)
 45 |     """
 46 |     ctat-LR-fusion \
 47 |         --LR_bam tagged.bam \
 48 |         --genome_lib_dir ./ctat_reference_bundle \
 49 |         --CPU ${threads} --vis --output ${ctat_outdir}
 50 | 
 51 |     if [ ! -f "${fusion_preds}" ]; then
 52 |         echo "No fusion candidates found for ${meta.alias}"
 53 |         # Create an empty file for expected output
 54 |         touch "${ctat_outdir}/ctat-LR-fusion.fusion_predictions.tsv"
 55 |     else
 56 |         n=\$(tail -n +2 "${fusion_preds}" | wc -l)
 57 |         if [ "\$n" -eq 0 ]; then
 58 |             echo "Fusion candidates found for ${meta.alias} but none passed filters"
 59 |         fi
 60 |     fi
 61 |     tar -czf "${meta.alias}.ctat-LR-fusion.tar.gz" ${ctat_outdir}
 62 |     """
 63 | }
 64 | 
 65 | 
 66 | process format_ctat_output {
 67 |     label "singlecell"
 68 |     cpus 1
 69 |     memory '2 GB'
 70 |     publishDir "${params.out_dir}/${meta.alias}/fusions", mode: 'copy', pattern: "${meta.alias}.ctat-LR-fusion.fusion_predictions_per*"
 71 |     input:
 72 |         tuple val(meta),
 73 |               path("ctat-LR-fusion.fusion_predictions.tsv"),
 74 |               path("read_summary_tags.tsv")
 75 |     output: 
 76 |         tuple val(meta),
 77 |               path("${meta.alias}.ctat-LR-fusion.fusion_predictions_per-read.tsv"),
 78 |               emit: read_summary
 79 |         tuple val(meta),
 80 |               path("${meta.alias}.ctat-LR-fusion.fusion_predictions_per-fusion.tsv"),
 81 |               emit: fusion_summary
 82 |         tuple val(meta),
 83 |               path("${meta.alias}.ctat-LR-fusion.fusion_summary.tsv"),
 84 |               emit: cell_summary
 85 |     script:
 86 |     """
 87 |     workflow-glue format_ctat_output \
 88 |         ctat-LR-fusion.fusion_predictions.tsv \
 89 |         read_summary_tags.tsv \
 90 |         "${meta.alias}.ctat-LR-fusion.fusion_predictions_per-read.tsv" \
 91 |         "${meta.alias}.ctat-LR-fusion.fusion_predictions_per-fusion.tsv" \
 92 |         "${meta.alias}.ctat-LR-fusion.fusion_summary.tsv" \
 93 |         ${meta.alias}
 94 |     """
 95 | }
 96 | 
 97 | 
 98 | 
 99 | workflow ctat_lr_fusion {
100 |     take: 
101 |         tagged_bam_and_summary
102 |         ctat_reference_bundle
103 |     main:
104 |         find_fusions(
105 |             tagged_bam_and_summary.map{meta, bam, _bai, _read_summary -> [meta, bam]}, 
106 |             ctat_reference_bundle)
107 |         
108 |         find_fusions.out.stdout.map {stdout -> 
109 |             if (stdout) {
110 |                 log.warn(stdout)
111 |             }
112 |         }
113 |         
114 |         format_ctat_output(
115 |             find_fusions.out.ctat_fusion_predictions
116 |                 .join(tagged_bam_and_summary
117 |                     .map {meta, _bam, _bai, read_summary -> [meta, read_summary]})
118 |         )
119 | 
120 |     emit:
121 |         read_summary = format_ctat_output.out.read_summary
122 |         fusion_summary = format_ctat_output.out.fusion_summary
123 |         cell_summary = format_ctat_output.out.cell_summary
124 | 
125 | }


--------------------------------------------------------------------------------
/subworkflows/preprocess.nf:
--------------------------------------------------------------------------------
  1 | include { call_paftools; build_minimap_index} from '../modules/local/common'
  2 | 
  3 | process call_adapter_scan {
  4 |     label "singlecell"
  5 |     cpus params.threads
  6 |     // memory here is taken by minimap2. Having merged the three steps into one,
  7 |     // we have have prehps reduced parallelism in the workflow because in some setups
  8 |     // it might be the case that multiple tasks of the first two steps cannot now run
  9 |     // in parallel. The resolution to that would be to make the first two steps do
 10 |     // better parallelism. The advantage here is not having to write to disk, stage files
 11 |     // and read from disk between the steps (creating a lot of big temporary files).
 12 |     //
 13 |     // peak RSS for aligning this data is robustly <12.4 GB with human reference. Set
 14 |     // a little more and do a retry
 15 |     memory {15.GB * task.attempt}
 16 |     maxRetries 1
 17 |     errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
 18 |     input:
 19 |         tuple val(meta), path(chunk, stageAs: 'chunk.fq.gz')
 20 |         path "bc_longlist_dir"
 21 |         path "genome_index.mmi"
 22 |         path "ref_genes.bed"
 23 |     output:
 24 |         tuple val(meta), path("adapters.json"), emit: adapter_summary
 25 |         tuple val(meta), path("read_tags.tsv"), emit: read_tags
 26 |         tuple val(meta), path("high_quality_bc_counts.tsv"), emit: barcode_counts
 27 |         tuple val(meta), path("sorted.bam"), path("sorted.bam.bai"), emit: bam_sort
 28 |         tuple val(meta), path("bamstats.tsv"), emit: bam_stats
 29 |     script:
 30 |     def fl = params.full_length_only ? "--keep_fl_only": ""
 31 |     // alignment is the real bottleneck here, don't worry about threads
 32 |     // for sorting. Just subtract 1 thread as a loose bookeeping. Note the
 33 |     // hidden call to vsearch in the first program: the pipe doesn't get
 34 |     // going until thats finished. vsearch appears to use all the juice
 35 |     // it can squeeze.
 36 |     // We set -K option to minimap2 as default appears too large to
 37 |     // stream data effectively (it just blocks with defaults waiting for
 38 |     // more data). The effectiveness of this is not clear.
 39 |     def mm2_threads = task.cpus - 1
 40 |     """
 41 |     export POLARS_MAX_THREADS=$task.cpus
 42 | 
 43 |     workflow-glue adapter_scan_vsearch \
 44 |         chunk.fq.gz \
 45 |         --kit ${meta['kit_name']} \
 46 |         --summary "adapters.json" \
 47 |         ${fl} \
 48 |     | workflow-glue extract_barcode \
 49 |         - \
 50 |         bc_longlist_dir/${meta['bc_long_list']} \
 51 |         --kit ${meta["kit_name"]} \
 52 |         --adapter1_suff_length $params.barcode_adapter1_suff_length \
 53 |         --min_barcode_qv $params.barcode_min_quality \
 54 |         --barcode_length ${meta['barcode_length']} \
 55 |         --umi_length ${meta['umi_length']} \
 56 |         --output_read_tags "bc_extract.tsv" \
 57 |         --output_barcode_counts "high_quality_bc_counts.tsv" \
 58 |     | minimap2 -ax splice -uf --MD \
 59 |         -t $mm2_threads -K 10M \
 60 |         --junc-bed ref_genes.bed  \
 61 |         --cap-kalloc 100m \
 62 |         genome_index.mmi - \
 63 |     | samtools view -uh --no-PG - \
 64 |     | tee >(seqkit bam -s  2> bamstats.tsv ) \
 65 |     | tee >(samtools view - -d SA \
 66 |         | awk 'BEGIN{OFS="\t"; print "read_id", "SA"} {print \$1,"True"}' > SA_tags.tsv ) \
 67 |     | samtools view -uh -F 256 - \
 68 |     | tee >(samtools sort --write-index -o "sorted.bam"##idx##"sorted.bam.bai" --no-PG  -) \
 69 |     | seqkit bam -F - 2> bam_info.tsv
 70 | 
 71 |     # TODO: improve this with pipes?
 72 |     csvtk cut -tlf Read,Pos,EndPos,Ref,MapQual bam_info.tsv > bam_info_cut.tsv
 73 |     # Left join of barcode
 74 |     csvtk join -tlf 1 bam_info_cut.tsv bc_extract.tsv --left-join \
 75 |         | csvtk rename -tl -f Read,Pos,EndPos,Ref,MapQual -n read_id,start,end,chr,mapq -o read_tags_interim.tsv
 76 | 
 77 |     # Merge the SA column with the read tags on read_id
 78 |     if [ \$(wc -l < SA_tags.tsv) -eq 1 ]; then
 79 |         echo "No SA tags found"
 80 |         # Add an empty SA column
 81 |         csvtk mutate2 -t -n 'SA' -e " '' " read_tags_interim.tsv > read_tags.tsv
 82 |     else
 83 |         csvtk -t uniq SA_tags.tsv | csvtk join -t --left-join --fields read_id read_tags_interim.tsv - > read_tags.tsv
 84 |     fi
 85 |     rm bam_info.tsv bam_info_cut.tsv bc_extract.tsv read_tags_interim.tsv
 86 |     """
 87 | }
 88 | 
 89 | 
 90 | process merge_bams {
 91 |     // Combine all BAMs derived from the initial chunking into per sample files
 92 |     label "wf_common"
 93 |     cpus params.threads
 94 |     memory "8 GB"
 95 |     input:
 96 |         tuple val(meta),
 97 |             path('bams/*aln.bam'),
 98 |             path('bams/*aln.bam.bai')
 99 |     output:
100 |         tuple val(meta),
101 |               path("merged.sorted.bam"),
102 |               path("merged.sorted.bam.bai"),
103 |               emit: merged_bam
104 |     script:
105 |     """
106 |     samtools merge -@ ${task.cpus -1} --write-index -o "merged.sorted.bam##idx##merged.sorted.bam.bai" bams/*.bam
107 |     """
108 | }
109 | 
110 | 
111 | // workflow module
112 | workflow preprocess {
113 |     take:
114 |         read_chunks
115 |         bc_longlist_dir
116 |         ref_genome_fasta
117 |         ref_genes_gtf
118 |     main:
119 |         // alignment pre-requisites
120 |         index_mmi = build_minimap_index(ref_genome_fasta)
121 |         ref_genes_bed = call_paftools(ref_genes_gtf)
122 | 
123 | 
124 |         // find adapters, trim barcodes, and align
125 |         call_adapter_scan(
126 |             read_chunks,
127 |             bc_longlist_dir,
128 |             build_minimap_index.out.index,
129 |             ref_genes_bed)
130 |         
131 |         merged_bam = merge_bams(call_adapter_scan.out.bam_sort.groupTuple())
132 | 
133 |     emit:
134 |         merged_bam = merged_bam
135 |         bam_stats = call_adapter_scan.out.bam_stats
136 |         read_tags = call_adapter_scan.out.read_tags
137 |         high_qual_bc_counts = call_adapter_scan.out.barcode_counts
138 |         adapter_summary = call_adapter_scan.out.adapter_summary
139 | }
140 | 


--------------------------------------------------------------------------------
/subworkflows/process_spaceranger.nf:
--------------------------------------------------------------------------------
  1 | include { call_paftools; build_minimap_index} from '../modules/local/common'
  2 | 
  3 | // Move to common
  4 | process split_gtf_by_chroms {
  5 |     label "singlecell"
  6 |     cpus 1
  7 |     memory "1 GB"
  8 |     input:
  9 |         path("ref.gtf")
 10 |     output:
 11 |         path("*"), emit: chrom_gtf
 12 |     script:
 13 |     """
 14 |     gawk '/^[^#]/ {print>\$1".gtf"}' ref.gtf
 15 |     """
 16 | }
 17 | 
 18 | 
 19 | process process_long_reads {
 20 |     label "singlecell"
 21 |     cpus params.threads
 22 |     memory 17.GB  
 23 |     input:
 24 |         tuple val(meta), 
 25 |               path(chunk, stageAs: 'chunk.fq.gz')
 26 |         path('genome_index.mmi')
 27 |         path('ref_genes.bed')
 28 | 
 29 |     output:
 30 |         tuple val(meta),
 31 |               path("read_tags.tsv"),
 32 |               emit: mapping_tags
 33 |         tuple val(meta),
 34 |               path("sorted.bam"), path("sorted.bam.bai"),
 35 |               emit: bam_sort
 36 |         tuple val(meta),
 37 |               path("bamstats.tsv"),
 38 |               emit: bam_stats
 39 |     script:
 40 |     def mm2_threads = task.cpus - 1
 41 |     """
 42 |     # -uf, these are stranded in mRNA sense ?
 43 | 
 44 |     # Extract fastq with relevant tags from spaceranger
 45 |     # Map reads to the reference genome, carry over the tags
 46 |     
 47 |     # Note: the minimap command is the same as that in preprocess.nf:call_adapter_scan
 48 |     # Can remove this dupoication
 49 | 
 50 |     minimap2 -y -ax splice -uf --MD \
 51 |         -t $mm2_threads -K 10M \
 52 |         --junc-bed ref_genes.bed  \
 53 |         --cap-kalloc 100m \
 54 |         genome_index.mmi chunk.fq.gz \
 55 |     | samtools view -uh --no-PG - \
 56 |     | tee >(seqkit bam -s  2> bamstats.tsv ) \
 57 |     | tee >(samtools view - -d SA \
 58 |         | awk 'BEGIN{OFS="\t"; print "read_id", "SA"} {print \$1,"True"}' > SA_tags.tsv ) \
 59 |     | samtools view -uh -F 256 - \
 60 |     | tee >(samtools sort --write-index -o "sorted.bam"##idx##"sorted.bam.bai" --no-PG  -) \
 61 |     | tee >(seqkit bam -F - 2> bam_info.tsv)
 62 | 
 63 |     csvtk cut -tlf Read,Pos,EndPos,Ref,MapQual bam_info.tsv \
 64 |     | csvtk rename -tl -f Read,Pos,EndPos,Ref,MapQual -n read_id,start,end,chr,mapq -o read_tags_interim.tsv
 65 | 
 66 |     # Merge the SA column with the read tags on read_id
 67 |     if [ \$(wc -l < SA_tags.tsv) -eq 1 ]; then
 68 |         echo "No SA tags found"
 69 |         # Add an empty SA column
 70 |         csvtk mutate2 -t -n 'SA' -e " '' " read_tags_interim.tsv > read_tags.tsv
 71 |     else
 72 |         csvtk -t uniq SA_tags.tsv | csvtk join -t --left-join --fields read_id read_tags_interim.tsv - > read_tags.tsv
 73 |     fi
 74 |     # rm bam_info.tsv bam_info_cut.tsv bc_extract.tsv read_tags_interim.tsv
 75 |     """    
 76 | }
 77 | 
 78 | 
 79 | process parse_sr_bam {
 80 |     // Extract barcode counts from teh spaceranger BAM
 81 |     label "wf_common"
 82 |     cpus params.threads
 83 |     memory "16 GB"
 84 |     input:
 85 |         tuple val(meta),
 86 |               path('spaceranger.bam'),
 87 |               path('spaceranger.bam.bai'),
 88 |               val(chr)
 89 |     output:
 90 |         tuple val(meta),
 91 |               path("spaceranger_tags.tsv"),
 92 |               emit: spaceranger_tags
 93 |         tuple val(meta),
 94 |               path("barcode_counts.tsv"),
 95 |               emit: barcode_counts
 96 |     script:
 97 |     """
 98 |     # If we want qual (UY, UB) these must be quoted
 99 |     workflow-glue tags_from_bam \
100 |         spaceranger.bam \
101 |         spaceranger_tags.tsv \
102 |         barcode_counts.tsv \
103 |         --tags CR CB CY UR UB UY \
104 |         --chrom "${chr}" 
105 |     """
106 | }
107 | 
108 | // Aggregate the per-chunk cell counts into a per sample/chr counts file.
109 | process aggregate_barcode_counts {
110 |     label "wf_common"
111 |     cpus params.threads
112 |     memory "64 GB"
113 |     input:
114 |         tuple val(meta),
115 |               path('barcode_counts/*.tsv')
116 |     output:   
117 |         path("agg_barcode_counts.tsv"),
118 |               emit: barcode_counts
119 |         tuple val(meta),
120 |               path("barcode_list.tsv"),
121 |               emit: barcode_list
122 |     script:
123 |     """
124 |     csvtk concat -t barcode_counts/*.tsv \
125 |     | csvtk -t sort -k barcode - \
126 |     | awk -F'\t' '
127 |         NR==1 { print; next }
128 |         {
129 |         if (\$1 == prev) {
130 |             sum += \$2
131 |         } else {
132 |             if (NR > 2) print prev, sum
133 |             prev = \$1
134 |             sum = \$2
135 |         }
136 |         }
137 |         END {
138 |         if (NR > 1) print prev, sum
139 |         }' OFS='\t' \
140 |     | csvtk -t mutate2 --name 'sample' --expression '"${meta.alias}"' \
141 |     | csvtk -t sort -k count:n  > agg_barcode_counts.tsv
142 | 
143 | 
144 |     # Get whitelist. This is just the list of barcodes. This is identical to the list
145 |     # in cell_counts.tsv, but this is need to be compatiable with other 10x kits
146 |     # where filtering of cells occurs
147 |     tail +2 agg_barcode_counts.tsv | cut -f 1 > barcode_list.tsv
148 |     """
149 | }
150 | 
151 | 
152 | // Put in common?
153 | process merge_bams {
154 |     // Combine all BAMs derived from the initial chunking into per sample files
155 |     label "wf_common"
156 |     cpus params.threads
157 |     memory "8 GB"
158 |     input:
159 |         tuple val(meta),
160 |             path('bams/*aln.bam'),
161 |             path('bams/*aln.bam.bai')
162 |     output:
163 |         tuple val(meta),
164 |               path("merged.sorted.bam"),
165 |               path("merged.sorted.bam.bai"),
166 |               emit: merged_bam
167 |     script:
168 |     """
169 |     samtools merge -@ ${task.cpus -1} --write-index -o "merged.sorted.bam##idx##merged.sorted.bam.bai" bams/*.bam
170 |     """
171 | }
172 | 
173 | 
174 | process cat_tags_by_chrom {
175 |     // Merge per-chunk tags to create per-chromosome tags
176 |     label "wf_common"
177 |     cpus params.threads
178 |     memory "4 GB"
179 |     input:
180 |         tuple val(meta),
181 |               path('tags/*tags.tsv')
182 |     output:
183 |          tuple val(meta),
184 |                path("chr_tags/*"),
185 |                emit: merged_tags
186 |     script:
187 |     """
188 |     # Concatenate all tags files, add sample name, and split by chromosome
189 |     mkdir -p chr_tags
190 | 
191 |     csvtk concat -t tags/*.tsv \
192 |         | csvtk -t split --fields chr -o chr_tags/
193 |     """
194 | }
195 | 
196 | process combine_mapping_and_demux_tags {
197 |     // Join the demux and mapping tags and split by chromosome
198 |     label "wf_common"
199 |     cpus 1
200 |     // The memory usage of this process has not been checked, but I assume it is high
201 |     // Check with 300M reads and come up with another approach if too high
202 |     memory "32 GB" 
203 |     input:
204 |         tuple val(meta),
205 |               path('demux_tags/demux_??.tsv'),
206 |               path('mapping_tags/mapping_??.tsv')
207 |     output:
208 |         tuple val(meta),
209 |               path("chr_tags/*"),
210 |               emit: chr_tags
211 |     script:
212 |     """
213 |     mkdir tmpdir
214 |     export TMPDIR=tmpdir
215 |     # In order for the outputs to be compatible with the reset of the workflow,
216 |     # we need to  combine the mapping and demux tags and split by chromosome 
217 | 
218 |     head -n 1 mapping_tags/mapping_01.tsv  > map.tsv
219 |     head -n 1 demux_tags/demux_01.tsv  > demux.tsv
220 | 
221 |     find -L demux_tags -type f -name "*.tsv" | while read -r file; do
222 |     line_count=\$(awk 'END {print NR}' "\$file")
223 |     if [ "\$line_count" -gt 1 ]; then
224 |         tail -n +2 "\$file"
225 |     fi
226 |     done > d.tsv
227 |     sort --buffer-size=25G -T tmpdir d.tsv -k1,1 >> demux.tsv
228 |     rm d.tsv
229 | 
230 |     find -L mapping_tags -type f -name "*.tsv" | while read -r file; do
231 |     line_count=\$(awk 'END {print NR}' "\$file")
232 |     if [ "\$line_count" -gt 1 ]; then
233 |         tail -n +2 "\$file"
234 |     fi
235 |     done > m.tsv
236 |     sort --buffer-size=25G -T tmpdir m.tsv -k1,1 >> map.tsv
237 |     rm m.tsv
238 | 
239 |     workflow-glue join_tags \
240 |         demux.tsv \
241 |         map.tsv \
242 |         join_tags.tsv 
243 |     
244 |     csvtk split -j 8 -t --fields chr -o chr_tags join_tags.tsv
245 |     # TODO: cleanup files
246 |     """
247 | }
248 | 
249 | 
250 | workflow spaceranger {
251 |     take:
252 |         samples
253 |         ref_genome_fa
254 |         ref_genes_gtf
255 | 
256 |     main:
257 |         ref_genome_mmi = build_minimap_index(ref_genome_fa)
258 |         ref_genome_bed = call_paftools(ref_genes_gtf)
259 | 
260 |         chr_gtfs = split_gtf_by_chroms(ref_genes_gtf)
261 |             .flatten()
262 |             .map {fname -> tuple(fname.baseName, fname)} // [chr, gtf]
263 |         
264 |         // Get the shortread BAM form the meta
265 |         //This should be moved into main workflow
266 |         sr_bam = samples.groupTuple()
267 |             .map {meta, _fastqs -> 
268 |                 [meta, 
269 |                 file("${meta.spaceranger_bam}", checkIfExists: true),
270 |                 file("${meta.spaceranger_bam}"+'.bai', checkIfExists: true)]
271 |             }
272 | 
273 |         adapter_configs = samples.groupTuple()
274 |                 .map {meta, _fastq -> [ meta, file("${meta.adapter_configs}", checkIfExists: true)]}
275 |         
276 |         //Chunks
277 |         process_long_reads(
278 |             samples, ref_genome_mmi, ref_genome_bed)
279 |         
280 |         merged_long_read_bam = merge_bams(process_long_reads.out.bam_sort.groupTuple())
281 |         
282 |         // Per chr
283 |         parse_sr_bam(sr_bam.combine(chr_gtfs.map {chr, gtf -> [chr]}))
284 |         
285 |         // Combine BC counts to sample
286 |         aggregate_barcode_counts(
287 |             parse_sr_bam.out.barcode_counts.groupTuple())
288 | 
289 |         // Merge demux tags with features and cat by chromosome
290 |         combine_mapping_and_demux_tags(
291 |             parse_sr_bam.out.spaceranger_tags.groupTuple()
292 |             .join(process_long_reads.out.mapping_tags.groupTuple()))
293 |         
294 |         chr_tags = combine_mapping_and_demux_tags.out.chr_tags
295 |         .transpose()
296 |             .map {meta, tags -> 
297 |                 def chr = tags.baseName.replaceAll('join_tags-', '')
298 |                 [meta, chr, tags] // [meta, chr, tags.tsv]
299 |             }
300 | 
301 | 
302 |     emit:
303 |         adapter_summary = adapter_configs
304 |         chr_tags = chr_tags
305 |         merged_bam = merged_long_read_bam
306 |         bam_stats = process_long_reads.out.bam_stats
307 |         barcode_list = aggregate_barcode_counts.out.barcode_list
308 |         barcode_counts = aggregate_barcode_counts.out.barcode_counts
309 |             .collectFile(keepHeader: true)
310 |         
311 | }


--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Pytests argument definitions."""
 3 | 
 4 | 
 5 | def pytest_addoption(parser):
 6 |     """Define command line arguments for pytest."""
 7 |     parser.addoption(
 8 |         "--wf_out_dir",
 9 |         action='store',
10 |         default='/host/wf-single-cell'
11 |     )
12 |     parser.addoption(
13 |         "--sample_id",
14 |         action="store",
15 |         default="sample1"
16 |     )
17 | 


--------------------------------------------------------------------------------
/test/workflow_integration.py:
--------------------------------------------------------------------------------
 1 | """Integration testing of the whole workflow using synthetic data."""
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import pandas as pd
 6 | from pytest import fixture
 7 | 
 8 | 
 9 | @fixture
10 | def wf_out_dir(request):
11 |     """Set workflow directory."""
12 |     return request.config.getoption('--wf_out_dir')
13 | 
14 | 
15 | @fixture
16 | def sample_id(request):
17 |     """Set sample ID."""
18 |     return request.config.getoption('--sample_id')
19 | 
20 | 
21 | def test_workflow(wf_out_dir, sample_id):
22 |     """Test the whole Nextflow workflow."""
23 |     out_dir = Path(wf_out_dir)
24 |     test_out_dir = out_dir / sample_id
25 |     read_tags = test_out_dir / 'sample1.read_summary.tsv'
26 | 
27 |     assert read_tags.is_file()
28 | 
29 |     df = pd.read_csv(read_tags, sep='\t')
30 | 
31 |     # As all reads should be assigned a barcode and UMI, there should be the
32 |     # same number of output rows as in reads in the integration test data (1850).
33 |     assert len(df) == 1850
34 | 
35 |     # Extract the expected values from the read_id
36 |     df[['true_gene', 'true_transcript', 'true_bc', 'true_umi', 'true_status', '_']] \
37 |         = df['read_id'].str.split('|', expand=True)
38 | 
39 |     # Check barcode and umis are correctly identified. Allow for 2 incorrect values
40 |     df_barcode_mismatches = df[df.corrected_barcode != df.true_bc]
41 |     assert len(df_barcode_mismatches) < 2
42 | 
43 |     df_umi_mismatches = df[df.true_umi != df.corrected_umi]
44 |     assert len(df_umi_mismatches) < 2
45 | 
46 |     # Check gene assignment
47 |     df_gene_matches = df[df.gene == df.true_gene]
48 |     perc_correct = 100 / len(df) * len(df_gene_matches)
49 |     assert perc_correct == 100.0
50 | 
51 |     # Check transcript assignment
52 |     # We should be getting more than 85% of the transcritps correctly called,
53 |     # especially on this contrived synthetic dataset.
54 |     df_tr_matches = df[df.transcript == df.true_transcript]
55 |     perc_correct = 100 / len(df) * len(df_tr_matches)
56 |     assert perc_correct > 85.0
57 | 


--------------------------------------------------------------------------------