├── .dockerignore ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── config.yml │ ├── feature_request.yml │ └── question.yml ├── .gitignore ├── .gitlab-ci.yml ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── bin ├── workflow-glue └── workflow_glue │ ├── __init__.py │ ├── adapter_scan_vsearch.py │ ├── assign_barcodes.py │ ├── assign_features.py │ ├── calc_saturation.py │ ├── clip_depth.py │ ├── create_matrix.py │ ├── create_shortlist.py │ ├── create_umap.py │ ├── expression_matrix.py │ ├── extract_barcode.py │ ├── format_ctat_output.py │ ├── models │ ├── __init__.py │ └── common.py │ ├── parse_kit_metadata.py │ ├── prepare_report_data.py │ ├── process_matrix.py │ ├── report.py │ ├── sc_util.py │ ├── summarise_adapters.py │ ├── tag_bam.py │ ├── tests │ ├── __init__.py │ ├── test_adapter_scan_vsearch.py │ ├── test_assign_barcodes.py │ ├── test_assign_features.py │ ├── test_calc_saturation.py │ ├── test_cluster_umis.py │ ├── test_expression_matrix.py │ ├── test_extract_barcode.py │ ├── test_format_ctat_output.py │ └── test_tag_bam.py │ ├── util.py │ ├── variant_mex.py │ └── wfg_helpers │ ├── __init__.py │ ├── check_bam_headers_in_dir.py │ ├── check_sample_sheet.py │ ├── check_xam_index.py │ ├── configure_igv.py │ ├── get_max_depth_locus.py │ └── reheader_samstream.py ├── data ├── 3M-3pgex-may-2023.txt.gz ├── 3M-5pgex-jan-2023.txt.gz ├── 3M-february-2018.txt.gz ├── 737K-arc-v1.txt.gz ├── 737K-august-2016.txt.gz ├── OPTIONAL_FILE ├── genes_of_interest.csv ├── visium-v1.txt.gz └── visium-v1_coordinates.txt ├── docs ├── 01_brief_description.md ├── 02_introduction.md ├── 03_compute_requirements.md ├── 04_install_and_run.md ├── 05_related_protocols.md ├── 06_input_example.md ├── 06_input_parameters.md ├── 07_outputs.md ├── 08_pipeline_overview.md ├── 09_troubleshooting.md ├── 10_FAQ.md ├── 11_other.md └── images │ ├── 3prime_read.png │ └── probe.png ├── kit_configs.csv ├── lib ├── ArgumentParser.groovy ├── CWUtil.groovy ├── NfcoreSchema.groovy ├── NfcoreTemplate.groovy ├── Pinguscript.groovy ├── WorkflowMain.groovy ├── common.nf ├── ingress.nf └── nfcore_external_java_deps.jar ├── limitations_and_known_issues.md ├── main.nf ├── modules └── local │ └── common.nf ├── nextflow.config ├── nextflow_schema.json ├── output_definition.json ├── subworkflows ├── fusions.nf ├── preprocess.nf ├── process_bams.nf └── snv.nf └── test ├── conftest.py ├── test_ingress.py └── workflow_integration.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | bin 3 | CHANGELOG.md 4 | data 5 | lib 6 | LICENSE 7 | main.nf 8 | nextflow.config 9 | README.md 10 | test_data 11 | # we typically run tests with outputs to these: 12 | output 13 | work 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report 3 | labels: ["triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for taking the time to fill out this bug report! 9 | 10 | 11 | - type: markdown 12 | attributes: 13 | value: | 14 | # Background 15 | - type: dropdown 16 | id: os 17 | attributes: 18 | label: Operating System 19 | description: What operating system are you running? 20 | options: 21 | - Windows 10 22 | - Windows 11 23 | - macOS 24 | - Ubuntu 22.04 25 | - CentOS 7 26 | - Other Linux (please specify below) 27 | validations: 28 | required: true 29 | - type: input 30 | id: other-os 31 | attributes: 32 | label: Other Linux 33 | placeholder: e.g. Fedora 38 34 | - type: input 35 | id: version 36 | attributes: 37 | label: Workflow Version 38 | description: This is most easily found in the workflow output log 39 | placeholder: v1.2.3 40 | validations: 41 | required: true 42 | - type: dropdown 43 | id: execution 44 | attributes: 45 | label: Workflow Execution 46 | description: Where are you running the workflow? 47 | options: 48 | - EPI2ME Desktop (Local) 49 | - EPI2ME Desktop (Cloud) 50 | - Command line (Local) 51 | - Command line (Cluster) 52 | - Other (please describe) 53 | validations: 54 | required: true 55 | - type: input 56 | id: other-workflow-execution 57 | attributes: 58 | label: Other workflow execution 59 | description: If "Other", please describe 60 | placeholder: Tell us where / how you are running the workflow. 61 | 62 | - type: markdown 63 | attributes: 64 | value: | 65 | # EPI2ME Desktop Application 66 | If you are using the application please provide the following. 67 | - type: input 68 | id: labs-version 69 | attributes: 70 | label: EPI2ME Version 71 | description: Available from the application settings page. 72 | placeholder: v5.1.1 73 | validations: 74 | required: false 75 | 76 | 77 | - type: markdown 78 | attributes: 79 | value: | 80 | # Command-line execution 81 | If you are using nextflow on a command-line, please provide the following. 82 | - type: textarea 83 | id: cli-command 84 | attributes: 85 | label: CLI command run 86 | description: Please tell us the command you are running 87 | placeholder: e.g. nextflow run epi2me-labs/wf-human-variations -profile standard --fastq my-reads/fastq 88 | validations: 89 | required: false 90 | - type: dropdown 91 | id: profile 92 | attributes: 93 | label: Workflow Execution - CLI Execution Profile 94 | description: Which execution profile are you using? If you are using a custom profile or nextflow configuration, please give details below. 95 | options: 96 | - standard (default) 97 | - singularity 98 | - custom 99 | validations: 100 | required: false 101 | 102 | 103 | - type: markdown 104 | attributes: 105 | value: | 106 | # Report details 107 | - type: textarea 108 | id: what-happened 109 | attributes: 110 | label: What happened? 111 | description: Also tell us, what did you expect to happen? 112 | placeholder: Tell us what you see! 113 | validations: 114 | required: true 115 | - type: textarea 116 | id: logs 117 | attributes: 118 | label: Relevant log output 119 | description: For CLI execution please include the full output from running nextflow. For execution from the EPI2ME application please copy the contents of the "Workflow logs" panel from the "Logs" tab corresponding to your workflow instance. (This will be automatically formatted into code, so no need for backticks). 120 | render: shell 121 | validations: 122 | required: true 123 | - type: textarea 124 | id: activity-log 125 | attributes: 126 | label: Application activity log entry 127 | description: For use with the EPI2ME application please see the Settings > View Activity Log page, and copy the contents of any items listed in red using the Copy to clipboard button. 128 | render: shell 129 | validations: 130 | required: false 131 | - type: dropdown 132 | id: run-demo 133 | attributes: 134 | label: Were you able to successfully run the latest version of the workflow with the demo data? 135 | description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button? 136 | options: 137 | - 'yes' 138 | - 'no' 139 | - other (please describe below) 140 | validations: 141 | required: true 142 | - type: textarea 143 | id: demo-other 144 | attributes: 145 | label: Other demo data information 146 | render: shell 147 | validations: 148 | required: false 149 | 150 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Nanopore customer support 4 | url: https://nanoporetech.com/contact 5 | about: For general support, including bioinformatics questions. 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature request 2 | description: Suggest an idea for this project 3 | labels: ["feature request"] 4 | body: 5 | 6 | - type: textarea 7 | id: question1 8 | attributes: 9 | label: Is your feature related to a problem? 10 | placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 11 | validations: 12 | required: true 13 | - type: textarea 14 | id: question2 15 | attributes: 16 | label: Describe the solution you'd like 17 | placeholder: A clear and concise description of what you want to happen. 18 | validations: 19 | required: true 20 | - type: textarea 21 | id: question3 22 | attributes: 23 | label: Describe alternatives you've considered 24 | placeholder: A clear and concise description of any alternative solutions or features you've considered. 25 | validations: 26 | required: true 27 | - type: textarea 28 | id: question4 29 | attributes: 30 | label: Additional context 31 | placeholder: Add any other context about the feature request here. 32 | validations: 33 | required: false 34 | 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.yml: -------------------------------------------------------------------------------- 1 | name: Question 2 | description: Ask a generic question about this project unrelated to features or bugs. 3 | labels: ["question"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Please reserve this form for issues not related to bugs or feature requests. If our developers deem your questions to be related to bugs or features you will be asked to fill in the appropriate form. 9 | - type: textarea 10 | id: question1 11 | attributes: 12 | label: Ask away! 13 | placeholder: | 14 | Bad question: How do I use this workflow in my HPC cluster? 15 | Good question: My HPC cluster uses a GridEngine scheduler. Can you point me to documentation for how to use your workflows to efficiently submit jobs to my cluster? 16 | validations: 17 | required: true 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | nextflow 2 | .nextflow* 3 | template-workflow 4 | .*.swp 5 | .*.swo 6 | *.pyc 7 | *.pyo 8 | .DS_store 9 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: docs_readme 5 | name: docs_readme 6 | entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json 7 | language: python 8 | always_run: true 9 | pass_filenames: false 10 | additional_dependencies: 11 | - epi2melabs==0.0.58 12 | - repo: https://github.com/pycqa/flake8 13 | rev: 5.0.4 14 | hooks: 15 | - id: flake8 16 | pass_filenames: false 17 | additional_dependencies: 18 | - flake8-rst-docstrings 19 | - flake8-docstrings 20 | - flake8-import-order 21 | - flake8-forbid-visual-indent 22 | - pep8-naming 23 | - flake8-no-types 24 | - flake8-builtins 25 | - flake8-absolute-import 26 | - flake8-print 27 | # avoid snowballstemmer>=3.0 as it causes flake8-docstrings to stop working [CW-6098] 28 | - snowballstemmer==2.2.0 29 | args: [ 30 | "bin", 31 | "--import-order-style=google", 32 | "--statistics", 33 | "--max-line-length=88", 34 | "--per-file-ignores=bin/workflow_glue/models/*:NT001", 35 | ] 36 | -------------------------------------------------------------------------------- /bin/workflow-glue: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Entry point for sc_tools (single_cell_tools).""" 3 | 4 | from workflow_glue import cli 5 | 6 | if __name__ == '__main__': 7 | cli() 8 | -------------------------------------------------------------------------------- /bin/workflow_glue/__init__.py: -------------------------------------------------------------------------------- 1 | """Workflow Python code.""" 2 | import argparse 3 | import glob 4 | import importlib 5 | import itertools 6 | import os 7 | import sys 8 | 9 | from .util import _log_level, get_main_logger # noqa: ABS101 10 | 11 | 12 | __version__ = "0.0.1" 13 | _package_name = "workflow_glue" 14 | 15 | HELPERS = "wfg_helpers" 16 | 17 | 18 | def get_components(allowed_components=None): 19 | """Find a list of workflow command scripts.""" 20 | logger = get_main_logger(_package_name) 21 | 22 | # gather all python files in the current directory and the wfg_helpers 23 | home_path = os.path.dirname(os.path.abspath(__file__)) 24 | standard_lib = os.path.join(home_path, HELPERS) 25 | globs = itertools.chain.from_iterable(( 26 | glob.glob(os.path.join(path, "*.py")) 27 | for path in (home_path, standard_lib))) 28 | 29 | components = dict() 30 | for fname in globs: 31 | name = os.path.splitext(os.path.basename(fname))[0] 32 | if name in ("__init__", "util"): 33 | continue 34 | if allowed_components is not None and name not in allowed_components: 35 | continue 36 | 37 | # leniently attempt to import module 38 | try: 39 | if HELPERS in fname: 40 | mod = importlib.import_module(f"{_package_name}.{HELPERS}.{name}") 41 | else: 42 | mod = importlib.import_module(f"{_package_name}.{name}") 43 | except ModuleNotFoundError as e: 44 | # if imports cannot be satisifed, refuse to add the component 45 | # rather than exploding 46 | logger.warn(f"Could not load {name} due to missing module {e.name}") 47 | continue 48 | 49 | # if theres a main() and and argparser() thats good enough for us. 50 | try: 51 | req = "main", "argparser" 52 | if all(callable(getattr(mod, x)) for x in req): 53 | components[name] = mod 54 | except Exception: 55 | pass 56 | return components 57 | 58 | 59 | def cli(): 60 | """Run workflow entry points.""" 61 | logger = get_main_logger(_package_name) 62 | logger.info("Bootstrapping CLI.") 63 | parser = argparse.ArgumentParser( 64 | 'wf-glue', 65 | parents=[_log_level()], 66 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 67 | 68 | parser.add_argument( 69 | '-v', '--version', action='version', 70 | version='%(prog)s {}'.format(__version__)) 71 | 72 | subparsers = parser.add_subparsers( 73 | title='subcommands', description='valid commands', 74 | help='additional help', dest='command') 75 | subparsers.required = True 76 | 77 | # importing everything can take time, try to shortcut 78 | if len(sys.argv) > 1: 79 | components = get_components(allowed_components=[sys.argv[1]]) 80 | if not sys.argv[1] in components: 81 | logger.warn("Importing all modules, this may take some time.") 82 | components = get_components() 83 | else: 84 | components = get_components() 85 | 86 | # add all module parsers to main CLI 87 | for name, module in components.items(): 88 | p = subparsers.add_parser( 89 | name.split(".")[-1], parents=[module.argparser()]) 90 | p.set_defaults(func=module.main) 91 | 92 | args = parser.parse_args() 93 | 94 | logger.info("Starting entrypoint.") 95 | args.func(args) 96 | -------------------------------------------------------------------------------- /bin/workflow_glue/assign_barcodes.py: -------------------------------------------------------------------------------- 1 | """Assign barcodes. 2 | 3 | Given a whitelist of barcodes assign raw barcodes to nearest match. 4 | """ 5 | import collections 6 | from pathlib import Path 7 | 8 | import pandas as pd 9 | import rapidfuzz 10 | from rapidfuzz.process import extract 11 | 12 | from .util import get_named_logger, wf_parser # noqa: ABS101 13 | 14 | logger = get_named_logger("AsgnBrcdes") 15 | 16 | 17 | def argparser(): 18 | """Create argument parser.""" 19 | parser = wf_parser("assign_barcodes") 20 | 21 | parser.add_argument( 22 | "whitelist", type=Path, 23 | help="File containing list of expected cell barcodes.") 24 | 25 | parser.add_argument( 26 | "barcode_tags", type=Path, 27 | help="TSV file of read_id, uncorrected_barcode, qscores.") 28 | 29 | parser.add_argument( 30 | "output_tags", type=Path, 31 | help="Output TSV containing columns from `barcode_tags` \ 32 | and additional a CB (corrected barcode) column.") 33 | 34 | parser.add_argument( 35 | "output_counts", type=Path, 36 | help="Output TSV file containing counts for each of the assigned \ 37 | barcodes.") 38 | 39 | parser.add_argument( 40 | "report", type=Path, 41 | help="Path to TSV file to store reasons for barcode assignment.") 42 | 43 | parser.add_argument( 44 | "--chunksize", type=int, default=50000, 45 | help="Process the BAM in chunks no larger than this.") 46 | 47 | parser.add_argument( 48 | "--use_kmer_index", action='store_true', 49 | help="Use a kmer index to reduce the search space of fuzzy matching.") 50 | 51 | parser.add_argument( 52 | "--max_ed", type=int, default=2, 53 | help="Max. edit distance between putative barcode \ 54 | and the matching whitelist barcode.") 55 | 56 | parser.add_argument( 57 | "--min_ed_diff", type=int, default=2, 58 | help="Min. difference in edit distance between the \ 59 | best and second best whitelist matches.") 60 | 61 | return parser 62 | 63 | 64 | def determine_barcode( 65 | bc_uncorr, whitelist, whiteset, 66 | max_ed, min_ed_diff, assignment_log, index=None): 67 | """Find barcode in a whitelist corresponding to read barcode. 68 | 69 | :param bc_uncorr: uncorrected barcode. 70 | :param whitelist: list of possible barcodes. 71 | :param whiteset: whitelist as a set. 72 | :param max_ed: max. edit distance between barcode and whitelist hit. 73 | :param min_ed_diff: min. edit distance difference between first and 74 | second best hits in order to accept the first as valid. 75 | :param assignment_log: a Counter object to store reasons for barcode assignment. 76 | :param index: a kmer index for reducing search space of fuzzy-matching. 77 | 78 | Passing the whitelist as both a list and set is for performance reasons 79 | when calling this function many times. 80 | """ 81 | # quick return 82 | if bc_uncorr in whiteset: 83 | assignment_log["bc_shortlist_exact_match"] += 1 84 | return bc_uncorr 85 | 86 | if index is not None: 87 | shortlist = set() 88 | for kmer in build_kmers(bc_uncorr): 89 | shortlist.update(index[kmer]) 90 | shortlist = list(shortlist) 91 | else: 92 | shortlist = whitelist 93 | 94 | result = extract( 95 | bc_uncorr, 96 | shortlist, 97 | scorer=rapidfuzz.distance.Levenshtein.distance, 98 | score_cutoff=max_ed + min_ed_diff + 1) 99 | 100 | corrected = "-" 101 | if len(result) > 0: 102 | # There is at least 1 initial match (ED >= max_ed + min_ed_diff + 1) 103 | bc_match = result[0][0] 104 | bc_match_ed = result[0][1] 105 | else: 106 | assignment_log['bc_no_shortlist_match'] += 1 107 | return corrected 108 | if len(result) > 1: 109 | next_match_diff = result[1][1] - bc_match_ed 110 | else: 111 | next_match_diff = len(bc_uncorr) 112 | 113 | # are we better than the second place? 114 | # This criteria is a little odd: we have (2, 2) as the defaults 115 | # for max_ed and min_ed_diff. But some true barcodes are within 116 | # and edit distance of 2 to start with, so they would be guaranteed 117 | # to be filtered out (the exact match shortcut above saves us a lot 118 | # of the time). Consider removing this? 119 | if (bc_match_ed <= max_ed) and (next_match_diff >= min_ed_diff): 120 | corrected = bc_match 121 | assignment_log['bc_corrected'] += 1 122 | elif bc_match_ed > max_ed: 123 | # There was an initial rapidfuzz match, but ED was greater than our max ED. 124 | assignment_log['bc_no_shortlist_match'] += 1 125 | elif next_match_diff < min_ed_diff: 126 | # Two or more hits to the rapidfuzz results. 127 | assignment_log['bc_shortlist_multiple_hits'] += 1 128 | 129 | return corrected 130 | 131 | 132 | def build_index(whitelist, klen=5): 133 | """Build a kmer index of a list of sequences.""" 134 | index = collections.defaultdict(set) 135 | for seq in whitelist: 136 | for ss in build_kmers(seq, klen): 137 | index[ss].add(seq) 138 | return index 139 | 140 | 141 | def build_kmers(seq, klen=5): 142 | """Create a list of kmers in a sequence.""" 143 | return [seq[i:i+klen] for i in range(0, len(seq) - klen)] 144 | 145 | 146 | def process_records( 147 | barcode_tags, whiteset, max_ed, min_ed_diff, tags_output, 148 | chunksize=50000, use_kmer_index=False): 149 | """Process read barcodes stored in text file to find whitelist equivalents. 150 | 151 | :param barcode_tags: path to TSV with tag data 152 | :param whiteset: set of allowed barcodes. 153 | :param: max_ed: max allowed edit distance between an uncorrected barcode 154 | and a potential corrected whiteset barcode. 155 | :param: min_ed_diff: minimum allowed edit distance between top two 156 | barcode candidates. 157 | """ 158 | barcode_counter = collections.Counter() 159 | # we need a list for indexing and because rapidfuzz appears to coerce 160 | # its input to a list on every call, saves 10% of the time. 161 | whitelist = list(whiteset) 162 | barcode_length = len(whitelist[0]) 163 | # for 16mers with 2 mismatches we must have a least a 5mer match. 164 | # The limit is reached by distributing the mismatches evenly, any 165 | # perturbation will increase the longest match length. 166 | # 0123456789ABCDEF 167 | # | | 168 | index = None 169 | if use_kmer_index: 170 | kmer = barcode_length // (max_ed + 1) 171 | index = build_index(whitelist, klen=kmer) 172 | 173 | output_cols = [ 174 | 'read_id', 'CR', 'CY', 'UR', 'UY', 'chr', 175 | 'start', 'end', 'mapq', 'CB', 'SA'] 176 | with open(tags_output, 'w') as fh: 177 | fh.write("\t".join(output_cols)) 178 | fh.write("\n") 179 | 180 | total_reads = 0 181 | assignment_log = collections.Counter() 182 | for df_tags in pd.read_csv(barcode_tags, sep='\t', chunksize=chunksize): 183 | df_tags["CB"] = "-" 184 | selected = df_tags["CR"].str.len() >= barcode_length - max_ed 185 | df_tags.loc[selected, "CB"] = df_tags.loc[selected].apply( 186 | lambda x: determine_barcode( 187 | x.CR, whitelist, whiteset, max_ed, min_ed_diff, assignment_log, 188 | index), 189 | axis=1) 190 | total_reads += len(df_tags) 191 | logger.info(f"Processed {total_reads} reads.") 192 | # Remove reads without a corrected barcode assigned. 193 | n_records = len(df_tags) 194 | df_tags.query('CB != "-"', inplace=True) 195 | logger.info( 196 | f"Removed {n_records - len(df_tags)} reads without a corrected barcode.") 197 | if len(df_tags) != 0: 198 | df_tags[output_cols].to_csv( 199 | tags_output, mode='a', sep='\t', header=None, index=False) 200 | barcode_counter.update(df_tags["CB"]) 201 | 202 | return barcode_counter, assignment_log 203 | 204 | 205 | def main(args): 206 | """Run main entry point.""" 207 | logger.info("Reading whitelist.") 208 | whiteset = set(pd.read_csv( 209 | args.whitelist, index_col=None, sep='\t', header=None)[0]) 210 | 211 | logger.info("Processing reads.") 212 | barcode_counter, assignment_log = process_records( 213 | args.barcode_tags, whiteset, 214 | args.max_ed, args.min_ed_diff, 215 | args.output_tags, 216 | chunksize=args.chunksize, 217 | use_kmer_index=args.use_kmer_index) 218 | 219 | with open(args.output_counts, "w") as f: 220 | for bc, n in barcode_counter.most_common(): 221 | f.write(f"{bc}\t{n}\n") 222 | 223 | df_summary = pd.DataFrame.from_dict(assignment_log, orient='index') 224 | df_summary.to_csv(args.report, sep='\t', header=False) 225 | 226 | logger.info("Finished.") 227 | -------------------------------------------------------------------------------- /bin/workflow_glue/calc_saturation.py: -------------------------------------------------------------------------------- 1 | """Calculate saturation.""" 2 | 3 | import polars as pl 4 | 5 | from .util import get_named_logger, wf_parser # noqa: ABS101 6 | 7 | 8 | def argparser(): 9 | """Create argument parser.""" 10 | parser = wf_parser("Calculate satutation") 11 | 12 | parser.add_argument( 13 | "--read_tags", 14 | help="TSV file with read_id, gene, barcode, and UMI" 15 | ) 16 | 17 | parser.add_argument( 18 | "--output", 19 | help="Output TSV file with saturation curves." 20 | ) 21 | 22 | parser.add_argument( 23 | "--sample", 24 | help="sample ID/alias" 25 | ) 26 | 27 | return parser 28 | 29 | 30 | def downsample_dataframe(df, fraction): 31 | """Downsample dataframe of read tags and tabulate genes and UMIs per cell.""" 32 | logger = get_named_logger('ClcSat') 33 | 34 | logger.info(f"Doing {fraction}") 35 | df_scaled = df.sample(fraction=fraction) 36 | n_reads = df_scaled.shape[0] 37 | 38 | # Get the unique number of reads, genes and UMIs per cell barcode 39 | gb_cell = df_scaled.group_by("barcode") 40 | gb_cell_median = gb_cell.n_unique().median() 41 | genes_per_cell = gb_cell_median['gene'][0] 42 | umis_per_cell = gb_cell_median['umi'][0] 43 | # Since polars 0.20.5 groupby.count() has been renamed groupby.len() 44 | reads_per_cell = gb_cell.count().median()['count'][0] 45 | 46 | n_deduped_reads = df_scaled.group_by(['gene', 'barcode', 'umi']).count().shape[0] 47 | if n_reads < 1: 48 | umi_saturation = 0 49 | else: 50 | umi_saturation = 1 - (n_deduped_reads / n_reads) 51 | 52 | record = ( 53 | ( 54 | fraction, 55 | n_reads, 56 | reads_per_cell, 57 | genes_per_cell, 58 | umis_per_cell, 59 | umi_saturation, 60 | ) 61 | ) 62 | logger.info(f"Done saturation calculation for fraction {fraction}") 63 | return record 64 | 65 | 66 | def run_jobs(args): 67 | """Create job to send off to workers, and collate results.""" 68 | logger = get_named_logger('ClcSat') 69 | 70 | df = pl.read_csv( 71 | args.read_tags, 72 | separator='\t', 73 | columns=['corrected_barcode', 'corrected_umi', 'gene'], 74 | new_columns=['barcode', 'umi', 'gene'], 75 | low_memory=True, 76 | dtypes={ 77 | 'corrected_barcode': pl.Categorical, 78 | 'corrected_umi': pl.Categorical, 79 | 'gene': str} 80 | ) 81 | 82 | df.filter((df['barcode'] != '-') & (df['umi'] != '-')) 83 | 84 | logger.info("Downsampling reads for saturation curves") 85 | fractions = [ 86 | 0.01, 87 | 0.02, 88 | 0.03, 89 | 0.04, 90 | 0.05, 91 | 0.1, 92 | 0.2, 93 | 0.3, 94 | 0.4, 95 | 0.5, 96 | 0.6, 97 | 0.7, 98 | 0.8, 99 | 0.9, 100 | 1.0, 101 | ] 102 | 103 | records = [(0.0, 0, 0, 0, 0, 0.0)] 104 | for frac in fractions: 105 | records.append(downsample_dataframe(df, frac)) 106 | 107 | res = pl.from_records( 108 | data=records, 109 | schema=[ 110 | "downsamp_frac", 111 | "downsamp_reads", 112 | "reads_pc", 113 | "genes_pc", 114 | "umis_pc", 115 | "umi_sat", 116 | ] 117 | ) 118 | res = res.with_columns( 119 | pl.lit(args.sample).alias("sample"), 120 | ) 121 | res.write_csv(args.output, separator="\t") 122 | 123 | 124 | def main(args): 125 | """Entry point.""" 126 | run_jobs(args) 127 | -------------------------------------------------------------------------------- /bin/workflow_glue/clip_depth.py: -------------------------------------------------------------------------------- 1 | """Clip read depth.""" 2 | 3 | import random 4 | 5 | import pandas as pd 6 | from pysam import AlignmentFile 7 | from .util import wf_parser # noqa: ABS101 8 | 9 | 10 | def argparser(): 11 | """Create argument parser.""" 12 | parser = wf_parser("clip_depth") 13 | 14 | parser.add_argument( 15 | "--bed", 16 | help="Regions to clip") 17 | 18 | parser.add_argument( 19 | "--bam_in", 20 | help="input bam file") 21 | 22 | parser.add_argument( 23 | "--target_depth", 24 | help="Desired read depth", 25 | type=int) 26 | 27 | parser.add_argument( 28 | "--bam_out", 29 | help="output bam file") 30 | 31 | return parser 32 | 33 | 34 | def main(args): 35 | """Run entry point.""" 36 | df_hi_cov = pd.read_csv( 37 | args.bed, sep='\t', 38 | names=['read_id', 'depth'], 39 | index_col='read_id', 40 | dtype={ 41 | 'read_id': str, 42 | 'depth': float 43 | } 44 | ) 45 | 46 | random.seed(1889) 47 | with AlignmentFile(args.bam_in, "rb", check_sq=False) as bam: 48 | with AlignmentFile(args.bam_out, "wb", template=bam) as out_bam: 49 | for aln in bam.fetch(until_eof=True): 50 | if aln.query_name in df_hi_cov.index: 51 | window_depth = df_hi_cov.at[aln.query_name, 'depth'] 52 | # Randomly discard records to achieve target depth 53 | # Random returns a random floating number between 0 and 1 54 | if random.random() > args.target_depth / window_depth: 55 | continue # Discard record 56 | out_bam.write(aln) 57 | -------------------------------------------------------------------------------- /bin/workflow_glue/create_umap.py: -------------------------------------------------------------------------------- 1 | """Umap reduce.""" 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.decomposition import PCA 7 | import umap 8 | 9 | from .util import get_named_logger, wf_parser # noqa: ABS101 10 | 11 | 12 | def argparser(): 13 | """Create argument parser.""" 14 | parser = wf_parser("umap_reduce") 15 | 16 | parser.add_argument( 17 | "matrix", 18 | help="Gene expression matrix: rows=genes/transcripts, " 19 | "columns=barcodes, values=UMIs") 20 | parser.add_argument( 21 | "output", type=Path, 22 | help="UMAP TSV output file path.") 23 | 24 | parser.add_argument( 25 | "--pcn", type=int, default=100, 26 | help="Number of principal components to generate prior to UMAP") 27 | 28 | parser.add_argument( 29 | "--dimensions", type=int, default=2, 30 | help="Number of dimensions in UMAP embedding") 31 | 32 | parser.add_argument( 33 | "--min_dist", type=float, default=0.1, 34 | help="Minimum distance parameter of UMAP") 35 | 36 | parser.add_argument( 37 | "--n_neighbors", type=int, default=15, 38 | help="Number of neighbors parameter of UMAP") 39 | 40 | return parser 41 | 42 | 43 | def main(args): 44 | """Run entry point.""" 45 | logger = get_named_logger('UmapReduce') 46 | 47 | # find the numpy of columns, since the numpy API doesn't allow skipping cols 48 | names = np.loadtxt( 49 | args.matrix, delimiter="\t", dtype=str, max_rows=1) 50 | names = names[1:] # first is transcript/gene 51 | n_barcodes = len(names) 52 | 53 | logger.info("Expression matrix has {n_barcodes} cells.") 54 | logger.info("Reading entire matrix.") 55 | mat = np.loadtxt( 56 | args.matrix, delimiter="\t", dtype=float, 57 | skiprows=1, usecols=list(range(1, n_barcodes + 1))) 58 | mat = np.atleast_2d(mat).transpose() 59 | logger.info("Finished reading matrix.") 60 | 61 | logger.info(f"Expression matrix has shape: {mat.shape}") 62 | pcn = min(args.pcn, *mat.shape) 63 | model = PCA(n_components=pcn, copy=False) 64 | mat = model.fit_transform(mat) 65 | logger.info(f"PCA output matrix has shape: {mat.shape}") 66 | 67 | mapper = umap.UMAP( 68 | n_neighbors=args.n_neighbors, 69 | min_dist=args.min_dist, 70 | n_components=args.dimensions, 71 | verbose=0) 72 | embedding = mapper.fit_transform(mat) 73 | logger.info(f"UMAP Embedding has shape: {embedding.shape}") 74 | 75 | # would be nice to avoid a copy here, but the array is fairly small 76 | cols = [f"D{i+1}" for i in range(args.dimensions)] 77 | out = pd.DataFrame(embedding, columns=cols, index=names) 78 | out.to_csv( 79 | args.output, sep="\t", index=True, index_label="barcode") 80 | -------------------------------------------------------------------------------- /bin/workflow_glue/format_ctat_output.py: -------------------------------------------------------------------------------- 1 | """Convert ctat-LR-fusion outputs from long to short format.""" 2 | from collections import defaultdict 3 | import csv 4 | 5 | import pandas as pd 6 | 7 | from .util import get_named_logger, wf_parser # noqa: ABS101 8 | 9 | 10 | def argparser(): 11 | """Parse the arguments.""" 12 | parser = wf_parser( 13 | "Map fusions to cell barcodes.") 14 | parser.add_argument( 15 | "fusion_file", 16 | help="Path to the fusion output file (TSV format).") 17 | parser.add_argument( 18 | "read_info_file", 19 | help="Path to the per-read info file (TSV format).") 20 | parser.add_argument( 21 | "per_read_output", 22 | help="Path to save the output TSV file.") 23 | parser.add_argument( 24 | "per_fusion_output", 25 | help="Path to save the output TSV file.") 26 | parser.add_argument( 27 | "cell_summary_out", 28 | help="Path to save the output TSV file.") 29 | parser.add_argument( 30 | "sample_id", 31 | help="sample identifier to add to tables.") 32 | parser.add_argument( 33 | "--unmatched_reads_out", 34 | help="Path to save the output TSV file of fusions with no matching CB.", 35 | default='unmatched_reads.txt') 36 | 37 | return parser 38 | 39 | 40 | def load_fusion_data(fusion_file): 41 | """Load fusion data. Extract relevant fields along with read associations.""" 42 | logger = get_named_logger('FmtCtat') 43 | try: 44 | per_fusion_df = pd.read_csv(fusion_file, sep="\t") 45 | except pd.errors.EmptyDataError: 46 | logger.warning( 47 | f"""The fusion file {fusion_file} is empty. 48 | No candidate fusions found by ctat-LR-fusion.""") 49 | return None 50 | if len(per_fusion_df) == 0: 51 | logger.warning( 52 | f"""The fusion file {fusion_file} contained no entries. 53 | No candidate fusions passed ctat-LR-fusion filters.""") 54 | return None 55 | per_fusion_df.rename(columns={"#FusionName": "FusionName"}, inplace=True) 56 | 57 | # Convert read IDs to lists 58 | per_fusion_df["LR_accessions"] = per_fusion_df["LR_accessions"].str.split(",") 59 | 60 | # Expand multiple read IDs per fusion 61 | per_read_df = per_fusion_df.explode("LR_accessions") 62 | # Select relevant columns 63 | per_read_df = per_read_df[[ 64 | "FusionName", "LeftGene", "LeftBreakpoint", "RightGene", 65 | "RightBreakpoint", "SpliceType", "LR_accessions" 66 | ]].rename(columns={"LR_accessions": "read_id"}) 67 | 68 | # Handle duplicate read IDs using defaultdict (list) 69 | fusion_dict = defaultdict(list) 70 | for row in per_read_df.itertuples(): 71 | fusion_dict[row.read_id].append({ 72 | "FusionName": row.FusionName, 73 | "LeftGene": row.LeftGene, 74 | "LeftBreakpoint": row.LeftBreakpoint, 75 | "RightGene": row.RightGene, 76 | "RightBreakpoint": row.RightBreakpoint, 77 | "SpliceType": row.SpliceType 78 | }) 79 | 80 | logger.info(f"Total fusions processed: {per_fusion_df['FusionName'].nunique()}") 81 | logger.info(f"Total unique reads linked to fusions: {len(fusion_dict)}") 82 | 83 | return fusion_dict 84 | 85 | 86 | def process_read_info(read_info_file, fusion_dict): 87 | """Combine single-cell tags with fusion info.""" 88 | matched_results = [] 89 | unmatched_reads = set(fusion_dict.keys()) # Track reads missing barcode/UMI 90 | 91 | with open(read_info_file, newline='') as csvfile: 92 | reader = csv.DictReader(csvfile, delimiter="\t") 93 | 94 | for line in reader: 95 | read_id = line['read_id'] 96 | if read_id in fusion_dict: # Only process relevant reads 97 | for fusion in fusion_dict[read_id]: 98 | matched_results.append({ 99 | "FusionName": fusion["FusionName"], 100 | "LeftGene": fusion["LeftGene"], 101 | "LeftBreakpoint": fusion["LeftBreakpoint"], 102 | "RightGene": fusion["RightGene"], 103 | "RightBreakpoint": fusion["RightBreakpoint"], 104 | "SpliceType": fusion["SpliceType"], 105 | "CB": line["corrected_barcode"], 106 | "UB": line["corrected_umi"], 107 | "read_id": read_id 108 | }) 109 | unmatched_reads.discard(read_id) # Remove matched reads 110 | 111 | return pd.DataFrame(matched_results), unmatched_reads 112 | 113 | 114 | def main(args): 115 | """Run the script.""" 116 | logger = get_named_logger('FmtCtat') 117 | 118 | logger.info("Loading fusion data...") 119 | 120 | fusion_dict = load_fusion_data(args.fusion_file) 121 | 122 | if fusion_dict is None: 123 | logger.warning("No fusion data found. writing empty files.") 124 | 125 | with open(args.per_read_output, 'w') as fh1: 126 | fh1.write( 127 | "FusionName\tLeftGene\tLeftBreakpoint\tRightGene\tRightBreakpoint" 128 | "\tSpliceType\tCB\tUB\tread_id\n" 129 | ) 130 | 131 | with open(args.per_fusion_output, 'w') as fh2: 132 | fh2.write( 133 | "Fusion\tLeftGene\tLeftBreakpoint\tRightGene\tRightBreakpoint" 134 | "\tSpliceType\tcells\tUMIs\tsample_ID\n" 135 | ) 136 | ( 137 | pd.DataFrame.from_records( 138 | [[args.sample_id, 0, 0, 0, 0, 0]], 139 | columns=[ 140 | 'sample_ID', 'cells_with_fusions', 'unique_fusions', 'reads', 141 | 'mean_fusion_reads_per_cell', 'mean_unique_fusions_per_cell'], 142 | ) 143 | .to_csv(args.cell_summary_out, sep="\t", index=False) 144 | ) 145 | else: 146 | logger.info("Processing read information...") 147 | merged_df, unmatched_reads = process_read_info( 148 | args.read_info_file, fusion_dict) 149 | 150 | # Make per-fusion summary from barcode-assigned fusion + reads. 151 | ( 152 | merged_df.groupby( 153 | ['FusionName', 'LeftGene', 'LeftBreakpoint', 154 | 'RightGene', 'RightBreakpoint', 'SpliceType']) 155 | .agg( 156 | cells=('CB', 'nunique'), 157 | UMIs=('UB', 'nunique')) 158 | .reset_index() 159 | .assign( 160 | sample_ID=args.sample_id) 161 | .rename(columns={'FusionName': 'Fusion'}) 162 | .to_csv(args.per_fusion_output, sep="\t", index=False) 163 | ) 164 | 165 | # Sort reads by most commonly occuring fusion pair 166 | merged_df = (merged_df.sort_values( 167 | by="FusionName", 168 | key=lambda col: col.map(col.value_counts()), ascending=False)) 169 | 170 | logger.info("Saving merged single cell/fusion output") 171 | merged_df.to_csv(args.per_read_output, sep="\t", index=False) 172 | 173 | logger.info("Writing the per fusion summary...") 174 | # Regerate the per-fusion summary adding some cell-specific info. 175 | ( 176 | pd.DataFrame.from_dict({ 177 | 'sample_ID': args.sample_id, 178 | 'cells_with_fusions': merged_df['CB'].nunique(), 179 | 'unique_fusions': merged_df['FusionName'].nunique(), 180 | 'reads': len(merged_df), 181 | 'mean_fusion_reads_per_cell': ( 182 | merged_df.groupby('CB')['FusionName'].count().mean()), 183 | 'mean_unique_fusions_per_cell': ( 184 | merged_df.groupby('CB')['FusionName'].nunique().mean()), 185 | }, orient='index') 186 | .T 187 | .to_csv(args.cell_summary_out, sep="\t", index=False) 188 | ) 189 | 190 | # Summary of unmatched reads 191 | logger.info( 192 | ( 193 | "\n**Summary:**" 194 | f"Matched reads with barcode/UMI: {len(merged_df)}\n" 195 | f"Reads missing barcode/UMI info: {len(unmatched_reads)}") 196 | ) 197 | 198 | if unmatched_reads: 199 | with open(args.unmatched_reads_out, "w") as uf: 200 | for read in unmatched_reads: 201 | uf.write(f"{read}\n") 202 | logger.info("Process complete!") 203 | -------------------------------------------------------------------------------- /bin/workflow_glue/models/__init__.py: -------------------------------------------------------------------------------- 1 | """A collection of scripts for results models.""" 2 | -------------------------------------------------------------------------------- /bin/workflow_glue/parse_kit_metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Read in and validate user sample data.""" 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | 7 | from .util import wf_parser # noqa: ABS101 8 | 9 | 10 | def argparser(): 11 | """Create argument parser.""" 12 | parser = wf_parser("read_samples") 13 | parent_parser = wf_parser("read_samples_parent") 14 | 15 | parent_parser.add_argument( 16 | "--kit_config", 17 | help="Kit-specific details CSV", 18 | type=Path, 19 | required=True 20 | ) 21 | parent_parser.add_argument( 22 | "--sample_ids", 23 | help="File with IDs from each sample", 24 | type=Path, 25 | required=True 26 | ) 27 | parent_parser.add_argument( 28 | "--output", 29 | help="Output path for merged config", 30 | type=Path, 31 | required=True 32 | ) 33 | 34 | subparsers = parser.add_subparsers(help='commands', dest="cmd") 35 | 36 | parser_sheet = subparsers.add_parser( 37 | 'from_sheet', help='Get kit metadata per sample from sample sheet', 38 | parents=[parent_parser] 39 | ) 40 | parser_sheet.add_argument( 41 | "--user_config", 42 | help="User sample metadata CSV file", 43 | type=Path, 44 | required=True 45 | ) 46 | 47 | parser_cli = subparsers.add_parser( 48 | "from_cli", 49 | help='Apply the same kit metadata to all samples from CLI variables', 50 | parents=[parent_parser], 51 | ) 52 | parser_cli.add_argument( 53 | "--kit", 54 | help="10x kit (name:version)", 55 | required=True 56 | ) 57 | parser_cli.add_argument( 58 | "--expected_cells", 59 | help="Number of expected cells", 60 | required=True 61 | ) 62 | 63 | return parser 64 | 65 | 66 | def main(args): 67 | """Entry point.""" 68 | # Single cell sample sheet expected header 69 | sc_sample_sheet_header = [ 70 | 'sample_id', 71 | 'kit', 72 | 'expected_cells' 73 | ] 74 | 75 | sample_ids = pd.read_csv(args.sample_ids, index_col=None, header=None)[0].to_list() 76 | 77 | if args.cmd == 'from_cli': 78 | # No per-sample single-cell sample sheet given by user, so we will use the 79 | # individual CLI parameters to build a CSV and apply the same parameters to 80 | # each sample 81 | entries = [ 82 | [sid.strip(), args.kit, args.expected_cells] 83 | for sid in sample_ids 84 | ] 85 | user_df = pd.DataFrame.from_records( 86 | entries, columns=sc_sample_sheet_header 87 | ) 88 | elif args.cmd == 'from_sheet': 89 | user_df = pd.read_csv(args.user_config) 90 | 91 | # Validate sample sheet header 92 | if len(set(sc_sample_sheet_header).difference(set(user_df.columns))) != 0: 93 | raise ValueError( 94 | 'single_cell_sample_sheet should have the following column names: ' 95 | f'{sc_sample_sheet_header}') 96 | 97 | # Validate kit + version combinations. 98 | kit_df = pd.read_csv(args.kit_config) 99 | 100 | # Check if all supplied kits + version strings are supported 101 | kit_and_version_diff = set(user_df.kit).difference(kit_df.kit) 102 | if len(kit_and_version_diff) != 0: 103 | raise ValueError( 104 | 'the following are not valid kit and version combinations: ' 105 | f'{kit_and_version_diff}') 106 | 107 | # Check that ingressed IDs match sample_ids from sample_sheet 108 | if set(user_df['sample_id']) != set(sample_ids): 109 | raise ValueError( 110 | 'Sample IDs from the sc_sample_sheet must match those from those inferred ' 111 | 'from the input data:' 112 | f'\nSample IDs from ingressed data: {sample_ids}' 113 | f'\nSample IDs from sample sheet: {user_df.sample_id.to_list()}' 114 | f'\nSamples IDs in sample sheet but not in ingressed data :' 115 | f'{set(user_df["sample_id"]).difference(sample_ids)}' 116 | f'\nSamples IDs ingressed data but not in sample sheet :' 117 | f'{set(sample_ids).difference(user_df["sample_id"])}' 118 | ) 119 | 120 | merged_config = user_df.merge( 121 | kit_df, on='kit', how='left', suffixes=(None, '_delete')) 122 | # Create kit name and version columns from the kit:version string 123 | merged_config[['kit_name', 'kit_version']] \ 124 | = merged_config['kit'].str.split(':', expand=True) 125 | cols_to_drop = merged_config.columns[merged_config.columns.str.contains('delete')] 126 | merged_config = merged_config.drop(cols_to_drop, axis=1) 127 | merged_config.to_csv(args.output, sep=',', index=False) 128 | -------------------------------------------------------------------------------- /bin/workflow_glue/prepare_report_data.py: -------------------------------------------------------------------------------- 1 | """Prepare data for the report.""" 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from scipy.io import mmread 7 | 8 | from .adapter_scan_vsearch import AdapterSummary # noqa: ABS101 9 | from .create_matrix import ExpressionSummary # noqa: ABS101 10 | from .util import get_named_logger, wf_parser # noqa: ABS101 11 | 12 | 13 | # TODO: Code in this script move into the main report.py script 14 | 15 | 16 | def argparser(): 17 | """Argument parser for entrypoint.""" 18 | parser = wf_parser("prepare_report_data") 19 | 20 | parser.add_argument( 21 | "sample_id", 22 | help="ID of the sample being processed") 23 | parser.add_argument( 24 | "adapter_stats", type=Path, 25 | help="Workflow summary statistics") 26 | parser.add_argument( 27 | "bam_stats", type=Path, 28 | help="Alignment summary statistics") 29 | parser.add_argument( 30 | "expression_stats", type=Path, 31 | help="Expression summary statistics") 32 | parser.add_argument( 33 | "white_list", 34 | help="Workflow summary statistics") 35 | parser.add_argument( 36 | "survival_out", type=Path, 37 | help="Output TSV with survival data for each stage.") 38 | parser.add_argument( 39 | "bam_stats_out", type=Path, 40 | help="Output TSV with combined alignment summary stats.") 41 | parser.add_argument( 42 | "raw_gene_expression", type=Path, 43 | help="Sparse data in MEX format.") 44 | parser.add_argument( 45 | "matrix_stats", type=Path, 46 | help="TSV file with matrix sumary stats.") 47 | parser.add_argument( 48 | "genes_of_interest", type=Path, 49 | help="TSV file of file names.") 50 | parser.add_argument( 51 | "n_input_seqs", type=int, 52 | help="Number of seqs input to the workflow after read quality filtering.") 53 | return parser 54 | 55 | 56 | def combine_bam_stats(input_dir, sample_id): 57 | """Aggregate alignment statistics.""" 58 | dfs = [] 59 | colnames = { 60 | "PrimAln": "primary", 61 | "SecAln": "secondary", 62 | "SupAln": "supplementary", 63 | "Unmapped": "unmapped", 64 | "TotalReads": "reads_aligned" 65 | 66 | } 67 | for stats in input_dir.glob('*.tsv'): 68 | dfs.append(pd.read_csv( 69 | stats, sep='\t', 70 | usecols=colnames.keys(), 71 | dtype=int 72 | )) 73 | df = pd.concat(dfs) 74 | df = pd.DataFrame(df.sum(axis=0)).T 75 | df = df.rename(columns=colnames) 76 | df.insert(0, 'sample', sample_id) 77 | df.insert(1, 'reads_aligned', df.pop('reads_aligned')) 78 | 79 | return df 80 | 81 | 82 | def combine_expression_stats(input_dir): 83 | """Summarise expressions summary files.""" 84 | fnames = list(input_dir.glob("*.json")) 85 | if len(fnames) == 0: 86 | raise IOError("No summary JSON files found.") 87 | 88 | summary = ExpressionSummary.from_json(fnames[0]) 89 | if len(fnames) > 1: 90 | for other in fnames[1:]: 91 | summary += ExpressionSummary.from_json(other) 92 | return summary 93 | 94 | 95 | def combine_adapter_stats(input_dir): 96 | """Combine adapter configuration summary files.""" 97 | fnames = list(input_dir.glob("*.json")) 98 | if len(fnames) == 0: 99 | raise IOError("No summary JSON files found.") 100 | 101 | summary = AdapterSummary.from_json(fnames[0]) 102 | if len(fnames) > 1: 103 | for other in fnames[1:]: 104 | summary += AdapterSummary.from_json(other) 105 | return summary 106 | 107 | 108 | def get_total_cells(white_list): 109 | """Create dataframe with total cells.""" 110 | # ok this is a little cheesy, but consistent for ease 111 | total_cells = len(pd.read_csv(white_list, sep='\t', header=None)) 112 | return {"cells": total_cells} 113 | 114 | 115 | def get_genes_of_interest_expression(mex_dir, genes): 116 | """Get a subset of the expression data. 117 | 118 | Given a list of genes, extract corresponding expression data from the MEX format 119 | matrix. 120 | """ 121 | genes_to_plot = pd.read_csv(genes, header=None)[0] 122 | matrix = mmread(mex_dir / 'matrix.mtx.gz') 123 | barcodes = pd.read_csv(mex_dir / 'barcodes.tsv.gz', header=None) 124 | # Remove '-1' suffix from barcodes 125 | barcodes = barcodes[0].str.split('-', expand=True)[0] 126 | features = pd.read_csv(mex_dir / 'features.tsv.gz', sep='\t', header=None)[1] 127 | rows = [] 128 | for gene in genes_to_plot: 129 | try: 130 | feature_idx = features[features == gene].index[0] 131 | rows.append( 132 | [gene, np.array(matrix.getrow(feature_idx).todense()).flatten()]) 133 | except IndexError: 134 | continue # no data 135 | if len(rows) > 0: 136 | return ( 137 | pd.DataFrame.from_records( 138 | [i[1] for i in rows], index=[j[0] for j in rows], columns=barcodes) 139 | ) 140 | else: 141 | return pd.DataFrame() 142 | 143 | 144 | def main(args): 145 | """Entry point for script.""" 146 | logger = get_named_logger('PrepReport') 147 | logger.info('Preparing report data.') 148 | stats = dict() 149 | stats.update(combine_expression_stats(args.expression_stats)) 150 | stats.update(combine_adapter_stats(args.adapter_stats)) 151 | stats.update(get_total_cells(args.white_list)) 152 | # n seqs after any read quality filtering 153 | n_input_reads = args.n_input_seqs 154 | stats.update({'reads': n_input_reads}) 155 | matstats = pd.read_csv( 156 | args.matrix_stats, sep='\t', header=None, names=['stat', 'val']) 157 | for _, row in matstats.iterrows(): 158 | stats[row['stat']] = row['val'] 159 | stats['mean_reads_per_cell'] = stats['reads'] / stats['cells'] 160 | 161 | survival = ( 162 | pd.DataFrame.from_dict(stats, orient="index", columns=['count']) 163 | .reset_index(names="statistic")) 164 | 165 | # this is a little nonsensical for some stats 166 | survival['pct_of_input_reads'] = 100 * survival['count'] / n_input_reads 167 | survival['pct_of_fl_reads'] = 100 * survival['count'] / stats['full_length'] 168 | survival['sample_id'] = args.sample_id 169 | 170 | survival.set_index('statistic', inplace=True, drop=True) 171 | survival.to_csv(args.survival_out, sep='\t', index=True) 172 | 173 | aln_stats = combine_bam_stats(args.bam_stats, args.sample_id) 174 | aln_stats.to_csv(args.bam_stats_out, sep='\t', index=False) 175 | 176 | goi_df = get_genes_of_interest_expression( 177 | args.raw_gene_expression, args.genes_of_interest) 178 | goi_df.to_csv( 179 | Path(f"{args.sample_id}_expression") / 'raw_goi_expression.tsv', sep='\t') 180 | -------------------------------------------------------------------------------- /bin/workflow_glue/process_matrix.py: -------------------------------------------------------------------------------- 1 | """Expression counts matrix construction.""" 2 | import argparse 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.decomposition import PCA 8 | import umap 9 | 10 | from .expression_matrix import ExpressionMatrix # noqa: ABS101 11 | from .util import get_named_logger, wf_parser # noqa: ABS101 12 | 13 | 14 | def argparser(): 15 | """Create argument parser.""" 16 | parser = wf_parser("exp_mat") 17 | 18 | parser.add_argument( 19 | "input", type=Path, nargs='+', 20 | help="TSV with read tag data or batched expression matrices in HDF.") 21 | parser.add_argument( 22 | "--feature", default="gene", choices=["gene", "transcript"], 23 | help="Feature to compute matrix. Only used when read tag input is given.") 24 | parser.add_argument( 25 | "--raw", default="raw_feature_bc_matrix", 26 | help="Output folder for raw counts MEX data.") 27 | parser.add_argument( 28 | "--processed", default="processed_feature_bc_matrix", 29 | help="Output folder for processed counts MEX data.") 30 | parser.add_argument( 31 | "--per_cell_expr", default="expression.mean-per-cell.tsv", type=Path, 32 | help="Output TSV for per-cell mean expression level.") 33 | parser.add_argument( 34 | "--per_cell_mito", default="expression.mito-per-cell.tsv", type=Path, 35 | help="Output TSV for per-cell mean mito expression level.") 36 | parser.add_argument( 37 | "--stats", type=Path, help="Output path for stats TSV.") 38 | parser.add_argument( 39 | "--text", action="store_true", help=argparse.SUPPRESS) 40 | 41 | grp = parser.add_argument_group("Filtering") 42 | grp.add_argument( 43 | "--enable_filtering", action="store_true", 44 | help="Enable filtering of matrix.") 45 | grp.add_argument( 46 | "--min_features", type=int, default=100, 47 | help="Filter out cells that contain fewer features than this.") 48 | grp.add_argument( 49 | "--min_cells", type=int, default=3, 50 | help="Filter out features that are observed in fewer than this " 51 | "number of cells") 52 | grp.add_argument( 53 | "--max_mito", type=int, default=5, 54 | help="Filter out cells where more than this percentage of counts " 55 | "belong to mitochondrial features.") 56 | grp.add_argument( 57 | "--mito_prefixes", default=["MT-"], nargs='*', 58 | help="prefixes to identify mitochondrial features.") 59 | grp.add_argument( 60 | "--norm_count", type=int, default=10000, 61 | help="Normalize to this number of counts per cell as " 62 | "is performed in CellRanger.") 63 | grp.add_argument( 64 | "--filtered_mex", default="filtered_feature_bc_matrix", 65 | help="Output folder for raw counts MEX data.") 66 | 67 | grp = parser.add_argument_group("UMAP creation") 68 | grp.add_argument( 69 | "--enable_umap", action="store_true", 70 | help="Perform UMAP on matrix.") 71 | grp.add_argument( 72 | "--umap_tsv", default="expression.umap.tsv", type=Path, 73 | help=( 74 | "UMAP TSV output file path. If --replicates is greater than 1 " 75 | "files will be named: name.index.tsv.")) 76 | grp.add_argument( 77 | "--replicates", type=int, default=1, 78 | help="Number of UMAP replicated to perform.") 79 | grp.add_argument( 80 | "--pcn", type=int, default=100, 81 | help="Number of principal components to generate prior to UMAP") 82 | grp.add_argument( 83 | "--dimensions", type=int, default=2, 84 | help="Number of dimensions in UMAP embedding") 85 | grp.add_argument( 86 | "--min_dist", type=float, default=0.1, 87 | help="Minimum distance parameter of UMAP") 88 | grp.add_argument( 89 | "--n_neighbors", type=int, default=15, 90 | help="Number of neighbors parameter of UMAP") 91 | 92 | return parser 93 | 94 | 95 | def main(args): 96 | """Make feature x cell, UMI-deduplicated, counts matrix.""" 97 | logger = get_named_logger('AggreMatrix') 98 | logger.info('Constructing count matrices') 99 | 100 | # converting to float on fly means we can save a copy when normalizing 101 | try: 102 | matrix = ExpressionMatrix.aggregate_tags(args.input, args.feature, dtype=float) 103 | except UnicodeDecodeError: 104 | matrix = ExpressionMatrix.aggregate_hdfs(args.input, dtype=float) 105 | 106 | logger.info("Removing unknown features.") 107 | if len(matrix.cells) == 0: 108 | raise ValueError("""The expression matrix contains no cells. 109 | This may indicate an issue with data quality or volume. 110 | Incorrectly specified 10x kits/versions and reference data can also lead to 111 | to removal of all data at this point.""") 112 | 113 | # Generate statistics from the assembled matrix before any filtering. 114 | stats = {} 115 | stats['median_umis_per_cell'] = matrix.median_counts 116 | stats['median_genes_per_cell'] = matrix.median_features_per_cell 117 | 118 | with open(args.stats, 'w') as fh: 119 | for k, v in stats.items(): 120 | fh.write(f'{k}\t{v}\n') 121 | 122 | # Begin filtering 123 | matrix.remove_unknown() 124 | 125 | logger.info("Writing raw counts to file.") 126 | if args.text: 127 | matrix.to_tsv(args.raw, args.feature) 128 | else: 129 | matrix.to_mex(args.raw, dtype=int) 130 | 131 | if args.enable_filtering: 132 | logger.info("Filtering, normalizing and log-transforming matrix.") 133 | matrix = ( 134 | matrix 135 | .remove_cells_and_features(args.min_features, args.min_cells) 136 | .remove_skewed_cells( 137 | args.max_mito / 100, args.mito_prefixes, 138 | fname=args.per_cell_mito, label="mito_pct") 139 | .normalize(args.norm_count) 140 | .log_transform() 141 | ) 142 | logger.info("Writing filtered matrix.") 143 | if args.text: 144 | matrix.to_tsv(args.processed, args.feature) 145 | else: 146 | matrix.to_mex(args.processed) 147 | else: 148 | logger.info("Normalizing and log-transforming matrix.") 149 | matrix.normalize(args.norm_count).log_transform() 150 | 151 | logger.info("Writing mean expression levels.") 152 | ExpressionMatrix.write_matrix( 153 | args.per_cell_expr, 154 | matrix.mean_expression, matrix.tcells, ['mean_expression'], index_name='CB') 155 | 156 | if args.enable_umap: 157 | # note, we're going to do things in place so ExpressionMatrix will 158 | # become modified (trimmed on feature axis, and transposed) 159 | mat = matrix._matrix 160 | pcn = min(args.pcn, *mat.shape) 161 | matrix._features = np.array([f"pca_{i}" for i in range(pcn)]) 162 | logger.info(f"Performing PCA on matrix of shape: {matrix.matrix.shape}") 163 | model = PCA(n_components=pcn, copy=False) 164 | mat = model.fit_transform(mat.transpose()) # warning! 165 | logger.info(f"PCA output matrix has shape: {mat.shape}") 166 | 167 | for replicate in range(args.replicates): 168 | logger.info(f"Performing UMAP replicate {replicate + 1}.") 169 | mapper = umap.UMAP( 170 | n_neighbors=args.n_neighbors, 171 | min_dist=args.min_dist, 172 | n_components=args.dimensions, 173 | verbose=0) 174 | embedding = mapper.fit_transform(mat) 175 | logger.info(f"UMAP Embedding has shape: {embedding.shape}") 176 | 177 | # would be nice to avoid a copy here, but the array is fairly small 178 | fname = str(args.umap_tsv).replace('REPEAT', str(replicate)) 179 | logger.info(f"Writing UMAP embedding {fname}.") 180 | cols = [f"D{i+1}" for i in range(args.dimensions)] 181 | out = pd.DataFrame(embedding, columns=cols, index=matrix.tcells) 182 | out.to_csv(fname, sep="\t", index=True, index_label="CB") 183 | matrix._matrix = mat.transpose() # undo the PCA transpose 184 | 185 | logger.info("Done.") 186 | -------------------------------------------------------------------------------- /bin/workflow_glue/sc_util.py: -------------------------------------------------------------------------------- 1 | """Common code to be used across workflow scripts.""" 2 | import collections 3 | import json 4 | 5 | kit_adapters = { 6 | '3prime': { 7 | 'adapter1': 'CTACACGACGCTCTTCCGATCT', 8 | 'adapter2': 'ATGTACTCTGCGTTGATACCACTGCTT' 9 | }, 10 | 'multiome': { 11 | 'adapter1': 'CTACACGACGCTCTTCCGATCT', 12 | 'adapter2': 'ATGTACTCTGCGTTGATACCACTGCTT' 13 | }, 14 | 'visium': { 15 | 'adapter1': 'CTACACGACGCTCTTCCGATCT', 16 | 'adapter2': 'ATGTACTCTGCGTTGATACCACTGCTT' 17 | }, 18 | '5prime': { 19 | 'adapter1': 'CTACACGACGCTCTTCCGATCT', 20 | 'adapter2': 'GTACTCTGCGTTGATACCACTGCTT' 21 | } 22 | } 23 | 24 | revcomp_map = str.maketrans("ACGTacgt", "TGCAtgca") 25 | 26 | 27 | def rev_cmp(seq): 28 | """Reverse complement a DNA sequence.""" 29 | return seq[::-1].translate(revcomp_map) 30 | 31 | 32 | class StatsSummary(collections.Counter): 33 | """Summary dictionary for storing.""" 34 | 35 | fields = {} # subclasses should fill this in 36 | 37 | def __init__(self, *args, **kwargs): 38 | """Count some numbers.""" 39 | self.update(*args, **kwargs) 40 | 41 | @classmethod 42 | def from_pandas(cls, df): 43 | """Create an instance from a pandas dataframe.""" 44 | raise NotImplementedError("This method has not been implemented.") 45 | 46 | def to_dict(self): 47 | """Create dictionary with explicit zeroes.""" 48 | return {k: self[k] for k in self} 49 | 50 | @classmethod 51 | def from_json(cls, fname): 52 | """Create and instance from a JSON file.""" 53 | with open(fname, "r") as fh: 54 | data = json.load(fh) 55 | return cls(data) 56 | 57 | def to_json(self, fname): 58 | """Save to JSON.""" 59 | with open(fname, "w") as fh: 60 | json.dump(self.to_dict(), fh, indent=4) 61 | -------------------------------------------------------------------------------- /bin/workflow_glue/summarise_adapters.py: -------------------------------------------------------------------------------- 1 | """Aggregate adapter configuration summaries.""" 2 | from pathlib import Path 3 | 4 | from .adapter_scan_vsearch import AdapterSummary # noqa: ABS101 5 | from .util import get_named_logger, wf_parser # noqa: ABS101 6 | 7 | 8 | def argparser(): 9 | """Create argument parser.""" 10 | parser = wf_parser("summarise_adapters") 11 | 12 | parser.add_argument( 13 | "input_dir", type=Path, 14 | help="Path to JSON files to aggregate.") 15 | parser.add_argument( 16 | "output", type=Path, 17 | help="Path to output JSON file") 18 | 19 | return parser 20 | 21 | 22 | def main(args): 23 | """Aggregate multiple adapter configuration summary files.""" 24 | logger = get_named_logger('AggAdptCnf') 25 | logger.info("Aggregating adapter configurations") 26 | 27 | fnames = list(args.input_dir.glob("*.json")) 28 | if len(fnames) == 0: 29 | raise IOError("No summary JSON files found.") 30 | 31 | summary = AdapterSummary.from_json(fnames[0]) 32 | if len(fnames) > 1: 33 | for other in fnames[1:]: 34 | summary += AdapterSummary.from_json(other) 35 | summary.to_json(args.output) 36 | -------------------------------------------------------------------------------- /bin/workflow_glue/tag_bam.py: -------------------------------------------------------------------------------- 1 | """Tag BAM with workflow-derived information. 2 | 3 | Tags files are TSV files containing read_id to tags mappings (such as barcodes, UMIs, 4 | assigned features). We iterate over the BAM file by chromosome, loading the tags for 5 | each chromosome individually to avoid holding all tags in memory 6 | at once. The records are tagged and output to a tagged BAM. This process only tags 7 | primary records, or supplementary records that are on the same chromosome as their 8 | primary record. 9 | 10 | To tag supplementary records, there is another tags file input that contains 11 | read_id to tag mappings for all supplementary records, formatted identically to the 12 | primary tags file. 13 | During tagging, these are all loaded into memory regardless of chromosome they map to. 14 | This allows supplementary records that map to a different chromosome than their 15 | primary alignment to be properly tagged. 16 | """ 17 | import csv 18 | from dataclasses import dataclass 19 | import itertools 20 | from pathlib import Path 21 | 22 | import pysam 23 | from .util import get_named_logger, wf_parser # noqa: ABS101 24 | 25 | logger = get_named_logger("TagBAMs") 26 | 27 | 28 | BAM_TAGS = { 29 | "corrected_barcode": "CB", 30 | "uncorrected_barcode": "CR", 31 | "quality_barcode": "CY", 32 | "corrected_umi": "UB", 33 | "uncorrected_umi": "UR", 34 | "quality_umi": "UY", 35 | "gene": "GN", 36 | "transcript": "TR" 37 | } 38 | 39 | 40 | def argparser(): 41 | """Create argument parser.""" 42 | parser = wf_parser("tag_bams") 43 | 44 | parser.add_argument( 45 | "in_bam", type=Path, 46 | help="BAM file for tagging") 47 | 48 | parser.add_argument( 49 | "out_bam", type=Path, 50 | help="Path for tagged output BAM") 51 | 52 | parser.add_argument( 53 | "tags", type=Path, 54 | help="Read tags TSV") 55 | 56 | parser.add_argument( 57 | "sa_tags", type=Path, 58 | help="Read supplementary tags TSV") 59 | 60 | parser.add_argument( 61 | "--threads", default=2, type=int, 62 | help="Number of threads used for BAM reading/writing.") 63 | return parser 64 | 65 | 66 | # The use of a dataclass here is primarily to reduce memory: 67 | # chr1, 9517964 reads. dict: 11.6 GB, class: 8.5 GB 68 | # The overhead in creating instances of the class is small 69 | # compared to the BAM writing time, and access is similarly 70 | # fast enough. 71 | 72 | @dataclass 73 | class Tags: 74 | """Storing tag data for a read.""" 75 | 76 | CB = None 77 | CR = None 78 | CY = None 79 | UB = None 80 | UR = None 81 | UY = None 82 | GN = None 83 | TR = None 84 | chrom = None 85 | 86 | @classmethod 87 | def from_dict(cls, d): 88 | """Create instance from a dictionary.""" 89 | self = cls() 90 | for k in BAM_TAGS.values(): 91 | setattr(self, k, d[k]) 92 | setattr(self, "chrom", d["chr"]) 93 | return self 94 | 95 | 96 | class TagStore: 97 | """Proxy to tag files for retrieving per-read tag information.""" 98 | 99 | def __init__(self, tags, bam=None, sa_tags=None): 100 | """Initialize an instance.""" 101 | self._sa_tags = self._load_supplementary_tags(sa_tags) 102 | self._cur = None 103 | self._single = False 104 | if tags.is_file(): 105 | self._single = True 106 | self._tags = self._read_file(tags) 107 | elif tags.is_dir(): 108 | if bam is None: 109 | raise ValueError("`bam` should be provided when `tags` is a directory.") 110 | tags = tags.glob("*.tsv") 111 | self._index = dict() 112 | for fname in tags: 113 | d = self._read_file(fname, nrows=10) 114 | try: 115 | chrom = getattr(next(iter(d.values())), "chrom") 116 | self._index[chrom] = fname 117 | except StopIteration: 118 | logger.warning(f"{fname} appears empty.") 119 | else: 120 | logger.info(f"{fname} contains tags for reference: {chrom}.") 121 | else: 122 | raise ValueError( 123 | "`tags` should be a tags file or directory containing such files.") 124 | 125 | def _load_supplementary_tags(self, sa_tags): 126 | # Load tag info for all reads with one or more suppl. records. 127 | # These are added to self._tags later regardless of chr mapping. 128 | # This enures that supplementary records are tagged even if on a different 129 | # chr to primary record. 130 | sa_tags_files = [] 131 | if sa_tags is not None: 132 | sa_tags_files = sa_tags.glob("*.tsv") 133 | sa_tags = {} 134 | for fname in sa_tags_files: 135 | sa_tags.update(self._read_file(fname)) 136 | return sa_tags 137 | 138 | def populate(self, rname): 139 | """Populate the proxy for a given reference.""" 140 | if not self._single: 141 | self._cur = rname 142 | try: 143 | self._tags = self._read_file(self._index[self._cur]) 144 | except KeyError: 145 | # No primary records for this chr, but there may be suppl records 146 | self._tags = {} 147 | self._tags.update(self._sa_tags) 148 | 149 | def _read_file(self, fname, nrows=None, cols=None): 150 | """Read a tags file.""" 151 | # note: this is actually around 50% faster than: 152 | # pd.read_csv().to_dict(orient="index") 153 | # first find and rename the fields to be tags rather than human names 154 | fields = None 155 | with open(fname) as csvfile: 156 | iterator = csv.DictReader(csvfile, delimiter="\t") 157 | fields = iterator.fieldnames 158 | for k, v in BAM_TAGS.items(): 159 | for i in range(len(fields)): 160 | if fields[i] == k: 161 | fields[i] = v 162 | # now parse the file 163 | with open(fname) as csvfile: 164 | iterator = csv.DictReader(csvfile, delimiter="\t", fieldnames=fields) 165 | next(iterator) # setting fieldnames doesn't read header 166 | if nrows is not None: 167 | iterator = itertools.islice(iterator, nrows) 168 | data = {d["read_id"]: Tags.from_dict(d) for d in iterator} 169 | return data 170 | 171 | def __getitem__(self, read_data): 172 | """Retrieve tags for a read.""" 173 | read_id, chrom = read_data 174 | try: 175 | data = self._tags[read_id] 176 | except KeyError: 177 | exp = KeyError(f"Read '{read_id}' not found in tag data.") 178 | if chrom != self._cur: 179 | self._cur = chrom 180 | self._tags = self._read_file(self._index[self._cur]) 181 | try: 182 | data = self._tags[read_id] 183 | except KeyError: 184 | raise exp 185 | else: 186 | raise exp 187 | return data 188 | 189 | 190 | def add_tags(tags, sa_tags, in_bam, out_bam, threads): 191 | """Add all the required tags to the BAM file.""" 192 | store = TagStore(tags, bam=in_bam, sa_tags=sa_tags) 193 | 194 | skipped = 0 195 | written = 0 196 | with pysam.AlignmentFile(in_bam, "rb", threads=threads) as bam_in: 197 | with pysam.AlignmentFile( 198 | out_bam, "wb", template=bam_in, threads=threads) as bam_out: 199 | for ref in bam_in.references: 200 | logger.info(f"Processing reads from reference: {ref}.") 201 | # There may be no primary records for this reference, but we'll process 202 | # it in case there are any supplementary records 203 | store.populate(ref) 204 | logger.info("Tagging reads.") 205 | for align in bam_in.fetch(ref): 206 | read_id = align.query_name 207 | try: 208 | row = store._tags[read_id] 209 | except KeyError: 210 | skipped += 1 211 | continue # don't write reads without tags 212 | else: 213 | written += 1 214 | for tag in BAM_TAGS.values(): 215 | align.set_tag(tag, getattr(row, tag), value_type="Z") 216 | bam_out.write(align) 217 | total = skipped + written 218 | written_pct = 0 219 | skipped_pct = 0 220 | if total > 0: 221 | written_pct = 100 * written / total 222 | skipped_pct = 100 * skipped / total 223 | logger.info( 224 | f"Written: {written} ({written_pct:0.2f}%). " 225 | f"Skipped: {skipped} ({skipped_pct:0.2f}%).") 226 | 227 | 228 | def main(args): 229 | """Entry point.""" 230 | add_tags(args.tags, args.sa_tags, args.in_bam, args.out_bam, args.threads) 231 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """__init__.py for the tests.""" 2 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_adapter_scan_vsearch.py: -------------------------------------------------------------------------------- 1 | """Test adapter_scan_vsearch.""" 2 | from pathlib import Path 3 | import tempfile 4 | 5 | import pysam 6 | import pytest 7 | from pytest import fixture 8 | from workflow_glue.adapter_scan_vsearch import ( 9 | call_vsearch, complement_trans, create_stranded_reads, 10 | parse_vsearch, write_adapters_fasta) 11 | from workflow_glue.sc_util import kit_adapters 12 | 13 | 14 | @fixture 15 | def segment(): 16 | """Random sequence to build a seq by concatenating along with adapters.""" 17 | return ( 18 | "ATTCAGCGCTGGAGACCGAGCGCCCCGCAAAGGGCCTGATCT" 19 | "ATCGCGCACGGGACTACTCATTGGGACTGCGGCAATAGGGGAGGGGCCTAACAACGTT") 20 | 21 | 22 | @pytest.mark.parametrize( 23 | 'adapters,expected_results', 24 | [ 25 | # Non-full length reads 26 | [[], [['*', 'no_adapters', '*']]], 27 | [['adapter1_f'], [['adapter1_f', 'single_adapter1', '+']]], 28 | [['adapter2_r'], [['adapter2_r', 'single_adapter2', '-']]], 29 | [['adapter2_r', 'adapter1_f'], [['adapter2_r-adapter1_f', 'other', '*']]], 30 | # Full length reds 31 | [['adapter1_f', 'adapter2_f'], [['adapter1_f-adapter2_f', 'full_len', '+']]], 32 | # 3 adapters with one full length segment 33 | [['adapter2_r', 'adapter1_r', 'adapter1_f'], 34 | [['adapter2_r-adapter1_r', 'full_len', '-']]], 35 | # Mutiple subreads in a read 36 | [['adapter1_f', 'adapter2_f', 'adapter2_r', 'adapter1_r'], 37 | [ 38 | ['adapter1_f-adapter2_f', 'full_len', '+'], 39 | ['adapter2_r-adapter1_r', 'full_len', '-'], 40 | ]], 41 | ] 42 | ) 43 | def test_call_vsearch(adapters, expected_results, segment): 44 | """Test call_vsearch running and parsing. 45 | 46 | This is the main function of the script that calls a bunch of other functions. 47 | """ 48 | id_ = 'read_1' 49 | kits = ['3prime', '5prime', 'multiome'] 50 | 51 | for kit in kits: 52 | # Build the read 53 | adapter_seqs = [] 54 | for a in adapters: 55 | # Get name and orientation from eg: adapter2_r 56 | adapter_name, ori = a.split('_') 57 | adap = kit_adapters[kit][adapter_name] 58 | if ori == 'r': 59 | adap = adap[::-1].translate(complement_trans) 60 | adapter_seqs.append(adap) 61 | 62 | seq = segment.join(adapter_seqs) + segment 63 | 64 | fastq = ( 65 | f"@{id_}\n" 66 | f"{seq}\n" 67 | "+\n" 68 | f"{'<' * len(seq)}") 69 | 70 | fastq_file = tempfile.NamedTemporaryFile(suffix='.fq') 71 | with open(fastq_file.name, 'w') as fh: 72 | fh.write(fastq) 73 | 74 | adapter_fasta = 'adapter_seqs.fasta' 75 | write_adapters_fasta( 76 | kit_adapters[kit]['adapter1'], kit_adapters[kit]['adapter2'], 77 | adapter_fasta) 78 | 79 | vsearch_results = tempfile.NamedTemporaryFile(suffix='.fq') 80 | call_vsearch( 81 | Path(fastq_file.name), Path(vsearch_results.name), 0.7, adapter_fasta, 4) 82 | parsed_results = parse_vsearch(vsearch_results.name) 83 | 84 | # Each result can contain 0 or more subreads - 85 | # segments with consecutive pairs of compatible adapters. 86 | parsed_results = iter(parsed_results[id_]) 87 | for exp_result in expected_results: 88 | subread_result = next(parsed_results) 89 | 90 | assert subread_result['adapter_config'] == exp_result[0] 91 | assert subread_result['lab'] == exp_result[1] 92 | assert subread_result['orig_strand'] == exp_result[2] 93 | 94 | 95 | def test_write_stranded_fastq(): 96 | """Test that the correct stranded and trimmed fastq files are being written.""" 97 | # Build a dummy fastq file containing a single read with two subreads. 98 | 99 | # This is a 3prime read 100 | seq = 't' * 10 + 'A' * 100 + 't' * 10 + 'G' * 200 101 | fastq = ( 102 | f"@read_1\n" 103 | f"{seq}\n" 104 | "+\n" 105 | f"{'<' * len(seq)}") 106 | 107 | # This config defines one read containing two subreads. 108 | config = { 109 | 'read_1': [ 110 | { 111 | 'readlen': 100, 'read_id': 'read_1_0', 'start': 10, 112 | 'end': 110, 113 | 'fl': True, 'stranded': True, 'orig_strand': '+', 114 | 'orig_adapter_config': 115 | 'adapter1_f-adapter2_f-adapter2_r-adapter1_r', 116 | 'adapter_config': 'adapter1_f-adapter2_f', 117 | 'lab': 'full_len'}, 118 | { 119 | 'readlen': 200, 'read_id': 'read_1_1', 'start': 120, 120 | 'end': 320, 121 | 'fl': True, 'stranded': True, 'orig_strand': '-', 122 | 'orig_adapter_config': 123 | 'adapter1_f-adapter2_f-adapter2_r-adapter1_r', 124 | 'adapter_config': 'adapter1_f-adapter2_f', 125 | 'lab': 'full_len'} 126 | ] 127 | } 128 | 129 | temp_fq = tempfile.NamedTemporaryFile(suffix='.fq') 130 | 131 | temp_fq_out = tempfile.NamedTemporaryFile(mode='wt', suffix='.fq') 132 | 133 | with open(temp_fq.name, 'w') as fh: 134 | fh.write(fastq) 135 | 136 | data = create_stranded_reads(temp_fq.name, config, '3prime', fl_only=True) 137 | for read in data: 138 | temp_fq_out.write(read) 139 | temp_fq_out.flush() 140 | 141 | results = [] 142 | with pysam.FastxFile(temp_fq_out.name) as fh_res: 143 | for entry in fh_res: 144 | results.append(entry) 145 | assert len(results) == 2 146 | assert len(results[0].sequence) == 100 147 | # Subread 0 should have been reverse complemented and be all 'T' 148 | assert set(results[0].sequence) == {'T'} 149 | assert len(results[1].sequence) == 200 150 | # Subread 1 should remain unchanged 151 | assert set(results[1].sequence) == {'G'} 152 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_assign_barcodes.py: -------------------------------------------------------------------------------- 1 | """Test assign_barcodes.""" 2 | from collections import Counter 3 | 4 | import pandas as pd 5 | import pytest 6 | from workflow_glue.assign_barcodes import ( 7 | determine_barcode, process_records 8 | ) 9 | 10 | 11 | @pytest.fixture 12 | def allowed_barcodes(): 13 | """Make a small barcode whitelist.""" 14 | return set( 15 | ('AAAAAAAAAAAAAAAA', 16 | 'ttAAAAAAAAAAAAAA', 17 | 'ttttAAAAAAAAAAAA', 18 | 'AAAAAAAAAAAAAccc', 19 | 'AAAggggAAAAAAAAA')) 20 | 21 | 22 | def test_calc_ed_with_allowed_barcodes(allowed_barcodes): 23 | """Test edit distance calculation.""" 24 | bc1 = 'AAAAAAAAAAAAAAAA' 25 | bc_match = determine_barcode( 26 | bc1, list(allowed_barcodes), allowed_barcodes, 2, 2, Counter()) 27 | assert bc_match == 'AAAAAAAAAAAAAAAA' 28 | 29 | # An uncorrected BC with a nearest match ED of 7 (cutoff = 6) 30 | # return no match 31 | bc2 = 'AAAAAAAAAggggggg' 32 | bc_match = determine_barcode( 33 | bc2, list(allowed_barcodes), allowed_barcodes, 2, 2, Counter()) 34 | assert bc_match == '-' 35 | 36 | 37 | @pytest.mark.parametrize("use_kmer_index", [False, True]) 38 | def test_process_records(tmp_path, allowed_barcodes, use_kmer_index): 39 | """Test process_records. 40 | 41 | Check if barcodes are corrected and enumerated appropriately. 42 | """ 43 | # Build some uncorrected barcodes. 44 | # The columns used in this test are read_id and CR (uncorrected barcode). The other 45 | # Columns can be any value for now 46 | header = ('read_id', 'CR', 'CY', 'UR', 'UY', 'chr', 'start', 'end', 'mapq') 47 | rows = [ 48 | # 100% match to whitelist 49 | ('read1', 'AAAAAAAAAAAAAAAA', 'qual', 'umi', 'qual' 'chr', 0, 100, 20), 50 | # This should be corrected to AAAAAAAAAAAAAAAA 51 | ('read2', 'AAAAcAAAcAAAAAAA', 'qual', 'umi', 'qual', 'chr', 0, 100, 20), 52 | # Not corrected due to multiple hits to whitelist. 53 | # bc_match_ed <= max_ed but next_match_diff < min_ed_diff 54 | ('read3', 'tAAAAAAAAAAAAAAA', 'qual', 'umi', 'qual', 'chr', 0, 100, 20), 55 | ('read4', 'AtAAAAAAAAAAAAAA', 'qual', 'umi', 'qual', 'chr', 0, 100, 20), 56 | # No matches to whitelist 57 | ('read5', 'GGGGGGGGGGGGGGGG', 'qual', 'umi', 'qual', 'chr', 0, 100, 20), 58 | ('read6', 'GCGCGCGCGCGCGCGC', 'qual', 'umi', 'qual', 'chr', 0, 100, 20), 59 | # Not corrected. A hit will have been found in the initial rapidfuzz search but 60 | # none have an ED <= max_ed (2). 61 | ('read7', 'cccAAAAAAAAAAAAA', 'qual', 'umi', 'qual', 'chr', 0, 100, 20), 62 | ] 63 | 64 | tags = ( 65 | pd.DataFrame(rows, columns=header) 66 | .set_index('read_id', drop=True) 67 | .assign(SA='True')) # Add constant SA column (not used in the tested code) 68 | tags_file = tmp_path / 'tags.tsv' 69 | tags.to_csv(tags_file.name, sep='\t') 70 | tags_output = tmp_path / 'tags_out.tsv' 71 | 72 | max_ed = 2 73 | min_ed_diff = 2 74 | barcode_counter, reasons_counter = process_records( 75 | tags_file.name, allowed_barcodes, 76 | max_ed, min_ed_diff, tags_output.name, 77 | use_kmer_index=use_kmer_index) 78 | 79 | result_tags_df = pd.read_csv(tags_output.name, sep='\t', index_col=0) 80 | 81 | # Just the single corrected barcode should be present: AAAAAAAAAAAAAAAA 82 | assert len(barcode_counter) == 1 83 | assert barcode_counter['AAAAAAAAAAAAAAAA'] == 2 84 | 85 | assert result_tags_df.loc['read1', 'CB'] == 'AAAAAAAAAAAAAAAA' 86 | assert result_tags_df.loc['read2', 'CB'] == 'AAAAAAAAAAAAAAAA' 87 | # Reads without a corrected barcode should not be present in the output 88 | assert 'read3' not in result_tags_df.index 89 | assert 'read4' not in result_tags_df.index 90 | assert 'read5' not in result_tags_df.index 91 | assert 'read6' not in result_tags_df.index 92 | 93 | assert dict(reasons_counter) == \ 94 | { 95 | 'bc_shortlist_exact_match': 1, # Read1 96 | 'bc_corrected': 1, # Read2 97 | 'bc_no_shortlist_match': 3, # Read5, Read6, Read7 98 | 'bc_shortlist_multiple_hits': 2 # Read3, Read4 99 | } 100 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_assign_features.py: -------------------------------------------------------------------------------- 1 | """Test assign_barcodes.""" 2 | import subprocess 3 | import tempfile 4 | 5 | import pandas as pd 6 | from workflow_glue.assign_features import ( 7 | main 8 | ) 9 | 10 | 11 | def getbam(): 12 | """Create a synthetic bam file containing adapter and barcode sequences.""" 13 | # two refrence transcripts in the BAM 14 | # One is 70 and one is 100bp long 15 | header = ( 16 | "@SQ SN:ST001 LN:2000\n" 17 | "@SQ SN:ST002 LN:1000\n") 18 | 19 | # Define the transcripts. Normally we wouldn't have transcripts this samll as 20 | # they would not pass the size threadhold in the stringtie step, but they work for 21 | # out purposes here 22 | 23 | alns = [ 24 | # Uniquely-mapped read 25 | ['read_1', 200, 100, 500, 60, 'ST001', '500M'], 26 | # read_2 mapped to two locations. ST002 should be assigned to read 2 27 | # as it's higher AS and read and transcript cov > 0.4 28 | ['read_2', 150, 100, 400, 1, 'ST001', '100H400M'], 29 | ['read_2', 200, 500, 500, 60, 'ST002', '500M'], 30 | # read_3 maps to two locations. The second alignemnt has higher AS score 31 | # but will not be assigned as reference coverage is < 0.4 32 | ['read_3', 150, 500, 100, 1, 'ST001', '50M400H50M'], 33 | ['read_3', 200, 500, 150, 60, 'ST001', '200H150M200H'] 34 | ] 35 | 36 | sam = header 37 | for align in alns: 38 | qname, a_score, start, seqlen, mapq, rname, cigar = align 39 | # Make a sam file containing the read and a quality qscore of 60. 40 | sam += ( 41 | f"{qname}\t0\t{rname}\t{start}\t{mapq}\t{cigar}\t*\t0\t0\t" 42 | f"{'A' * seqlen}\t{'?' * seqlen}\tAS:i:{a_score}\n" 43 | ) 44 | 45 | # Write out a test BAM 46 | with tempfile.NamedTemporaryFile( 47 | mode='w', suffix='.sam', delete=False) as fh_sam: 48 | fh_sam.write(sam) 49 | sam_file = fh_sam.name 50 | 51 | bam = 'align.bam' 52 | subprocess.check_output(['samtools', 'view', sam_file, '-o', 'align.bam']) 53 | 54 | return bam 55 | 56 | 57 | def test_main(): 58 | """Test main.""" 59 | # gffcompare tmap dataframe. Maps stringtie transcripts (qry_id) 60 | # to reference transcripts and reference gene IDs 61 | df_gffcompare_tmap_rows = ( 62 | ('ST001', 'ref_tr_1', 'gene_id_1', '='), 63 | ('ST002', 'ref_tr_2', 'gene_id_2', '='), 64 | ('ST003', 'ref_tr_3', 'gene_id_3', '=') 65 | ) 66 | df_gffcompare_tmap = pd.DataFrame( 67 | df_gffcompare_tmap_rows, columns=[ 68 | ['qry_id', 'ref_id', 'ref_gene_id', 'class_code']] 69 | ) 70 | gffcompare_file = tempfile.NamedTemporaryFile('w', delete=False, suffix='.tsv').name 71 | df_gffcompare_tmap.to_csv(gffcompare_file, sep='\t', index=None) 72 | 73 | # All we want from tags is the mapq alignment score 74 | df_tags_rows = ( 75 | ('read_1', '60'), 76 | ('read_2', '60'), 77 | ('read_3', '30'), 78 | ) 79 | df_tags = pd.DataFrame( 80 | df_tags_rows, columns=['read_id', 'mapq'] 81 | ) 82 | tags_file = tempfile.NamedTemporaryFile('w', delete=False, suffix='.tsv').name 83 | df_tags.to_csv(tags_file, index=None, sep='\t') 84 | 85 | # GTF file is used for mapping transcript id (from the gffcompare tmap file) to 86 | # gene name. 87 | # Here is just a subset of the gtf. We need 'transcript' in pos [2]. gene_name and 88 | # transcript_id are grepped 89 | gtf_str = ( 90 | 'chr1\tHAVANA\ttranscript\tgene_name "gene_name_1";transcript_id "ref_tr_1";\n', 91 | 'chr1\tHAVANA\ttranscript\tgene_name "gene_name_2";transcript_id "ref_tr_2";\n', 92 | 'chr1\tHAVANA\ttranscript\tgene_name "gene_name_3";transcript_id "ref_tr_3";\n', 93 | 'chr1\tHAVANA\ttranscript\tgene_name "gene_name_4";transcript_id "ref_tr_4";', 94 | ) 95 | 96 | with tempfile.NamedTemporaryFile('w', delete=False, suffix='.tsv') as fh: 97 | fh.writelines(gtf_str) 98 | gtf_file = fh.name 99 | 100 | class Args: 101 | transcriptome_bam = getbam() 102 | gffcompare_tmap = gffcompare_file 103 | tags = tags_file 104 | gtf = gtf_file 105 | output = tempfile.NamedTemporaryFile('w', suffix='.tsv', delete=False).name 106 | min_mapq = 30 107 | min_tr_coverage = 0.4 108 | min_read_coverage = 0.4 109 | chunksize = 1 110 | 111 | args = Args() 112 | main(args) 113 | 114 | result = pd.read_csv(args.output, sep='\t', index_col=0) 115 | 116 | assert result.at['read_1', 'gene'] == 'gene_name_1' 117 | assert result.at['read_1', 'transcript'] == 'ref_tr_1' 118 | 119 | assert result.at['read_2', 'gene'] == 'gene_name_2' 120 | assert result.at['read_2', 'transcript'] == 'ref_tr_2' 121 | 122 | assert result.at['read_3', 'gene'] == 'gene_name_1' 123 | assert result.at['read_3', 'transcript'] == '-' 124 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_calc_saturation.py: -------------------------------------------------------------------------------- 1 | """Test adapter_scan_vsearch.""" 2 | from unittest.mock import Mock 3 | 4 | import polars as pl 5 | from workflow_glue.calc_saturation import ( 6 | downsample_dataframe, run_jobs 7 | ) 8 | 9 | 10 | def test_run_jobs(tmp_path): 11 | """Test_downsample_reads. 12 | 13 | Check for the correct number of downsampled dataframes are returned, each with 14 | the correct size. 15 | """ 16 | args = Mock() 17 | args.read_tags = tmp_path / 'read_tags.tsv' 18 | args.output = tmp_path / 'output.tsv' 19 | args.threads = 2 20 | args.sample = 'test' 21 | 22 | # Create df with 1000 rows of fake data. 23 | with open(args.read_tags, 'w') as fh: 24 | fh.write('read_id\tcorrected_barcode\tcorrected_umi\tgene\n') 25 | row = 'id\tagtcgatcgatcgta\tatcgtacaatct\tYFG' 26 | for i in range(1000): 27 | fh.write(f'{row}\n') 28 | 29 | run_jobs(args) 30 | result = pl.read_csv(source=args.output, separator='\t') 31 | 32 | # Simply check correct number of results are returned 33 | # and that the downsampled reads are the correct size. 34 | assert len(result) == 16 35 | for row in result.iter_rows(named=True): 36 | assert row['downsamp_reads'] == 1000 * row['downsamp_frac'] 37 | 38 | 39 | def test_downsample_dataframe(): 40 | """Test calc_saturation.""" 41 | header = ['barcode', 'umi', 'gene'] 42 | 43 | rows = ( 44 | # Cell 1: 4 reads, 2 umis with two reads each, 2 genes. 45 | ('AGATAGATAGATAGAT', 'ATAGATAGATAG', 'YFG1'), 46 | ('AGATAGATAGATAGAT', 'ATAGATAGATAG', 'YFG1'), 47 | ('AGATAGATAGATAGAT', 'ccccATAGATAG', 'YFG2'), 48 | ('AGATAGATAGATAGAT', 'ccccATAGATAG', 'YFG2'), 49 | 50 | # Cell 2: 4 reads, 3 umis, 3 genes. 51 | ('TATATATATATATATA', 'TACTACTACTAC', 'YFG3'), 52 | ('TATATATATATATATA', 'CACTACTACTCA', 'YFG4'), 53 | ('TATATATATATATATA', 'CACTACTACTCA', 'YFG4'), 54 | ('TATATATATATATATA', 'GACGACGACGAC', 'YFG5') 55 | ) 56 | 57 | df = pl.from_records( 58 | data=rows, schema=header) 59 | 60 | ( 61 | label, 62 | n_reads, 63 | reads_per_cell, 64 | genes_per_cell, 65 | umis_per_cell, 66 | umi_saturation 67 | ) = downsample_dataframe(df, 1.0) 68 | 69 | assert n_reads == 8 70 | assert reads_per_cell == 4 71 | assert genes_per_cell == 2.5 72 | assert umis_per_cell == 2.5 73 | 74 | unique_umis = 5 75 | n_reads = 8 76 | assert umi_saturation == 1 - (unique_umis / n_reads) 77 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_cluster_umis.py: -------------------------------------------------------------------------------- 1 | """Test cluster barcodes.""" 2 | import pandas as pd 3 | import pytest 4 | from workflow_glue.create_matrix import cluster_dataframe 5 | 6 | 7 | @pytest.fixture() 8 | def umi_gene_df(): 9 | """Make read tag and feature assignment DataFrames.""" 10 | # Define 3 clusters of UMIs. 11 | # each entry contains (UMI, gene name and number of UMIs 12 | clusters = [ 13 | 14 | # Cluster1 - 3 UMIS ################### 15 | # 'true' UMI 16 | ('AAAAAAAAAAAA', 'YFG1', 20), # umi1 17 | # ED to umi1 = 2. n_true > (n_umi2 * 2) - 1 18 | ('ttAAAAAAAAAA', 'YFG1', 10), # umi2 19 | 20 | # Cluster2 - single UMI ############### 21 | # ED to umi1 = 2, n_true < (n_umi2 * 2) - 1 22 | ('ggAAAAAAAAAA', 'YFG1', 15), # umi3 23 | 24 | # Cluster3 - single UMI ############### 25 | # ED to umi1 = 3, n_true > (n_umi2 * 2) - 1 26 | ('AAAAAAAAAggg', 'YFG1', 10), # umi4 27 | 28 | # Cluster4 - single UMI ############### 29 | # ED to umi1 = 1, n_true > (n_umi2 * 2) - 1, 30 | # but has a diffrent gene assignment to UMI 1 31 | ('cAAAAAAAAAAA', 'YFG2', 10), # umi5 32 | 33 | # Cluster5 - single UMI with insertion #### 34 | # Should not be returned due to incorrect UMI length 35 | ('AAAAAAAAAAAAtttt', 'YFG2', 1), # no umi 36 | 37 | ] 38 | 39 | # The actual dataframes used in the workflow will contain more columns, 40 | # but they are not used in the clustering process so are omitted for clarity. 41 | # CB is required, but can be any non '-' string 42 | header = ('read_id', 'UR', 'gene', 'CB') 43 | records = [] 44 | read_num = 0 45 | 46 | for umi, gene, n_molecules in clusters: 47 | for _ in range(n_molecules): 48 | records.append((f'read_{read_num}', umi, gene, 'CB')) 49 | read_num += 1 50 | 51 | df = pd.DataFrame( 52 | records, columns=header).set_index( 53 | 'read_id', drop=True) 54 | 55 | return df 56 | 57 | 58 | def test_process_records(umi_gene_df): 59 | """Check that process_records is clustering and correcting UMIs appropriately.""" 60 | result = cluster_dataframe(umi_gene_df, 1000, umi_length=12) 61 | 62 | assert 'UB' in result 63 | # Check for the correct number of clusters 64 | assert result['UB'].nunique() == 4 65 | 66 | # Check that UMI2 is corrected to the 'true' UMI of cluster1 67 | assert all( 68 | result.loc[ 69 | result['UR'] == 'ttAAAAAAAAAA'].loc[:, 'UB'] == 'AAAAAAAAAAAA') 70 | 71 | # Check that the read with a UMI insertion is set to '-' 72 | assert 'AAAAAAAAAAAAtttt' not in result.UB 73 | 74 | # Check that the rest of the UMIs map back to themselves 75 | # as they are all single-UMI clusters 76 | df_no_clust1 = result[~result.UR.isin( 77 | ['AAAAAAAAAAAA', 'ttAAAAAAAAAA', 'AAAAAAAAAAAAtttt'])] 78 | assert all(df_no_clust1.UR == df_no_clust1.UB) 79 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_expression_matrix.py: -------------------------------------------------------------------------------- 1 | """Test expression matrix construction.""" 2 | import os 3 | from pathlib import Path 4 | import tempfile 5 | from unittest.mock import Mock 6 | 7 | import h5py 8 | import numpy as np 9 | import pandas as pd 10 | import pytest 11 | from workflow_glue.expression_matrix import ExpressionMatrix 12 | from workflow_glue.process_matrix import main 13 | 14 | 15 | @pytest.fixture() 16 | def tags_df(): 17 | """Make read tag dataframe.""" 18 | # Create a DataFrame with two cells, and genes 19 | # cell TTT as only one gene (g1) and one UMI (TTT) 20 | # cell AAA has 3 genes 21 | # g1 has a single UMI (TTT) 22 | # g2 has a single UMI (AAA) 23 | # g3 has two UMIS (CCC, GGG) 24 | df = pd.DataFrame({ 25 | 'gene': ['g1', 'g2', 'g3', 'g1', 'g2', 'g3'], 26 | 'corrected_barcode': ['TTT', 'AAA', 'AAA', 'AAA', 'AAA', 'AAA'], 27 | 'corrected_umi': ['CAT', 'AAA', 'CCC', 'TTT', 'AAA', 'GGG'] 28 | }) 29 | 30 | expected_raw_result = pd.DataFrame({ 31 | 'gene': ['g1', 'g2', 'g3'], 32 | 'AAA': [1, 1, 2], 33 | 'TTT': [1, 0, 0], 34 | }) 35 | 36 | # With a min cells per gene of two, would exclude g1 and g2 37 | # TODO add more tests 38 | 39 | # Multiply by norm count, divide by total cell counts, 40 | # np.log1p transform, then divide by log(10) to get back to base 10 41 | expected_processed_result = pd.DataFrame({ 42 | 'gene': ['g1'], 43 | 'AAA': [1.04139], 44 | 'TTT': [1.04139], # np.log1p((1 * 10) / 1) / np.log(10) 45 | }) 46 | 47 | return df, expected_raw_result, expected_processed_result 48 | 49 | 50 | def test_empty_matrix(): 51 | """Test instantiating ExpressionMatrix with empty data.""" 52 | em = ExpressionMatrix( 53 | matrix=np.ndarray(shape=(0, 2)), 54 | features=np.array([], dtype=bytes), 55 | cells=np.array([], dtype=bytes) 56 | ) 57 | 58 | assert em.matrix.shape == (0, 2) 59 | assert em.features.shape == (0,) 60 | assert em.cells.shape == (0,) 61 | 62 | 63 | @pytest.fixture 64 | def empty_matrix(): 65 | """Create an empty ExpressionMatrix.""" 66 | matrix = ExpressionMatrix( 67 | matrix=np.ndarray(shape=(0, 2)), 68 | features=np.array([]), 69 | cells=np.array([]) 70 | ) 71 | return matrix 72 | 73 | 74 | @pytest.fixture 75 | def small_matrix(): 76 | """Create a 2x2 ExpressionMatrix.""" 77 | matrix = ExpressionMatrix( 78 | matrix=np.array([2, 2, 2, 2]).reshape((2, 2)), 79 | features=np.array(['gene1', 'gene2'], dtype=bytes), 80 | cells=np.array(['cell1', 'cell2'], dtype=bytes) 81 | ) 82 | return matrix 83 | 84 | 85 | def test_normalize_empty_matrix(empty_matrix): 86 | """Test normalizing with empty ExpressionMatrix.""" 87 | empty_matrix.normalize(10000) 88 | 89 | 90 | def test_remove_cells_empty_matrix(empty_matrix): 91 | """Test cell filtering with empty ExpressionMatrix.""" 92 | with pytest.raises( 93 | ValueError, 94 | match="Matrix is zero-sized on entry to `remove_cells`." 95 | ): 96 | empty_matrix.remove_cells(0) 97 | 98 | 99 | def test_remove_cells_becomes_empty(small_matrix): 100 | """Test cell filtering with small matrix made empty by filtering.""" 101 | with pytest.raises( 102 | ValueError, 103 | match="All cells would be removed, try altering filter thresholds." 104 | ): 105 | small_matrix.remove_cells(3) 106 | 107 | 108 | def test_remove_features_empty_matrix(empty_matrix): 109 | """Test feature filtering with empty ExpressionMatrix.""" 110 | with pytest.raises( 111 | ValueError, 112 | match="Matrix is zero-sized on entry to `remove_features`." 113 | ): 114 | empty_matrix.remove_features(0) 115 | 116 | 117 | def test_remove_features_becomes_empty(small_matrix): 118 | """Test fature filtering with small matrix made empty by filtering.""" 119 | with pytest.raises( 120 | ValueError, 121 | match="All features would be removed, try altering filter thresholds." 122 | ): 123 | small_matrix.remove_features(3) 124 | 125 | 126 | def test_remove_cells_and_features_empty_matrix(empty_matrix): 127 | """Test cell and feature filtering with empty ExpressionMatrix.""" 128 | with pytest.raises( 129 | ValueError, 130 | match="Matrix is zero-sized on entry to `remove_cells_and_features`." 131 | ): 132 | empty_matrix.remove_cells_and_features(0, 0) 133 | 134 | 135 | def test_remove_cells_and_features_becomes_empty(small_matrix): 136 | """Test cell and feature filtering made empty by filtering.""" 137 | with pytest.raises( 138 | ValueError, 139 | match="All features would be removed, try altering filter thresholds." 140 | ): 141 | small_matrix.remove_cells_and_features(3, 3) 142 | 143 | 144 | def test_remove_skewed_cells_empty_matrix(empty_matrix): 145 | """Test skewed cell filtering with empty ExpressionMatrix.""" 146 | with pytest.raises( 147 | ValueError, 148 | match="Matrix is zero-sized on entry to `remove_skewed_cells`." 149 | ): 150 | empty_matrix.remove_skewed_cells(0, ['gene']) 151 | 152 | 153 | def test_remove_skewed_cells_becomes_empty(small_matrix): 154 | """Test skewed cell filtering with empty ExpressionMatrix.""" 155 | with pytest.raises( 156 | ValueError, 157 | match="All cells would be removed, try altering filter thresholds." 158 | ): 159 | small_matrix.remove_skewed_cells(0.05, ['gene']) 160 | 161 | 162 | def test_remove_unknown_empty_matrix(empty_matrix): 163 | """Test unknown feature filtering with empty ExpressionMatrix.""" 164 | with pytest.raises( 165 | ValueError, 166 | match="Matrix is zero-sized on entry to `remove_unknown`." 167 | ): 168 | empty_matrix.remove_unknown('-') 169 | 170 | 171 | def test_remove_unknown_becomes_empty(): 172 | """Test unknown filtering made empty by filtering.""" 173 | matrix = ExpressionMatrix( 174 | matrix=np.array([2, 2]).reshape((1, 2)), 175 | features=np.array(['-'], dtype=bytes), 176 | cells=np.array(['cell1', 'cell2'], dtype=bytes) 177 | ) 178 | with pytest.raises( 179 | ValueError, 180 | match="All features would be removed, try altering filter thresholds." 181 | ): 182 | matrix.remove_unknown('-') 183 | 184 | 185 | def test_aggregate_hdfs(): 186 | """Test the creation of an ExpressionMatrix from multiple HDF inputs.""" 187 | hdf1 = tempfile.NamedTemporaryFile(suffix='.hdf5', mode='w') 188 | with h5py.File(hdf1.name, 'w') as fh1: 189 | fh1['cells'] = ['cell1', 'cell2'] 190 | fh1['features'] = ['f1', 'f2'] 191 | fh1['matrix'] = np.array([ 192 | [1, 2], [3, 4] 193 | ]).reshape((2, 2)) 194 | 195 | hdf2 = tempfile.NamedTemporaryFile(suffix='.hdf5', mode='w') 196 | with h5py.File(hdf2.name, 'w') as fh2: 197 | fh2['cells'] = ['cell3', 'cell1'] 198 | fh2['features'] = ['f2', 'f1'] 199 | fh2['matrix'] = np.array([ 200 | [1, 2], [2, 4] 201 | ]).reshape((2, 2)) 202 | 203 | hdf3 = tempfile.NamedTemporaryFile(suffix='.hdf5', mode='w') 204 | with h5py.File(hdf3.name, 'w') as fh3: 205 | fh3['cells'] = [] 206 | fh3['features'] = [] 207 | fh3['matrix'] = np.ndarray(shape=(0, 0)) 208 | 209 | em = ExpressionMatrix.aggregate_hdfs((hdf1.name, hdf2.name, hdf3.name)) 210 | 211 | np.testing.assert_array_equal(em.tcells, np.array(['cell1', 'cell2', 'cell3'])) 212 | np.testing.assert_array_equal(em.tfeatures, np.array(['f1', 'f2'])) 213 | np.testing.assert_array_equal( 214 | em.matrix, np.array([[5, 2, 2], [5, 4, 1]]) 215 | ) 216 | 217 | 218 | def test_main(tags_df): 219 | """Test the main function. 220 | 221 | :param tags_df: fixture with input test file and expected result pd.DataFrame. 222 | :return: 223 | """ 224 | tags_df, expected_raw_result, expected_processed_result = tags_df 225 | with tempfile.TemporaryDirectory() as fh: 226 | tmp_test_dir = Path(fh) 227 | os.chdir(tmp_test_dir) 228 | tags_df.to_csv('tags.tsv', sep='\t') 229 | 230 | args = Mock() 231 | args.input = ["tags.tsv"] 232 | args.feature = 'gene' 233 | args.raw = 'raw.tsv' 234 | args.per_cell_mito = 'per_cell_mito.tsv' 235 | args.per_cell_expr = 'per_cell_expr.tsv' 236 | args.filtered_mex = 'filtered_mex.tsv' 237 | args.min_features = 1 238 | args.min_cells = 2 239 | args.max_mito = 5 240 | args.mito_prefixes = 'MT-' 241 | args.norm_count = 10 242 | args.stats = 'stats.tsv' 243 | args.processed = 'processed.tsv' 244 | args.enable_filtering = True 245 | args.text = True 246 | args.enable_umap = False 247 | args.pcn = None 248 | 249 | main(args) 250 | 251 | counts_result_df = pd.read_csv(args.raw, sep='\t', index_col=None) 252 | pd.testing.assert_frame_equal( 253 | expected_raw_result, counts_result_df, check_like=True, check_dtype=False) 254 | 255 | procs_result_df = pd.read_csv(args.processed, sep='\t', index_col=None) 256 | pd.testing.assert_frame_equal( 257 | expected_processed_result, 258 | procs_result_df, check_like=True, check_dtype=False) 259 | 260 | mito_results_df = pd.read_csv(args.per_cell_mito, sep='\t', index_col=None) 261 | assert "CB" in mito_results_df.columns 262 | assert "mito_pct" in mito_results_df.columns 263 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_extract_barcode.py: -------------------------------------------------------------------------------- 1 | """Test adapter_scan_vsearch.""" 2 | import functools 3 | import os 4 | import tempfile 5 | from unittest.mock import Mock 6 | 7 | import pandas as pd 8 | import pytest 9 | from pytest import fixture 10 | from workflow_glue import extract_barcode 11 | from ..sc_util import rev_cmp # noqa: ABS101 12 | 13 | # prevent stdout writing in align_adapter even with `pytest -s` 14 | devnull = open(os.devnull, 'w') 15 | extract_barcode.align_adapter = functools.partial( 16 | extract_barcode.align_adapter, fastq_out=devnull) 17 | 18 | 19 | def gene(): 20 | """Get a randomly-generated gene seuence.""" 21 | return ( 22 | "ATTCAGCGCTGGAGACCGAGCGCCCCGCAAAGGGCCTGATCT" 23 | "ATCGCGCACGGGACTACTCATTGGGACTGCGGCAATAGGGGAGGGGCCTAACAACGTT") 24 | 25 | 26 | def make_fastq( 27 | read_adapter1='CTACACGACGCTCTTCCGATCT', 28 | read_barcode='AAACCCAAGAAACACT', 29 | read_umi='GACTGACTGACT', 30 | read_polyt='T'*12, 31 | rev=True): 32 | """Create a synthetic fastq file containing adapter and barcode sequences.""" 33 | read = \ 34 | f'{read_adapter1}{read_barcode}{read_umi}{read_polyt}{gene()}' 35 | if rev: 36 | read = rev_cmp(read) 37 | 38 | # Make a FASTQ file containing the read and a quality score of 60. 39 | fastq = ( 40 | '@test_id\n' 41 | f"{read}\n" 42 | '+\n' 43 | f"{'?' * len(read)}\n" 44 | ) 45 | 46 | # Write out a test FASTQ 47 | with tempfile.NamedTemporaryFile( 48 | mode='w', suffix='.fastq', delete=False) as fh_fq: 49 | fh_fq.write(fastq) 50 | fq_fn = fh_fq.name 51 | 52 | return fq_fn 53 | 54 | 55 | @fixture 56 | def make_superlist(): 57 | """Make a small superlist(whitelist) of barcodes.""" 58 | superlist = ( 59 | "AAACCCAAGAAACACT\n" 60 | "AAACCCAAGAAACCAT") 61 | 62 | with tempfile.NamedTemporaryFile( 63 | mode='w', suffix='.tsv', delete=False) as fh_sl: 64 | fh_sl.write(superlist) 65 | superlist_fname = fh_sl.name 66 | return superlist_fname 67 | 68 | 69 | @fixture() 70 | def args(make_superlist): 71 | """Mock Args with workflow defaults set.""" 72 | class Args: 73 | contig = 'chr17' 74 | fastq = make_fastq() 75 | match = 5 76 | mismatch = -1 77 | acg_to_n_match = 1 78 | t_to_n_match = 1 79 | adapter1_seq = 'CTACACGACGCTCTTCCGATCT' 80 | adapter1_suff_length = 10 81 | kit = '3prime' 82 | barcode_length = 16 83 | umi_length = 12 84 | window = 100 85 | gap_open = 2 86 | gap_extend = 4 87 | max_adapter1_ed = 3 88 | min_barcode_qv = 15 89 | polyt_length = 10 90 | superlist = make_superlist 91 | verbosity = 2 92 | 93 | return Args 94 | 95 | 96 | def test_main(args): 97 | """Test the final output from main().""" 98 | # Make temp files to store the output 99 | counts_file = tempfile.NamedTemporaryFile( 100 | mode='w', suffix='.tsv') 101 | tags_file = tempfile.NamedTemporaryFile( 102 | mode='w', suffix='.tsv') 103 | trimmed_fastq_file = tempfile.NamedTemporaryFile( 104 | mode='w', suffix='.tsv') 105 | 106 | args.fastq = make_fastq(read_adapter1=args.adapter1_seq, rev=True) 107 | args.output_barcode_counts = counts_file.name 108 | args.output_read_tags = tags_file.name 109 | args.output_trimmed_fastq = trimmed_fastq_file.name 110 | extract_barcode.main(args) 111 | 112 | # Barcode we expect to find in the input BAM 113 | # For 3prime this will be reverse 114 | expected_barcode = 'AAACCCAAGAAACACT' 115 | 116 | counts_result = pd.read_csv(counts_file.name, sep='\t') 117 | assert counts_result.shape == (1, 2) 118 | assert counts_result.iat[0, 0] == expected_barcode 119 | assert counts_result.iat[0, 1] == 1 120 | 121 | tags_result = pd.read_csv(tags_file.name, sep='\t', index_col=0) 122 | 123 | # Check we have correct number of rows returned # read_id(index), CR, CY, UR, UY 124 | assert tags_result.shape == (1, 4) 125 | assert tags_result.loc['test_id', 'CR'] == 'AAACCCAAGAAACACT' 126 | assert tags_result.loc['test_id', 'UR'] == 'GACTGACTGACT' 127 | 128 | # TODO: test if barcode missing from superlist 129 | 130 | 131 | @pytest.mark.parametrize( 132 | 'adapter1_seq,tags_results_shape,counts_results_shape', 133 | [ 134 | # Tags files should have 5 columns (read_id, CR, CY, UR, UY) , 135 | # counts results should have one column (count) with read_id index 136 | ['CTACACGACGCTCTTCCGATCT', (1, 5), (1, 1)], # ED 0 137 | ['CTACACGACGCTCTTCCGAggg', (1, 5), (1, 1)], # ED 3 138 | ['CTACACGACGCTCTTCCGgggg', (0, 5), (0, 1)] # ED 4; no results 139 | ] 140 | ) 141 | def test_align_adapter(args, adapter1_seq, tags_results_shape, counts_results_shape): 142 | """Test the identification of adapter1 sequences. 143 | 144 | algin_adapter() should return results with a max adapter1 edit distance, 145 | which defaults to 3. 146 | """ 147 | tags_file = tempfile.NamedTemporaryFile( 148 | mode='w', suffix='.tsv') 149 | trimmed_fastq_file = tempfile.NamedTemporaryFile( 150 | mode='w', suffix='.tsv') 151 | args.fastq = make_fastq(read_adapter1=adapter1_seq, rev=True) 152 | args.output_read_tags = tags_file.name 153 | args.kit = '3prime' 154 | args.output_trimmed_fastq = trimmed_fastq_file.name 155 | df_counts = extract_barcode.align_adapter(args) 156 | assert df_counts.shape == counts_results_shape 157 | 158 | df_tags = pd.read_csv(tags_file.name, sep='\t') 159 | assert df_tags.shape == tags_results_shape 160 | 161 | 162 | def ascii_decode_qscores(string): 163 | """Convert ASCII character quality values into integers.""" 164 | return list(map(lambda x: ord(x) - 33, string)) 165 | 166 | 167 | def ascii_encode_qscores(integers): 168 | """Convert integer quality values into ASCII characters.""" 169 | return "".join(map(lambda x: chr(x + 33), integers)) 170 | 171 | 172 | @pytest.mark.parametrize( 173 | # 3 prime tests 174 | 'query,query_aln,query_ascii_q,expected_adapter1_ed', 175 | [ 176 | # Adapter 1 BC UMI polyT 177 | 178 | # 100% match of the query adapter1 and the 10bp adapter1 prefix in the ref probe 179 | ["CTACACGACGCTCTTCCGATCT AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT", 180 | " CTTCCGATCT AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT", 181 | "?????????????????????? ()*+,-./01234567 89:;<=>?@ABC ????????????", 182 | 0], 183 | 184 | # 2 bp substitution in the adapter1 query 185 | ["CTACACGACGCTCTTCCGATaa AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT", 186 | " CTTCCGATaa AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT", 187 | "?????????????????????? ()*+,-./01234567 89:;<=>?@ABC ????????????", 188 | 2], 189 | 190 | # 2 bp deletion in the adapter1 query 191 | ["CTACACGACGCTCTTCCGAT AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT", 192 | " CTTCCGAT-- AAACCCAAGAAACACT GACTGACTGACT TTTTTTTTTTTT", 193 | "???????????????????? ()*+,-./01234567 89:;<=>?@ABC ????????????", 194 | 2], 195 | 196 | ] 197 | ) 198 | def test_parse_probe_alignment(query, query_aln, query_ascii_q, expected_adapter1_ed): 199 | """Test_parse_probe_alignment. 200 | 201 | In this test, a mocked parasail alignment is created. We want to test that the 202 | correct barcode, UMI and associated quality scores are extracted from the 203 | query. 204 | 205 | :param query: read query 206 | :param query_aln: the query alignment that would result from parasail alignment to 207 | the reference probe 208 | :param query_ascii_q: the ascii-encoded qualitey string associated with the query 209 | :param: expected_adapter1_ed: the expected edit distance of the adapter1 210 | """ 211 | # Build a mock parasail alignment result. Although there would be geen sequrnce 212 | # after the polyT, we can omit it here. 213 | 214 | # This is the read including the full 22bp adapter1 probe 215 | # adapter1 BC UMI PolyT 216 | barcode, umi = query.split()[1:3] 217 | barcode_q, umi_q = query_ascii_q.split()[1:3] 218 | query = query.replace(' ', '') 219 | query_aln = query_aln.replace(' ', '') 220 | query_ascii_q = query_ascii_q.replace(' ', '') 221 | qual_ints = ascii_decode_qscores(query_ascii_q) 222 | 223 | # The parasail reference alignment. Contains only the 10 bp suffix of the adapter1 224 | # For 3prime kit 225 | ref_align = ( 226 | # 10 bp A1 Ns for BC Ns for UMI PolyT 227 | "CTTCCGATCT NNNNNNNNNNNNNNNN NNNNNNNNNNNN TTTTTTTTTTTT" 228 | ).replace(' ', '') 229 | 230 | p_alignment = Mock() 231 | p_alignment.traceback.query = query_aln 232 | p_alignment.traceback.ref = ref_align 233 | 234 | adapter1_probe_suffix = 'CTTCCGATCT' # Forward seq 235 | 236 | ( 237 | adapter1_editdist, barcode_result, umi_result, 238 | bc_qscores, umi_qscores 239 | ) = extract_barcode.parse_probe_alignment( 240 | p_alignment, adapter1_probe_suffix, 241 | 16, 12, qual_ints, query) 242 | 243 | # convert the return qscores to ascii-encoded values 244 | bc_qscores = ascii_encode_qscores(bc_qscores) 245 | umi_qscores = ascii_encode_qscores(umi_qscores) 246 | 247 | assert adapter1_editdist == expected_adapter1_ed 248 | assert barcode_result == barcode 249 | assert umi_result == umi 250 | assert bc_qscores == barcode_q 251 | assert umi_qscores == umi_q 252 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_format_ctat_output.py: -------------------------------------------------------------------------------- 1 | """Tests for the format_ctat_output module.""" 2 | 3 | from workflow_glue.format_ctat_output import load_fusion_data 4 | 5 | 6 | def test_load_fusion_data_valid_file(tmp_path): 7 | """Test load_fusion_data with a valid fusion file.""" 8 | # Create a temporary valid fusion file 9 | fusion_file = tmp_path / "fusions.tsv" 10 | fusion_file.write_text( 11 | "#FusionName\tLR_accessions\tLeftGene\tRightGene\tLeftBreakpoint" 12 | "\tRightBreakpoint\tSpliceType\n" 13 | 14 | "Fusion1\tread1,read2\tGeneA\tGeneB\tchr1:100\tchr2:200\tSpliceA\n" 15 | "Fusion2\tread3\tGeneC\tGeneD\tchr3:300\tchr4:400\tSpliceB\n" 16 | ) 17 | 18 | fusion_dict = load_fusion_data(fusion_file) 19 | 20 | assert fusion_dict is not None 21 | assert len(fusion_dict) == 3 # 3 unique reads 22 | 23 | 24 | def test_load_fusion_data_empty_file(tmp_path): 25 | """Test load_fusion_data with an empty fusion file.""" 26 | fusion_file = tmp_path / "empty.tsv" 27 | fusion_file.write_text("") 28 | 29 | fusion_dict = load_fusion_data(fusion_file) 30 | 31 | assert fusion_dict is None 32 | 33 | 34 | def test_load_fusion_data_no_entries(tmp_path): 35 | """Test load_fusion_data with a fusion file containing no valid entries.""" 36 | # Create a fusion file with no valid entries 37 | fusion_file = tmp_path / "no_entries_fusion.tsv" 38 | fusion_file.write_text( 39 | "#FusionName\tLR_accessions\tLeftGene\tRightGene\tLeftBreakpoint" 40 | "\tRightBreakpoint\tSpliceType\n" 41 | ) 42 | 43 | fusion_dict = load_fusion_data(fusion_file) 44 | 45 | assert fusion_dict is None 46 | 47 | 48 | def test_load_fusion_data_duplicate_reads(tmp_path): 49 | """Test load_fusion_data with duplicate read IDs.""" 50 | # Create a fusion file with duplicate read IDs 51 | fusion_file = tmp_path / "duplicate_reads_fusion.tsv" 52 | fusion_file.write_text( 53 | "#FusionName\tLR_accessions\tLeftGene\tRightGene\tLeftBreakpoint" 54 | "\tRightBreakpoint\tSpliceType\n" 55 | 56 | "Fusion1\tread1,read1\tGeneA\tGeneB\tchr1:100\tchr2:200\tSpliceA\n" 57 | ) 58 | 59 | fusion_dict = load_fusion_data(fusion_file) 60 | 61 | assert fusion_dict is not None 62 | assert len(fusion_dict) == 1 # Only 1 unique read 63 | assert len(fusion_dict["read1"]) == 2 # 2 entries for the same read ID 64 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_tag_bam.py: -------------------------------------------------------------------------------- 1 | """Test tag_bam.py".""" 2 | import subprocess as sub 3 | 4 | import pandas as pd 5 | import pysam 6 | import pytest 7 | from workflow_glue import tag_bam 8 | 9 | 10 | def make_bam(tmp_path, bam_entries): 11 | """Make a BAM file.""" 12 | read = ( 13 | "ATTCAGCGCTGGAGACCGAGCGCCCCGCAAAGGGCCTGATCT" 14 | "ATCGCGCACGGGACTACTCATTGGGACTGCGGCAATAGGGGAGGGGCCTAACAACGTT") 15 | chrs = set([x[1] for x in bam_entries]) 16 | 17 | # Create the BAM file to be tagged 18 | header = '\n'.join([f'@SQ SN:{chr_} LN:10000000' for chr_ in chrs]) 19 | 20 | entries = [f'{header}'] 21 | 22 | for records in bam_entries: 23 | # Make a sam file containing the read and a quality qscore of 60. 24 | id_, chr_, flag, sa_tag = records 25 | entries.append( 26 | f"{id_}\t{flag}\t{chr_}\t1\t60\t{len(read)}M\t*\t0\t0\t" 27 | f"{read}\t{'?' * len(read)}\t{sa_tag}" 28 | ) 29 | sam = '\n'.join(entries) 30 | # Write out a test BAM 31 | sam_file = tmp_path / 'align.sam' 32 | with open(sam_file, 'w') as fh_sam: 33 | fh_sam.write(sam) 34 | 35 | test_bam = tmp_path / 'align.bam' 36 | sub.check_output(['samtools', 'view', sam_file, '-o', test_bam]) 37 | sub.check_output(['samtools', 'index', test_bam]) 38 | return test_bam 39 | 40 | 41 | @pytest.mark.parametrize( 42 | "tags,prim_records,supp_records", 43 | [ 44 | ( # A single entry from a tag file representing a primary alignment. 45 | [ 46 | { 47 | 'read_id': 'read1', 48 | 'CR': 'AAAAAAAAAAAAAgAA', 49 | 'CB': 'AAAAAAAAAAAAAaAA', 50 | 'CY': '????????????????', 51 | 'UR': 'GGGGGtGGGGGG', 52 | 'UB': 'GGGGGGGGGGGG', 53 | 'UY': '????????????', 54 | 'GN': 'YFG', 55 | 'TR': 'YFT', 56 | 'chr': 'chr1', 57 | 'start': 1000, 58 | 'end': 2000 59 | }, 60 | { 61 | 'read_id': 'read2', 62 | 'CR': 'GCGCGCGCGCGCGCGc', 63 | 'CB': 'GCGCGCGCGCGCGCCC', 64 | 'CY': '????????????????', 65 | 'UR': 'TTTTTTTaTTTT', 66 | 'UB': 'TTTTTTTTTTTT', 67 | 'UY': '????????????', 68 | 'GN': 'YFG2', 69 | 'TR': 'YFT2', 70 | 'chr': 'chr9', 71 | 'start': 1000, 72 | 'end': 2000 73 | } 74 | ], 75 | # Primary records 76 | [ 77 | ('read1', 'chr1', 0, "SA:Z:chr2,10000,+,10S100M1S,60,11"), 78 | ('read2', 'chr9', 0, "")], 79 | # Supplementary records 80 | [('read1', 'chr2', 2048, "")] 81 | ) 82 | ] 83 | ) 84 | def test_add_tags(tmp_path, tags, prim_records, supp_records): 85 | """Check that the output BAMs are tagged correctly.""" 86 | bam_out = tmp_path / 'test_tags.bam' 87 | 88 | tags_to_test = ['CR', 'CB', 'CY', 'UR', 'UB', 'UY', 'GN', 'TR'] 89 | 90 | tag_rows = [] 91 | for tag_entry in tags: 92 | tag_rows.append(pd.DataFrame.from_dict(tag_entry, orient='index').T) 93 | tags_df = pd.concat(tag_rows, axis=0) 94 | tags_df.set_index('read_id', drop=True, inplace=True) 95 | 96 | # Get the SA tags; a subset of the primary tags that have a suppl record 97 | supp_read_ids = [x[0] for x in supp_records] 98 | sa_tags_df = tags_df.loc[supp_read_ids] 99 | 100 | # Create primary and supplementary tag files in temporary directories 101 | prim_tags_dir = tmp_path / 'tags' 102 | prim_tags_dir.mkdir() 103 | sa_tags_dir = tmp_path / 'sa_tags' 104 | sa_tags_dir.mkdir() 105 | 106 | # Write the per chr primary tags file 107 | for chr_, chr_df in tags_df.groupby('chr'): 108 | chr_df.to_csv(prim_tags_dir / f'{chr_}.tsv', sep='\t') 109 | 110 | sa_tags_file = sa_tags_dir / 'sa_tags.tsv' 111 | sa_tags_df.to_csv(sa_tags_file, sep='\t') 112 | 113 | test_bam = make_bam(tmp_path, prim_records + supp_records) 114 | 115 | # Run the test 116 | tag_bam.add_tags(prim_tags_dir, sa_tags_dir, test_bam, bam_out, threads=1) 117 | 118 | # Check that the correct tags have been set on primary and supplementary 119 | # record. 120 | primary_tagged = 0 121 | supp_tagged = 0 122 | with pysam.AlignmentFile(bam_out, "rb") as bam_result: 123 | for align in bam_result: 124 | expected_tags = tags_df.loc[align.query_name] 125 | for expected_tag, expected_value in expected_tags.items(): 126 | if expected_tag in tags_to_test: 127 | assert align.get_tag(expected_tag) == expected_value 128 | if align.is_supplementary: 129 | supp_tagged += 1 130 | else: 131 | primary_tagged += 1 132 | 133 | assert primary_tagged == len(prim_records) 134 | assert supp_tagged == len(supp_records) 135 | 136 | 137 | def test_empty_file(tmp_path): 138 | """Test giving a header-only tags file, in a tags directory, to tag_bams.""" 139 | input_bam = make_bam(tmp_path, [('read1', 'chr1', 0, '')]) 140 | tags_header = ( 141 | 'read_id', 'CR', 'CB', 'CY', 'UR', 'UB', 'UY', 'chr', 'start', 'end', 'gene', 142 | 'transcript') 143 | tags_df = ( 144 | pd.DataFrame(columns=tags_header) 145 | .set_index('read_id', drop=True) 146 | .rename( 147 | columns={v: k for k, v in tag_bam.BAM_TAGS.items()}) 148 | ) 149 | tmp_test_dir = tmp_path / 'tags' 150 | tmp_test_dir.mkdir() 151 | tmp_sa_dir = tmp_path / 'sa_tags' 152 | tmp_sa_dir.mkdir() 153 | header_only_file = tmp_test_dir / 'test_tags.tsv' 154 | header_only_sa_file = tmp_sa_dir / 'test_sa_tags.tsv' 155 | tags_df.to_csv(header_only_file, sep='\t') 156 | tags_df.to_csv(header_only_sa_file, sep='\t') 157 | out_bam = tmp_path / 'out.bam' 158 | tag_bam.add_tags(tmp_test_dir, tmp_sa_dir, input_bam, out_bam, threads=1) 159 | -------------------------------------------------------------------------------- /bin/workflow_glue/util.py: -------------------------------------------------------------------------------- 1 | """The odd helper function. 2 | 3 | Be careful what you place in here. This file is imported into all glue. 4 | """ 5 | import argparse 6 | import logging 7 | 8 | 9 | _log_name = None 10 | 11 | 12 | def get_main_logger(name): 13 | """Create the top-level logger.""" 14 | global _log_name 15 | _log_name = name 16 | logging.basicConfig( 17 | format='[%(asctime)s - %(name)s] %(message)s', 18 | datefmt='%H:%M:%S', level=logging.INFO) 19 | return logging.getLogger(name) 20 | 21 | 22 | def get_named_logger(name): 23 | """Create a logger with a name. 24 | 25 | :param name: name of logger. 26 | """ 27 | name = name.ljust(10)[:10] # so logging is aligned 28 | logger = logging.getLogger('{}.{}'.format(_log_name, name)) 29 | return logger 30 | 31 | 32 | def wf_parser(name): 33 | """Make an argument parser for a workflow command.""" 34 | return argparse.ArgumentParser( 35 | name, 36 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 37 | add_help=False) 38 | 39 | 40 | def _log_level(): 41 | """Parser to set logging level and acquire software version/commit.""" 42 | parser = argparse.ArgumentParser( 43 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False) 44 | 45 | modify_log_level = parser.add_mutually_exclusive_group() 46 | modify_log_level.add_argument( 47 | '--debug', action='store_const', 48 | dest='log_level', const=logging.DEBUG, default=logging.INFO, 49 | help='Verbose logging of debug information.') 50 | modify_log_level.add_argument( 51 | '--quiet', action='store_const', 52 | dest='log_level', const=logging.WARNING, default=logging.INFO, 53 | help='Minimal logging; warnings only.') 54 | 55 | return parser 56 | -------------------------------------------------------------------------------- /bin/workflow_glue/variant_mex.py: -------------------------------------------------------------------------------- 1 | 2 | """Make sparse snv x cell matrix. 3 | 4 | TODO: We may want the expression matrix to use this code. 5 | Or modify the expression matrix code to deal with genotype matrices 6 | """ 7 | import gzip 8 | from pathlib import Path 9 | import sys 10 | 11 | import pandas as pd 12 | import pysam 13 | 14 | 15 | from .util import wf_parser # noqa: ABS101 16 | 17 | 18 | def argparser(): 19 | """Create argument parser.""" 20 | parser = wf_parser("MEX matrix") 21 | 22 | parser.add_argument( 23 | "vcf_in", help="VCF file") 24 | parser.add_argument( 25 | "mex_out_dir", help="MEX output directory") 26 | parser.add_argument( 27 | "--report_vars", 28 | help="List of variant ot report (chr_pos_ref_alt)", default=None) 29 | 30 | return parser 31 | 32 | 33 | def main(args): 34 | """Write a matrix to disk in mtx format. 35 | 36 | :param args.matrix: Path to encoded genotype matrix file with encoding: 37 | hom ref: 0 38 | het: 1 39 | hom alt: 2 40 | no data: -1 41 | """ 42 | # Full (non-sparse) matrix variants to write to a TSV file. 43 | for_report = [] 44 | max_report_vars = 50 45 | report_vars_written = 0 46 | 47 | if args.report_vars: 48 | # Load the list of interesting variants 49 | with open(args.report_vars, 'r') as fh: 50 | report_vars = [line.strip() for line in fh] 51 | 52 | mex_folder = Path(args.mex_out_dir) 53 | mex_folder.mkdir(parents=True, exist_ok=True) 54 | 55 | vcf = pysam.VariantFile(args.vcf_in, threads=6) 56 | samples = list(vcf.header.samples) 57 | 58 | fhf = gzip.open(mex_folder / "features.tsv.gz", 'wt') 59 | 60 | n_rows = 0 61 | n_vars = 0 62 | 63 | with fhf as fh_feat: 64 | 65 | for i, record in enumerate(vcf.fetch()): 66 | n_rows += 1 67 | write_for_report = False 68 | if args.report_vars: 69 | if f"{record.chrom}_{record.pos}" in report_vars: 70 | if report_vars_written < max_report_vars: 71 | write_for_report = True 72 | report_vars_written += 1 73 | else: 74 | # Just get the first n variants to show in the report 75 | if report_vars_written < max_report_vars: 76 | write_for_report = True 77 | report_vars_written += 1 78 | variant_id = f"{record.chrom}_{record.pos}_{record.ref}_{record.alts[0]}" 79 | fh_feat.write(variant_id + '\n') 80 | for j, sample in enumerate(samples): 81 | genotype = record.samples[sample]['GT'] # Get genotype 82 | # numerically-encode diploid genotype 83 | try: 84 | gt_val = sum(allele for allele in genotype) 85 | except TypeError: 86 | gt_val = -1 # Missing genotype 87 | if write_for_report: 88 | for_report.append( 89 | (variant_id, sample, gt_val)) 90 | if gt_val == -1: 91 | continue # Skip missing genotypes from sparse matrix 92 | n_vars += 1 93 | # 1-based indexing for mtx format 94 | sys.stdout.write(f"{i + 1} {j + 1} {gt_val}\n") 95 | with gzip.open(mex_folder / "barcodes.tsv.gz", 'wt') as fh: 96 | for col in samples: 97 | fh.write(f"{col}\n") 98 | 99 | header = (( 100 | '%%MatrixMarket matrix coordinate integer general\n' 101 | '%metadata_json:' 102 | '{"software_version": "ont-single-cell","format_version": 2}\n' 103 | f'{n_rows} {len(samples)} {n_vars} \n') 104 | ) 105 | with open('header.txt', 'w') as f: 106 | f.write(header) 107 | 108 | # Write full matrix of interesting variants to a TSV file 109 | df_top = pd.DataFrame.from_records( 110 | for_report, columns=['variant', 'sample', 'gt_val']) 111 | df_top = df_top.pivot( 112 | index='variant', columns='sample', values='gt_val').fillna(-1) 113 | df_top.to_csv("top_snvs.tsv", sep='\t', index=True) 114 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | """A collection of helper scripts common to workflows.""" 2 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/check_bam_headers_in_dir.py: -------------------------------------------------------------------------------- 1 | """Check (u)BAM files for `@SQ` lines whether they are the same in all headers.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pysam 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def main(args): 12 | """Run the entry point.""" 13 | logger = get_named_logger("checkBamHdr") 14 | 15 | if not args.input_path.is_dir(): 16 | raise ValueError(f"Input path '{args.input_path}' must be a directory.") 17 | 18 | target_files = list(args.input_path.glob("*")) 19 | if not target_files: 20 | raise ValueError(f"No files found in input directory '{args.input_path}'.") 21 | # Loop over target files and check if there are `@SQ` lines in all headers or not. 22 | # Set `is_unaligned` accordingly. If there are mixed headers (either with some files 23 | # containing `@SQ` lines and some not or with different files containing different 24 | # `@SQ` lines), set `mixed_headers` to `True`. 25 | # Also check if there is the SO line, to validate whether the file is (un)sorted. 26 | first_sq_lines = None 27 | mixed_headers = False 28 | sorted_xam = False 29 | for xam_file in target_files: 30 | # get the `@SQ` and `@HD` lines in the header 31 | with pysam.AlignmentFile(xam_file, check_sq=False) as f: 32 | # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with 33 | # same reference but different SQ.UR as mixed_header (see CW-4842) 34 | sq_lines = [{ 35 | "SN": sq["SN"], 36 | "LN": sq["LN"], 37 | "M5": sq.get("M5"), 38 | } for sq in f.header.get("SQ", [])] 39 | hd_lines = f.header.get("HD") 40 | # Check if it is sorted. 41 | # When there is more than one BAM, merging/sorting 42 | # will happen regardless of this flag. 43 | if hd_lines is not None and hd_lines.get('SO') == 'coordinate': 44 | sorted_xam = True 45 | if first_sq_lines is None: 46 | # this is the first file 47 | first_sq_lines = sq_lines 48 | else: 49 | # this is a subsequent file; check with the first `@SQ` lines 50 | if sq_lines != first_sq_lines: 51 | mixed_headers = True 52 | break 53 | 54 | # we set `is_unaligned` to `True` if there were no mixed headers and the last file 55 | # didn't have `@SQ` lines (as we can then be sure that none of the files did) 56 | is_unaligned = not mixed_headers and not sq_lines 57 | # write `is_unaligned` and `mixed_headers` out so that they can be set as env. 58 | # variables 59 | sys.stdout.write( 60 | f"IS_UNALIGNED={int(is_unaligned)};" + 61 | f"MIXED_HEADERS={int(mixed_headers)};" + 62 | f"IS_SORTED={int(sorted_xam)}" 63 | ) 64 | logger.info(f"Checked (u)BAM headers in '{args.input_path}'.") 65 | 66 | 67 | def argparser(): 68 | """Argument parser for entrypoint.""" 69 | parser = wf_parser("check_bam_headers_in_dir") 70 | parser.add_argument("input_path", type=Path, help="Path to target directory") 71 | return parser 72 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/check_sample_sheet.py: -------------------------------------------------------------------------------- 1 | """Check if a sample sheet is valid.""" 2 | import codecs 3 | import csv 4 | import os 5 | import re 6 | import sys 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | # Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my 12 | # comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8 13 | # I should add). If we do not handle this with the correct encoding, the mark will 14 | # appear in the parsed data, causing the header to be malformed. 15 | # See CW-2310 16 | def determine_codec(f): 17 | """Peek at a file and return an appropriate reading codec.""" 18 | with open(f, 'rb') as f_bytes: 19 | # Could use chardet here if we need to expand codec support 20 | initial_bytes = f_bytes.read(8) 21 | 22 | for codec, encoding_name in [ 23 | [codecs.BOM_UTF8, "utf-8-sig"], # use the -sig codec to drop the mark 24 | [codecs.BOM_UTF16_BE, "utf-16"], # don't specify LE or BE to drop mark 25 | [codecs.BOM_UTF16_LE, "utf-16"], 26 | [codecs.BOM_UTF32_BE, "utf-32"], # handle 32 for completeness 27 | [codecs.BOM_UTF32_LE, "utf-32"], # again skip LE or BE to drop mark 28 | ]: 29 | if initial_bytes.startswith(codec): 30 | return encoding_name 31 | return None # will cause file to be opened with default encoding 32 | 33 | 34 | def main(args): 35 | """Run the entry point.""" 36 | logger = get_named_logger("checkSheet") 37 | 38 | barcodes = [] 39 | aliases = [] 40 | sample_types = [] 41 | analysis_groups = [] 42 | allowed_sample_types = [ 43 | "test_sample", "positive_control", "negative_control", "no_template_control" 44 | ] 45 | 46 | if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet): 47 | sys.stdout.write("Could not open sample sheet file.") 48 | sys.exit() 49 | 50 | try: 51 | encoding = determine_codec(args.sample_sheet) 52 | with open(args.sample_sheet, "r", encoding=encoding) as f: 53 | try: 54 | # Excel files don't throw any error until here 55 | csv.Sniffer().sniff(f.readline()) 56 | f.seek(0) # return to initial position again 57 | except Exception as e: 58 | # Excel fails with UniCode error 59 | sys.stdout.write( 60 | "The sample sheet doesn't seem to be a CSV file.\n" 61 | "The sample sheet has to be a CSV file.\n" 62 | "Please verify that the sample sheet is a CSV file.\n" 63 | f"Parsing error: {e}" 64 | ) 65 | 66 | sys.exit() 67 | 68 | csv_reader = csv.DictReader(f) 69 | n_row = 0 70 | for row in csv_reader: 71 | n_row += 1 72 | if n_row == 1: 73 | n_cols = len(row) 74 | else: 75 | # check we got the same number of fields 76 | if len(row) != n_cols: 77 | sys.stdout.write( 78 | f"Unexpected number of cells in row number {n_row}" 79 | ) 80 | sys.exit() 81 | try: 82 | barcodes.append(row["barcode"]) 83 | except KeyError: 84 | sys.stdout.write("'barcode' column missing") 85 | sys.exit() 86 | try: 87 | aliases.append(row["alias"]) 88 | except KeyError: 89 | sys.stdout.write("'alias' column missing") 90 | sys.exit() 91 | try: 92 | sample_types.append(row["type"]) 93 | except KeyError: 94 | pass 95 | try: 96 | analysis_groups.append(row["analysis_group"]) 97 | except KeyError: 98 | pass 99 | except Exception as e: 100 | sys.stdout.write(f"Parsing error: {e}") 101 | sys.exit() 102 | 103 | # check barcodes are correct format 104 | for barcode in barcodes: 105 | if not re.match(r'^barcode\d\d+$', barcode): 106 | sys.stdout.write("values in 'barcode' column are incorrect format") 107 | sys.exit() 108 | 109 | # check aliases are correct format 110 | # for now we have decided they may not start with "barcode" 111 | for alias in aliases: 112 | if alias.startswith("barcode"): 113 | sys.stdout.write("values in 'alias' column must not begin with 'barcode'") 114 | sys.exit() 115 | 116 | # check barcodes are all the same length 117 | first_length = len(barcodes[0]) 118 | for barcode in barcodes[1:]: 119 | if len(barcode) != first_length: 120 | sys.stdout.write("values in 'barcode' column are different lengths") 121 | sys.exit() 122 | 123 | # check barcode and alias values are unique 124 | if len(barcodes) > len(set(barcodes)): 125 | sys.stdout.write("values in 'barcode' column not unique") 126 | sys.exit() 127 | if len(aliases) > len(set(aliases)): 128 | sys.stdout.write("values in 'alias' column not unique") 129 | sys.exit() 130 | 131 | if sample_types: 132 | # check if "type" column has unexpected values 133 | unexp_type_vals = set(sample_types) - set(allowed_sample_types) 134 | 135 | if unexp_type_vals: 136 | sys.stdout.write( 137 | f"found unexpected values in 'type' column: {unexp_type_vals}. " 138 | f"Allowed values are: {allowed_sample_types}" 139 | ) 140 | sys.exit() 141 | 142 | if args.required_sample_types: 143 | for required_type in args.required_sample_types: 144 | if required_type not in allowed_sample_types: 145 | sys.stdout.write(f"Not an allowed sample type: {required_type}") 146 | sys.exit() 147 | if sample_types.count(required_type) < 1: 148 | sys.stdout.write( 149 | f"Sample sheet requires at least 1 of {required_type}") 150 | sys.exit() 151 | if analysis_groups: 152 | # if there was a "analysis_group" column, make sure it had values for all 153 | # samples 154 | if not all(analysis_groups): 155 | sys.stdout.write( 156 | "if an 'analysis_group' column exists, it needs values in each row" 157 | ) 158 | sys.exit() 159 | 160 | logger.info(f"Checked sample sheet {args.sample_sheet}.") 161 | 162 | 163 | def argparser(): 164 | """Argument parser for entrypoint.""" 165 | parser = wf_parser("check_sample_sheet") 166 | parser.add_argument("sample_sheet", help="Sample sheet to check") 167 | parser.add_argument( 168 | "--required_sample_types", 169 | help="List of required sample types. Each sample type provided must " 170 | "appear at least once in the sample sheet", 171 | nargs="*" 172 | ) 173 | return parser 174 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/check_xam_index.py: -------------------------------------------------------------------------------- 1 | """Validate a single (u)BAM file index.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pysam 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def validate_xam_index(xam_file): 12 | """Use fetch to validate the index. 13 | 14 | Invalid indexes will fail the call with a ValueError: 15 | ValueError: fetch called on bamfile without index 16 | """ 17 | with pysam.AlignmentFile(xam_file, check_sq=False) as alignments: 18 | try: 19 | alignments.fetch() 20 | has_valid_index = True 21 | except ValueError: 22 | has_valid_index = False 23 | return has_valid_index 24 | 25 | 26 | def main(args): 27 | """Run the entry point.""" 28 | logger = get_named_logger("checkBamIdx") 29 | 30 | # Check if a XAM has a valid index 31 | has_valid_index = validate_xam_index(args.input_xam) 32 | # write `has_valid_index` out so that they can be set as env. 33 | sys.stdout.write( 34 | f"HAS_VALID_INDEX={int(has_valid_index)}" 35 | ) 36 | logger.info(f"Checked (u)BAM index for: '{args.input_xam}'.") 37 | 38 | 39 | def argparser(): 40 | """Argument parser for entrypoint.""" 41 | parser = wf_parser("check_xam_index") 42 | parser.add_argument("input_xam", type=Path, help="Path to target XAM") 43 | return parser 44 | -------------------------------------------------------------------------------- /bin/workflow_glue/wfg_helpers/get_max_depth_locus.py: -------------------------------------------------------------------------------- 1 | """Find max depth window in a `mosdepth` regions BED file and write as locus string.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pandas as pd 7 | 8 | from ..util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def main(args): 12 | """Run the entry point.""" 13 | logger = get_named_logger("getMaxDepth") 14 | 15 | # read the regions BED file 16 | df = pd.read_csv( 17 | args.depths_bed, sep="\t", header=None, names=["ref", "start", "end", "depth"] 18 | ) 19 | 20 | # get the window with the largest depth 21 | ref, start, end, depth = df.loc[df["depth"].idxmax()] 22 | 23 | # get the length of the reference of that window 24 | ref_length = df.query("ref == @ref")["end"].iloc[-1] 25 | 26 | # show the whole reference in case it's shorter than the desired locus size 27 | if ref_length < args.locus_size: 28 | start = 1 29 | end = ref_length 30 | else: 31 | # otherwise, show a region of the desired size around the window 32 | half_size = args.locus_size // 2 33 | mid = (start + end) // 2 34 | start = mid - half_size 35 | end = mid + half_size 36 | # check if the region starts below `1` or ends beyond the end of the reference 37 | if start < 1: 38 | start = 1 39 | end = args.locus_size 40 | if end > ref_length: 41 | start = ref_length - args.locus_size 42 | end = ref_length 43 | 44 | # write depth and locus string 45 | sys.stdout.write(f"{depth}\t{ref}:{start}-{end}") 46 | 47 | logger.info("Wrote locus with maximum depth to STDOUT.") 48 | 49 | 50 | def argparser(): 51 | """Argument parser for entrypoint.""" 52 | parser = wf_parser("get_max_depth_locus") 53 | parser.add_argument( 54 | "depths_bed", 55 | type=Path, 56 | help="path to mosdepth regions depth file (can be compressed)", 57 | ) 58 | parser.add_argument( 59 | "locus_size", type=int, help="size of the locus in basepairs (e.g. '2000')" 60 | ) 61 | return parser 62 | -------------------------------------------------------------------------------- /data/3M-3pgex-may-2023.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/c82cf053458fadfb64393afbc2124704c99745a3/data/3M-3pgex-may-2023.txt.gz -------------------------------------------------------------------------------- /data/3M-5pgex-jan-2023.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/c82cf053458fadfb64393afbc2124704c99745a3/data/3M-5pgex-jan-2023.txt.gz -------------------------------------------------------------------------------- /data/3M-february-2018.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/c82cf053458fadfb64393afbc2124704c99745a3/data/3M-february-2018.txt.gz -------------------------------------------------------------------------------- /data/737K-arc-v1.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/c82cf053458fadfb64393afbc2124704c99745a3/data/737K-arc-v1.txt.gz -------------------------------------------------------------------------------- /data/737K-august-2016.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/c82cf053458fadfb64393afbc2124704c99745a3/data/737K-august-2016.txt.gz -------------------------------------------------------------------------------- /data/OPTIONAL_FILE: -------------------------------------------------------------------------------- 1 | # Nothing to see here. A sentinel file to replace real data. 2 | # e.g.: 3 | # 4 | # process run { 5 | # input: 6 | # path some_data 7 | # path extra_data 8 | # script: 9 | # def extra = extra_data.name != 'OPTIONAL_FILE' ? "--extra-data $opt" : '' 10 | # """ 11 | # command ${some_data} ${extra} 12 | # """ 13 | # } 14 | # 15 | # some_data = ... 16 | # extra_data = Channel.fromPath("$projectDir/data/OPTIONAL_FILE")) 17 | # run(some_data, extra_data) 18 | -------------------------------------------------------------------------------- /data/genes_of_interest.csv: -------------------------------------------------------------------------------- 1 | COX16 2 | AAGAB 3 | CD70 4 | NOGENE -------------------------------------------------------------------------------- /data/visium-v1.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/c82cf053458fadfb64393afbc2124704c99745a3/data/visium-v1.txt.gz -------------------------------------------------------------------------------- /docs/01_brief_description.md: -------------------------------------------------------------------------------- 1 | Cell barcode & UMI identification from single-cell sequencing data. -------------------------------------------------------------------------------- /docs/02_introduction.md: -------------------------------------------------------------------------------- 1 | This workflow extracts cell barcodes and UMIs from [10x](https://www.10xgenomics.com/)-generated single cell libraries. 2 | It was initially created as a Nextflow port of [Sockeye](https://github.com/nanoporetech/sockeye). 3 | 4 | In brief, the workflow does the following: 5 | 6 | + Adapter identification, fused read splitting and stranding. 7 | + Mapping of reads to genomic reference. 8 | + Gene and transcript read assignment. 9 | + Cell barcode and UMI extraction and correction. 10 | + Generation of gene and transcript count matrices for unique UMIs. 11 | + Tagging BAM files with cell barcodes and UMIs. 12 | + Calculation of library saturation. 13 | 14 | This workflow supports the following 10x kits: 15 | + 3': v2/v3 and v4 (GEM-X) 16 | + 5': v1/v2 17 | + multiome (gene expression only): v1 18 | + visium spatial transcriptomics: v1 19 | 20 | 21 | The [BLAZE](https://github.com/shimlab/BLAZE) preprint provided useful benchmarking of the original sockeye implementation. 22 | This assisted in the selection of appropriate thresholds for cell cut-off and for defining the limits of the gene x cell matrix. 23 | 24 | The isoform selection procedure used in this workflow was adapted from that found in the [FLAMES](https://github.com/LuyiTian/FLAMES) 25 | package. 26 | -------------------------------------------------------------------------------- /docs/03_compute_requirements.md: -------------------------------------------------------------------------------- 1 | Recommended requirements: 2 | 3 | + CPUs = 64 4 | + Memory = 256GB 5 | 6 | Minimum requirements: 7 | 8 | + CPUs = 32 9 | + Memory = 32GB 10 | 11 | Approximate run time: Approximately 8h for 120M reads with the recommended requirements. 12 | 13 | ARM processor support: False 14 | -------------------------------------------------------------------------------- /docs/04_install_and_run.md: -------------------------------------------------------------------------------- 1 | 2 | These are instructions to install and run the workflow on command line. 3 | You can also access the workflow via the 4 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/). 5 | 6 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage 7 | compute and software resources, 8 | therefore Nextflow will need to be 9 | installed before attempting to run the workflow. 10 | 11 | The workflow can currently be run using either 12 | [Docker](https://docs.docker.com/get-started/) 13 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) 14 | to provide isolation of the required software. 15 | Both methods are automated out-of-the-box provided 16 | either Docker or Singularity is installed. 17 | This is controlled by the 18 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) 19 | parameter as exemplified below. 20 | 21 | It is not required to clone or download the git repository 22 | in order to run the workflow. 23 | More information on running EPI2ME workflows can 24 | be found on our [website](https://labs.epi2me.io/wfindex). 25 | 26 | The following command can be used to obtain the workflow. 27 | This will pull the repository in to the assets folder of 28 | Nextflow and provide a list of all parameters 29 | available for the workflow as well as an example command: 30 | 31 | ``` 32 | nextflow run epi2me-labs/wf-single-cell --help 33 | ``` 34 | To update a workflow to the latest version on the command line use 35 | the following command: 36 | ``` 37 | nextflow pull epi2me-labs/wf-single-cell 38 | ``` 39 | 40 | A demo dataset is provided for testing of the workflow. 41 | It can be downloaded and unpacked using the following commands: 42 | ``` 43 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/wf-single-cell-demo.tar.gz 44 | tar -xzvf wf-single-cell-demo.tar.gz 45 | ``` 46 | The workflow can then be run with the downloaded demo data using: 47 | ``` 48 | nextflow run epi2me-labs/wf-single-cell \ 49 | --expected_cells 100 \ 50 | --fastq 'wf-single-cell-demo/chr17.fq.gz' \ 51 | --kit '3prime:v3' \ 52 | --ref_genome_dir 'wf-single-cell-demo' \ 53 | --genes_of_interest 'wf-single-cell-demo/umap_plot_genes.csv' \ 54 | -profile standard 55 | ``` 56 | 57 | For further information about running a workflow on 58 | the command line see https://labs.epi2me.io/wfquickstart/ 59 | -------------------------------------------------------------------------------- /docs/05_related_protocols.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. 4 | 5 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). 6 | 7 | + [Library prep and sequencing protocol for the 10x 5' kit](https://community.nanoporetech.com/docs/prepare/library_prep_protocols/ligation-sequencing-v14-single-cell-transcriptomics-with-5-cdna/v/sst_9204_v114_revd_06mar2024) 8 | + [Library prep and sequencing protocol for the 10x 3' kit](https://community.nanoporetech.com/docs/prepare/library_prep_protocols/single-cell-transcriptomics-with-cdna-prepared-using-10x/v/sst_9198_v114_reve_06dec2023) -------------------------------------------------------------------------------- /docs/06_input_example.md: -------------------------------------------------------------------------------- 1 | 2 | This workflow accepts either FASTQ or BAM files as input. 3 | 4 | The FASTQ or BAM input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ or BAM file; (ii) the path to a top-level directory containing FASTQ or BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ or BAM files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. 5 | 6 | ``` 7 | (i) (ii) (iii) 8 | input_reads.fastq ─── input_directory ─── input_directory 9 | ├── reads0.fastq ├── barcode01 10 | └── reads1.fastq │ ├── reads0.fastq 11 | │ └── reads1.fastq 12 | ├── barcode02 13 | │ ├── reads0.fastq 14 | │ ├── reads1.fastq 15 | │ └── reads2.fastq 16 | └── barcode03 17 | └── reads0.fastq 18 | ``` -------------------------------------------------------------------------------- /docs/06_input_parameters.md: -------------------------------------------------------------------------------- 1 | ### Input Options 2 | 3 | | Nextflow parameter name | Type | Description | Help | Default | 4 | |--------------------------|------|-------------|------|---------| 5 | | fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | 6 | | bam | string | BAM or unaligned BAM (uBAM) files to use in the analysis. | This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | 7 | | epi2me_resource_bundle | string | Reference genome resource bundle to automatically download. | If selected, a prebuilt 10x reference genome bundle will be automatically downloaded from the EPI2ME AWS cloud. If `call_fusions` is selected, a matched ctat-LR-fusion resource directory will also be downloaded. This overrides `ref_genome_dir`, and `ctat_resourses`. The selected resources will be automatically downloaded, on the first run, into the directory defined by the `store_dir` parameter (default `wf-single-cell_resources`). Subsequent workflow runs will use the pre-downloaded resources. | | 8 | | ref_genome_dir | string | A local path to the 10x reference directory. | The workflow requires a 10x reference directory containing sequence and annotation data. The folder should contain these files: `genes/genes.gtf`, `fasta/genome.fa`, and `fasta/genome.fa.fai` as per the 10x reference folder format. 10x reference folders can be downloaded from https://www.10xgenomics.com/support/software/cell-ranger/downloads. Alternatively, the workflow can download a limited set of prebuilt 10x references using the `epi2me_resource_bundle` parameter | | 9 | | ctat_resources | string | For fusion transcript calling. A local path to ctat-LR-fusion resource directory. | The ctat-LR-fusion resource bundle must be built against the same reference genome data as is given with `ref_genome_dir`. Resource bundles can be downloaded from https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/, and instructions for building your own resources bundle can be found here: https://github.com/TrinityCTAT/ctat-genome-lib-builder. Alternatively see the `epi2me_resource_bundle` option | | 10 | | kit | string | The 10x kit and version separated by a colon (eg: 3prime:v3) | 10x kits can be released with different versions, each requiring a specific whitelist that is looked-up by the workflow. If `single_cell_sample_sheet` is not defined, the 10x kit is applied to all samples. This parameter is ignored if `single_cell_sample_sheet` is supplied. | | 11 | | expected_cells | integer | Number of expected cells in the sample. | The number of expected cells. If `single_cell_sample_sheet` is not defined, `expected_cells` is applied to all samples. This parameter is ignored if `single_cell_sample_sheet` is supplied. | | 12 | | estimate_cell_count | boolean | Estimate cell count from the data. | If set to true, the cell count will be estimated from the read count distribution. If set to false, the top `expected_cells` cells with highest read support will be selected. | True | 13 | | single_cell_sample_sheet | string | An optional CSV file used to assign library metadata per sample. If all samples have the same library metadata, this can be supplied instead by using the `--kit` and `--expected_cells` parameters. | Columns should be: [sample_id, kit, exp_cells]. This must not be confused with the MinKNOW sample_sheet. `sample_id` should correspond to `sample_name` which is defined either in the `sample_sheet`, given by the `sample` parameter (for single sample runs) or if no `sample_sheet` or `sample` is given, is derived from the folder name containing the FASTQ files. | | 14 | | full_length_only | boolean | Only process full length reads. | If set to true, only process reads or subreads that are classified as full length (read segments flanked by compatible adapters in the expected orientation). | True | 15 | | min_read_qual | number | Specify read quality lower limit. | Any reads with a quality lower than this limit will not be included in the analysis. | | 16 | | call_fusions | boolean | Use ctat-LR-fusion to call fusion reads. | ctat-LR-fusion is a tool for calling fusions from long reads. | False | 17 | 18 | 19 | ### Sample Options 20 | 21 | | Nextflow parameter name | Type | Description | Help | Default | 22 | |--------------------------|------|-------------|------|---------| 23 | | sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. | | 24 | | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | 25 | 26 | 27 | ### Output Options 28 | 29 | | Nextflow parameter name | Type | Description | Help | Default | 30 | |--------------------------|------|-------------|------|---------| 31 | | out_dir | string | Directory for output of all workflow results. | | output | 32 | 33 | 34 | ### Advanced options 35 | 36 | | Nextflow parameter name | Type | Description | Help | Default | 37 | |--------------------------|------|-------------|------|---------| 38 | | call_variants | boolean | Call cell-level single nucleotide variants (SNV). | Call single cell variants using a longshot-based workflow. This subworkflow is computationally intensive, datasets with large numbers of cells may take a long time. | False | 39 | | report_variants | string | Display information about variants of interest in the report. | A VCF file containing variants of interest. | | 40 | | kit_config | string | A file defining the configurations associated with the various supported 10x kits. | A CSV file is expected with the following headers [kit, barcode_length, umi_length]. If not specified, a default `kit_configs.csv` (found in the project directory root) will be used. This parameter does not typically need be changed. | | 41 | | threads | integer | Number of CPU threads to use in resource intensive processes. | The total CPU resource used by the workflow is constrained by the executor configuration. | 8 | 42 | | fastq_chunk | integer | Sets the maximum number of reads per chunk for the initial processing of reads. | Controls batching of reads for processing. | 1000000 | 43 | | barcode_adapter1_suff_length | integer | Suffix length of the read1 adapter to use in creating the probe sequence for identifying barcode/UMI bases. | For example, specifying 12 would mean that the last 12 bases of the specified read1 sequence will be included in the probe sequence. | 10 | 44 | | barcode_min_quality | integer | Minimum allowed nucleotide-level quality score in the extracted/uncorrected barcode sequence. | Values equal or higher to this this will be considered 'high-quality' and used for generating the barcode whitelist. | 15 | 45 | | barcode_max_ed | integer | Maximum allowable edit distance between uncorrected barcode and the best matching corrected barcode from the sample whitelist. | Barcodes are corrected by searching from a list of barcodes known to exist in the dataset. A maximum edit distance of 2 between query and whitelist barcode is recommended. | 2 | 46 | | barcode_min_ed_diff | integer | Minimum allowable edit distance difference between whitelist barcode candidates. | If there is more than one candidate barcode found in the whitelist, the edit distance difference of the top hit and second best hits (in relation to the uncorrected barcode) must be at least this value to be able to assign a barcode. If the edit distance difference is less than this, it is assumed that barcode identity is amiguous, and the read is not tagged with a corrected barcode. | 2 | 47 | | gene_assigns_minqv | integer | Minimum MAPQ score allowed for a read to be assigned to a gene. | | 30 | 48 | | matrix_min_genes | integer | Filter cells from the gene expression matrix if they contain fewer than genes. | | 200 | 49 | | matrix_min_cells | integer | Filter genes from the gene expression matrix that are observed in fewer than cells. | | 3 | 50 | | matrix_max_mito | integer | Filter cells from the gene expression matrix if more than percent of UMI counts come from mitochondrial genes. | | 20 | 51 | | matrix_norm_count | integer | Normalize expression matrix to counts per cell. | | 10000 | 52 | | genes_of_interest | string | File containing a list of gene symbols (one symbol per line) to annotate with expression values in the UMAP projections. If doing visium spatial analysis, these genes will be used to annotate the spatial plots. | | | 53 | | mito_prefix | string | Gene name prefix to identify for mitochondrial genes. | Parts of the workflow analyse mitochondrial genes separately. These genes are identified by searching for a gene name prefix. Human mitochondrial genes can be identified with prefix 'MT-' and mouse genes with prefix 'mt-'. If the reference genome contains data from multiple organisms with different nomenclature, multiple prefixes can be supplied like so: 'MT-,mt-' | MT- | 54 | | umap_n_repeats | integer | Number of UMAP projection to repeat for each dataset. | The UMAP algorithm contains elements of randomness that can mislead users into seeing associations between cells that are not meaningful. It is recommended to view multiple plots generated with the same parameters and check that any observed structure is consistent across runs. | 3 | 55 | | stringtie_opts | string | StringTie options for transcriptome assembly. | StringTie option string can be supplied at the command line as in this example: `--stringtie_opts="-c 5 -m 100 "`. StringTie options can be found here: http://ccb.jhu.edu/software/stringtie/index.shtml?t=manual. The default option (-c 2) ensures that only transcripts with a coverage of 2 or higher are included in the generated transcriptome | -c 2 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /docs/07_outputs.md: -------------------------------------------------------------------------------- 1 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. 2 | 3 | | Title | File path | Description | Per sample or aggregated | 4 | |-------|-----------|-------------|--------------------------| 5 | | workflow report | wf-single-cell-report.html | Report for all samples | aggregated | 6 | | Results summaries | {{ alias }}/{{ alias }}.config_stats.json | Results summaries including adapter configuration numbers. | per-sample | 7 | | Gene expression counts | {{ alias }}/{{ alias }}.gene_raw_feature_bc_matrix/matrix.mtx.gz | Gene x cell expression sparse matrix values (MEX format). | per-sample | 8 | | Gene expression barcodes | {{ alias }}/{{ alias }}.gene_raw_feature_bc_matrix/barcodes.tsv.gz | Barcode column names (MEX format). | per-sample | 9 | | Gene expression features | {{ alias }}/{{ alias }}.gene_raw_feature_bc_matrix/features.tsv.gz | Feature row names (MEX format). | per-sample | 10 | | Transcript expression counts | {{ alias }}/{{ alias }}.transcript_raw_feature_bc_matrix/matrix.mtx.gz | Transcript x cell expression sparse matrix values (MEX format). | per-sample | 11 | | Transcript expression MEX barcodes | {{ alias }}/{{ alias }}.transcript_raw_feature_bc_matrix/barcodes.tsv.gz | Barcode column names (MEX format). | per-sample | 12 | | Transcript expression MEX features | {{ alias }}/{{ alias }}.transcript_raw_feature_bc_matrix/features.tsv.gz | Feature row names (MEX format). | per-sample | 13 | | Processed gene expression counts | {{ alias }}/{{ alias }}.gene_processed_feature_bc_matrix/matrix.mtx.gz | Filtered and normalized gene x cell expression sparse matrix values (MEX format). | per-sample | 14 | | Processed gene expression barcodes | {{ alias }}/{{ alias }}.gene_processed_feature_bc_matrix/barcodes.tsv.gz | Barcode column names (MEX format) for processed matrix. | per-sample | 15 | | Processed gene expression features | {{ alias }}/{{ alias }}.gene_processed_feature_bc_matrix/features.tsv.gz | Feature row names (MEX format) for processed matrix. | per-sample | 16 | | Processed transcript expression counts | {{ alias }}/{{ alias }}.transcript_processed_feature_bc_matrix/matrix.mtx.gz | Filtered and normalized transcript x cell expression sparse matrix values (MEX format). | per-sample | 17 | | Processed transcript expression MEX barcodes | {{ alias }}/{{ alias }}.transcript_processed_feature_bc_matrix/barcodes.tsv.gz | Barcode column names (MEX format) for processed matrix. | per-sample | 18 | | Processed transcript expression MEX features | {{ alias }}/{{ alias }}.transcript_processed_feature_bc_matrix/features.tsv.gz | Feature row names (MEX format) for processed matrix. | per-sample | 19 | | Mitochondrial expression levels | {{ alias }}/{{ alias }}.gene_expression_mito_per_cell.tsv | Per cell mitochondrial gene expression as percentage total of total gene expression. | per-sample | 20 | | Read summary | {{ alias }}/{{ alias }}.read_summary.tsv | Per read assigned barcodes UMIs genes and transcripts. | per-sample | 21 | | Whitelist | {{ alias }}/{{ alias }}.whitelist.tsv | The barcodes found in the library that remain after filtering. | per-sample | 22 | | Alignment output per sample | {{ alias }}/{{ alias }}.tagged.bam | Genomic alignment output file. | per-sample | 23 | | Alignment index per sample | {{ alias }}/{{ alias }}.tagged.bam.bai | Genomic alignment index file. | per-sample | 24 | | Transcriptome sequence | {{ alias }}/{{ alias }}.transcriptome.fa.gz | Transcriptome generated by Stringtie during transcript discovery stage | per-sample | 25 | | Transcriptome annotation | {{ alias }}/{{ alias }}.transcriptome.gff.gz | Transcriptome annotation generated by Stringtie during transcript discovery stage | per-sample | 26 | | Gene expression umap | {{ alias }}/{{ alias }}.gene_expression_umap_*.tsv | UMAP matrix from gene expression. Varying number of files will be present based on number of umap repeats. | per-sample | 27 | | Transcript expression umap | {{ alias }}/{{ alias }}.transcript_expression_umap_*.tsv | UMAP matrix from transcript expression. Varying number of files will be present based on number of umap repeats. | per-sample | 28 | | Barcode assignment summary | {{ alias }}/{{ alias }}.bc_assignment_summary.tsv | TSV file with barcode assignment summary statistics. | per-sample | 29 | | Single cell SNVs | {{ alias }}/{{ alias }}.final_merged.vcf.gz | VCF file containing per-barcode single nucleotide variant calls. | per-sample | 30 | | Single cell SNVs index | {{ alias }}/{{ alias }}.final_merged.vcf.gz.tbi | VCF index file. | per-sample | 31 | | Genotype matrix | {{ alias }}/{{ alias }}.genotype_matrix/matrix.mtx.gz | Sparse MEX format matrix file. | per-sample | 32 | | Genotype matrix barcodes | {{ alias }}/{{ alias }}.genotype_matrix/barcodes.tsv.gz | Sparse MEX format barcode (columns) file. | per-sample | 33 | | Genotype matrix features | {{ alias }}/{{ alias }}.genotype_matrix/features.tsv.gz | Sparse MEX format SNV ID (rows) file. | per-sample | 34 | | Per-read fusion info | {{ alias }}/fusions/{{ alias }}.ctat-LR-fusion.fusion_predictions_per-read.tsv | TSV file with per-read fusion information, including gene fusion pairs and cell/UMI barcodes. | per-sample | 35 | | Fusion summary | {{ alias }}/fusions/{{ alias }}.ctat-LR-fusion.fusion_predictions_per-fusion.tsv | Summary of each prediciton fusion gene. | per-sample | 36 | | ctat-LR-fusion output | {{ alias }}/fusions/{{ alias }}.ctat-LR-fusion.tar.gz | The complete output of ctat-LR-fusion. | per-sample | 37 | -------------------------------------------------------------------------------- /docs/09_troubleshooting.md: -------------------------------------------------------------------------------- 1 | 2 | + If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. 3 | + See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/). 4 | -------------------------------------------------------------------------------- /docs/10_FAQ.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-single-cell/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). -------------------------------------------------------------------------------- /docs/11_other.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Related blog posts 4 | 5 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. -------------------------------------------------------------------------------- /docs/images/3prime_read.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/c82cf053458fadfb64393afbc2124704c99745a3/docs/images/3prime_read.png -------------------------------------------------------------------------------- /docs/images/probe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/c82cf053458fadfb64393afbc2124704c99745a3/docs/images/probe.png -------------------------------------------------------------------------------- /kit_configs.csv: -------------------------------------------------------------------------------- 1 | kit,barcode_length,umi_length,bc_long_list 2 | 3prime:v2,16,10,737K-august-2016.txt.gz 3 | 3prime:v3,16,12,3M-february-2018.txt.gz 4 | 3prime:v4,16,12,3M-3pgex-may-2023.txt.gz 5 | 5prime:v1,16,10,737K-august-2016.txt.gz 6 | 5prime:v2,16,10,737K-august-2016.txt.gz 7 | 5prime:v3,16,12,3M-5pgex-jan-2023.txt.gz 8 | multiome:v1,16,12,737K-arc-v1.txt.gz 9 | visium:v1,16,12,visium-v1.txt.gz 10 | -------------------------------------------------------------------------------- /lib/ArgumentParser.groovy: -------------------------------------------------------------------------------- 1 | /* Check arguments of a Nextflow function 2 | * 3 | * Nextflow script does not support the Groovy idiom: 4 | * 5 | * def function(Map args[:], arg1, arg2, ...) 6 | * 7 | * to support unordered kwargs. The methods here are designed 8 | * to reduce boileplate while allowing Nextflow script to implement 9 | * 10 | * def function(Map args[:]) 11 | * 12 | * with required and default values. This is similar to some Python 13 | * libraries' (notably matplotlib) extensive use of things like: 14 | * 15 | * def function(*args, **kwargs) 16 | * 17 | * to implement generic APIs. Why do we want to do all this? Because 18 | * we want to write library code with a clean set of required parameters 19 | * but also extensible with non-required parameters with default values. 20 | * This allows us to later add parameters without breaking existing code, 21 | * and is very common practice elsewhere. 22 | */ 23 | 24 | import java.util.Set 25 | 26 | class ArgumentParser { 27 | Set args 28 | Map kwargs 29 | String name 30 | 31 | /* Parse arguments, raising an error on unknown keys */ 32 | public Map parse_args(LinkedHashMap given_args) { 33 | Set opt_keys = kwargs.keySet() 34 | Set given_keys = given_args.keySet() 35 | check_required(given_keys) 36 | check_unknown(given_keys, opt_keys) 37 | return kwargs + given_args 38 | } 39 | 40 | /* Parse arguments, without raising an error for extra keys */ 41 | public Map parse_known_args(LinkedHashMap given_args) { 42 | Set opt_keys = kwargs.keySet() 43 | Set given_keys = given_args.keySet() 44 | check_required(given_keys) 45 | return kwargs + given_args 46 | } 47 | 48 | private void check_required(Set given) { 49 | Set missing_keys = args - given 50 | if (!missing_keys.isEmpty()) { 51 | throw new Exception("Missing arguments for function ${name}: ${missing_keys}") 52 | } 53 | } 54 | 55 | private void check_unknown(Set given, Set kwargs_keys) { 56 | Set extra_keys = given - (args + kwargs_keys) 57 | if (!extra_keys.isEmpty()) { 58 | throw new Exception("Unknown arguments provided to function ${name}: ${extra_keys}.") 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /lib/CWUtil.groovy: -------------------------------------------------------------------------------- 1 | /* Miscellaneous utilities for workflows from the ONT Customer Workflows Group. 2 | */ 3 | class CWUtil { 4 | 5 | /* Mutate the global Nextflow params map 6 | * 7 | * Occasionally, we may wish to mutate the value of a parameter provided 8 | * by the user. Typically, this leads to workflows with `params.my_param` 9 | * and `params._my_param` which is ripe for confusion. Instead, we can 10 | * mutate the parameter value in the Nextflow params ScriptMap itself 11 | * with the following call: 12 | * 13 | * CWUtil.mutateParam(params, k, v) 14 | * 15 | * This is possible as Groovy actually has a surprisingly loose 16 | * definition of "private", and allows us to call the private `allowNames` 17 | * method on the ScriptMap which removes the read-only status for a key set. 18 | * We can follow this up with a call to the private `put0` to reinsert 19 | * the key and mark it as read-only again. 20 | */ 21 | public static void mutateParam(nf_params, key, value) { 22 | Set s = [key] // must be a set to allow call to allowNames 23 | nf_params.allowNames(s) 24 | nf_params.put0(key, value) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /lib/Pinguscript.groovy: -------------------------------------------------------------------------------- 1 | import static groovy.json.JsonOutput.toJson 2 | import groovy.json.JsonBuilder 3 | import groovy.json.JsonSlurper 4 | 5 | 6 | class Pinguscript { 7 | 8 | // Send a ping for the start of a workflow 9 | public static void ping_start(nextflow, workflow, params) { 10 | wf_ping(nextflow, workflow, "start", null, params) 11 | } 12 | // Send a ping for a completed workflow (successful or otherwise) 13 | public static void ping_complete(nextflow, workflow, params) { 14 | wf_ping(nextflow, workflow, "end", null, params) 15 | } 16 | // Send a ping for a workflow error 17 | public static void ping_error(nextflow, workflow, params) { 18 | def error_message = workflow.errorMessage 19 | wf_ping(nextflow, workflow, "error", error_message, params) 20 | } 21 | // Shared handler to construct a ping JSON and send it 22 | private static String wf_ping(nextflow, workflow, event, error_message, params) { 23 | if (params.disable_ping) { 24 | return "{}" 25 | } 26 | def body_json = make_wf_ping(nextflow, workflow, event, error_message, params) 27 | send_ping_post("epilaby", body_json) 28 | } 29 | 30 | // Helper to removing keys from a map 31 | private static clean_meta(meta, keys_to_remove) { 32 | for (key in keys_to_remove) { 33 | if (meta.containsKey(key)) { 34 | meta.remove(key) 35 | } 36 | } 37 | } 38 | 39 | // Helper for fetching a key from the params map 40 | // seems pointless but you just know someone is going to end up writing meta.this ? meta.that 41 | private static get_meta(meta, key) { 42 | (meta.containsKey(key) && meta[key]) ? meta[key].toString() : null 43 | } 44 | 45 | // Construct workflow ping JSON 46 | private static String make_wf_ping(nextflow, workflow, event, error_message, params) { 47 | // cheeky deepcopy using json 48 | String paramsJSON = new JsonBuilder(params).toPrettyString() 49 | def params_data = new JsonSlurper().parseText(paramsJSON) 50 | 51 | // OS 52 | // TODO check version on WSL 53 | def opsys = System.properties['os.name'].toLowerCase() 54 | def opver = System.properties['os.version'] 55 | if (opver.toLowerCase().contains("wsl")){ 56 | opsys = "wsl" 57 | } 58 | 59 | // placeholder for any future okta business 60 | // for now we'll use the guest_ sent to wf.epi2me_user 61 | def user = get_meta(params.wf, "epi2me_user") 62 | 63 | // drop cruft to save some precious bytes 64 | // affects the deep copy rather than original params 65 | clean_meta(params_data, [ 66 | "schema_ignore_params", 67 | ]) 68 | def ingress_ids = [] 69 | if (params_data.containsKey("wf")) { 70 | ingress_ids = params_data.wf["ingress.run_ids"] ?: [] 71 | clean_meta(params_data.wf, [ 72 | "agent", // we send this later 73 | "epi2me_instance", // we send this later 74 | "epi2me_user", // we send this later 75 | "example_cmd", 76 | "ingress.run_ids", // we will send this elsewhere 77 | ]) 78 | } 79 | 80 | // try and get runtime information 81 | def cpus = null 82 | try { 83 | cpus = Runtime.getRuntime().availableProcessors() 84 | } 85 | catch(Exception e) {} 86 | 87 | def workflow_success = null 88 | def workflow_exitcode = null 89 | if (event != "start") { 90 | workflow_success = workflow.success 91 | workflow_exitcode = workflow.exitStatus 92 | } 93 | 94 | /// build message 95 | def body_json = new JsonBuilder() 96 | body_json \ 97 | "tracking_id": [ 98 | "msg_id": UUID.randomUUID().toString(), 99 | "version": "3.0.1" 100 | ], 101 | "source": "workflow", 102 | "event": event, 103 | "params": params_data, 104 | // data will be null on start events, as ingress has not run 105 | "data": event != "start" ? [run_ids: ingress_ids] : null, 106 | "workflow": [ 107 | "name": workflow.manifest.name, 108 | "version": workflow.manifest.version, // could use NfcoreTemplate.version(workflow) 109 | "run_name": workflow.runName, // required to disambiguate sessions 110 | "session": workflow.sessionId, 111 | "profile": workflow.profile, 112 | "resume": workflow.resume, 113 | "error": error_message, // null if no error 114 | "success": workflow_success, 115 | "exitcode": workflow_exitcode, 116 | ], 117 | "env": [ 118 | "user": user, // placeholder for any future okta 119 | "os": [ 120 | "name": opsys, 121 | "version": opver 122 | ], 123 | "resource": [ 124 | "cpus": cpus, 125 | "memory": null, // placeholder, no point asking via Runtime as it will just give us the Xmx size 126 | ], 127 | "agent": get_meta(params.wf, "agent"), // access via original params 128 | "epi2me": [ 129 | "instance": get_meta(params.wf, "epi2me_instance"), 130 | "user": user, 131 | ], 132 | "nextflow": [ 133 | "version": nextflow.version.toString(), 134 | "version_compat": nextflow.version.matches(workflow.manifest.nextflowVersion) 135 | ] 136 | ] 137 | return body_json 138 | } 139 | 140 | // Send a JSON payload to a given endpoint 141 | private static String send_ping_post(endpoint, body_json) { 142 | // Attempt to send payload and absorb any possible Exception gracefully 143 | String postResult 144 | boolean raise_exception = false 145 | try { 146 | ((HttpURLConnection)new URL("https://ping.oxfordnanoportal.com/${endpoint}").openConnection()).with({ 147 | requestMethod = 'POST' 148 | doOutput = true 149 | setConnectTimeout(5000) 150 | setReadTimeout(10000) 151 | setRequestProperty('Content-Type', 'application/json') 152 | setRequestProperty('accept', 'application/json') 153 | outputStream.withPrintWriter({printWriter -> 154 | printWriter.write(body_json.toString()) 155 | }) 156 | 157 | // Rethrow exceptions that imply we're not using this endpoint properly 158 | if(responseCode >= 400 && agent.toString() == "cw-ci") { 159 | raise_exception = true 160 | } 161 | // Accessing inputStream.text will raise an Exception for failed requests 162 | postResult = inputStream.text 163 | }) 164 | } 165 | catch(Exception e) { 166 | if(raise_exception) { throw e } 167 | } 168 | return (postResult) 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /lib/WorkflowMain.groovy: -------------------------------------------------------------------------------- 1 | // This file is based on the nf-core/tools pipeline-template. 2 | // Changes to this file must be propagated via wf-template. 3 | 4 | class WorkflowMain { 5 | 6 | // Citation string for pipeline 7 | public static String citation(workflow) { 8 | return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + 9 | "* The nf-core framework\n" + 10 | " https://doi.org/10.1038/s41587-020-0439-x\n\n" 11 | } 12 | 13 | // Generate help string 14 | public static String help(workflow, params, log) { 15 | String line_sep = ' \\ \n\t' 16 | String command_example = params.wf.example_cmd.join(line_sep) 17 | String command = 'nextflow run ' + workflow.manifest.name + line_sep + command_example 18 | String help_string = '' 19 | help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) 20 | help_string += NfcoreSchema.paramsHelp(workflow, params, command) 21 | help_string += '\n' + citation(workflow) + '\n' 22 | return help_string 23 | } 24 | 25 | // Generate parameter summary log string 26 | public static String paramsSummaryLog(workflow, params, log) { 27 | String workflow_version = NfcoreTemplate.version(workflow) 28 | String summary_log = '' 29 | summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) 30 | summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) 31 | summary_log += '\n' + citation(workflow) + '\n' 32 | summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) 33 | summary_log += "\nThis is ${workflow.manifest.name} ${workflow_version}.\n" 34 | summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) 35 | return summary_log 36 | } 37 | 38 | // Validate parameters and print summary to screen 39 | public static void initialise(workflow, params, log) { 40 | // Print help to screen if required 41 | if (params.help) { 42 | log.info help(workflow, params, log) 43 | System.exit(0) 44 | } 45 | 46 | // Print workflow version and exit on --version 47 | if (params.version) { 48 | String workflow_version = NfcoreTemplate.version(workflow) 49 | log.info "${workflow.manifest.name} ${workflow_version}" 50 | System.exit(0) 51 | } 52 | 53 | // Explode on conda 54 | // conda.enabled seems to be backward compatible but wrap this 55 | // in a generic catch just in case 56 | try { 57 | if (workflow.session.config.conda.enabled) { 58 | log.error "Sorry, this workflow is not compatible with Conda, please use -profile standard (Docker) or -profile singularity." 59 | System.exit(1) 60 | } 61 | } catch(Exception e) {} 62 | 63 | // Validate workflow parameters via the JSON schema 64 | if (params.validate_params) { 65 | NfcoreSchema.validateParameters(workflow, params, log) 66 | } 67 | 68 | // Print parameter summary log to screen 69 | log.info paramsSummaryLog(workflow, params, log) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /lib/common.nf: -------------------------------------------------------------------------------- 1 | import groovy.json.JsonBuilder 2 | 3 | process getParams { 4 | label "wf_common" 5 | publishDir "${params.out_dir}", mode: 'copy', pattern: "params.json" 6 | cache false 7 | cpus 1 8 | memory "2 GB" 9 | output: 10 | path "params.json" 11 | script: 12 | def paramsJSON = new JsonBuilder(params).toPrettyString().replaceAll("'", "'\\\\''") 13 | """ 14 | # Output nextflow params object to JSON 15 | echo '$paramsJSON' > params.json 16 | """ 17 | } 18 | 19 | process configure_igv { 20 | publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv 21 | label "wf_common" 22 | cpus 1 23 | memory "2 GB" 24 | input: 25 | // the python script will work out what to do with all the files based on their 26 | // extensions 27 | path "file-names.txt" 28 | val locus_str 29 | val aln_extra_opts 30 | val var_extra_opts 31 | output: path "igv.json" 32 | script: 33 | // the locus argument just makes sure that the initial view in IGV shows something 34 | // interesting 35 | String locus_arg = locus_str ? "--locus $locus_str" : "" 36 | // extra options for alignment tracks 37 | def aln_opts_json_str = \ 38 | aln_extra_opts ? new JsonBuilder(aln_extra_opts).toPrettyString() : "" 39 | String aln_extra_opts_arg = \ 40 | aln_extra_opts ? "--extra-alignment-opts extra-aln-opts.json" : "" 41 | // extra options for variant tracks 42 | def var_opts_json_str = \ 43 | var_extra_opts ? new JsonBuilder(var_extra_opts).toPrettyString() : "" 44 | String var_extra_opts_arg = \ 45 | var_extra_opts ? "--extra-vcf-opts extra-var-opts.json" : "" 46 | """ 47 | # write out JSON files with extra options for the alignment and variant tracks 48 | echo '$aln_opts_json_str' > extra-aln-opts.json 49 | echo '$var_opts_json_str' > extra-var-opts.json 50 | 51 | workflow-glue configure_igv \ 52 | --fofn file-names.txt \ 53 | $locus_arg \ 54 | $aln_extra_opts_arg \ 55 | $var_extra_opts_arg \ 56 | > igv.json 57 | """ 58 | } 59 | 60 | -------------------------------------------------------------------------------- /lib/nfcore_external_java_deps.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-single-cell/c82cf053458fadfb64393afbc2124704c99745a3/lib/nfcore_external_java_deps.jar -------------------------------------------------------------------------------- /limitations_and_known_issues.md: -------------------------------------------------------------------------------- 1 | # wf-single-cell: Limitations and known issues 2 | 3 | 4 | ## No trimmed FASTQ output 5 | Users may want to obtained FASTQ files trimmed of adapter sequences, barcodes and UMIs for their 6 | downstream analysis, but there is not currently an option to output such files. 7 | 8 | ## No ability to filter non-full-length reads 9 | Subreads are classified as being full length if flanked by two compatible adapters. 10 | However, at the moment this classification has no effect 11 | on whether these are further processed by the workflow. A user option to control this behaviour may be desirable. 12 | 13 | ## 10x gene expression and feature barcodes discrimination 14 | A 10x barcode whitelist containing all possible barcodes is used to cross-reference the discovered baroccodes for 15 | barcode error correction. For the 3prime and multiome kits, the whitelist contains ~3M gene expression barcodes that this workflow is interested in. 16 | However, it also contains a similar number of feature barcodes, see this [10x article](https://kb.10xgenomics.com/hc/en-us/articles/360031133451-Why-is-there-a-discrepancy-in-the-3M-february-2018-txt-barcode-whitelist-). 17 | It's not currently possible to differentiate between the two types of barcode in this whitelist. 18 | Therefore, it is possible that somen error-containing gene expression barcodes may be being incorrectly assigned 19 | to feature barcodes. To what extent this is happening is currently unknown. 20 | 21 | ## Gene and feature assignment discrepancy 22 | In the `assign_features` process, genes are only assigned if they have a MAPQ score greater than a user-defined MAPQ score (default 30) 23 | However, transcripts are assigned based on alignment to a transcriptome that is built during the workflow. 24 | Transcripts are not filtered by MAPQ, but by applying some alternative heuristics based on alignment scores as well as transcript and query 25 | coverages. This can lead to cases where transcripts are called, but not genes. This will be fixed in a future version. 26 | -------------------------------------------------------------------------------- /modules/local/common.nf: -------------------------------------------------------------------------------- 1 | // Merge TSVs and sum the specified column 2 | // Currently support only headerless inputs and summing of the second column 3 | process merge_and_publish_tsv { 4 | publishDir "${params.out_dir}/${meta.alias}", mode: 'copy' 5 | label "wf_common" 6 | cpus 1 7 | memory "2GB" 8 | input: 9 | tuple val(meta), 10 | path("inputs/input*.tsv") 11 | val(output_fname) 12 | output: 13 | tuple val(meta), 14 | path("${meta.alias}.${output_fname}") 15 | script: 16 | """ 17 | find inputs -name "*.tsv" \ 18 | -exec cat {} + \ 19 | | csvtk -t summary -H -f 2:sum -g 1 \ 20 | > "${meta.alias}.${output_fname}" 21 | """ 22 | } -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | // 2 | // Notes to End Users. 3 | // 4 | // The workflow should run without editing this configuration file, 5 | // however there may be instances in which you wish to edit this 6 | // file for compute performance or other reasons. Please see: 7 | // 8 | // https://nextflow.io/docs/latest/config.html#configuration 9 | // 10 | // for further help editing this file. 11 | 12 | params { 13 | help = false 14 | version = false 15 | fastq = null 16 | bam = null 17 | out_dir = "output" 18 | sample_sheet = null 19 | sample = null 20 | single_cell_sample_sheet = null 21 | aws_image_prefix = null 22 | aws_queue = null 23 | disable_ping = false 24 | kit_config = null 25 | kit = null 26 | threads = 8 27 | full_length_only = true 28 | min_read_qual = null 29 | 30 | fastq_chunk = 1000000 31 | barcode_adapter1_suff_length = 10 32 | barcode_min_quality = 15 33 | barcode_max_ed = 2 34 | barcode_min_ed_diff = 2 35 | gene_assigns_minqv = 30 36 | matrix_min_genes = 200 37 | matrix_min_cells = 3 38 | matrix_max_mito = 20 39 | matrix_norm_count = 10000 40 | genes_of_interest = null 41 | umap_n_repeats = 3 42 | expected_cells = null 43 | estimate_cell_count = true 44 | mito_prefix = "MT-" 45 | stringtie_opts = "-c 2" 46 | call_variants = false 47 | report_variants = null 48 | call_fusions = false 49 | 50 | ref_genome_dir = null 51 | ctat_resources = null 52 | epi2me_resource_bundle = null 53 | 54 | monochrome_logs = false 55 | validate_params = true 56 | show_hidden_params = false 57 | schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf,resource_bundles' 58 | store_dir = "wf-single-cell_resources" 59 | 60 | resource_bundles = [ 61 | 'gex-GRCh38-2024-A': [ 62 | '10x': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/refdata-gex-GRCh38-2024-A.tar.gz', 63 | 'ctat-lr-fusion': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/ctat_genome_lib_10x_2024.tar.gz' 64 | ], 65 | 'gex-GRCh38-2024-A_chr_20-21': [ 66 | '10x': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/refdata-gex-GRCh38-2024-A_chr20_21.tar.gz', 67 | 'ctat-lr-fusion': 'https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/ctat_genome_lib_chr20_21_UyHq1cFI.tar.gz' 68 | ] 69 | ] 70 | 71 | wf { 72 | example_cmd = [ 73 | "--expected_cells 100", 74 | "--fastq 'wf-single-cell-demo/chr17.fq.gz'", 75 | "--kit '3prime:v3'", 76 | "--ref_genome_dir 'wf-single-cell-demo'", 77 | "--genes_of_interest 'wf-single-cell-demo/umap_plot_genes.csv'", 78 | ] 79 | merge_threads = 24 80 | fusion_threads = 12 81 | container_sha = "shab5b0dea0efc4685f74c8b4f91c979c587e23a020" 82 | common_sha = "sha1c69fd30053aad5d516e9567b3944384325a0fee" 83 | } 84 | } 85 | 86 | manifest { 87 | name = 'epi2me-labs/wf-single-cell' 88 | author = 'Oxford Nanopore Technologies' 89 | homePage = 'https://github.com/epi2me-labs/wf-single-cell' 90 | description = 'Identification of cell- and UMI barcodes from single-cell sequencing.' 91 | mainScript = 'main.nf' 92 | nextflowVersion = '>=23.04.2' 93 | version = '3.2.0' 94 | } 95 | 96 | epi2melabs { 97 | tags = 'wf-single-cell,transcriptomics,human,mouse' 98 | icon = 'faCircle' 99 | } 100 | 101 | // used by default for "standard" (docker) and singularity profiles, 102 | // other profiles may override. 103 | process { 104 | withLabel:singlecell { 105 | container = "ontresearch/wf-single-cell:${params.wf.container_sha}" 106 | } 107 | withLabel:wf_common { 108 | container = "ontresearch/wf-common:${params.wf.common_sha}" 109 | } 110 | withLabel:ctat_lr_fusion { 111 | container = "trinityctat/ctat_lr_fusion:1.1.0" 112 | } 113 | shell = ['/bin/bash', '-euo', 'pipefail'] 114 | } 115 | 116 | 117 | profiles { 118 | // the "standard" profile is used implicitely by nextflow 119 | // if no other profile is given on the CLI 120 | standard { 121 | docker { 122 | enabled = true 123 | // this ensures container is run as host user and group, but 124 | // also adds host user to the within-container group 125 | runOptions = "--user \$(id -u):\$(id -g) --group-add 100" 126 | } 127 | } 128 | 129 | // using singularity instead of docker 130 | singularity { 131 | singularity { 132 | enabled = true 133 | autoMounts = true 134 | } 135 | } 136 | 137 | conda { 138 | conda.enabled = true 139 | } 140 | 141 | // Using AWS batch. 142 | // May need to set aws.region and aws.batch.cliPath 143 | awsbatch { 144 | process { 145 | executor = 'awsbatch' 146 | queue = "${params.aws_queue}" 147 | memory = '8G' 148 | withLabel:singlecell { 149 | container = "${params.aws_image_prefix}-wf-single-cell:${params.wf.container_sha}" 150 | } 151 | withLabel:wf_common { 152 | container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}" 153 | } 154 | shell = ['/bin/bash', '-euo', 'pipefail'] 155 | } 156 | } 157 | 158 | // local profile for simplified development testing 159 | local { 160 | process.executor = 'local' 161 | } 162 | } 163 | 164 | 165 | timeline { 166 | enabled = true 167 | overwrite = true 168 | file = "${params.out_dir}/execution/timeline.html" 169 | } 170 | report { 171 | enabled = true 172 | overwrite = true 173 | file = "${params.out_dir}/execution/report.html" 174 | } 175 | trace { 176 | enabled = true 177 | overwrite = true 178 | file = "${params.out_dir}/execution/trace.txt" 179 | } 180 | 181 | env { 182 | PYTHONNOUSERSITE = 1 183 | JAVA_TOOL_OPTIONS = "-Xlog:disable -Xlog:all=warning:stderr" 184 | NUMBA_CACHE_DIR = "./numba_cache_dir" 185 | } 186 | -------------------------------------------------------------------------------- /subworkflows/fusions.nf: -------------------------------------------------------------------------------- 1 | process get_ctat_data { 2 | label "wf_common" 3 | cpus 1 4 | memory "2 GB" 5 | storeDir {params.store_dir ? "${params.store_dir}/${name}" : null } 6 | input: 7 | val name 8 | val url 9 | output: 10 | path "${name}", emit: resource_dir 11 | script: 12 | """ 13 | wget -qO- $url \ 14 | | tar --no-same-owner -xzv --one-top-level=${name} --strip-component=1 15 | """ 16 | } 17 | 18 | process find_fusions { 19 | /* 20 | Run ctat-LR-fusion to find fusion reads. 21 | */ 22 | label "ctat_lr_fusion" 23 | cpus params.wf.fusion_threads 24 | memory '16 GB' 25 | publishDir "${params.out_dir}/${meta.alias}/fusions", mode: 'copy', pattern: "${meta.alias}.ctat-LR-fusion.tar.gz" 26 | input: 27 | tuple val(meta), 28 | path("tagged.bam") 29 | path("ctat_reference_bundle") 30 | output: 31 | tuple val(meta), 32 | path("${meta.alias}.ctat-LR-fusion.tar.gz"), 33 | emit: gzipped_ctat_dir 34 | tuple val(meta), 35 | path(fusion_preds), 36 | emit: ctat_fusion_predictions 37 | stdout emit: stdout 38 | script: 39 | String ctat_outdir = "fusions" 40 | // Expected main output file 41 | // This will be present if no fusion candidates are verified (header only) 42 | // or absent if no fusion candidates are found 43 | fusion_preds = "${ctat_outdir}/ctat-LR-fusion.fusion_predictions.tsv" 44 | Integer threads = Math.max(2, task.cpus - 2) 45 | """ 46 | ctat-LR-fusion \ 47 | --LR_bam tagged.bam \ 48 | --genome_lib_dir ./ctat_reference_bundle \ 49 | --CPU ${threads} --vis --output ${ctat_outdir} 50 | 51 | if [ ! -f "${fusion_preds}" ]; then 52 | echo "No fusion candidates found for ${meta.alias}" 53 | # Create an empty file for expected output 54 | touch "${ctat_outdir}/ctat-LR-fusion.fusion_predictions.tsv" 55 | else 56 | n=\$(tail -n +2 "${fusion_preds}" | wc -l) 57 | if [ "\$n" -eq 0 ]; then 58 | echo "Fusion candidates found for ${meta.alias} but none passed filters" 59 | fi 60 | fi 61 | tar -czf "${meta.alias}.ctat-LR-fusion.tar.gz" ${ctat_outdir} 62 | """ 63 | } 64 | 65 | 66 | process format_ctat_output { 67 | label "singlecell" 68 | cpus 1 69 | memory '2 GB' 70 | publishDir "${params.out_dir}/${meta.alias}/fusions", mode: 'copy', pattern: "${meta.alias}.ctat-LR-fusion.fusion_predictions_per*" 71 | input: 72 | tuple val(meta), 73 | path("ctat-LR-fusion.fusion_predictions.tsv"), 74 | path("read_summary_tags.tsv") 75 | output: 76 | tuple val(meta), 77 | path("${meta.alias}.ctat-LR-fusion.fusion_predictions_per-read.tsv"), 78 | emit: read_summary 79 | tuple val(meta), 80 | path("${meta.alias}.ctat-LR-fusion.fusion_predictions_per-fusion.tsv"), 81 | emit: fusion_summary 82 | tuple val(meta), 83 | path("${meta.alias}.ctat-LR-fusion.fusion_summary.tsv"), 84 | emit: cell_summary 85 | script: 86 | """ 87 | workflow-glue format_ctat_output \ 88 | ctat-LR-fusion.fusion_predictions.tsv \ 89 | read_summary_tags.tsv \ 90 | "${meta.alias}.ctat-LR-fusion.fusion_predictions_per-read.tsv" \ 91 | "${meta.alias}.ctat-LR-fusion.fusion_predictions_per-fusion.tsv" \ 92 | "${meta.alias}.ctat-LR-fusion.fusion_summary.tsv" \ 93 | ${meta.alias} 94 | """ 95 | } 96 | 97 | 98 | 99 | workflow ctat_lr_fusion { 100 | take: 101 | tagged_bam_and_summary 102 | ctat_reference_bundle 103 | main: 104 | find_fusions( 105 | tagged_bam_and_summary.map{meta, bam, _bai, _read_summary -> [meta, bam]}, 106 | ctat_reference_bundle) 107 | 108 | find_fusions.out.stdout.map {stdout -> 109 | if (stdout) { 110 | log.warn(stdout) 111 | } 112 | } 113 | 114 | format_ctat_output( 115 | find_fusions.out.ctat_fusion_predictions 116 | .join(tagged_bam_and_summary 117 | .map {meta, _bam, _bai, read_summary -> [meta, read_summary]}) 118 | ) 119 | 120 | emit: 121 | read_summary = format_ctat_output.out.read_summary 122 | fusion_summary = format_ctat_output.out.fusion_summary 123 | cell_summary = format_ctat_output.out.cell_summary 124 | 125 | } -------------------------------------------------------------------------------- /subworkflows/preprocess.nf: -------------------------------------------------------------------------------- 1 | process call_paftools { 2 | label "singlecell" 3 | memory "2 GB" 4 | cpus 1 5 | input: 6 | path "ref_genes.gtf" 7 | output: 8 | path "ref_genes.bed", emit: ref_genes_bed 9 | """ 10 | paftools.js gff2bed -j ref_genes.gtf > ref_genes.bed 11 | """ 12 | } 13 | 14 | 15 | process build_minimap_index { 16 | /* 17 | Build minimap index from reference genome 18 | */ 19 | label "singlecell" 20 | cpus params.threads 21 | memory '16 GB' 22 | input: 23 | path "reference.fa" 24 | output: 25 | path "genome_index.mmi", emit: index 26 | script: 27 | """ 28 | minimap2 -t ${task.cpus} -I 16G -d "genome_index.mmi" "reference.fa" 29 | """ 30 | } 31 | 32 | 33 | process call_adapter_scan { 34 | label "singlecell" 35 | cpus params.threads 36 | // memory here is taken by minimap2. Having merged the three steps into one, 37 | // we have have prehps reduced parallelism in the workflow because in some setups 38 | // it might be the case that multiple tasks of the first two steps cannot now run 39 | // in parallel. The resolution to that would be to make the first two steps do 40 | // better parallelism. The advantage here is not having to write to disk, stage files 41 | // and read from disk between the steps (creating a lot of big temporary files). 42 | // 43 | // peak RSS for aligning this data is robustly <12.4 GB with human reference. Set 44 | // a little more and do a retry 45 | memory {15.GB * task.attempt} 46 | maxRetries 1 47 | errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } 48 | input: 49 | tuple val(meta), path(chunk, stageAs: 'chunk.fq.gz') 50 | path "bc_longlist_dir" 51 | path "genome_index.mmi" 52 | path "ref_genes.bed" 53 | output: 54 | tuple val(meta), path("adapters.json"), emit: adapter_summary 55 | tuple val(meta), path("read_tags.tsv"), emit: read_tags 56 | tuple val(meta), path("high_quality_bc_counts.tsv"), emit: barcode_counts 57 | tuple val(meta), path("sorted.bam"), path("sorted.bam.bai"), emit: bam_sort 58 | tuple val(meta), path("bamstats.tsv"), emit: bam_stats 59 | script: 60 | def fl = params.full_length_only ? "--keep_fl_only": "" 61 | // alignment is the real bottleneck here, don't worry about threads 62 | // for sorting. Just subtract 1 thread as a loose bookeeping. Note the 63 | // hidden call to vsearch in the first program: the pipe doesn't get 64 | // going until thats finished. vsearch appears to use all the juice 65 | // it can squeeze. 66 | // We set -K option to minimap2 as default appears too large to 67 | // stream data effectively (it just blocks with defaults waiting for 68 | // more data). The effectiveness of this is not clear. 69 | def mm2_threads = task.cpus - 1 70 | """ 71 | export POLARS_MAX_THREADS=$task.cpus 72 | 73 | workflow-glue adapter_scan_vsearch \ 74 | chunk.fq.gz \ 75 | --kit ${meta['kit_name']} \ 76 | --summary "adapters.json" \ 77 | ${fl} \ 78 | | workflow-glue extract_barcode \ 79 | - \ 80 | bc_longlist_dir/${meta['bc_long_list']} \ 81 | --kit ${meta["kit_name"]} \ 82 | --adapter1_suff_length $params.barcode_adapter1_suff_length \ 83 | --min_barcode_qv $params.barcode_min_quality \ 84 | --barcode_length ${meta['barcode_length']} \ 85 | --umi_length ${meta['umi_length']} \ 86 | --output_read_tags "bc_extract.tsv" \ 87 | --output_barcode_counts "high_quality_bc_counts.tsv" \ 88 | | minimap2 -ax splice -uf --MD \ 89 | -t $mm2_threads -K 10M \ 90 | --junc-bed ref_genes.bed \ 91 | --cap-kalloc 100m \ 92 | genome_index.mmi - \ 93 | | samtools view -uh --no-PG - \ 94 | | tee >(seqkit bam -s 2> bamstats.tsv ) \ 95 | | tee >(samtools view - -d SA \ 96 | | awk 'BEGIN{OFS="\t"; print "read_id", "SA"} {print \$1,"True"}' > SA_tags.tsv ) \ 97 | | samtools view -uh -F 256 - \ 98 | | tee >(samtools sort --write-index -o "sorted.bam"##idx##"sorted.bam.bai" --no-PG -) \ 99 | | seqkit bam -F - 2> bam_info.tsv 100 | 101 | # TODO: improve this with pipes? 102 | csvtk cut -tlf Read,Pos,EndPos,Ref,MapQual bam_info.tsv > bam_info_cut.tsv 103 | # Left join of barcode 104 | csvtk join -tlf 1 bam_info_cut.tsv bc_extract.tsv --left-join \ 105 | | csvtk rename -tl -f Read,Pos,EndPos,Ref,MapQual -n read_id,start,end,chr,mapq -o read_tags_interim.tsv 106 | 107 | # Merge the SA column with the read tags on read_id 108 | if [ \$(wc -l < SA_tags.tsv) -eq 1 ]; then 109 | echo "No SA tags found" 110 | # Add an empty SA column 111 | csvtk mutate2 -t -n 'SA' -e " '' " read_tags_interim.tsv > read_tags.tsv 112 | else 113 | csvtk -t uniq SA_tags.tsv | csvtk join -t --left-join --fields read_id read_tags_interim.tsv - > read_tags.tsv 114 | fi 115 | rm bam_info.tsv bam_info_cut.tsv bc_extract.tsv read_tags_interim.tsv 116 | """ 117 | } 118 | 119 | 120 | process summarize_adapter_table { 121 | label "singlecell" 122 | publishDir "${params.out_dir}/${meta.alias}", mode: 'copy' 123 | cpus 1 124 | memory "1 GB" 125 | input: 126 | tuple val(meta), path("inputs/summary*.json") 127 | output: 128 | tuple val(meta), path("${meta.alias}.config_stats.json"), emit: config_stats 129 | """ 130 | workflow-glue summarise_adapters inputs "${meta.alias}.config_stats.json" 131 | """ 132 | } 133 | 134 | 135 | // workflow module 136 | workflow preprocess { 137 | take: 138 | read_chunks 139 | bc_longlist_dir 140 | ref_genome_fasta 141 | ref_genome_idx 142 | ref_genes_gtf 143 | main: 144 | // alignment pre-requisites 145 | call_paftools(ref_genes_gtf) 146 | build_minimap_index(ref_genome_fasta) 147 | 148 | // find adapters, trim barcodes, and align 149 | call_adapter_scan( 150 | read_chunks, 151 | bc_longlist_dir, 152 | build_minimap_index.out.index, 153 | call_paftools.out.ref_genes_bed) 154 | 155 | // TODO: we don't necessarily need to merge these, they 156 | // could just be given to the final reporting 157 | // without pre-aggregating 158 | summarize_adapter_table( 159 | call_adapter_scan.out.adapter_summary.groupTuple()) 160 | 161 | emit: 162 | bam_sort = call_adapter_scan.out.bam_sort 163 | bam_stats = call_adapter_scan.out.bam_stats 164 | read_tags = call_adapter_scan.out.read_tags 165 | high_qual_bc_counts = call_adapter_scan.out.barcode_counts 166 | adapter_summary = call_adapter_scan.out.adapter_summary 167 | } 168 | -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Pytests argument definitions.""" 3 | 4 | 5 | def pytest_addoption(parser): 6 | """Define command line arguments for pytest.""" 7 | parser.addoption( 8 | "--wf_out_dir", 9 | action='store', 10 | default='/host/wf-single-cell' 11 | ) 12 | parser.addoption( 13 | "--sample_id", 14 | action="store", 15 | default="sample1" 16 | ) 17 | -------------------------------------------------------------------------------- /test/test_ingress.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from itertools import chain 3 | import json 4 | from pathlib import Path 5 | import sys 6 | 7 | import pandas as pd 8 | import pytest 9 | 10 | import util 11 | 12 | 13 | ROOT_DIR = Path(__file__).resolve().parent.parent 14 | 15 | 16 | def args(): 17 | """Parse and process input arguments. Use the workflow params for those missing.""" 18 | # get the path to the workflow output directory 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument( 21 | "--input", 22 | help=( 23 | "Path to input file / directory with input files / directory with " 24 | "sub-directories with input files; will take input path from workflow " 25 | "output if not provided" 26 | ), 27 | ) 28 | parser.add_argument( 29 | "--type", 30 | choices=util.INPUT_TYPES_EXTENSIONS.keys(), 31 | help="Input file type", 32 | required=True, 33 | ) 34 | parser.add_argument( 35 | "--wf-output-dir", 36 | default=ROOT_DIR / "output", 37 | help=( 38 | "path to the output directory where the workflow results have been " 39 | "published; defaults to 'output' in the root directory of the workflow if " 40 | "not provided" 41 | ), 42 | ) 43 | parser.add_argument( 44 | "--sample_sheet", 45 | help=( 46 | "Path to sample sheet CSV file. If not provided, will take sample sheet " 47 | "path from workflow params (if available)." 48 | ), 49 | ) 50 | parser.add_argument( 51 | "--chunk", type=int, 52 | help=( 53 | "Chunk size for output fastq." 54 | ) 55 | ) 56 | args = parser.parse_args() 57 | 58 | input_type = args.type 59 | wf_output_dir = Path(args.wf_output_dir) 60 | ingress_results_dir = ( 61 | wf_output_dir / f"{'xam' if input_type == 'bam' else 'fastq'}_ingress_results" 62 | ) 63 | 64 | # make sure that there are ingress results (i.e. that the workflow has been 65 | # run successfully and that the correct wf output path was provided) 66 | if not ingress_results_dir.exists(): 67 | raise ValueError( 68 | f"{ingress_results_dir} does not exist. Has `wf-template` been run?" 69 | ) 70 | 71 | # get the workflow params 72 | with open(wf_output_dir / "params.json", "r") as f: 73 | params = json.load(f) 74 | input_path = ( 75 | Path(args.input) if args.input is not None else ROOT_DIR / params[input_type] 76 | ) 77 | sample_sheet = args.sample_sheet 78 | if sample_sheet is None and params["sample_sheet"] is not None: 79 | sample_sheet = ROOT_DIR / params["sample_sheet"] 80 | 81 | # Define output type 82 | output_type = input_type 83 | if params["wf"]["return_fastq"]: 84 | output_type = "fastq" 85 | 86 | if not input_path.exists(): 87 | raise ValueError(f"Input path '{input_path}' does not exist.") 88 | 89 | return input_path, input_type, output_type, sample_sheet, ingress_results_dir, args.chunk, params 90 | 91 | 92 | # prepare data for the tests 93 | @pytest.fixture(scope="module") 94 | def prepare(): 95 | """Prepare data for tests.""" 96 | input_path, input_type, output_type, sample_sheet, ingress_results_dir, chunk_size, params = args() 97 | valid_inputs = util.get_valid_inputs(input_path, input_type, sample_sheet, chunk_size, params) 98 | return ingress_results_dir, input_type, output_type, valid_inputs, chunk_size, params 99 | 100 | 101 | # define tests 102 | def test_result_subdirs(prepare): 103 | """ 104 | Test if workflow results dir contains all expected samples. 105 | 106 | Tests if the published sub-directories in `ingress_results_dir` contain all 107 | the samples we expect. 108 | """ 109 | ingress_results_dir, input_type, output_type, valid_inputs, chunk_size, params = prepare 110 | files = [x for x in ingress_results_dir.iterdir() if x.is_file()] 111 | subdirs = [x.name for x in ingress_results_dir.iterdir() if x.is_dir()] 112 | assert not files, "Files found in top-level dir of ingress results" 113 | assert set(subdirs) == set([meta["alias"] for meta, _ in valid_inputs]) 114 | 115 | 116 | def test_entry_names_and_run_ids(prepare): 117 | """Test sequence names and run IDs. 118 | 119 | Tests if the concatenated sequences indeed contain all the read IDs of the target 120 | files in the valid inputs. 121 | """ 122 | ingress_results_dir, input_type, output_type, valid_inputs, chunk_size, params = prepare 123 | # unless when run with `--bam ... --wf.return_fastq` the output type (i.e. the type 124 | # of the files returned by ingress) is the same as the input type 125 | for meta, path in valid_inputs: 126 | if path is None: 127 | # this sample sheet entry had no input dir (or no reads) 128 | continue 129 | # get entries in the result file produced by the workflow 130 | if chunk_size is not None: 131 | res_seqs_fname = "" 132 | elif output_type == "fastq": 133 | res_seqs_fname = "seqs.fastq.gz" 134 | elif output_type == "bam": 135 | res_seqs_fname = "reads.bam" 136 | else: 137 | raise ValueError(f"Unknown output_type: {output_type}.") 138 | 139 | entries = util.create_preliminary_meta( 140 | ingress_results_dir / meta["alias"] / res_seqs_fname, 141 | output_type, chunk_size, 142 | params["wf"]["return_fastq"]) 143 | 144 | # now collect the entries from the individual input files 145 | exp_read_names = [] 146 | exp_run_ids = [] 147 | target_files = ( 148 | util.get_target_files(path, input_type=input_type) 149 | if path.is_dir() 150 | else [path] 151 | ) 152 | for file in target_files: 153 | if ( 154 | input_type == "bam" 155 | and not params["wf"]["keep_unaligned"] 156 | and util.is_unaligned(file) 157 | ): 158 | continue 159 | curr_entries = util.create_preliminary_meta( 160 | file, input_type, chunk_size, False) 161 | exp_read_names += curr_entries["names"] 162 | exp_run_ids += curr_entries["run_ids"] 163 | assert set(entries["names"]) == set(exp_read_names) 164 | assert set(entries["run_ids"]) == set(exp_run_ids) 165 | 166 | 167 | def test_stats_present(prepare): 168 | """Tests if the `fastcat` stats are present when they should be.""" 169 | ingress_results_dir, input_type, output_type, valid_inputs, chunk_size, params = prepare 170 | for meta, path in valid_inputs: 171 | if path is None: 172 | # this sample sheet entry had no input dir (or no reads) 173 | continue 174 | if output_type == "fastq": 175 | expect_stats = params["wf"]["fastcat_stats"] 176 | stats_dir_name = "fastcat_stats" 177 | stats_file_names = [ 178 | "per-file-stats.tsv", 179 | "per-read-stats.tsv.gz", 180 | "run_ids", 181 | "length.hist", 182 | "quality.hist" 183 | ] 184 | else: 185 | # `bamstats` we only expect when they were requested 186 | expect_stats = params["wf"]["bamstats"] 187 | stats_dir_name = "bamstats_results" 188 | stats_file_names = [ 189 | "bamstats.readstats.tsv.gz", 190 | "bamstats.flagstat.tsv", 191 | "run_ids", 192 | "accuracy.hist", 193 | "coverage.hist", 194 | "length.hist", 195 | "quality.hist" 196 | ] 197 | stats_dir = ingress_results_dir / meta["alias"] / stats_dir_name 198 | # assert that stats are there when we expect them 199 | assert expect_stats == stats_dir.exists() 200 | # make sure that the per-file stats, per-read stats, and run ID files are there 201 | if expect_stats: 202 | for fname in stats_file_names: 203 | assert ( 204 | ingress_results_dir / meta["alias"] / stats_dir_name / fname 205 | ).is_file() 206 | 207 | 208 | def test_metamap(prepare): 209 | """Test if the metamap in the ingress results is as expected.""" 210 | ingress_results_dir, input_type, output_type, valid_inputs, chunk_size, params = prepare 211 | for meta, _ in valid_inputs: 212 | # prepare() uses a function to parse both inputs and outputs, 213 | # add in some output specific things 214 | meta = util.add_output_n_fastq(meta, output_type, chunk_size) 215 | sample_results = ingress_results_dir / meta["alias"] 216 | 217 | # if there were no stats, we can't expect run IDs in the metamap 218 | if not list(sample_results.glob("*stats*/run_ids")): 219 | meta["run_ids"] = [] 220 | # if there are not stats, reset extra fields to defaults 221 | # could not be stats where barcodes are in sample sheet but not in data. 222 | if output_type == "fastq": 223 | meta["n_seqs"] = None 224 | elif output_type == "bam": 225 | meta["n_primary"] = None 226 | meta["n_unmapped"] = None 227 | 228 | # read what nextflow had 229 | with open(sample_results / "metamap.json", "r") as f: 230 | metamap = json.load(f) 231 | assert meta == metamap 232 | 233 | 234 | def test_reads_sorted(prepare): 235 | """If input type is BAM, test if the emitted files were sorted.""" 236 | ingress_results_dir, input_type, output_type, valid_inputs, chunk_size, params = prepare 237 | if input_type == "fastq": 238 | return 239 | for meta, _ in valid_inputs: 240 | stats_file = ( 241 | ingress_results_dir 242 | / meta["alias"] 243 | / "bamstats_results" 244 | / "bamstats.readstats.tsv.gz" 245 | ) 246 | if stats_file.exists(): 247 | stats_df = pd.read_csv(stats_file, sep="\t", index_col=0) 248 | # check that the start coordinates of all aligned reads are sorted within 249 | # their respective reference 250 | assert ( 251 | stats_df.query('ref != "*"') 252 | .groupby("ref")["rstart"] 253 | .is_monotonic_increasing.all() 254 | ) 255 | 256 | 257 | def test_reads_index(prepare): 258 | """If input type is BAM, check that the BAI index exists.""" 259 | ingress_results_dir, input_type, output_type, valid_inputs, chunk_size, params = prepare 260 | if output_type == "fastq": 261 | return 262 | for meta, path in valid_inputs: 263 | if path is None: 264 | # this sample sheet entry had no input dir (or no reads) 265 | continue 266 | # Create BAI file path 267 | bai_file = ( 268 | ingress_results_dir 269 | / meta["alias"] 270 | / 'reads.bam.bai' 271 | ) 272 | if not bai_file.is_file(): 273 | raise ValueError(f"Missing index: {bai_file.as_posix()}.") 274 | 275 | 276 | if __name__ == "__main__": 277 | # trigger pytest 278 | ret_code = pytest.main([Path(__file__).resolve(), "-vv", "-s"]) 279 | sys.exit(ret_code) 280 | -------------------------------------------------------------------------------- /test/workflow_integration.py: -------------------------------------------------------------------------------- 1 | """Integration testing of the whole workflow using synthetic data.""" 2 | 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | from pytest import fixture 7 | 8 | 9 | @fixture 10 | def wf_out_dir(request): 11 | """Set workflow directory.""" 12 | return request.config.getoption('--wf_out_dir') 13 | 14 | 15 | @fixture 16 | def sample_id(request): 17 | """Set sample ID.""" 18 | return request.config.getoption('--sample_id') 19 | 20 | 21 | def test_workflow(wf_out_dir, sample_id): 22 | """Test the whole Nextflow workflow.""" 23 | out_dir = Path(wf_out_dir) 24 | test_out_dir = out_dir / sample_id 25 | read_tags = test_out_dir / 'sample1.read_summary.tsv' 26 | 27 | assert read_tags.is_file() 28 | 29 | df = pd.read_csv(read_tags, sep='\t') 30 | 31 | # As all reads should be assigned a barcode and UMI, there should be the 32 | # same number of output rows as in reads in the integration test data (1850). 33 | assert len(df) == 1850 34 | 35 | # Extract the expected values from the read_id 36 | df[['true_gene', 'true_transcript', 'true_bc', 'true_umi', 'true_status', '_']] \ 37 | = df['read_id'].str.split('|', expand=True) 38 | 39 | # Check barcode and umis are correctly identified. Allow for 2 incorrect values 40 | df_barcode_mismatches = df[df.corrected_barcode != df.true_bc] 41 | assert len(df_barcode_mismatches) < 2 42 | 43 | df_umi_mismatches = df[df.true_umi != df.corrected_umi] 44 | assert len(df_umi_mismatches) < 2 45 | 46 | # Check gene assignment 47 | df_gene_matches = df[df.gene == df.true_gene] 48 | perc_correct = 100 / len(df) * len(df_gene_matches) 49 | assert perc_correct == 100.0 50 | 51 | # Check transcript assignment 52 | # We should be getting more than 85% of the transcritps correctly called, 53 | # especially on this contrived synthetic dataset. 54 | df_tr_matches = df[df.transcript == df.true_transcript] 55 | perc_correct = 100 / len(df) * len(df_tr_matches) 56 | assert perc_correct > 85.0 57 | --------------------------------------------------------------------------------