├── .circleci └── config.yml ├── .dockerignore ├── .flake8 ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docker_build.sh ├── docs ├── README.md └── source │ ├── Makefile │ ├── conf.py │ ├── index.rst │ ├── readme.rst │ ├── sctools.metrics.rst │ ├── sctools.rst │ └── sctools.test.rst ├── fastqpreprocessing ├── .gitignore ├── Makefile ├── patches │ ├── BgzfFileType.cpp.patch │ ├── FastQFile.cpp.patch │ ├── Makefile.patch │ └── general.Makefile.patch ├── src │ ├── example-run.sh │ ├── fastq_common.cpp │ ├── fastq_common.h │ ├── fastq_metrics.cpp │ ├── fastq_metrics.h │ ├── fastq_slideseq.cpp │ ├── fastqprocess.cpp │ ├── htslib_tagsort.cpp │ ├── htslib_tagsort.h │ ├── input_options.cpp │ ├── input_options.h │ ├── metricgatherer.cpp │ ├── metricgatherer.h │ ├── samplefastq.cpp │ ├── tagsort.cpp │ ├── utilities.cpp │ └── utilities.h └── utils │ ├── big-run.sh │ ├── check_barcode_partition.py │ ├── create_fastq.sh │ ├── example-run.sh │ └── run.sh ├── pull_request_template.md ├── readthedocs.yml ├── requirements.txt ├── security.txt ├── setup.py └── src └── sctools ├── __init__.py ├── bam.py ├── barcode.py ├── consts.py ├── count.py ├── encodings.py ├── fastq.py ├── groups.py ├── gtf.py ├── metrics ├── README.md ├── __init__.py ├── aggregator.py ├── gatherer.py ├── merge.py └── writer.py ├── platform.py ├── reader.py ├── stats.py └── test ├── __init__.py ├── characterize-cell-testing-data.ipynb ├── characterize-gene-testing-data.ipynb ├── data ├── 1k-august-2016.txt ├── cell-gene-umi-queryname-sorted.bam ├── cell-sorted-missing-cb.bam ├── cell-sorted.bam ├── cell_metrics_missing_cb.csv.gz ├── chr1.30k_records.gtf.gz ├── group_metrics │ ├── expected_picard_group.csv │ ├── test_hisat2.csv │ ├── test_hisat2_paired_end_qc.log │ ├── test_hisat2_trans.csv │ ├── test_hisat2_transcriptome_rsem.log │ ├── test_picard_group.csv │ ├── test_qc.alignment_summary_metrics.txt │ ├── test_qc.duplicate_metrics.txt │ ├── test_qc.error_summary_metrics.txt │ ├── test_qc.gc_bias.summary_metrics.txt │ ├── test_qc.insert_size_metrics.txt │ ├── test_qc.rna_metrics.txt │ ├── test_rsem.cnt │ └── test_rsem.csv ├── group_metrics_unpaired_ss2 │ ├── SRR6258488_qc.alignment_summary_metrics.txt │ ├── SRR6258488_qc.duplicate_metrics.txt │ ├── SRR6258488_qc.gc_bias.summary_metrics.txt │ └── SRR6258488_qc.rna_metrics.txt ├── small-cell-sorted.bam ├── small-gene-sorted.bam ├── test.bam ├── test.gtf ├── test.gtf.bz2 ├── test.gtf.gz ├── test.sam ├── test_i7.fastq ├── test_i7.fastq.bz2 ├── test_i7.fastq.gz ├── test_r1.fastq ├── test_r1.fastq.bz2 ├── test_r1.fastq.gz ├── test_r2.bam ├── test_r2.fastq ├── test_r2.fastq.bz2 ├── test_r2.fastq.gz ├── test_r2_tagged.bam └── unsorted.bam ├── test_bam.py ├── test_barcode.py ├── test_count.py ├── test_encodings.py ├── test_entrypoints.py ├── test_fastq.py ├── test_groups.py ├── test_gtf.py ├── test_metrics.py ├── test_platform.py └── test_stats.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | docker: 9 | # specify the version you desire here 10 | - image: circleci/python:3.6.1 11 | 12 | # Specify service dependencies here if necessary 13 | # CircleCI maintains a library of pre-built images 14 | # documented at https://circleci.com/docs/2.0/circleci-images/ 15 | # - image: circleci/postgres:9.4 16 | 17 | working_directory: ~/repo 18 | 19 | steps: 20 | - checkout 21 | 22 | # Download and cache dependencies 23 | - restore_cache: 24 | keys: 25 | - v1-dependencies-{{ checksum "requirements.txt" }} 26 | # fallback to using the latest cache if no exact match is found 27 | - v1-dependencies- 28 | 29 | - run: 30 | name: install dependencies 31 | command: | 32 | python3 -m venv venv 33 | . venv/bin/activate 34 | pip install -r requirements.txt 35 | pip install codecov 36 | 37 | - save_cache: 38 | paths: 39 | - ./venv 40 | key: v1-dependencies-{{ checksum "requirements.txt" }} 41 | 42 | # run tests! 43 | # https://pytest.org 44 | # And upload reports to codecov.io 45 | - run: 46 | name: linting test 47 | command: | 48 | . venv/bin/activate 49 | # Check Black code style compliance 50 | black ./ --skip-string-normalization --check --exclude venv 51 | # Check PEP-8 compliance 52 | flake8 53 | 54 | - run: 55 | name: run tests 56 | command: | 57 | . venv/bin/activate 58 | mkdir test-reports 59 | pytest --junitxml=test-reports/junit.xml --cov=sctools 60 | codecov 61 | 62 | - store_artifacts: 63 | path: test-reports 64 | destination: test-reports 65 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | #files ignored when building docker image 2 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | # We ignore the following PEP-8 styles: 2 | 3 | # E203 whitespace before ‘:’ 4 | # E266 too many leading ‘#’ for block comment 5 | # E501 line too long (82 > 79 characters) (^) 6 | # W503 line break occurred before a binary operator 7 | # F841 local variable is assigned to but never used 8 | # W605 invalid escape sequence (causes false alarms around regex) 9 | 10 | # Note: (^) These checks can be disabled at the 11 | # line level using the # noqa special comment. 12 | # This possibility should be reserved for special cases. 13 | 14 | [flake8] 15 | ignore = E203, E266, E501, W503, F841, W605 16 | max-complexity = 18 17 | select = B,C,E,F,W,T4,B9 18 | exclude = 19 | # No need to traverse the virtualenv directory, which'll be created by Circle CI 20 | venv 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.idea 6 | .pytest_cache 7 | 8 | # C extensions 9 | *.so 10 | *.o 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | test/data/bam_with_tags_test.bam 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | docs/generated 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | *.DS_Store 104 | 105 | # do not check in the executable and bam file 106 | fastqpreprocessing/src/fastqprocess 107 | fastqpreprocessing/src/TagSort 108 | fastqpreprocessing/src/obj/ 109 | fastqpreprocessing/bin/ 110 | src/sctools/test/data/bam_with_tags_test.bam 111 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: 19.3b0 4 | hooks: 5 | - id: black 6 | language_version: python3.6 7 | # Using args here is not recommended by Black: 8 | # https://black.readthedocs.io/en/stable/version_control_integration.html 9 | # But since we only have one argument here, and 10 | # we don't force developers to use editor plugins, 11 | # putting the args here seems to be fine 12 | args: [./, --skip-string-normalization] 13 | 14 | - repo: https://gitlab.com/pycqa/flake8 15 | rev: 3.7.7 16 | hooks: 17 | - id: flake8 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.7 2 | 3 | LABEL maintainer="Farzaneh Khajouei " \ 4 | software="sctools v.1.0.0" \ 5 | description="A collection of tools for single cell data. Splitting fastq files based on cellbarcodes and other tools to compute metrics on single cell data using barcodes and UMIs." 6 | 7 | 8 | RUN apt-get update && apt-get upgrade -y && apt-get install -y patch libhdf5-dev vim apt-utils 9 | RUN mkdir /sctools/ 10 | 11 | COPY . /sctools 12 | 13 | ARG htslib_version="1.13" 14 | 15 | RUN cd /sctools/fastqpreprocessing &&\ 16 | wget https://github.com/khajoue2/libStatGen/archive/refs/tags/v1.0.15.broad.tar.gz &&\ 17 | wget https://github.com/samtools/htslib/releases/download/${htslib_version}/htslib-${htslib_version}.tar.bz2 &&\ 18 | tar -zxvf v1.0.15.broad.tar.gz &&\ 19 | tar -jxvf htslib-${htslib_version}.tar.bz2 &&\ 20 | mv libStatGen-1.0.15.broad libStatGen 21 | 22 | RUN cd /sctools/fastqpreprocessing &&\ 23 | wget http://www.cs.unc.edu/Research/compgeom/gzstream/gzstream.tgz &&\ 24 | tar -xvf gzstream.tgz 25 | 26 | RUN cd /sctools/fastqpreprocessing &&\ 27 | make -C libStatGen 28 | 29 | RUN cd /sctools/fastqpreprocessing && make -C htslib-${htslib_version}/ && make -C gzstream 30 | 31 | RUN cd /sctools/fastqpreprocessing && mkdir bin obj && make install 32 | 33 | RUN cp /sctools/fastqpreprocessing/bin/* /usr/local/bin/ 34 | 35 | WORKDIR usr/local/bin/sctools 36 | 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Human Cell Atlas Authors, https://humancellatlas.org 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name Broad Institute, Inc. nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/sctools/test/data/* 2 | include README.rst 3 | include LICENSE -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Single Cell Tools 2 | ################# 3 | 4 | .. image:: https://img.shields.io/circleci/project/github/HumanCellAtlas/sctools.svg?label=Unit%20Test%20on%20Circle%20CI%20&style=flat-square&logo=circleci 5 | :target: https://circleci.com/gh/HumanCellAtlas/sctools/tree/master 6 | :alt: Unit Test Status 7 | 8 | .. image:: https://img.shields.io/codecov/c/github/HumanCellAtlas/sctools/master.svg?label=Test%20Coverage&logo=codecov&style=flat-square 9 | :target: https://codecov.io/gh/HumanCellAtlas/sctools 10 | :alt: Test Coverage on Codecov 11 | 12 | .. image:: https://img.shields.io/readthedocs/sctools/latest.svg?label=ReadtheDocs%3A%20Latest&logo=Read%20the%20Docs&style=flat-square 13 | :target: http://sctools.readthedocs.io/en/latest/?badge=latest 14 | :alt: Documentation Status 15 | 16 | .. image:: https://img.shields.io/snyk/vulnerabilities/github/HumanCellAtlas/sctools/requirements.txt.svg?label=Snyk%20Vulnerabilities&logo=Snyk 17 | :target: https://snyk.io/test/github/HumanCellAtlas/sctools/?targetFile=requirements.txt 18 | :alt: Snyk Vulnerabilities for GitHub Repo (Specific Manifest) 19 | 20 | .. image:: https://img.shields.io/github/release/HumanCellAtlas/sctools.svg?label=Latest%20Release&style=flat-square&colorB=green 21 | :target: https://github.com/HumanCellAtlas/sctools/releases 22 | :alt: Latest Release 23 | 24 | .. image:: https://img.shields.io/github/license/HumanCellAtlas/sctools.svg?style=flat-square 25 | :target: https://img.shields.io/github/license/HumanCellAtlas/sctools.svg?style=flat-square 26 | :alt: License 27 | 28 | .. image:: https://img.shields.io/badge/python-3.6-green.svg?style=flat-square&logo=python&colorB=blue 29 | :target: https://img.shields.io/badge/python-3.6-green.svg?style=flat-square&logo=python&colorB=blue 30 | :alt: Language 31 | 32 | .. image:: https://img.shields.io/badge/Code%20Style-black-000000.svg?style=flat-square 33 | :target: https://github.com/ambv/black 34 | :alt: Code Style 35 | 36 | Single Cell Tools provides utilities for manipulating sequence data formats suitable for use in 37 | distributed systems analyzing large biological datasets. 38 | 39 | Download and Installation 40 | ========================= 41 | 42 | .. code bash 43 | git clone https://github.com/humancellatlas/sctools.git 44 | cd sctools 45 | pip3 install . 46 | pytest # verify installation; run tests 47 | 48 | sctools Package 49 | =============== 50 | 51 | The sctools package provides both command line utilities and classes designed for use in python 52 | programs. 53 | 54 | Command Line Utilities 55 | ====================== 56 | 57 | 1. Attach10XBarcodes: Attached barcodes stored in fastq files to reads in an unaligned bam file 58 | 2. SplitBam: Split a bam file into chunks, guaranteeing that cells are contained in 1 chunk 59 | 3. CalculateGeneMetrics: Calculate information about genes in an experiment or chunk 60 | 4. CalculateCellMetrics: Calculate information about cells in an experiment or chunk 61 | 5. MergeGeneMetrics: Merge gene metrics calculated from different chunks of an experiment 62 | 6. MergeCellMetrics Merge cell metrics calculated from different chunks of an experiment 63 | 64 | Main Package Classes 65 | ==================== 66 | 67 | 1. **Platform**: an abstract class that defines a common data structure for different 3' sequencing 68 | formats. All algorithms and methods in this package that are designed to work on 3' sequencing data 69 | speak to this common data structure. Currently 10X_v2 is defined. 70 | 71 | 2. **Reader**: a general iterator over arbitrarily zipped file(s) that is extended to work with common 72 | sequence formats like fastq (fastq.Reader) and gtf (gtf.Reader). We recommend using the pysam 73 | package for reading sam and bam files. 74 | 75 | 3. **TwoBit & ThreeBit** DNA encoders that store DNA in 2- and 3-bit form. 2-bit is smaller but 76 | randomizes "N" nucleotides. Both classes support fastq operations over common sequence tasks such 77 | as the calculation of GC content. 78 | 79 | 4. **ObservedBarcodeSet & PriorBarcodeSet**: classes for analysis and comparison of sets of barcodes 80 | such as the cell barcodes used by 10X genomics. Supports operations like summarizing hamming 81 | distances and comparing observed sequence diversity to expected (normally uniform) diversity. 82 | 83 | 5. **gtf.Reader & gtf.Record** GTF iterator and GTF record class that exposes the gtf 84 | fields as a lightweight, lazy-parsed python object. 85 | 86 | 6. **fastq.Reader & fastq.Record** fastq reader and fastq record class that exposes the fastq fields 87 | as a lightweight, lazy-parsed python object. 88 | 89 | 7. **Metrics** calculate information about the genes and cells of an experiment 90 | 91 | 8. **Bam** Split bam files into chunks and attach barcodes as tags 92 | 93 | 94 | Viewing Test Results and Coverage 95 | ================================= 96 | 97 | To calculate and view test coverage cd to the ``sctools`` directory and 98 | type the following two commands to generate the report and open it in your web browser: 99 | 100 | .. code:: bash 101 | 102 | pytest --cov-report html:cov_html --cov=sctools 103 | open cov_html/index.html 104 | 105 | Definitions 106 | =========== 107 | 108 | Several definitions are helpful to understand how sequence data is analyzed. 109 | 110 | 1. **Cell**: an individual cell, the target of single-cell RNA-seq experiments and the entity that we 111 | wish to characterize 112 | 113 | 2. **Capture Primer**: A DNA oligonucleotide containing amplification machinery, a fixed cell barcode, 114 | a random molecule barcode, and an oligo-dT tail to capture poly-adenylated RNA 115 | 116 | 3. **Molecule**: A molecule refers to a single mRNA molecule that is captured by an oligo-dT capture 117 | primer in a single-cell sequencing experiment 118 | 119 | 4. **Molecule Barcode**: A molecule barcode (alias: UMI, RMT) is a short, random DNA barcode attached 120 | to the capture primer that has adequate length to be probabilistically unique across the experiment. 121 | Therefore, when multiple molecules of the same gene are captured in the same cell, they can be 122 | differentiated through having different molecule barcodes. The proposed GA4GH standard tag for a 123 | molecule barcode is UB and molecule barcode qualities is UY 124 | 125 | 5. **Cell Barcode**: A short DNA barcode that is typically selected from a whitelist of barcodes that 126 | will be used in an experiment. All capture primers for a given cell will contain the same cell 127 | barcode. The proposed GA4GH standard tag for a cell barcode is CB and cell barcode qualities is CY 128 | 129 | 6. **Fragment**: During library construction, mRNA molecules captured on capture primers are amplified, 130 | and the resulting amplified oligonucleotides are fragmented. In 3' experiments, only the fragment 131 | that contains the 3' end is retained, but the break point will be random, which means fragments 132 | often have different lengths. Once sequenced, different fragments can be identified as unique 133 | combinations of cell barcode, molecule barcode, the chromosome the sequence aligns to, and the 134 | position it aligns to on that chromosome, after correcting for clipping that the aligner may add 135 | 136 | 7. **Bam/Sam file**: The GA4GH standard file type for the storage of aligned sequencing reads. 137 | Unless specified, our Single Cell Tools will operate over bam files containing either aligned or 138 | unaligned reads 139 | 140 | Development 141 | =========== 142 | 143 | Code Style 144 | ---------- 145 | The sctools code base is complying with the PEP-8 and using `Black `_ to 146 | format our code, in order to avoid "nitpicky" comments during the code review process so we spend more time discussing about the logic, 147 | not code styles. 148 | 149 | In order to enable the auto-formatting in the development process, you have to spend a few seconds setting 150 | up the ``pre-commit`` the first time you clone the repo: 151 | 152 | 1. Install ``pre-commit`` by running: ``pip install pre-commit`` (or simply run ``pip install -r requirements.txt``). 153 | 2. Run `pre-commit install` to install the git hook. 154 | 155 | Once you successfully install the ``pre-commit`` hook to this repo, the Black linter/formatter will be automatically triggered and run on this repo. Please make sure you followed the above steps, otherwise your commits might fail at the linting test! 156 | 157 | If you really want to manually trigger the linters and formatters on your code, make sure ``Black`` and ``flake8`` are installed in your Python environment and run ``flake8 DIR1 DIR2`` and ``black DIR1 DIR2 --skip-string-normalization`` respectively. 158 | -------------------------------------------------------------------------------- /docker_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Update version when changes to Dockerfile are made 5 | DOCKER_IMAGE_VERSION=1.0.0 6 | TIMESTAMP=$(date +"%s") 7 | DIR=$(cd $(dirname $0) && pwd) 8 | 9 | # Registries and tags 10 | GCR_URL="us.gcr.io/broad-gotc-prod/sctools" 11 | 12 | # sctools version 13 | SCTOOLS_VERSION="v0.3.15" 14 | 15 | # Necessary tools and help text 16 | TOOLS=(docker gcloud) 17 | HELP="$(basename "$0") [-h|--help] [-v|--version] [-t|tools] -- script to build the sctools image and push to GCR & Quay 18 | 19 | where: 20 | -h|--help Show help text 21 | -v|--version Version of Samtools to use (default: $SCTOOLS_VERSION) 22 | -t|--tools Show tools needed to run script 23 | " 24 | 25 | function main(){ 26 | for t in "${TOOLS[@]}"; do which "$t" >/dev/null || ok=no; done 27 | if [[ $ok == no ]]; then 28 | echo "Missing one of the following tools: " 29 | for t in "${TOOLS[@]}"; do echo "$t"; done 30 | exit 1 31 | fi 32 | 33 | while [[ $# -gt 0 ]] 34 | do 35 | key="$1" 36 | case $key in 37 | -v|--version) 38 | SCTOOLS_VERSION="$2" 39 | shift 40 | shift 41 | ;; 42 | -h|--help) 43 | echo "$HELP" 44 | exit 0 45 | ;; 46 | -t|--tools) 47 | for t in "${TOOLS[@]}"; do echo "$t"; done 48 | exit 0 49 | ;; 50 | *) 51 | shift 52 | ;; 53 | esac 54 | done 55 | 56 | IMAGE_TAG="$DOCKER_IMAGE_VERSION-$SCTOOLS_VERSION-$TIMESTAMP" 57 | 58 | echo "building and pushing GCR Image - $GCR_URL:$IMAGE_TAG" 59 | docker build --no-cache -t "$GCR_URL:$IMAGE_TAG" \ 60 | --build-arg SCTOOLS_VERSION="$SCTOOLS_VERSION" "$DIR" 61 | docker push "$GCR_URL:$IMAGE_TAG" 62 | 63 | echo -e "$GCR_URL:$IMAGE_TAG" >> "$DIR/docker_versions.tsv" 64 | echo "done" 65 | } 66 | 67 | main "$@" 68 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Build Docs 2 | 3 | 1. Make sure you have [Sphinx](http://www.sphinx-doc.org/en/stable/) installed. 4 | 2. Install the sctools package in advance following the instructions. 5 | 3. From the current directory (/docs/), type: 6 | 7 | ```bash 8 | make target 9 | ``` 10 | where `target` is one of {html, epub, latex, ...}. For more details about the sphinx builders, check [here](http://www.sphinx-doc.org/en/master/man/sphinx-build.html) 11 | 12 | Note that there are still some bugs to be worked out. 13 | - There are warnings about: 14 | ``` 15 | WARNING: [autosummary] failed to import 'sctools.metrics.CellMetrics': no module named sctools.metrics.CellMetrics 16 | WARNING: [autosummary] failed to import 'sctools.metrics.GeneMetrics': no module named sctools.metrics.GeneMetrics 17 | WARNING: [autosummary] failed to import 'sctools.metrics.MetricAggregatorBase': no module named sctools.metrics.MetricAggregatorBase 18 | ``` 19 | 20 | - There are a bunch of warnings: `WARNING: Unexpected section title.` 21 | - There are a bunch of warnings: `WARNING: toctree contains reference to nonexisting document` 22 | 23 | Most of the warnings can be solved by refactoring the docstrings and standardize the usages of `autosummary` later. 24 | -------------------------------------------------------------------------------- /docs/source/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = SCTools 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/stable/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('..')) 18 | from pkg_resources import get_distribution 19 | 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = 'SC Tools' 24 | copyright = '2018, Ambrose J. Carr' 25 | author = 'Ambrose J. Carr' 26 | 27 | # The short X.Y version 28 | version = '' 29 | # The full version, including alpha/beta/rc tags 30 | release = get_distribution('sctools').version 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | # If your documentation needs a minimal Sphinx version, state it here. 36 | # 37 | # needs_sphinx = '1.0' 38 | 39 | # Add any Sphinx extension module names here, as strings. They can be 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 41 | # ones. 42 | extensions = [ 43 | 'sphinx.ext.autodoc', 44 | 'sphinx.ext.doctest', 45 | 'sphinx.ext.mathjax', 46 | 'sphinx.ext.viewcode', 47 | 'sphinx.ext.napoleon', 48 | 'sphinx.ext.autosummary', 49 | ] 50 | 51 | # Add any paths that contain templates here, relative to this directory. 52 | templates_path = ['_templates'] 53 | 54 | # The suffix(es) of source filenames. 55 | # You can specify multiple suffix as a list of string: 56 | # 57 | # source_suffix = ['.rst', '.md'] 58 | source_suffix = ['.rst', '.md'] 59 | 60 | # The master toctree document. 61 | master_doc = 'index' 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This pattern also affects html_static_path and html_extra_path . 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = 'sphinx' 77 | 78 | 79 | # -- Options for HTML output ------------------------------------------------- 80 | 81 | # The theme to use for HTML and HTML Help pages. See the documentation for 82 | # a list of builtin themes. 83 | # 84 | html_theme = 'sphinx_rtd_theme' 85 | 86 | # Theme options are theme-specific and customize the look and feel of a theme 87 | # further. For a list of options available for each theme, see the 88 | # documentation. 89 | # 90 | # html_theme_options = {} 91 | 92 | # Add any paths that contain custom static files (such as style sheets) here, 93 | # relative to this directory. They are copied after the builtin static files, 94 | # so a file named "default.css" will overwrite the builtin "default.css". 95 | html_static_path = ['_static'] 96 | 97 | # Custom sidebar templates, must be a dictionary that maps document names 98 | # to template names. 99 | # 100 | # The default sidebars (for documents that don't match any pattern) are 101 | # defined by theme itself. Builtin themes are using these templates by 102 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 103 | # 'searchbox.html']``. 104 | # 105 | # html_sidebars = {} 106 | 107 | 108 | # -- Options for HTMLHelp output --------------------------------------------- 109 | 110 | # Output file base name for HTML help builder. 111 | htmlhelp_basename = 'SCToolsdoc' 112 | 113 | 114 | # -- Options for LaTeX output ------------------------------------------------ 115 | 116 | latex_elements = { 117 | # The paper size ('letterpaper' or 'a4paper'). 118 | # 119 | # 'papersize': 'letterpaper', 120 | # The font size ('10pt', '11pt' or '12pt'). 121 | # 122 | # 'pointsize': '10pt', 123 | # Additional stuff for the LaTeX preamble. 124 | # 125 | # 'preamble': '', 126 | # Latex figure (float) alignment 127 | # 128 | # 'figure_align': 'htbp', 129 | } 130 | 131 | # Grouping the document tree into LaTeX files. List of tuples 132 | # (source start file, target name, title, 133 | # author, documentclass [howto, manual, or own class]). 134 | latex_documents = [ 135 | (master_doc, 'SCTools.tex', 'SC Tools Documentation', 'Ambrose J. Carr', 'manual') 136 | ] 137 | 138 | 139 | # -- Options for manual page output ------------------------------------------ 140 | 141 | # One entry per manual page. List of tuples 142 | # (source start file, name, description, authors, manual section). 143 | man_pages = [(master_doc, 'sctools', 'SC Tools Documentation', [author], 1)] 144 | 145 | 146 | # -- Options for Texinfo output ---------------------------------------------- 147 | 148 | # Grouping the document tree into Texinfo files. List of tuples 149 | # (source start file, target name, title, author, 150 | # dir menu entry, description, category) 151 | texinfo_documents = [ 152 | ( 153 | master_doc, 154 | 'SCTools', 155 | 'SC Tools Documentation', 156 | author, 157 | 'SCTools', 158 | 'One line description of project.', 159 | 'Miscellaneous', 160 | ) 161 | ] 162 | 163 | 164 | # -- Extension configuration ------------------------------------------------- 165 | numpydoc_show_class_members = False 166 | autosummary_generate = True 167 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. toctree:: 2 | :maxdepth: 1 3 | :caption: Overview 4 | 5 | readme 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | :caption: API References 10 | 11 | sctools 12 | sctools.metrics 13 | sctools.test 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /docs/source/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../README.rst 2 | -------------------------------------------------------------------------------- /docs/source/sctools.metrics.rst: -------------------------------------------------------------------------------- 1 | sctools.metrics package 2 | ======================= 3 | 4 | Submodules 5 | ~~~~~~~~~~ 6 | 7 | sctools.metrics.aggregator module 8 | --------------------------------- 9 | 10 | .. automodule:: sctools.metrics.aggregator 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | :inherited-members: 15 | 16 | sctools.metrics.gatherer module 17 | ------------------------------- 18 | 19 | .. automodule:: sctools.metrics.gatherer 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | :inherited-members: 24 | 25 | sctools.metrics.merge module 26 | ---------------------------- 27 | 28 | .. automodule:: sctools.metrics.merge 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | :inherited-members: 33 | 34 | sctools.metrics.writer module 35 | ----------------------------- 36 | 37 | .. automodule:: sctools.metrics.writer 38 | :members: 39 | :undoc-members: 40 | :show-inheritance: 41 | :inherited-members: 42 | -------------------------------------------------------------------------------- /docs/source/sctools.rst: -------------------------------------------------------------------------------- 1 | sctools package 2 | =============== 3 | 4 | 5 | Submodules 6 | ~~~~~~~~~~ 7 | 8 | sctools.bam module 9 | ------------------ 10 | 11 | .. automodule:: sctools.bam 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | :inherited-members: 16 | 17 | sctools.barcode module 18 | ---------------------- 19 | 20 | .. automodule:: sctools.barcode 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | :inherited-members: 25 | 26 | sctools.encodings module 27 | ------------------------ 28 | 29 | .. automodule:: sctools.encodings 30 | :members: 31 | :undoc-members: 32 | :show-inheritance: 33 | :inherited-members: 34 | 35 | sctools.fastq module 36 | -------------------- 37 | 38 | .. automodule:: sctools.fastq 39 | :members: 40 | :undoc-members: 41 | :show-inheritance: 42 | :inherited-members: 43 | 44 | sctools.gtf module 45 | ------------------ 46 | 47 | .. automodule:: sctools.gtf 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | :inherited-members: 52 | 53 | sctools.platform module 54 | ----------------------- 55 | 56 | .. automodule:: sctools.platform 57 | :members: 58 | :undoc-members: 59 | :show-inheritance: 60 | :inherited-members: 61 | 62 | sctools.reader module 63 | --------------------- 64 | 65 | .. automodule:: sctools.reader 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | :inherited-members: 70 | 71 | sctools.stats module 72 | -------------------- 73 | 74 | .. automodule:: sctools.stats 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | :inherited-members: 79 | -------------------------------------------------------------------------------- /docs/source/sctools.test.rst: -------------------------------------------------------------------------------- 1 | sctools.test package 2 | ==================== 3 | 4 | Submodules 5 | ~~~~~~~~~~ 6 | 7 | sctools.test.test\_bam module 8 | ----------------------------- 9 | 10 | .. automodule:: sctools.test.test_bam 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | sctools.test.test\_barcode module 16 | --------------------------------- 17 | 18 | .. automodule:: sctools.test.test_barcode 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | sctools.test.test\_encodings module 24 | ----------------------------------- 25 | 26 | .. automodule:: sctools.test.test_encodings 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | sctools.test.test\_entrypoints module 32 | ------------------------------------- 33 | 34 | .. automodule:: sctools.test.test_entrypoints 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | sctools.test.test\_fastq module 40 | ------------------------------- 41 | 42 | .. automodule:: sctools.test.test_fastq 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | sctools.test.test\_gtf module 48 | ----------------------------- 49 | 50 | .. automodule:: sctools.test.test_gtf 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | sctools.test.test\_metrics module 56 | --------------------------------- 57 | 58 | .. automodule:: sctools.test.test_metrics 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | sctools.test.test\_stats module 64 | ------------------------------- 65 | 66 | .. automodule:: sctools.test.test_stats 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | -------------------------------------------------------------------------------- /fastqpreprocessing/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.o 3 | *.a 4 | *.bak 5 | dox/ 6 | dox_errors.txt 7 | *# 8 | *nohup.txt 9 | -------------------------------------------------------------------------------- /fastqpreprocessing/Makefile: -------------------------------------------------------------------------------- 1 | IDIR1 = libStatGen/include 2 | IDIR2 = htslib-1.13 3 | IDIR3 = gzstream 4 | 5 | CC = g++ -std=c++17 -fPIC -DHTSLIB -Wall -O4 -Wwrite-strings 6 | 7 | CFLAGS = -I$(IDIR1) -LlibStatGen -Lgzstream 8 | 9 | LIBS = -LlibStatGen -lStatGen -lz -lpthread -lstdc++fs -Lgzstream -lgzstream 10 | 11 | _DEPS = src/utilities.h src/input_options.h src/fastq_common.h 12 | 13 | TARGET1 = bin/fastqprocess 14 | TARGET1_OBJ = obj/fastqprocess.o 15 | 16 | TARGET2 = bin/TagSort 17 | TARGET2_OBJ = obj/tagsort.o obj/htslib_tagsort.o obj/metricgatherer.o 18 | 19 | TARGET3 = bin/fastq_slideseq 20 | TARGET3_OBJ = obj/fastq_slideseq.o 21 | 22 | TARGET4 = bin/fastq_metrics 23 | TARGET4_OBJ = obj/fastq_metrics.o 24 | 25 | TARGET5 = bin/samplefastq 26 | TARGET5_OBJ = obj/samplefastq.o 27 | 28 | install: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5) 29 | cp htslib-1.13/*.so.? bin/ 30 | 31 | all: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5) 32 | 33 | COMMON_OBJ = obj/utilities.o obj/input_options.o obj/fastq_common.o 34 | 35 | obj/%.o: src/%.cpp $(_DEPS) 36 | $(CC) -c -o $@ $< -I$(IDIR1) -I$(IDIR2) -I$(IDIR3) 37 | 38 | $(TARGET1): $(COMMON_OBJ) $(TARGET1_OBJ) 39 | $(CC) -o $@ $^ $(CFLAGS) $(LIBS) 40 | 41 | $(TARGET2): $(COMMON_OBJ) $(TARGET2_OBJ) 42 | $(CC) -Wl,-rpath,/usr/local/bin:fastqpreprocessing/bin:bin:. -o $@ $(COMMON_OBJ) $(TARGET2_OBJ) $(LIBS) -Lhtslib-1.13 -lhts 43 | 44 | $(TARGET3): $(COMMON_OBJ) $(TARGET3_OBJ) 45 | $(CC) -o $@ $^ $(CFLAGS) $(LIBS) 46 | 47 | $(TARGET4): $(COMMON_OBJ) $(TARGET4_OBJ) 48 | $(CC) -o $@ $^ $(CFLAGS) $(LIBS) 49 | 50 | $(TARGET5): $(COMMON_OBJ) $(TARGET5_OBJ) 51 | $(CC) -o $@ $^ $(CFLAGS) $(LIBS) 52 | 53 | .PHONY: clean 54 | clean: 55 | rm -f obj/*.o *~ core $(INCDIR)/*~ *.o *.so *.a 56 | rm -rf $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5) 57 | -------------------------------------------------------------------------------- /fastqpreprocessing/patches/BgzfFileType.cpp.patch: -------------------------------------------------------------------------------- 1 | --- libStatGen/general/BgzfFileType.cpp 2015-07-08 20:03:23.000000000 +0000 2 | +++ /tmp/BgzfFileType.cpp 2020-11-03 12:25:36.168474179 +0000 3 | @@ -23,7 +23,7 @@ 4 | #include "BgzfFileType.h" 5 | 6 | // Default to require the EOF block at the end of the file. 7 | -bool BgzfFileType::ourRequireEofBlock = true; 8 | +bool BgzfFileType::ourRequireEofBlock = false; 9 | 10 | BgzfFileType::BgzfFileType(const char * filename, const char * mode) 11 | { 12 | -------------------------------------------------------------------------------- /fastqpreprocessing/patches/FastQFile.cpp.patch: -------------------------------------------------------------------------------- 1 | --- libStatGen-1.0.14/fastq/FastQFile.cpp 2015-07-08 20:03:23.000000000 +0000 2 | +++ ../libStatGen/FastQFile.cpp 2020-09-17 19:35:48.797593411 +0000 3 | @@ -489,6 +489,7 @@ 4 | // Check to see if the sequenceIdentifier is a repeat by adding 5 | // it to the set and seeing if it already existed. 6 | std::pair::iterator,bool> insertResult; 7 | + /* 8 | insertResult = 9 | myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(), 10 | myLineNum)); 11 | @@ -505,6 +506,7 @@ 12 | reportErrorOnLine(); 13 | return(false); 14 | } 15 | + */ 16 | } 17 | 18 | // Valid, return true. 19 | -------------------------------------------------------------------------------- /fastqpreprocessing/patches/Makefile.patch: -------------------------------------------------------------------------------- 1 | --- libStatGen-1.0.14/Makefile 2015-07-08 20:03:23.000000000 +0000 2 | +++ ../libStatGen/Makefile 2020-09-03 14:15:41.904210140 +0000 3 | @@ -2,7 +2,8 @@ 4 | 5 | .PHONY: package 6 | 7 | -SUBDIRS=general bam fastq glf samtools vcf 8 | +#SUBDIRS=general bam fastq glf samtools vcf 9 | +SUBDIRS=general fastq samtools bam 10 | 11 | include Makefiles/Makefile.base 12 | 13 | @@ -16,7 +17,8 @@ 14 | general: samtools 15 | 16 | # other subdirectories depend on general 17 | -bam fastq glf vcf: general 18 | +#bam fastq glf vcf: general 19 | +bam fastq : general 20 | 21 | RELEASE_FILE?=libStatGen.$(VERSION).tgz 22 | 23 | -------------------------------------------------------------------------------- /fastqpreprocessing/patches/general.Makefile.patch: -------------------------------------------------------------------------------- 1 | --- libStatGen-1.0.14/general/Makefile 2020-09-17 20:29:00.320563968 +0000 2 | +++ ../libStatGen/Makefile.general 2020-09-17 20:57:47.982915972 +0000 3 | @@ -8,7 +8,7 @@ 4 | # an error, but allow unused results and variables for the 5 | # time being. 6 | # 7 | - USER_WARNINGS ?= -Werror $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi) 8 | + USER_WARNINGS ?= $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi) 9 | #-Wno-strict-overflow 10 | # -Wno-unused-variable $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-unused-result" ; fi) 11 | endif 12 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/example-run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ./fastqprocess --verbose \ 3 | --bam-size 0.001 \ 4 | --barcode-length 16 \ 5 | --umi-length 10 \ 6 | --sample-id L8TX \ 7 | --white-list ../../../data/L8TX/737K-august-2016.txt \ 8 | --I1 ../../../data/L8TX/A_I1.fastq.gz \ 9 | --R1 ../../../data/L8TX/A_R1.fastq.gz \ 10 | --R2 ../../../data/L8TX/A_R2.fastq.gz \ 11 | --I1 ../../../data/L8TX/B_I1.fastq.gz \ 12 | --R1 ../../../data/L8TX/B_R1.fastq.gz \ 13 | --R2 ../../../data/L8TX/B_R2.fastq.gz \ 14 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/fastq_common.h: -------------------------------------------------------------------------------- 1 | #ifndef __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_ 2 | #define __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "FastQFile.h" 12 | #include "FastQStatus.h" 13 | #include "SamFile.h" 14 | #include "SamValidation.h" 15 | 16 | // A pointer to a valid SamRecord waiting to be written to disk, and the index 17 | // of the g_read_arenas that pointer should be released to after the write. 18 | using PendingWrite = std::pair; 19 | 20 | class WriteQueue 21 | { 22 | public: 23 | static constexpr int kShutdown = -1; 24 | PendingWrite dequeueWrite(); 25 | void enqueueWrite(PendingWrite write); 26 | void enqueueShutdownSignal(); 27 | private: 28 | std::mutex mutex_; 29 | std::condition_variable cv_; 30 | std::queue queue_; 31 | }; 32 | 33 | // This is a hack for the sake of samplefastq program. 34 | void releaseReaderThreadMemory(int reader_thread_index, SamRecord* samRecord); 35 | 36 | void fillSamRecordCommon(SamRecord* samRecord, FastQFile* fastQFileI1, 37 | FastQFile* fastQFileR1, FastQFile* fastQFileR2, 38 | bool has_I1_file_list, 39 | std::string const& barcode_seq, std::string const& barcode_quality, 40 | std::string const& umi_seq, std::string const& umi_quality); 41 | 42 | void mainCommon( 43 | std::string white_list_file, int num_writer_threads, std::string output_format, 44 | std::vector I1s, std::vector R1s, std::vector R2s, 45 | std::string sample_id, 46 | std::function sam_record_filler, 47 | std::function barcode_getter, 48 | std::function output_handler); 49 | 50 | #endif // __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_ 51 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/fastq_metrics.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file fastq_metrics.cpp 3 | * @brief functions for computing metrics 4 | * @author Farzaneh Khajouei and Fred Douglas 5 | * @date 2022-05-25 6 | ***********************************************/ 7 | #include "FastQFile.h" 8 | #include "FastQStatus.h" 9 | #include "fastq_metrics.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using std::string; 16 | 17 | std::vector> parseReadStructure(std::string read_structure) 18 | { 19 | std::vector> ret; 20 | int next_ind = 0; 21 | while (next_ind < read_structure.size()) 22 | { 23 | int type_ind = read_structure.find_first_not_of("0123456789", next_ind); 24 | assert(type_ind != std::string::npos); 25 | char type = read_structure[type_ind]; 26 | int len = std::stoi(read_structure.substr(next_ind, type_ind - next_ind)); 27 | ret.emplace_back(type, len); 28 | next_ind = type_ind + 1; 29 | } 30 | return ret; 31 | } 32 | 33 | int getLengthOfType(string read_structure,char type) 34 | { 35 | int total_length = 0; 36 | for (auto [curr_type, length] : parseReadStructure(read_structure)) 37 | if (curr_type == type) 38 | total_length += length; 39 | return total_length; 40 | } 41 | 42 | void PositionWeightMatrix::recordChunk(string s) 43 | { 44 | for (int index = 0; index < s.size(); index++) 45 | { 46 | switch (s[index]) 47 | { 48 | case 'A': 49 | case 'a': 50 | A[index]++; 51 | break; 52 | case 'C': 53 | case 'c': 54 | C[index]++; 55 | break; 56 | case 'G': 57 | case 'g': 58 | G[index]++; 59 | break; 60 | case 'T': 61 | case 't': 62 | T[index]++; 63 | break; 64 | case 'N': 65 | case 'n': 66 | N[index]++; 67 | break; 68 | default: 69 | std::cerr<<"Unknown character:"<processShard(filenameR1, read_structure, white_list_data); 116 | } 117 | void FastQMetricsShard::processShard(String filenameR1, std::string read_structure, 118 | const WhiteListData* white_list_data) 119 | { 120 | /// setting the shortest sequence allowed to be read 121 | FastQFile fastQFileR1(4, 4); 122 | // open the R1 file 123 | if (fastQFileR1.openFile(filenameR1, BaseAsciiMap::UNKNOWN) != FastQStatus::FASTQ_SUCCESS) 124 | crash("Failed to open R1 file"); 125 | 126 | // Keep reading the file until there are no more fastq sequences to process. 127 | int n_lines_read = 0; 128 | while (fastQFileR1.keepReadingFile()) 129 | { 130 | if (fastQFileR1.readFastQSequence() != FastQStatus::FASTQ_SUCCESS) 131 | break; 132 | 133 | ingestBarcodeAndUMI(std::string_view(fastQFileR1.myRawSequence.c_str(),fastQFileR1.myRawSequence.Length())); 134 | 135 | n_lines_read++; 136 | if (n_lines_read % 10000000 == 0) 137 | { 138 | printf("%d\n", n_lines_read); 139 | std::string a = std::string(fastQFileR1.myRawSequence.c_str()); 140 | printf("%s\n", fastQFileR1.mySequenceIdLine.c_str()); 141 | } 142 | } 143 | // Finished processing all of the sequences in the file. 144 | // Close the input files. 145 | fastQFileR1.closeFile(); 146 | } 147 | 148 | PositionWeightMatrix& PositionWeightMatrix::operator+=(const PositionWeightMatrix& rhs) 149 | { 150 | for (int i=0; i < A.size(); i++) 151 | { 152 | A[i] += rhs.A[i]; 153 | C[i] += rhs.C[i]; 154 | G[i] += rhs.G[i]; 155 | T[i] += rhs.T[i]; 156 | N[i] += rhs.N[i]; 157 | } 158 | return *this; 159 | } 160 | 161 | FastQMetricsShard& FastQMetricsShard::operator+=(const FastQMetricsShard& rhs) 162 | { 163 | for (auto [key,value] : rhs.barcode_counts_) 164 | barcode_counts_[key] += value; 165 | for (auto [key,value] : rhs.umi_counts_) 166 | umi_counts_[key] += value; 167 | 168 | barcode_+=rhs.barcode_; 169 | umi_+=rhs.umi_; 170 | return *this; 171 | } 172 | 173 | /** @copydoc process_inputs */ 174 | void process_inputs(const INPUT_OPTIONS_FASTQ_READ_STRUCTURE& options, 175 | const WhiteListData* white_list_data) 176 | { 177 | // number of files based on the input size 178 | int num_files = options.R1s.size(); 179 | 180 | // compute UMI and cell_barcode lengths 181 | 182 | int umi_length = getLengthOfType(options.read_structure,'M'); 183 | int CB_length = getLengthOfType(options.read_structure,'C'); 184 | 185 | // create the data for the threads 186 | vector fastqMetrics; 187 | for (int i = 0; i < num_files; i++) 188 | fastqMetrics.emplace_back(options.read_structure); 189 | 190 | // execute the fastq readers threads 191 | vector readers; 192 | for (unsigned int i = 0; i < options.R1s.size(); i++) 193 | { 194 | readers.emplace_back(processShard, 195 | &fastqMetrics[i], 196 | options.R1s[i].c_str(), 197 | options.read_structure.c_str(), 198 | white_list_data); 199 | 200 | } 201 | 202 | // every reader thread joins. 203 | for (unsigned int i = 0; i < options.R1s.size(); i++) 204 | readers[i].join(); 205 | 206 | std::cout << "Done reading all shards. Will now aggregate and write to file; " 207 | << "this will take a few minutes." << std::endl; 208 | FastQMetricsShard::mergeMetricsShardsToFile(options.sample_id, fastqMetrics, umi_length, CB_length); 209 | } 210 | 211 | void writeCountsFile(std::unordered_map counts, std::string filename) 212 | { 213 | std::ofstream out(filename, std::ofstream::out); 214 | std::vector> sorted_counts; 215 | for (auto [str, count] : counts) 216 | sorted_counts.emplace_back(str, count); 217 | std::sort(sorted_counts.begin(), sorted_counts.end(), //sort counts from most to fewest! 218 | [](std::pair const& a, std::pair const& b) 219 | { 220 | return a.second > b.second; 221 | }); 222 | for (auto [str, count] : sorted_counts) 223 | out << count << "\t" << str << "\n"; 224 | } 225 | void PositionWeightMatrix::writeToFile(std::string filename) 226 | { 227 | std::ofstream out(filename, std::ofstream::out); 228 | out << "position\tA\tC\tG\tT\tN\n"; 229 | for (int i = 0; i < A.size(); i++) 230 | out << (i + 1) << "\t" << A[i] << "\t" << C[i] << "\t" << G[i] << "\t" << T[i] << "\t" << N[i] << "\n"; 231 | } 232 | void FastQMetricsShard::mergeMetricsShardsToFile(std::string filename_prefix, vector shards, int umi_length, int CB_length) 233 | { 234 | FastQMetricsShard total(shards[0].read_structure_); 235 | for (FastQMetricsShard const& shard : shards) 236 | total += shard; 237 | 238 | writeCountsFile(total.umi_counts_, filename_prefix + ".numReads_perCell_XM.txt"); 239 | writeCountsFile(total.barcode_counts_, filename_prefix + ".numReads_perCell_XC.txt"); 240 | total.barcode_.writeToFile(filename_prefix + ".barcode_distribution_XC.txt"); 241 | total.umi_.writeToFile(filename_prefix + ".barcode_distribution_XM.txt"); 242 | } 243 | 244 | int main(int argc, char** argv) 245 | { 246 | INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqMetrics(argc, argv); 247 | std::cout << "reading whitelist file " << options.white_list_file << "..."; 248 | WhiteListData white_list_data = readWhiteList(options.white_list_file); 249 | std::cout << "done" << std::endl; 250 | 251 | process_inputs(options, &white_list_data); 252 | return 0; 253 | } 254 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/fastq_metrics.h: -------------------------------------------------------------------------------- 1 | #ifndef __FASTQ_METRICS_H__ 2 | #define __FASTQ_METRICS_H__ 3 | /** 4 | * @file fastq_metrics.h 5 | * @brief functions for computing metrics 6 | * @author Farzaneh Khajouei and Fred Douglas 7 | * @date 2022-05-25 8 | ***********************************************/ 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "BaseAsciiMap.h" 14 | #include "utilities.h" 15 | #include "input_options.h" 16 | #include "FastQFile.h" 17 | #include "FastQStatus.h" 18 | 19 | class PositionWeightMatrix 20 | { 21 | public: 22 | PositionWeightMatrix(int length): A(length), C(length), G(length), T(length), N(length) {} 23 | void recordChunk(std::string s); 24 | PositionWeightMatrix& operator+=(const PositionWeightMatrix& rhs); 25 | void writeToFile(std::string filename); 26 | 27 | std::vector A; 28 | std::vector C; 29 | std::vector G; 30 | std::vector T; 31 | std::vector N; 32 | }; 33 | 34 | class FastQMetricsShard 35 | { 36 | public: 37 | FastQMetricsShard(std::string read_structure); 38 | void ingestBarcodeAndUMI(std::string_view raw_seq); 39 | void processShard(String filenameR1, std::string read_structure, 40 | const WhiteListData* white_list_data); 41 | static void mergeMetricsShardsToFile(std::string filename_prefix, 42 | std::vector shards, 43 | int umi_length, int CB_length); 44 | FastQMetricsShard& operator+=(const FastQMetricsShard& rhs); 45 | 46 | 47 | private: 48 | std::string read_structure_; 49 | int barcode_length_; 50 | int umi_length_; 51 | std::vector> tagged_lengths_; 52 | std::unordered_map barcode_counts_; 53 | std::unordered_map umi_counts_; 54 | PositionWeightMatrix barcode_; 55 | PositionWeightMatrix umi_; 56 | }; 57 | 58 | #endif // __FASTQ_METRICS_H__ 59 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/fastq_slideseq.cpp: -------------------------------------------------------------------------------- 1 | #include "fastq_common.h" 2 | #include "input_options.h" 3 | 4 | std::vector> parseReadStructure(std::string const& read_structure) 5 | { 6 | std::vector> ret; 7 | int next_ind = 0; 8 | while (next_ind < read_structure.size()) 9 | { 10 | int type_ind = read_structure.find_first_not_of("0123456789", next_ind); 11 | assert(type_ind != std::string::npos); 12 | char type = read_structure[type_ind]; 13 | int len = std::stoi(read_structure.substr(next_ind, type_ind - next_ind)); 14 | ret.emplace_back(type, len); 15 | next_ind = type_ind + 1; 16 | } 17 | return ret; 18 | } 19 | 20 | std::vector> g_parsed_read_structure; 21 | 22 | void fillSamRecordWithReadStructure(SamRecord* sam, FastQFile* fastQFileI1, 23 | FastQFile* fastQFileR1, FastQFile* fastQFileR2, 24 | bool has_I1_file_list) 25 | { 26 | // check the sequence names matching 27 | std::string a = std::string(fastQFileR1->myRawSequence.c_str()); 28 | std::string b = std::string(fastQFileR1->myQualityString.c_str()); 29 | // extract the raw barcode and UMI 8C18X6C9M1X and raw barcode and UMI quality string 30 | 31 | std::string barcode_seq, barcode_quality, umi_seq, umi_quality; 32 | int cur_ind = 0; 33 | for (auto [tag, length] : g_parsed_read_structure) 34 | { 35 | switch (tag) 36 | { 37 | case 'C': 38 | barcode_seq += a.substr(cur_ind, length); 39 | barcode_quality += b.substr(cur_ind, length); 40 | break; 41 | case 'M': 42 | umi_seq += a.substr(cur_ind, length); 43 | umi_quality += b.substr(cur_ind, length); 44 | break; 45 | default: 46 | break; 47 | } 48 | cur_ind += length; 49 | } 50 | fillSamRecordCommon(sam, fastQFileI1, fastQFileR1, fastQFileR2, has_I1_file_list, 51 | barcode_seq, barcode_quality, umi_seq, umi_quality); 52 | } 53 | 54 | std::string slideseqBarcodeGetter(SamRecord* sam, FastQFile* fastQFileI1, 55 | FastQFile* fastQFileR1, FastQFile* fastQFileR2, 56 | bool has_I1_file_list) 57 | { 58 | return std::string(sam->getString("CR").c_str()); 59 | } 60 | 61 | void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index) 62 | { 63 | cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index)); 64 | } 65 | 66 | int main(int argc, char** argv) 67 | { 68 | INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqSlideseq(argc, argv); 69 | // number of output bam files, and one writer thread per bam file 70 | int num_writer_threads = get_num_blocks(options); 71 | 72 | g_parsed_read_structure = parseReadStructure(options.read_structure); 73 | 74 | mainCommon(options.white_list_file, num_writer_threads, options.output_format, 75 | options.I1s, options.R1s, options.R2s, options.sample_id, 76 | fillSamRecordWithReadStructure, slideseqBarcodeGetter, outputHandler); 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/fastqprocess.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file fastqprocess.cpp 3 | * @brief functions for file processing 4 | * @author Kishori Konwar 5 | * @date 2020-08-27 6 | ***********************************************/ 7 | 8 | #include "fastq_common.h" 9 | #include "input_options.h" 10 | 11 | unsigned int g_barcode_length; 12 | unsigned int g_umi_length; 13 | 14 | void fillSamRecord(SamRecord* samRecord, FastQFile* fastQFileI1, 15 | FastQFile* fastQFileR1, FastQFile* fastQFileR2, 16 | bool has_I1_file_list) 17 | { 18 | // check the sequence names matching 19 | std::string a = std::string(fastQFileR1->myRawSequence.c_str()); 20 | std::string b = std::string(fastQFileR1->myQualityString.c_str()); 21 | 22 | // extract the raw barcode and UMI 23 | std::string barcode_seq = a.substr(0, g_barcode_length); 24 | std::string umi_seq = a.substr(g_barcode_length, g_umi_length); 25 | 26 | // extract raw barcode and UMI quality string 27 | std::string barcode_quality = b.substr(0, g_barcode_length); 28 | std::string umi_quality = b.substr(g_barcode_length, g_umi_length); 29 | 30 | fillSamRecordCommon(samRecord, fastQFileI1, fastQFileR1, fastQFileR2, has_I1_file_list, 31 | barcode_seq, barcode_quality, umi_seq, umi_quality); 32 | } 33 | 34 | std::string barcodeGetter(SamRecord* samRecord, FastQFile* fastQFileI1, 35 | FastQFile* fastQFileR1, FastQFile* fastQFileR2, 36 | bool has_I1_file_list) 37 | { 38 | return std::string(fastQFileR1->myRawSequence.c_str()).substr(0, g_barcode_length); 39 | } 40 | 41 | void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index) 42 | { 43 | cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index)); 44 | } 45 | 46 | int main(int argc, char** argv) 47 | { 48 | InputOptionsFastqProcess options = readOptionsFastqProcess(argc, argv); 49 | // number of output bam files, and one writer thread per bam file 50 | int num_writer_threads = get_num_blocks(options); 51 | 52 | g_barcode_length = options.barcode_length; 53 | g_umi_length = options.umi_length; 54 | 55 | mainCommon(options.white_list_file, num_writer_threads, options.output_format, 56 | options.I1s, options.R1s, options.R2s, options.sample_id, 57 | fillSamRecord, barcodeGetter, outputHandler); 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/htslib_tagsort.h: -------------------------------------------------------------------------------- 1 | #ifndef __HTSLIB_TAG_SORT__ 2 | #define __HTSLIB_TAG_SORT__ 3 | 4 | /** 5 | * @file htslib_tagsort.h 6 | * @brief Utility functions for input options processing 7 | * @author Kishori Konwar 8 | * @date 2021-08-11 9 | ***********************************************/ 10 | 11 | #include 12 | #include "input_options.h" 13 | #include "utilities.h" 14 | 15 | 16 | /** 17 | * @brief From the input bam create a list of txt files with the records (lines) 18 | * sorted according to the * tags 19 | * 20 | * @details 21 | * The input bam file is read chunk by chunk, sorted by the tags and the written 22 | * out as a text file in the sorted manner. 23 | * 24 | * @param options: INPUT_OPTIONS_TAGSORT the inputs to the program 25 | * @return a vector containing the file paths of the partial files 26 | */ 27 | std::vector create_sorted_file_splits_htslib(INPUT_OPTIONS_TAGSORT options); 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/input_options.h: -------------------------------------------------------------------------------- 1 | #ifndef __SCTOOLS_FASTQPREPROCESSING_INPUT_OPTIONS_H_ 2 | #define __SCTOOLS_FASTQPREPROCESSING_INPUT_OPTIONS_H_ 3 | /** 4 | * @file input_options.h 5 | * @brief Utility functions for input options processing 6 | * @author Kishori Konwar 7 | * @date 2021-08-11 8 | ***********************************************/ 9 | 10 | #include "utilities.h" 11 | 12 | #include 13 | #include 14 | 15 | constexpr unsigned int kMaxTagsortThreads = 30; 16 | constexpr unsigned int kDefaultNumAlignsPerThread = 1000000; 17 | 18 | struct INPUT_OPTIONS_FASTQ_READ_STRUCTURE 19 | { 20 | // I1, R1 and R2 files name 21 | std::vector I1s, R1s, R2s; 22 | 23 | // Bead Barcode list 24 | std::string white_list_file; 25 | 26 | std::string output_format; 27 | 28 | // Bam file size to split by (in GB) 29 | double bam_size = 1.0; 30 | 31 | std::string read_structure; 32 | 33 | std::string sample_id; 34 | }; 35 | 36 | 37 | // Structure to hold input options for fastqprocess 38 | struct InputOptionsFastqProcess 39 | { 40 | // I1, R1 and R2 files name 41 | std::vector I1s, R1s, R2s; 42 | 43 | // Barcode white list file 44 | std::string white_list_file; 45 | 46 | std::string output_format; 47 | 48 | // chemistry dependent (V2/V3) barcode and UMI length 49 | int barcode_length = -1; 50 | int umi_length = -1; 51 | 52 | // Bam file size to split by (in GB) 53 | double bam_size = 1.0; 54 | 55 | std::string sample_id; 56 | }; 57 | 58 | 59 | // Structure to hold input options for tagsort 60 | struct INPUT_OPTIONS_TAGSORT 61 | { 62 | std::string metric_type; 63 | bool output_sorted_info = false; 64 | bool compute_metric = false; 65 | // name of the bam file 66 | std::string bam_input; 67 | // name of the gtf file 68 | std::string gtf_file; 69 | // temp folder for disk sorting 70 | std::string temp_folder = "/tmp/"; 71 | 72 | std::string metric_output_file; 73 | // sorted tsv output file 74 | std::string sorted_output_file; 75 | 76 | // Size (in number of alignments) of individual chunks to sort in a batch and 77 | // write to a partial file. Approximately 20 million alignments makes 1 GB bam file. 78 | unsigned int alignments_per_batch = kDefaultNumAlignsPerThread; 79 | unsigned int nthreads = 1; 80 | std::string barcode_tag; 81 | std::string umi_tag; 82 | std::string gene_tag; 83 | 84 | // order of the tags to sort by 85 | std::unordered_map tag_order; 86 | 87 | std::string mitochondrial_gene_names_filename; 88 | }; 89 | 90 | InputOptionsFastqProcess readOptionsFastqProcess(int argc, char** argv); 91 | 92 | INPUT_OPTIONS_TAGSORT readOptionsTagsort(int argc, char** argv); 93 | 94 | INPUT_OPTIONS_FASTQ_READ_STRUCTURE readOptionsFastqSlideseq(int argc, char** argv); 95 | 96 | INPUT_OPTIONS_FASTQ_READ_STRUCTURE readOptionsFastqMetrics(int argc, char** argv); 97 | 98 | int64_t get_num_blocks(InputOptionsFastqProcess const& options); 99 | int64_t get_num_blocks(INPUT_OPTIONS_FASTQ_READ_STRUCTURE const& options); 100 | 101 | #endif // __SCTOOLS_FASTQPREPROCESSING_INPUT_OPTIONS_H_ 102 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/metricgatherer.h: -------------------------------------------------------------------------------- 1 | #ifndef __METRIC_GATHERER__ 2 | #define __METRIC_GATHERER__ 3 | /** 4 | * @file metricgatherer.h 5 | * @brief functions for file processing 6 | * @author Kishori Konwar 7 | * @date 2021-08-11 8 | ***********************************************/ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | enum class MetricType { Cell, Gene }; 22 | 23 | /* 24 | Methods 25 | ------- 26 | update(new_value: float) 27 | incorporate new_value into the online estimate of mean and variance 28 | getMean() 29 | return the mean value 30 | calculate_variance() 31 | calculate and return the variance 32 | mean_and_variance() 33 | return both mean and variance 34 | */ 35 | class OnlineGaussianSufficientStatistic 36 | { 37 | private: 38 | double _mean_squared_error = 0.0; 39 | double sum_EX2 = 0.0; 40 | double _mean = 0.0; 41 | double _sum = 0.0; 42 | double _count = 0.0; 43 | 44 | public: 45 | void update(double new_value) 46 | { 47 | _count += 1.0; 48 | _sum += new_value; 49 | sum_EX2 += (new_value*new_value); 50 | } 51 | 52 | // return the mean value 53 | double getMean() 54 | { 55 | _mean = _sum/_count; 56 | return _mean; 57 | } 58 | 59 | // calculate and return the variance 60 | double calculate_variance() 61 | { 62 | if (_count < 2) 63 | return -1.0; 64 | return sum_EX2 / (_count - 1) - (_sum/_count) * (_sum / (_count - 1)); 65 | } 66 | 67 | void clear() 68 | { 69 | _mean_squared_error = 0.0; 70 | _mean = 0.0; 71 | _count = 0; 72 | _sum = 0; 73 | sum_EX2 = 0.0; 74 | } 75 | }; 76 | 77 | class Metrics 78 | { 79 | private: 80 | // count information 81 | int n_reads = 0; 82 | const int noise_reads = 0; //# long polymers, N-sequences; NotImplemented 83 | 84 | std::unordered_map _fragment_histogram; 85 | std::unordered_map _molecule_histogram; 86 | 87 | // molecule information 88 | OnlineGaussianSufficientStatistic _molecule_barcode_fraction_bases_above_30; 89 | 90 | int perfect_molecule_barcodes = 0; 91 | 92 | OnlineGaussianSufficientStatistic _genomic_reads_fraction_bases_quality_above_30; 93 | 94 | OnlineGaussianSufficientStatistic _genomic_read_quality; 95 | 96 | // alignment location information 97 | int reads_mapped_exonic = 0; 98 | int reads_mapped_intronic = 0; 99 | int reads_mapped_utr = 0; 100 | 101 | // in future we can implement this when we have a gene model 102 | // self.reads_mapped_outside_window = 0 # reads should be within 1000 bases of UTR 103 | // self._read_distance_from_termination_site = OnlineGaussianSufficientStatistic() 104 | 105 | // alignment uniqueness information 106 | int reads_mapped_uniquely = 0; 107 | int reads_mapped_multiple = 0; 108 | int duplicate_reads = 0; 109 | 110 | // alignment splicing information 111 | int spliced_reads = 0; 112 | int antisense_reads = 0; 113 | int plus_strand_reads = 0; // strand balance 114 | 115 | // higher-order methods, filled in by finalize() when all data is extracted 116 | float molecule_barcode_fraction_bases_above_30_mean = -1; 117 | float molecule_barcode_fraction_bases_above_30_variance = -1; 118 | float genomic_reads_fraction_bases_quality_above_30_mean = -1; 119 | float genomic_reads_fraction_bases_quality_above_30_variance = -1; 120 | float genomic_read_quality_mean = -1; 121 | float genomic_read_quality_variance = -1; 122 | float n_molecules = -1; 123 | float n_fragments = -1; 124 | float reads_per_molecule = -1; 125 | float reads_per_fragment = -1; 126 | float fragments_per_molecule = -1; 127 | int fragments_with_single_read_evidence = -1; 128 | int molecules_with_single_read_evidence = -1; 129 | 130 | // TODO separate these 2 out from the above, all of which gets clear()d 131 | std::string prev_tag; 132 | char* record[20]; 133 | 134 | protected: 135 | std::string common_headers[24] = 136 | { 137 | "n_reads", 138 | "noise_reads", 139 | "perfect_molecule_barcodes", 140 | "reads_mapped_exonic", 141 | "reads_mapped_intronic", 142 | "reads_mapped_utr", 143 | "reads_mapped_uniquely", 144 | "reads_mapped_multiple", 145 | "duplicate_reads", 146 | "spliced_reads", 147 | "antisense_reads", 148 | "molecule_barcode_fraction_bases_above_30_mean", 149 | "molecule_barcode_fraction_bases_above_30_variance", 150 | "genomic_reads_fraction_bases_quality_above_30_mean", 151 | "genomic_reads_fraction_bases_quality_above_30_variance", 152 | "genomic_read_quality_mean", 153 | "genomic_read_quality_variance", 154 | "n_molecules", 155 | "n_fragments", 156 | "reads_per_molecule", 157 | "reads_per_fragment", 158 | "fragments_per_molecule", 159 | "fragments_with_single_read_evidence", 160 | "molecules_with_single_read_evidence" 161 | }; 162 | 163 | 164 | public: 165 | virtual ~Metrics() {} 166 | // get the headers 167 | virtual std::string getHeader() = 0; 168 | 169 | void parse_line(std::string& str, std::ofstream& fmetric_out, 170 | std::unordered_set& mitochondrial_genes, 171 | MetricType metric_type); 172 | 173 | void output_metrics(std::ofstream& fmetric_out); 174 | virtual void output_metrics_extra(std::ofstream& fmetric_out) = 0; 175 | virtual void parse_extra_fields(const std::string& first_tag, 176 | const std::string& second_tag, 177 | const std::string& third_tag, 178 | char** record) = 0; 179 | virtual void finalize(std::unordered_set& mitochondrial_genes); 180 | virtual void clear(); 181 | }; 182 | 183 | class CellMetrics: public Metrics 184 | { 185 | private: 186 | int perfect_cell_barcodes; // The number of reads whose cell barcodes contain no errors (tag ``CB`` == ``CR``) 187 | int reads_mapped_intergenic; // The number of reads mapped to an intergenic region for this cell 188 | 189 | // reads unmapped 190 | int reads_unmapped; 191 | // The number of reads that were mapped to too many loci across the genome and as a 192 | // consequence, are reported unmapped by the aligner 193 | int reads_mapped_too_many_loci; 194 | 195 | // The variance of the fraction of Illumina base calls for the cell barcode sequence that 196 | // are greater than 30, across molecules 197 | float cell_barcode_fraction_bases_above_30_variance; 198 | 199 | // The average fraction of Illumina base calls for the cell barcode sequence that 200 | // are greater than 30, across molecules 201 | float cell_barcode_fraction_bases_above_30_mean; 202 | 203 | int n_genes; //The number of genes detected by this cell 204 | 205 | int genes_detected_multiple_observations; // The number of genes that are observed by more than one read in this cell 206 | int n_mitochondrial_genes; // The number of mitochondrial genes detected by this cell 207 | int n_mitochondrial_molecules; // The number of molecules from mitochondrial genes detected for this cell 208 | int pct_mitochondrial_molecules; // The percentage of molecules from mitoc 209 | 210 | OnlineGaussianSufficientStatistic _cell_barcode_fraction_bases_above_30; 211 | std::unordered_map _genes_histogram; 212 | 213 | std::string cell_specific_headers[11] = 214 | { 215 | "perfect_cell_barcodes", 216 | "reads_mapped_intergenic", 217 | "reads_unmapped", 218 | "reads_mapped_too_many_loci", 219 | "cell_barcode_fraction_bases_above_30_variance", 220 | "cell_barcode_fraction_bases_above_30_mean", 221 | "n_genes", 222 | "genes_detected_multiple_observations", 223 | "n_mitochondrial_genes", 224 | "n_mitochondrial_molecules", 225 | "pct_mitochondrial_molecules" 226 | }; 227 | 228 | public: 229 | std::string getHeader() override; 230 | void output_metrics_extra(std::ofstream& fmetric_out) override; 231 | void parse_extra_fields(const std::string& first_tag, 232 | const std::string& second_tag, 233 | const std::string& third_tag, 234 | char** record) override; 235 | 236 | void finalize(std::unordered_set& mitochondrial_genes); 237 | 238 | void clear(); 239 | }; 240 | 241 | 242 | class GeneMetrics: public Metrics 243 | { 244 | private: 245 | int number_cells_detected_multiple; 246 | int number_cells_expressing; 247 | 248 | std::unordered_map _cells_histogram; 249 | std::string gene_specific_headers[2] = 250 | { 251 | "number_cells_detected_multiple", 252 | "number_cells_expressing" 253 | }; 254 | 255 | public: 256 | GeneMetrics() 257 | { 258 | number_cells_detected_multiple = 0; 259 | number_cells_expressing = 0; 260 | } 261 | 262 | public: 263 | std::string getHeader() override; 264 | void output_metrics_extra(std::ofstream& fmetric_out) override; 265 | void parse_extra_fields(std::string const& first_tag, 266 | std::string const& second_tag, 267 | std::string const& third_tag, 268 | char** record) override; 269 | 270 | void finalize(std::unordered_set& mitochondrial_genes); 271 | void clear(); 272 | }; 273 | 274 | #endif 275 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/samplefastq.cpp: -------------------------------------------------------------------------------- 1 | #include "fastq_common.h" 2 | #include "input_options.h" 3 | #include 4 | 5 | std::vector> parseReadStructure(std::string const& read_structure) 6 | { 7 | std::vector> ret; 8 | int next_ind = 0; 9 | while (next_ind < read_structure.size()) 10 | { 11 | int type_ind = read_structure.find_first_not_of("0123456789", next_ind); 12 | assert(type_ind != std::string::npos); 13 | char type = read_structure[type_ind]; 14 | int len = std::stoi(read_structure.substr(next_ind, type_ind - next_ind)); 15 | ret.emplace_back(type, len); 16 | next_ind = type_ind + 1; 17 | } 18 | return ret; 19 | } 20 | 21 | std::vector> g_parsed_read_structure; 22 | 23 | void fillSamRecordWithReadStructure(SamRecord* sam, FastQFile* fastQFileI1, 24 | FastQFile* fastQFileR1, FastQFile* fastQFileR2, 25 | bool has_I1_file_list) 26 | { 27 | // check the sequence names matching 28 | std::string a = std::string(fastQFileR1->myRawSequence.c_str()); 29 | std::string b = std::string(fastQFileR1->myQualityString.c_str()); 30 | // extract the raw barcode and UMI 8C18X6C9M1X and raw barcode and UMI quality string 31 | 32 | std::string barcode_seq, barcode_quality, umi_seq, umi_quality; 33 | int cur_ind = 0; 34 | for (auto [tag, length] : g_parsed_read_structure) 35 | { 36 | switch (tag) 37 | { 38 | case 'C': 39 | barcode_seq += a.substr(cur_ind, length); 40 | barcode_quality += b.substr(cur_ind, length); 41 | break; 42 | case 'M': 43 | umi_seq += a.substr(cur_ind, length); 44 | umi_quality += b.substr(cur_ind, length); 45 | break; 46 | default: 47 | break; 48 | } 49 | cur_ind += length; 50 | } 51 | fillSamRecordCommon(sam, fastQFileI1, fastQFileR1, fastQFileR2, has_I1_file_list, 52 | barcode_seq, barcode_quality, umi_seq, umi_quality); 53 | } 54 | 55 | std::string slideseqBarcodeGetter(SamRecord* sam, FastQFile* fastQFileI1, 56 | FastQFile* fastQFileR1, FastQFile* fastQFileR2, 57 | bool has_I1_file_list) 58 | { 59 | return std::string(sam->getString("CR").c_str()); 60 | } 61 | 62 | void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index) 63 | { 64 | cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index)); 65 | } 66 | 67 | 68 | int main(int argc, char** argv) 69 | { 70 | INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqSlideseq(argc, argv); 71 | // number of output bam files, and one writer thread per bam file 72 | int num_writer_threads = get_num_blocks(options); 73 | 74 | std::ofstream outfile_r1("sampled_down.R1"); 75 | if (!outfile_r1) 76 | crash("Failed to open output file sampled_down.R1"); 77 | std::ofstream outfile_r2("sampled_down.R2"); 78 | if (!outfile_r2) 79 | crash("Failed to open output file sampled_down.R2"); 80 | 81 | g_parsed_read_structure = parseReadStructure(options.read_structure); 82 | mainCommon(options.white_list_file, /*num_writer_threads=*/1, options.output_format, 83 | options.I1s, options.R1s, options.R2s, options.sample_id, 84 | fillSamRecordWithReadStructure, slideseqBarcodeGetter, 85 | [&outfile_r1, &outfile_r2](WriteQueue* ignored1, SamRecord* sam, int reader_thread_index) 86 | { 87 | if (sam->getStringTag("CB")) 88 | { 89 | // Assumed read structure of 8C18X6C9M1X with a fixed spacer sequence 90 | const char* barcode = sam->getString("CR").c_str(); 91 | const char* quality_score = sam->getString("CY").c_str(); 92 | outfile_r1 << "@" << sam->getReadName() << "\n" 93 | << std::string_view(barcode, 8) << "CTTCAGCGTTCCCGAGAG" << std::string_view(barcode+8, 6) << sam->getString("UR") <<"T\n" 94 | << "+\n" 95 | << std::string_view(quality_score, 8)<<"FFFFFFFFFFFFFFFFFF" << std::string_view(quality_score+8, 6) << sam->getString("UY") <<"F"<< "\n"; 96 | 97 | outfile_r2 << "@" << sam->getReadName() << "\n" 98 | << sam->getSequence() << "\n" 99 | << "+\n" 100 | << sam->getQuality() << "\n"; 101 | } 102 | releaseReaderThreadMemory(reader_thread_index,sam); 103 | }); 104 | return 0; 105 | } 106 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/utilities.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file utilities.cpp 3 | * @brief Utility functions for file processing 4 | * @author Kishori Konwar 5 | * @date 2021-08-11 6 | ***********************************************/ 7 | 8 | #include "utilities.h" 9 | 10 | #include 11 | #include 12 | 13 | /** @copydoc readWhiteList */ 14 | WhiteListData readWhiteList(std::string const& white_list_file) 15 | { 16 | const char ATCG[] = {'A', 'C', 'G', 'T', 'N'}; 17 | 18 | std::ifstream file(white_list_file); 19 | if (!file.is_open()) 20 | crash("Couldn't open whitelist file " + white_list_file); 21 | 22 | WhiteListData white_list_data; 23 | int k = 0; 24 | // read data from file object and put it into string. 25 | for (std::string tp; getline(file, tp); ) 26 | { 27 | white_list_data.barcodes.push_back(tp); 28 | 29 | for (unsigned int i=0; i < tp.size(); i++) 30 | { 31 | for (int j=0; j < 5; j++) 32 | { 33 | char c = tp[i]; 34 | tp[i] = ATCG[j]; 35 | // If the mutation we're writing is already present, we just overwrite 36 | // what was there with the current. 37 | // This is done to have the same values for corrected barcodes 38 | // as in the python implementation. 39 | white_list_data.mutations[tp] = k; 40 | tp[i] = c; 41 | } 42 | } 43 | 44 | // -1 suggests it is already a whitelisted barcode 45 | // This is used, instead of the actual index, because when 46 | // the barcode is seen with -1 then no correction is necessary. 47 | // Avoids lots of map lookups, as most barcodes are not erroneous. 48 | white_list_data.mutations[tp] = -1; 49 | k++; 50 | } 51 | 52 | return white_list_data; 53 | } 54 | 55 | 56 | /** @copydoc crashWithPerror */ 57 | void crashWithPerror(std::string msg) 58 | { 59 | perror(msg.c_str()); 60 | exit(1); 61 | } 62 | 63 | void crash(std::string msg) 64 | { 65 | std::cout << msg << std::endl; 66 | std::cerr << msg << std::endl; 67 | exit(1); 68 | } 69 | -------------------------------------------------------------------------------- /fastqpreprocessing/src/utilities.h: -------------------------------------------------------------------------------- 1 | #ifndef __OPTIMUS_UTILITES__ 2 | #define __OPTIMUS_UTILITES__ 3 | 4 | /** 5 | * @file utilities.h 6 | * @brief Utility functions for file processing 7 | * @author Kishori Konwar 8 | * @date 2021-08-11 9 | ***********************************************/ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | // structure for correcting the barcodes 16 | struct WhiteListData 17 | { 18 | // an unordered map from whitelist barcodes and 1-mutations 19 | // to the index of the correct barcode 20 | std::unordered_map mutations; 21 | // vector of whitelist barcodes 22 | std::vector barcodes; 23 | }; 24 | 25 | /** 26 | * @brief Build barcode correction map white list barcodes & mutations 27 | * 28 | * @details 29 | * A barcode is computed by checking if it is either in the white 30 | * list or 1-mutation away from any white listed barcode. To check 31 | * whether a barcode is correct or to correct it, if 1-mutation away from 32 | * a barcode in the white list, we build a 33 | * a map is created with the barcodes and the 1-mutation. The keys are 34 | * barcodes or mutation and the values are index of the crrect barcode 35 | * 36 | * @param whilte_list_file white list file from 10x genomics' cellranger 37 | * @return a stricture containing the barcode/1-mutation barcode to index 38 | * of the correct barcode 39 | */ 40 | WhiteListData readWhiteList(std::string const& white_list_file); 41 | 42 | /** 43 | * @brief Print system error and exit 44 | * 45 | * @param msg error string to print 46 | */ 47 | void crashWithPerror(std::string msg); 48 | 49 | void crash(std::string msg); 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /fastqpreprocessing/utils/big-run.sh: -------------------------------------------------------------------------------- 1 | ./fastqproc ../../L8TX/L8TX_180221_01_F12_R1.fastq.gz ../../L8TX/L8TX_180221_01_F12_I1.fastq.gz ../../L8TX/L8TX_180221_01_F12_R2.fastq.gz ../../L8TX/L8TX_171026_01_F03_R1.fastq.gz ../../L8TX/L8TX_171026_01_F03_I1.fastq.gz ../../L8TX/L8TX_171026_01_F03_R2.fastq.gz 2 | -------------------------------------------------------------------------------- /fastqpreprocessing/utils/check_barcode_partition.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--bam", nargs="+", dest="bams", help="BAM files") 6 | 7 | 8 | def check_disjoint_cbs(): 9 | global parser 10 | opts = parser.parse_args() 11 | barcodes = {} 12 | tot_alignments = 0 13 | 14 | for bam in opts.bams: 15 | print("reading " + bam) 16 | barcodes[bam] = {} 17 | with pysam.AlignmentFile(bam, "rb", check_sq=False) as input_alignments: 18 | for alignment in input_alignments: 19 | tot_alignments += 1 20 | if alignment.has_tag("CB"): 21 | barcodes[bam][alignment.get_tag("CB")] = True 22 | 23 | for bam in opts.bams: 24 | print("checking " + bam) 25 | files = set(opts.bams) 26 | otherbams = files.difference(set([bam])) 27 | for cb in barcodes[bam].keys(): 28 | for obam in otherbams: 29 | if cb in barcodes[obam]: 30 | print("not a partition") 31 | return 32 | 33 | print("total alignments : ", tot_alignments) 34 | print("is a partition") 35 | return 36 | 37 | 38 | if __name__ == "__main__": 39 | check_disjoint_cbs() 40 | -------------------------------------------------------------------------------- /fastqpreprocessing/utils/create_fastq.sh: -------------------------------------------------------------------------------- 1 | zcat ../../L8TX/L8TX_180221_01_F12_R2.fastq.gz | head -n 4000000 > a_R1.fastq 2 | 3 | gzip a_R1.fastq 4 | 5 | cp a_R1.fastq.gz b_R2.fastq.gz 6 | cp a_R1.fastq.gz b_I1.fastq.gz 7 | cp a_R1.fastq.gz b_R1.fastq.gz 8 | 9 | cp a_R1.fastq.gz a_R2.fastq.gz 10 | cp a_R1.fastq.gz a_I1.fastq.gz 11 | -------------------------------------------------------------------------------- /fastqpreprocessing/utils/example-run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./fastqprocess --verbose \ 4 | --bam-size 0.001 \ 5 | --barcode-length 16 \ 6 | --umi-length 10 \ 7 | --sample-id L8TX \ 8 | --white-list ../../../data/L8TX/737K-august-2016.txt \ 9 | --I1 ../../../data/L8TX/A_I1.fastq.gz \ 10 | --R1 ../../../data/L8TX/A_R1.fastq.gz \ 11 | --R2 ../../../data/L8TX/A_R2.fastq.gz \ 12 | --I1 ../../../data/L8TX/B_I1.fastq.gz \ 13 | --R1 ../../../data/L8TX/B_R1.fastq.gz \ 14 | --R2 ../../../data/L8TX/B_R2.fastq.gz \ 15 | -------------------------------------------------------------------------------- /fastqpreprocessing/utils/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # --tool=memcheck \ 5 | # --leak-check=full \ 6 | # --log-file=valgrind-out.txt \ 7 | 8 | valgrind \ 9 | --tool=massif \ 10 | --time-unit=B \ 11 | ./fastqproc a_R1.fastq.gz a_I1.fastq.gz a_R2.fastq.gz \ 12 | b_R1.fastq.gz b_I1.fastq.gz b_R2.fastq.gz 13 | -------------------------------------------------------------------------------- /pull_request_template.md: -------------------------------------------------------------------------------- 1 | ### Purpose 2 | 3 | 4 | - No issue is linked to this PR. 5 | 6 | ### Changes 7 | 8 | 9 | - No changes. 10 | 11 | ### Review Instructions 12 | 13 | 14 | - No instructions. 15 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | 3 | build: 4 | image: latest 5 | 6 | python: 7 | version: 3.6 8 | use_system_site_packages: false # Set to true will let the virtualenv use the pre-installed packages such as numpy, which is not what we want 9 | setup_py_install: false 10 | pip_install: true 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | crimson==0.5.2 2 | pandas==0.25.3 3 | pysam==0.16.0.1 4 | pytest-cov==2.10.1 5 | pytest==5.1.1 6 | scipy==1.5.2 7 | black==19.3b0 8 | flake8==3.7.7 9 | gffutils==0.9 10 | numpy==1.19.1 11 | requests==2.20.0 12 | setuptools==40.4.3 13 | setuptools_scm==3.1.0 14 | h5py==2.10.0 15 | tables==3.4.4 -------------------------------------------------------------------------------- /security.txt: -------------------------------------------------------------------------------- 1 | If you'd like to report a security issue please contact us. 2 | 3 | Contact: security-leads@data.humancellatlas.org 4 | 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | CLASSIFIERS = [ 4 | "Development Status :: 4 - Beta", 5 | "Natural Language :: English", 6 | "License :: OSI Approved :: BSD License", 7 | "Operating System :: OS Independent", 8 | "Programming Language :: Python :: 3.6", 9 | "Topic :: Scientific/Engineering :: Bio-Informatics", 10 | ] 11 | 12 | setup( 13 | name="sctools", 14 | use_scm_version=True, 15 | setup_requires=["setuptools_scm"], 16 | description="Utilities for large-scale distributed single cell" + "data processing", 17 | url="https://github.com/humancellatlas/sctools.git", 18 | author="Ambrose J. Carr", 19 | author_email="mail@ambrosejcarr.com", 20 | package_dir={"": "src"}, 21 | packages=["sctools", "sctools/test", "sctools/metrics"], 22 | install_requires=[ 23 | "gffutils", 24 | "numpy", 25 | "pandas", 26 | "pysam", 27 | "pytest", 28 | "pytest-cov", 29 | "sphinx", 30 | "sphinxcontrib-websupport", 31 | "sphinx_rtd_theme", 32 | "setuptools_scm>=3.1.0", 33 | "setuptools>=40.4.3", 34 | "scipy>=1.0.0", 35 | "crimson>=0.3.0", 36 | ], 37 | entry_points={ 38 | "console_scripts": [ 39 | "AttachBarcodes = sctools.platform:BarcodePlatform." + "attach_barcodes", 40 | "Attach10xBarcodes = sctools.platform:TenXV2.attach_barcodes", 41 | "SplitBam = sctools.platform:GenericPlatform.split_bam", 42 | "CalculateGeneMetrics = sctools.platform:GenericPlatform." 43 | + "calculate_gene_metrics", 44 | "CalculateCellMetrics = sctools.platform:GenericPlatform." 45 | + "calculate_cell_metrics", 46 | "MergeGeneMetrics = sctools.platform:GenericPlatform." 47 | + "merge_gene_metrics", 48 | "MergeCellMetrics = sctools.platform:GenericPlatform." 49 | + "merge_cell_metrics", 50 | "CreateCountMatrix = sctools.platform:GenericPlatform." 51 | + "bam_to_count_matrix", 52 | "MergeCountMatrices = sctools.platform:GenericPlatform." 53 | + "merge_count_matrices", 54 | "TagSortBam = sctools.platform:GenericPlatform.tag_sort_bam", 55 | "VerifyBamSort = sctools.platform:GenericPlatform.verify_bam_sort", 56 | "GroupQCs = sctools.platform:GenericPlatform.group_qc_outputs", 57 | ] 58 | }, 59 | classifiers=CLASSIFIERS, 60 | include_package_data=True, 61 | ) 62 | -------------------------------------------------------------------------------- /src/sctools/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from . import bam 3 | from . import encodings 4 | from . import barcode 5 | from . import fastq 6 | from . import gtf 7 | from . import stats 8 | from . import reader 9 | from . import metrics 10 | from . import platform 11 | from . import consts 12 | from . import groups 13 | from pkg_resources import get_distribution, DistributionNotFound 14 | 15 | 16 | try: 17 | __version__ = get_distribution(__name__).version 18 | except DistributionNotFound: 19 | pass 20 | -------------------------------------------------------------------------------- /src/sctools/consts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Global constants 3 | ================ 4 | 5 | .. currentmodule:: sctools 6 | 7 | This module contains global constants, such as various barcoded BAM tags, and sctools-specific 8 | constants. 9 | """ 10 | 11 | # BAM tag constants 12 | 13 | RAW_SAMPLE_BARCODE_TAG_KEY = "SR" 14 | QUALITY_SAMPLE_BARCODE_TAG_KEY = "SY" 15 | 16 | MOLECULE_BARCODE_TAG_KEY = "UB" 17 | RAW_MOLECULE_BARCODE_TAG_KEY = "UR" 18 | QUALITY_MOLECULE_BARCODE_TAG_KEY = "UY" 19 | 20 | CELL_BARCODE_TAG_KEY = "CB" 21 | RAW_CELL_BARCODE_TAG_KEY = "CR" 22 | QUALITY_CELL_BARCODE_TAG_KEY = "CY" 23 | 24 | GENE_NAME_TAG_KEY = "GE" 25 | NUMBER_OF_HITS_TAG_KEY = "NH" 26 | 27 | ALIGNMENT_LOCATION_TAG_KEY = "XF" 28 | INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE = "INTRONIC" 29 | CODING_ALIGNMENT_LOCATION_TAG_VALUE = "CODING" 30 | UTR_ALIGNMENT_LOCATION_TAG_VALUE = "UTR" 31 | INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE = "INTERGENIC" 32 | 33 | # bam.py constants 34 | 35 | MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500 36 | MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000 37 | 38 | 39 | # modes of the count matrix runs 40 | SINGLE_CELL_COUNT_MATRIX = 0 41 | SINGLE_NUCLEI_COUNT_MATRIX = 1 42 | -------------------------------------------------------------------------------- /src/sctools/encodings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compressed Barcode Encoding Methods 3 | =================================== 4 | 5 | .. currentmodule:: sctools 6 | 7 | This module defines several classes to encode DNA sequences in memory-efficient forms, using 2 bits 8 | to encode bases of a 4-letter DNA alphabet (ACGT) or 3 bits to encode a 5-letter DNA alphabet 9 | that includes the ambiguous call often included by Illumina base calling software (ACGTN). The 10 | classes also contain several methods useful for efficient querying and manipulation of the encoded 11 | sequence. 12 | 13 | Classes 14 | ------- 15 | Encoding Encoder base class 16 | ThreeBit Three bit DNA encoder / decoder 17 | TwoBit Two bit DNA encoder / decoder 18 | 19 | """ 20 | 21 | import random 22 | from typing import Mapping, AnyStr, Set 23 | 24 | 25 | class Encoding: 26 | """ 27 | 28 | Attributes 29 | ---------- 30 | encoding_map : TwoBitEncodingMap 31 | Class that mimics a Mapping[bytes, str] where bytes must be a single byte encoded character 32 | (encoder) 33 | decoding_map : Mapping[int, bytes] 34 | Dictionary that maps integers to bytes human-readable representations (decoder) 35 | bits_per_base : int 36 | number of bits used to encode each base 37 | 38 | Methods 39 | ------- 40 | encode(bytes_encoded: bytes) 41 | encode a DNA string in a compressed representation 42 | decode(integer_encoded: int) 43 | decode a compressed DNA string into a human readable bytes format 44 | gc_content(integer_encoded: int) 45 | calculate the GC content of an encoded DNA string 46 | hamming_distance(a: int, b: int) 47 | calculate the hamming distance between two encoded DNA strings 48 | 49 | """ 50 | 51 | encoding_map: Mapping[AnyStr, int] = NotImplemented 52 | decoding_map: Mapping[int, AnyStr] = NotImplemented 53 | bits_per_base: int = NotImplemented 54 | 55 | @classmethod 56 | def encode(cls, bytes_encoded: bytes) -> int: 57 | """Encode a DNA bytes string. 58 | 59 | Parameters 60 | ---------- 61 | bytes_encoded : bytes 62 | bytes DNA string 63 | 64 | Returns 65 | ------- 66 | encoded : int 67 | Encoded DNA sequence 68 | 69 | """ 70 | raise NotImplementedError 71 | 72 | def decode(self, integer_encoded: int) -> bytes: 73 | """Decode a DNA bytes string. 74 | 75 | Parameters 76 | ---------- 77 | integer_encoded : bytes 78 | Integer encoded DNA string 79 | 80 | Returns 81 | ------- 82 | decoded : bytes 83 | Bytes decoded DNA sequence 84 | 85 | """ 86 | raise NotImplementedError 87 | 88 | def gc_content(self, integer_encoded: int) -> int: 89 | """Return the number of G or C nucleotides in `integer_encoded` 90 | 91 | Parameters 92 | ---------- 93 | integer_encoded : int 94 | Integer encoded DNA string 95 | 96 | Returns 97 | ------- 98 | gc_content, int 99 | number of bases in `integer_encoded` input that are G or C. 100 | 101 | """ 102 | raise NotImplementedError 103 | 104 | @staticmethod 105 | def hamming_distance(a, b) -> int: 106 | """Calculate the hamming distance between two DNA sequences 107 | 108 | The hamming distance counts the number of bases that are not the same nucleotide 109 | 110 | Parameters 111 | ---------- 112 | a, b : int 113 | integer encoded 114 | 115 | 116 | Returns 117 | ------- 118 | d : int 119 | hamming distance between a and b 120 | """ 121 | raise NotImplementedError 122 | 123 | 124 | class TwoBit(Encoding): 125 | """Encode a DNA sequence using a 2-bit encoding. 126 | 127 | Two-bit encoding uses 0 for an encoded nucleotide. As such, it cannot distinguish between 128 | the end of sequence and trailing A nucleotides, and thus decoding these strings requires 129 | knowledge of their length. Therefore, it is only appropriate for encoding fixed sequence 130 | lengths 131 | 132 | In addition, in order to encode in 2-bit, N-nucleotides must be randomized to one of A, C, 133 | G, and T. 134 | 135 | Parameters 136 | ---------- 137 | sequence_length : int 138 | number of nucleotides that are being encoded 139 | 140 | """ 141 | 142 | __doc__ += Encoding.__doc__ 143 | 144 | def __init__(self, sequence_length: int): 145 | self.sequence_length: int = sequence_length 146 | 147 | class TwoBitEncodingMap: 148 | """Dict-like class that maps bytes to 2-bit integer representations 149 | 150 | Generates random nucleotides for ambiguous nucleotides e.g. N 151 | 152 | """ 153 | 154 | map_ = { 155 | ord("A"): 0, 156 | ord("C"): 1, 157 | ord("T"): 2, 158 | ord("G"): 3, 159 | ord("a"): 0, 160 | ord("c"): 1, 161 | ord("t"): 2, 162 | ord("g"): 3, 163 | } 164 | 165 | iupac_ambiguous: Set[int] = {ord(c) for c in "MRWSYKVHDBNmrwsykvhdbn"} 166 | 167 | def __getitem__(self, byte: int) -> int: 168 | try: 169 | return self.map_[byte] 170 | except KeyError: 171 | if byte not in self.iupac_ambiguous: 172 | raise KeyError(f"{chr(byte)} is not a valid IUPAC nucleotide code") 173 | return random.randint(0, 3) 174 | 175 | encoding_map: TwoBitEncodingMap = TwoBitEncodingMap() 176 | decoding_map: Mapping[int, bytes] = {0: b"A", 1: b"C", 2: b"T", 3: b"G"} 177 | bits_per_base: int = 2 178 | 179 | @classmethod 180 | def encode(cls, bytes_encoded: bytes) -> int: 181 | encoded = 0 182 | for character in bytes_encoded: 183 | encoded <<= 2 184 | encoded += cls.encoding_map[character] 185 | return encoded 186 | 187 | def decode(self, integer_encoded: int) -> bytes: 188 | decoded = b"" 189 | for _ in range(self.sequence_length): 190 | decoded = self.decoding_map[integer_encoded & 3] + decoded 191 | integer_encoded >>= 2 192 | return decoded 193 | 194 | def gc_content(self, integer_encoded: int) -> int: 195 | i = 0 196 | for _ in range(self.sequence_length): 197 | i += integer_encoded & 1 198 | integer_encoded >>= 2 199 | return i 200 | 201 | @staticmethod 202 | def hamming_distance(a: int, b: int) -> int: 203 | difference = a ^ b 204 | d_hamming = 0 205 | while difference: 206 | if difference & 3: 207 | d_hamming += 1 208 | difference >>= 2 209 | return d_hamming 210 | 211 | 212 | class ThreeBit(Encoding): 213 | """Encode a DNA sequence using a 3-bit encoding. 214 | 215 | Since no bases are encoded as 0, an empty triplet is interpreted as the end of the encoded 216 | string; Three-bit encoding can be used to encode and decode strings without knowledge of their 217 | length. 218 | 219 | """ 220 | 221 | __doc__ += Encoding.__doc__ 222 | 223 | def __init__(self, *args, **kwargs): 224 | """ 225 | Notes 226 | ----- 227 | args and kwargs are not used, but allow ThreeBit to be initialized the same way as TwoBit, 228 | despite not requiring a sequence length parameter. 229 | 230 | """ 231 | pass 232 | 233 | class ThreeBitEncodingMap: 234 | """Dict-like class that maps bytes to 3-bit integer representations 235 | 236 | All IUPAC ambiguous codes are treated as "N" 237 | 238 | """ 239 | 240 | # C: 1, A: 2, G: 3, T: 4, N: 6; # note, not using 0 241 | map_ = { 242 | ord("C"): 1, 243 | ord("A"): 2, 244 | ord("G"): 3, 245 | ord("T"): 4, 246 | ord("N"): 6, 247 | ord("c"): 1, 248 | ord("a"): 2, 249 | ord("g"): 3, 250 | ord("t"): 4, 251 | ord("n"): 6, 252 | } 253 | 254 | def __getitem__(self, byte: int) -> int: 255 | try: 256 | return self.map_[byte] 257 | except KeyError: 258 | return 6 # any non-standard nucleotide gets "N" 259 | 260 | encoding_map: ThreeBitEncodingMap = ThreeBitEncodingMap() 261 | decoding_map: Mapping[int, bytes] = {1: b"C", 2: b"A", 3: b"G", 4: b"T", 6: b"N"} 262 | bits_per_base: int = 3 263 | 264 | @classmethod 265 | def encode(cls, bytes_encoded: bytes) -> int: 266 | encoded = 0 267 | for character in bytes_encoded: 268 | encoded <<= 3 269 | encoded += cls.encoding_map[character] 270 | return encoded 271 | 272 | @classmethod 273 | def decode(cls, integer_encoded: int) -> bytes: 274 | decoded = b"" 275 | while integer_encoded: 276 | decoded = cls.decoding_map[integer_encoded & 7] + decoded 277 | integer_encoded >>= 3 278 | return decoded 279 | 280 | @classmethod 281 | def gc_content(cls, integer_encoded: int) -> int: 282 | i = 0 283 | while integer_encoded: 284 | i += integer_encoded & 1 285 | integer_encoded >>= 3 286 | return i 287 | 288 | @staticmethod 289 | def hamming_distance(a: int, b: int) -> int: 290 | difference = a ^ b 291 | d_hamming = 0 292 | while difference: 293 | if difference & 7: 294 | d_hamming += 1 295 | difference >>= 3 296 | return d_hamming 297 | -------------------------------------------------------------------------------- /src/sctools/groups.py: -------------------------------------------------------------------------------- 1 | """ 2 | Group QC outputs 3 | 4 | """ 5 | 6 | from crimson import picard 7 | import os 8 | import pandas as pd 9 | 10 | 11 | def write_aggregated_picard_metrics_by_row(file_names, output_name): 12 | """Command line entrypoint to parse, aggreagete and write Picard row metrics. 13 | Parameters 14 | ---------- 15 | args: 16 | file_names: array of files. the basename of inputs should be formated 17 | as 'samplename_qc',such as 18 | "samplename_qc.alignment_summary_metrics.txt" and "samplename_qc.insert_size_metrics.txt" 19 | output_name: prefix of output file name without extension. 20 | Returns 21 | ---------- 22 | return: 0 23 | return if the program completes successfully. 24 | """ 25 | # initial output 26 | metrics = {} 27 | d = pd.DataFrame() 28 | for file_name in file_names: 29 | cell_id = os.path.basename(file_name).split("_qc")[0] 30 | metrics[cell_id] = {} 31 | parsed = picard.parse(file_name) 32 | class_name = parsed["metrics"]["class"].split(".")[2] 33 | # Alignment metrics return multiple lines, 34 | # but only output PAIRED-READS/third line 35 | contents = parsed["metrics"]["contents"] 36 | if class_name == "AlignmentSummaryMetrics": 37 | # parse out PE, R1 and R2. If the reads are unpaired, the contents 38 | # will be a single dict rather than a list of dicts. 39 | if isinstance(contents, dict): 40 | contents = [contents] 41 | rows = {} 42 | for m in contents: 43 | cat = m["CATEGORY"] 44 | rows.update( 45 | { 46 | k + "." + cat: v 47 | for k, v in m.items() 48 | if k not in ["SAMPLE", "LIBRARY", "READ_GROUP", "CATEGORY"] 49 | } 50 | ) 51 | # sometimes(very rare), insertion metrics also return multiple lines 52 | # results to include TANDEM repeats. but we only output the first line. 53 | elif class_name == "InsertSizeMetrics": 54 | # if the element counts is less than 21, 55 | # it means insertion metrics returns multiple line results. 56 | if len(contents) < 21: 57 | rows = contents[0] 58 | else: 59 | rows = contents 60 | else: 61 | # other metrics(so far) only return one line results. 62 | rows = contents 63 | metrics[cell_id].update( 64 | { 65 | k: rows[k] 66 | for k in rows 67 | if k not in ["SAMPLE", "LIBRARY", "READ_GROUP", "CATEGORY"] 68 | } 69 | ) 70 | df = pd.DataFrame.from_dict(metrics, orient="columns") 71 | df.insert(0, "Class", class_name) 72 | d = pd.concat([d, df]) 73 | d_T = d.T 74 | d_T.to_csv(output_name + ".csv") 75 | 76 | 77 | def write_aggregated_picard_metrics_by_table(file_names, output_name): 78 | """Command line entrypoint to parse and write Picard table metrics. 79 | Parameters 80 | ---------- 81 | args: 82 | file_names: array of files.the basename of inputs should be formated as 'samplename_qc' 83 | output_name: prefix of output file name. the basename of outputs 84 | includes the Picard metrics class name. 85 | Returns 86 | ---------- 87 | return: 0 88 | return if the program completes successfully. 89 | """ 90 | for file_name in file_names: 91 | cell_id = os.path.basename(file_name).split("_qc")[0] 92 | class_name = os.path.basename(file_name).split(".")[1] 93 | parsed = picard.parse(file_name) 94 | dat = pd.DataFrame.from_dict(parsed["metrics"]["contents"]) 95 | dat.insert(0, "Sample", cell_id) 96 | dat.to_csv(output_name + "_" + class_name + ".csv", index=False) 97 | 98 | 99 | def write_aggregated_qc_metrics(file_names, output_name): 100 | """Command line entrypoint to merge Picard metrics along with RSEM and HISAT2 log 101 | Parameters 102 | ---------- 103 | args: 104 | file_names: array of files,such as Picard row metric, hisat2 metrics. 105 | output_name: prefix of output file name. 106 | Returns 107 | ---------- 108 | return: 0 109 | return if the program completes successfully. 110 | """ 111 | df = pd.DataFrame() 112 | for file_name in file_names: 113 | dat = pd.read_csv(file_name, index_col=0) 114 | print(dat.index) 115 | print(df.head()) 116 | df = pd.concat([df, dat], axis=1, join="outer") 117 | df.to_csv(output_name + ".csv", index=True) 118 | 119 | 120 | def parse_hisat2_log(file_names, output_name): 121 | """Command line entrypoint parse, aggreagete and write HISAT2 logs 122 | Parameters 123 | ---------- 124 | args: 125 | file_names: array of HISAT2 log files. Basename of file indicates 126 | the alignment references 'samplename_qc.log' indicates the genome reference and 127 | 'samplename_rsem.log' indicates the transcriptome reference alignment. 128 | output_name: prefix of output file name. 129 | Returns 130 | ---------- 131 | return: 0 132 | return if the program completes successfully. 133 | """ 134 | metrics = {} 135 | tag = "NONE" 136 | for file_name in file_names: 137 | if "_qc" in file_name: 138 | cell_id = os.path.basename(file_name).split("_qc")[0] 139 | tag = "HISAT2G" 140 | elif "_rsem" in file_name: 141 | cell_id = os.path.basename(file_name).split("_rsem")[0] 142 | tag = "HISAT2T" 143 | with open(file_name) as f: 144 | dat = f.readlines() 145 | d = [x.strip().split(":") for x in dat] 146 | # remove the first row of each section. 147 | d.pop(0) 148 | metrics[cell_id] = {x[0]: x[1].strip().split(" ")[0] for x in d} 149 | df = pd.DataFrame.from_dict(metrics, orient="columns") 150 | df.insert(0, "Class", tag) 151 | df_T = df.T 152 | df_T.to_csv(output_name + ".csv") 153 | 154 | 155 | def parse_rsem_cnt(file_names, output_name): 156 | """Command line entrypoint parse, aggreagete and write RSEM cnt 157 | Parameters 158 | ---------- 159 | args: 160 | file_names: array of RSEM cnt files. The basename of inputs should be 161 | 'samplename_rsem.cnt' 162 | output_name: prefix of output file name. 163 | Returns 164 | ---------- 165 | return: 0 166 | return if the program completes successfully. 167 | """ 168 | metrics = {} 169 | for file_name in file_names: 170 | cell_id = os.path.basename(file_name).split("_rsem")[0] 171 | i = 0 172 | with open(file_name) as f: 173 | while i < 3: 174 | if i == 0: 175 | [N0, N1, N2, N_tot] = f.readline().strip().split(" ") 176 | elif i == 1: 177 | [n_unique, n_multi, n_uncertain] = f.readline().strip().split(" ") 178 | elif i == 2: 179 | [n_hits, read_type] = f.readline().strip().split(" ") 180 | i = i + 1 181 | metrics[cell_id] = { 182 | "unalignable reads": N0, 183 | "alignable reads": N1, 184 | "filtered reads": N2, 185 | "total reads": N_tot, 186 | "unique aligned": n_unique, 187 | "multiple mapped": n_multi, 188 | "total alignments": n_hits, 189 | "strand": read_type, 190 | "uncertain reads": n_uncertain, 191 | } 192 | df = pd.DataFrame.from_dict(metrics, orient="columns") 193 | df.insert(0, "Class", "RSEM") 194 | df_T = df.T 195 | df_T.to_csv(output_name + ".csv") 196 | -------------------------------------------------------------------------------- /src/sctools/metrics/README.md: -------------------------------------------------------------------------------- 1 | ## Metric Processing 2 | This module implements a metric suite that generates information on data quality at the level of 3 | both cells and genes. This QC information aligns with the cells and genes that make up the 4 | expression matrix, providing easy access to information that the user can examine to make decisions 5 | about which cells or genes are of adequate quality to include in downstream processing. 6 | 7 | Metric processing in sctools can be run on large individual files, but also implements a map-reduce 8 | architecture execution at production scale. Specifically, the workflow is as follows: 9 | 10 | 1. Chunk the input bam file using `SplitBam`, which generates several chunks, each of which is 11 | guaranteed to contain all data for any cell it contains 12 | 2. Sort each chunk by cell, gene, and molecule tags to ensure that all the reads associated with 13 | a molecule are stored sequentially by cell (`CalculateCellMetrics`) or by gene 14 | (`CalculateGeneMetrics`) 15 | 3. For each cell or gene, parse the information by molecule, which typically loads fewer than 16 | 10,000 records into memory at a time. 17 | 4. Merge data across chunks using `MergeCellMetrics` or `MergeGeneMetrics`. 18 | 19 | This map-reduce approach is currently implemented by the 20 | [HCA 3' pipeline](https://github.com/HumanCellAtlas/skylab/blob/master/pipelines/optimus/Optimus.wdl), 21 | but an abbreviated WDL could be made in the future which would contain: 22 | 23 | ``` 24 | 1. SplitBamByCellBarcode 25 | 2. scatter[CalculateMetrics] 26 | 3. MergeMetrics 27 | ``` 28 | 29 | ## Implementation Details: 30 | 31 | This module implements 4 base classes that carry out metric processing. These are: 32 | 33 | ``` 34 | MetricAggregator: 35 | - CellMetricAggregator 36 | - GeneMetricAggregator 37 | 38 | MetricGatherer: 39 | - CellMetricGatherer 40 | - GeneMetricGatherer 41 | 42 | MetricCSVWriter 43 | 44 | MergeMetrics: 45 | - MergeCellMetrics 46 | - MergeGeneMetrics 47 | ``` 48 | MetricGatherer defines generator functions to group records into molecules, the bam parsing pattern 49 | necessary to process data iteratively. 50 | 51 | MetricAggregator stores the information for a unit of the relevant data (cell, gene), 52 | and processses all the records with the `.parse_records()` method. 53 | 54 | When all records of a single unit (cell, gene) have been processed, `.finalize()` is called to 55 | calculate any higher-order metrics (for example, the variance in quality scores across reads of the 56 | cell or gene), and it is written to file by `MetricSCVWriter`. 57 | 58 | MergeMetrics merges multiple metric outputs from the scattered chunks. This is a trivial 59 | concatenation in the case of cell metrics, and a more complex merge in the case of gene metrics. 60 | -------------------------------------------------------------------------------- /src/sctools/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from . import aggregator 3 | from . import gatherer 4 | from . import merge 5 | -------------------------------------------------------------------------------- /src/sctools/metrics/gatherer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sequence Metric Gatherers 3 | ========================= 4 | 5 | ..currentmodule:: sctools.metrics 6 | 7 | This module defines classes to gather metrics across the cells or genes of an experiment and write 8 | them to gzip-compressed csv files 9 | 10 | Classes 11 | ------- 12 | 13 | .. autosummary:: 14 | :toctree: generated/ 15 | 16 | MetricGatherer Gatherer Base Class 17 | GatherCellMetrics Class to gather metrics on all cells in an experiment 18 | GatherGeneMetrics Class to gather metrics on all genes in an experiment 19 | 20 | See Also 21 | -------- 22 | sctools.metrics.aggregator 23 | sctools.metrics.merge 24 | sctools.metrics.writer 25 | 26 | """ 27 | 28 | from contextlib import closing 29 | 30 | import pysam 31 | from typing import Set 32 | 33 | from sctools.bam import iter_cell_barcodes, iter_genes, iter_molecule_barcodes 34 | from sctools.metrics.aggregator import CellMetrics, GeneMetrics 35 | from sctools.metrics.writer import MetricCSVWriter 36 | 37 | 38 | class MetricGatherer: 39 | """Gathers Metrics from an experiment 40 | 41 | Because molecules tend to have relatively small numbers of reads, the memory footprint of 42 | this method is typically small (tens of megabytes). 43 | 44 | Parameters 45 | ---------- 46 | bam_file : str 47 | the bam file containing the reads that metrics should be calculated from. Can be a chunk 48 | of cells or an entire experiment 49 | output_stem : str 50 | the file stem for the gzipped csv output 51 | 52 | Methods 53 | ------- 54 | extract_metrics 55 | extracts metrics from ``bam_file`` and writes them to output_stem.csv.gz 56 | 57 | """ 58 | 59 | def __init__( 60 | self, 61 | bam_file: str, 62 | output_stem: str, 63 | mitochondrial_gene_ids: Set[str] = set(), 64 | compress: bool = True, 65 | ): 66 | self._bam_file = bam_file 67 | self._output_stem = output_stem 68 | self._compress = compress 69 | self._mitochondrial_gene_ids = mitochondrial_gene_ids 70 | 71 | @property 72 | def bam_file(self) -> str: 73 | """the bam file that metrics are generated from""" 74 | return self._bam_file 75 | 76 | def extract_metrics(self, mode="rb") -> None: 77 | """extract metrics from the provided bam file and write the results to csv. 78 | 79 | Parameters 80 | ---------- 81 | mode : {'r', 'rb'}, default 'rb' 82 | the open mode for pysam.AlignmentFile. 'r' indicates the input is a sam file, and 'rb' 83 | indicates a bam file. 84 | 85 | """ 86 | raise NotImplementedError 87 | 88 | 89 | class GatherCellMetrics(MetricGatherer): 90 | 91 | extra_docs = """ 92 | Notes 93 | ----- 94 | ``bam_file`` must be sorted by gene (``GE``), molecule (``UB``), and cell (``CB``), where gene 95 | varies fastest. 96 | 97 | Examples 98 | -------- 99 | >>> from sctools.metrics.gatherer import GatherCellMetrics 100 | >>> import os, tempfile 101 | 102 | >>> # example data 103 | >>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam' 104 | >>> temp_dir = tempfile.mkdtemp() 105 | >>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True) 106 | >>> g.extract_metrics() 107 | 108 | See Also 109 | -------- 110 | GatherGeneMetrics 111 | 112 | """ 113 | 114 | __doc__ += extra_docs 115 | 116 | def extract_metrics(self, mode: str = "rb") -> None: 117 | """Extract cell metrics from self.bam_file 118 | 119 | Parameters 120 | ---------- 121 | mode : str, optional 122 | Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb'). 123 | 124 | """ 125 | # open the files 126 | with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing( 127 | MetricCSVWriter(self._output_stem, self._compress) 128 | ) as cell_metrics_output: 129 | 130 | # write the header 131 | cell_metrics_output.write_header(vars(CellMetrics())) 132 | 133 | # break up the bam file into sub-iterators over cell barcodes 134 | for cell_iterator, cell_tag in iter_cell_barcodes( 135 | bam_iterator=bam_iterator 136 | ): 137 | metric_aggregator = CellMetrics() 138 | 139 | # break up cell barcodes by molecule barcodes 140 | for molecule_iterator, molecule_tag in iter_molecule_barcodes( 141 | bam_iterator=cell_iterator 142 | ): 143 | 144 | # break up molecule barcodes by gene ids 145 | for gene_iterator, gene_tag in iter_genes( 146 | bam_iterator=molecule_iterator 147 | ): 148 | 149 | # process the data 150 | metric_aggregator.parse_molecule( 151 | tags=(cell_tag, molecule_tag, gene_tag), 152 | records=gene_iterator, 153 | ) 154 | 155 | # write a record for each cell 156 | metric_aggregator.finalize( 157 | mitochondrial_genes=self._mitochondrial_gene_ids 158 | ) 159 | cell_metrics_output.write(cell_tag, vars(metric_aggregator)) 160 | 161 | 162 | class GatherGeneMetrics(MetricGatherer): 163 | 164 | extra_docs = """ 165 | Notes 166 | ----- 167 | ``bam_file`` must be sorted by molecule (``UB``), cell (``CB``), and gene (``GE``), where 168 | molecule varies fastest. 169 | 170 | Examples 171 | -------- 172 | >>> from sctools.metrics.gatherer import GatherCellMetrics 173 | >>> import os, tempfile 174 | 175 | >>> # example data 176 | >>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam' 177 | >>> temp_dir = tempfile.mkdtemp() 178 | >>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True) 179 | >>> g.extract_metrics() 180 | 181 | See Also 182 | -------- 183 | GatherGeneMetrics 184 | 185 | """ 186 | 187 | __doc__ += extra_docs 188 | 189 | def extract_metrics(self, mode: str = "rb") -> None: 190 | """Extract gene metrics from self.bam_file 191 | 192 | Parameters 193 | ---------- 194 | mode : str, optional 195 | Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb'). 196 | 197 | """ 198 | # open the files 199 | with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing( 200 | MetricCSVWriter(self._output_stem, self._compress) 201 | ) as gene_metrics_output: 202 | 203 | # write the header 204 | gene_metrics_output.write_header(vars(GeneMetrics())) 205 | 206 | # break up the bam file into sub-iterators over gene ids 207 | for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator): 208 | metric_aggregator = GeneMetrics() 209 | 210 | # in case of multi-genes ignore as in the counting stage 211 | if gene_tag and len(gene_tag.split(",")) > 1: 212 | continue 213 | 214 | # break up gene ids by cell barcodes 215 | for cell_iterator, cell_tag in iter_cell_barcodes( 216 | bam_iterator=gene_iterator 217 | ): 218 | 219 | # break up cell barcodes by molecular barcodes 220 | for molecule_iterator, molecule_tag in iter_molecule_barcodes( 221 | bam_iterator=cell_iterator 222 | ): 223 | 224 | # process the data 225 | metric_aggregator.parse_molecule( 226 | tags=(gene_tag, cell_tag, molecule_tag), 227 | records=molecule_iterator, 228 | ) 229 | 230 | # write a record for each gene id 231 | metric_aggregator.finalize() 232 | gene_metrics_output.write(gene_tag, vars(metric_aggregator)) 233 | -------------------------------------------------------------------------------- /src/sctools/metrics/merge.py: -------------------------------------------------------------------------------- 1 | """ 2 | Merge Sequence Metrics 3 | ====================== 4 | 5 | ..currentmodule:: sctools.metrics 6 | 7 | This module defines classes to merge multiple metrics files that have been gathered from bam files 8 | containing disjoint sets of cells. This is a common use pattern, as sequencing datasets are often 9 | chunked to enable horizontal scaling using scatter-gather patterns. 10 | 11 | Classes 12 | ------- 13 | MergeMetrics Merge Metrics base class 14 | MergeCellMetrics Class to merge cell metrics 15 | MergeGeneMetrics Class to merge gene metrics 16 | 17 | See Also 18 | -------- 19 | sctools.metrics.gatherer 20 | sctools.metrics.aggregator 21 | sctools.metrics.writer 22 | 23 | """ 24 | 25 | from typing import List, Sequence 26 | 27 | import pandas as pd 28 | import numpy as np 29 | 30 | 31 | class MergeMetrics: 32 | """Merges multiple metrics files into a single gzip compressed csv file 33 | 34 | Parameters 35 | ---------- 36 | metric_files : Sequence[str] 37 | metrics files to merge 38 | output_file : str 39 | file name for the merged output 40 | 41 | Methods 42 | ------- 43 | execute 44 | merge metrics files 45 | # todo this should probably be wrapped into __init__ to make this more like a function 46 | 47 | """ 48 | 49 | def __init__(self, metric_files: Sequence[str], output_file: str): 50 | self._metric_files = metric_files 51 | if not output_file.endswith(".csv.gz"): 52 | output_file += ".csv.gz" 53 | self._output_file = output_file 54 | 55 | def execute(self) -> None: 56 | raise NotImplementedError # merge the metrics 57 | 58 | 59 | class MergeCellMetrics(MergeMetrics): 60 | def execute(self) -> None: 61 | """Concatenate input cell metric files 62 | 63 | Since bam files that metrics are calculated from contain disjoint sets of cells, cell 64 | metrics can simply be concatenated together. 65 | 66 | """ 67 | metric_dataframes: List[pd.DataFrame] = [ 68 | pd.read_csv(f, index_col=0) for f in self._metric_files 69 | ] 70 | concatenated_frame: pd.DataFrame = pd.concat(metric_dataframes, axis=0) 71 | concatenated_frame.to_csv(self._output_file, compression="gzip") 72 | 73 | 74 | class MergeGeneMetrics(MergeMetrics): 75 | def execute(self) -> None: 76 | """Merge input gene metric files 77 | 78 | The bam files that metrics are calculated from contain disjoint sets of cells, each 79 | of which can measure the same genes. 80 | As a result, the metric values must be summed (count based metrics) averaged over 81 | (fractional, averge, or variance metrics) or recalculated (metrics that depend on other 82 | metrics). 83 | 84 | """ 85 | 86 | count_data_to_sum = [ 87 | "n_reads", 88 | "noise_reads", 89 | "perfect_molecule_barcodes", 90 | "reads_mapped_exonic", 91 | "reads_mapped_intronic", 92 | "reads_mapped_utr", 93 | "reads_mapped_uniquely", 94 | "reads_mapped_multiple", 95 | "duplicate_reads", 96 | "spliced_reads", 97 | "antisense_reads", 98 | "n_molecules", 99 | "n_fragments", 100 | "fragments_with_single_read_evidence", 101 | "molecules_with_single_read_evidence", 102 | "number_cells_detected_multiple", 103 | "number_cells_expressing", 104 | ] 105 | 106 | sum_operations = {c: "sum" for c in count_data_to_sum} 107 | 108 | def weighted_average(data_frame: pd.DataFrame) -> pd.Series: 109 | """Calculate the average of each metric, weighted by number of reads per chunk 110 | 111 | Parameters 112 | ---------- 113 | data_frame : pd.DataFrame 114 | chunks x metrics data frame 115 | 116 | Returns 117 | ------- 118 | weighted_average_metrics : pd.Series 119 | The average of each metric across chunks, weighted by the number of reads per chunk 120 | 121 | """ 122 | weights = data_frame["n_reads"].values 123 | 124 | columns_to_average_by_read = [ 125 | "molecule_barcode_fraction_bases_above_30_mean", 126 | "molecule_barcode_fraction_bases_above_30_variance", 127 | "genomic_reads_fraction_bases_quality_above_30_mean", 128 | "genomic_reads_fraction_bases_quality_above_30_variance", 129 | "genomic_read_quality_mean", 130 | "genomic_read_quality_variance", 131 | ] 132 | 133 | return pd.Series( 134 | { 135 | c: np.average(data_frame[c], weights=weights) 136 | for c in columns_to_average_by_read 137 | } 138 | ) 139 | 140 | def recalculate_operation(data_frame) -> pd.DataFrame: 141 | """Recalculate metrics that are dependent on other metric values 142 | 143 | Other metrics should be merged before this function is executed 144 | 145 | Parameters 146 | ---------- 147 | data_frame : pd.DataFrame 148 | chunks x metrics data frame 149 | 150 | Returns 151 | ------- 152 | recalculated_metrics : pd.DataFrame 153 | data frame containing recalculated metrics 154 | 155 | """ 156 | return pd.DataFrame( 157 | data={ 158 | "reads_per_molecule": data_frame["n_reads"] 159 | / data_frame["n_molecules"], 160 | "fragments_per_molecule": data_frame["n_fragments"] 161 | / data_frame["n_molecules"], 162 | "reads_per_fragment": data_frame["n_reads"] 163 | / data_frame["n_fragments"], 164 | } 165 | ) 166 | 167 | # pick one file as a nucleus and merge each subsequent dataframe into it 168 | nucleus = pd.read_csv(self._metric_files[0], index_col=0) 169 | for filename in self._metric_files[1:]: 170 | leaf = pd.read_csv(filename, index_col=0) 171 | 172 | # concatenate this leaf with the nucleus metrics file 173 | concatenated = pd.concat([nucleus, leaf], axis=0) 174 | 175 | # group all duplicate gene names together 176 | grouped = concatenated.groupby(level=0, axis=0) 177 | 178 | # execute the merging operations 179 | summed_columns = grouped.agg(sum_operations) 180 | averaged_columns = grouped.apply(weighted_average) 181 | 182 | # stitch the columns back together, add the metrics that need to be recalculated 183 | merged = pd.concat([summed_columns, averaged_columns], axis=1) 184 | recalculated_columns = recalculate_operation(merged) 185 | merged = pd.concat([merged, recalculated_columns], axis=1) 186 | 187 | # set as nucleus and continue 188 | nucleus = merged 189 | 190 | # write the data 191 | nucleus.to_csv(self._output_file, compression="gzip") 192 | -------------------------------------------------------------------------------- /src/sctools/metrics/writer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Metric Writers 3 | ============== 4 | 5 | ..currentmodule:: sctools.metrics 6 | 7 | This module defines a class to write metrics to csv as the data is generated, cell by cell or gene 8 | by gene. This strategy keeps memory usage low, as no more than a single molecule's worth of sam 9 | records and one cell or gene's worth of metric data are in-memory at a time. 10 | 11 | Classes 12 | ------- 13 | MetricCSVWriter Class to write metrics to file 14 | 15 | See Also 16 | -------- 17 | sctools.metrics.gatherer 18 | sctools.metrics.aggregator 19 | sctools.metrics.merge 20 | 21 | """ 22 | from typing import TextIO, List, Mapping, Any 23 | from numbers import Number 24 | import gzip 25 | 26 | 27 | class MetricCSVWriter: 28 | """Writes metric information iteratively to (optionally compressed) csv. 29 | 30 | Parameters 31 | ---------- 32 | output_stem : str 33 | File stem for the output file. 34 | compress : bool, optional 35 | Whether or not to compress the output file (default = True). 36 | 37 | Methods 38 | ------- 39 | write_header 40 | Write the metric header to file. 41 | write 42 | Write an array of cell or gene metrics to file. 43 | close 44 | Close the metric file. 45 | 46 | """ 47 | 48 | def __init__(self, output_stem: str, compress=True): 49 | 50 | # check and fix extension: 51 | if compress: 52 | if not output_stem.endswith(".csv.gz"): 53 | output_stem += ".csv.gz" 54 | else: 55 | if not output_stem.endswith(".csv"): 56 | output_stem += ".csv" 57 | self._filename: str = output_stem 58 | 59 | # open the file 60 | if compress: 61 | self._open_fid: TextIO = gzip.open(self._filename, "wt") 62 | else: 63 | self._open_fid: TextIO = open(self._filename, "w") 64 | self._header: List[str] = None 65 | 66 | @property 67 | def filename(self) -> str: 68 | """filename with correct suffix added""" 69 | return self._filename 70 | 71 | def write_header(self, record: Mapping[str, Any]) -> None: 72 | """Write the metric keys to file, producing the header line of the csv file. 73 | 74 | Parameters 75 | ---------- 76 | record : Mapping[str, Any] 77 | Output of ``vars()`` called on an sctools.metrics.aggregator.MetricAggregator instance, 78 | producing a dictionary of keys to metric values. 79 | 80 | """ 81 | self._header = list(key for key in record.keys() if not key.startswith("_")) 82 | self._open_fid.write("," + ",".join(self._header) + "\n") 83 | 84 | def write(self, index: str, record: Mapping[str, Number]) -> None: 85 | """Write the array of metric values for a cell or gene to file. 86 | 87 | Parameters 88 | ---------- 89 | index : str 90 | The name of the cell or gene that these metrics summarize 91 | record : Mapping[str, Number] 92 | Output of ``vars()`` called on an sctools.metrics.aggregator.MetricAggregator instance, 93 | producing a dictionary of keys to metric values. 94 | 95 | """ 96 | ordered_fields = [str(record[k]) for k in self._header] 97 | 98 | # genes and cells can be None, call repr to convert to string when this induces a TypeError 99 | try: 100 | self._open_fid.write(index + "," + ",".join(ordered_fields) + "\n") 101 | except TypeError: 102 | index = repr(index) 103 | self._open_fid.write(index + "," + ",".join(ordered_fields) + "\n") 104 | 105 | def close(self) -> None: 106 | """Close the metrics file.""" 107 | self._open_fid.close() 108 | -------------------------------------------------------------------------------- /src/sctools/reader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sequence File Iterators 3 | ======================= 4 | 5 | .. currentmodule:: sctools 6 | 7 | This module defines a general iterator and some helper functions for iterating over files 8 | that contain sequencing data 9 | 10 | Methods 11 | ------- 12 | infer_open(file_: str, mode: str) 13 | helper function that determines the compression type of a file without relying on its extension 14 | zip_readers(*readers, indices=None) 15 | helper function that iterates over one or more readers, optionally extracting only the records 16 | that correspond to indices 17 | 18 | Classes 19 | ------- 20 | Reader Basic reader that loops over one or more input files. 21 | 22 | See Also 23 | -------- 24 | sctools.gtf.Reader 25 | sctools.fastq.Reader 26 | 27 | """ 28 | 29 | import os 30 | import gzip 31 | import bz2 32 | from copy import copy 33 | from functools import partial 34 | from typing import Callable, Iterable, Generator, Set, List 35 | 36 | 37 | def infer_open(file_: str, mode: str) -> Callable: 38 | """Helper function to infer the correct compression type of an input file 39 | 40 | Identifies files that are .gz or .bz2 compressed without requiring file extensions 41 | 42 | Parameters 43 | ---------- 44 | file_ : str 45 | the file to open 46 | mode : {'r', 'rb'} 47 | the mode to open the file in. 'r' returns strings, 'rb' returns bytes 48 | 49 | Returns 50 | ------- 51 | open_function : Callable 52 | the correct open function for the file's compression with mode pre-set through functools 53 | partial 54 | 55 | """ 56 | with open(file_, "rb") as f: 57 | data: bytes = f.read(3) 58 | 59 | # gz and bzip treat 'r' = bytes, 'rt' = string 60 | if data[:2] == b"\x1f\x8b": # gzip magic number 61 | inferred_openhook: Callable = gzip.open 62 | inferred_mode: str = "rt" if mode == "r" else mode 63 | 64 | elif data == b"BZh": # bz2 magic number 65 | inferred_openhook: Callable = bz2.open 66 | inferred_mode: str = "rt" if mode == "r" else mode 67 | 68 | else: 69 | inferred_openhook: Callable = open 70 | inferred_mode: str = mode 71 | 72 | return partial(inferred_openhook, mode=inferred_mode) 73 | 74 | 75 | class Reader: 76 | """Basic reader object that seamlessly loops over multiple input files. 77 | 78 | Is subclassed to create readers for specific file types (e.g. fastq, gtf, etc.) 79 | 80 | Parameters 81 | ---------- 82 | files : Union[str, List], optional 83 | The file(s) to read. If '-', read sys.stdin (default = '-') 84 | mode : {'r', 'rb'}, optional 85 | The open mode for files. If 'r', yield string data, if 'rb', yield bytes data 86 | (default = 'r'). 87 | header_comment_char : str, optional 88 | If not None, skip lines beginning with this character (default = None). 89 | 90 | """ 91 | 92 | def __init__(self, files="-", mode="r", header_comment_char=None): 93 | if isinstance(files, str): 94 | self._files = [files] 95 | elif isinstance(files, Iterable): # test items of iterable 96 | files = list(files) 97 | if all(isinstance(f, str) for f in files): 98 | self._files = files 99 | else: 100 | raise TypeError("All passed files must be type str") 101 | else: 102 | raise TypeError("Files must be a string filename or a list of such names.") 103 | 104 | # set open mode: 105 | if mode not in {"r", "rb"}: 106 | raise ValueError("Mode must be one of 'r', 'rb'") 107 | self._mode = mode 108 | 109 | if isinstance(header_comment_char, str) and mode == "rb": 110 | self._header_comment_char = header_comment_char.encode() 111 | else: 112 | self._header_comment_char = header_comment_char 113 | 114 | @property 115 | def filenames(self) -> List[str]: 116 | return self._files 117 | 118 | def __len__(self): 119 | """Return the length of the Reader object. 120 | 121 | Notes 122 | ----- 123 | This function requires reading the complete file, and should typically not be 124 | used with sys.stdin, as it will consume the input. 125 | 126 | """ 127 | return sum(1 for _ in self) 128 | 129 | def __iter__(self): 130 | for file_ in self._files: 131 | 132 | f = infer_open(file_, self._mode)(file_) 133 | 134 | # iterate over the file, dropping header lines if requested 135 | try: 136 | file_iterator = iter(f) 137 | if self._header_comment_char is not None: 138 | first_record = next(file_iterator) 139 | while first_record.startswith(self._header_comment_char): 140 | first_record = next(file_iterator) 141 | 142 | yield first_record # avoid loss of first non-comment line 143 | 144 | for record in file_iterator: # now, run to exhaustion 145 | yield record 146 | finally: # clean up 147 | f.close() 148 | 149 | @property 150 | def size(self) -> int: 151 | """return the collective size of all files being read in bytes""" 152 | return sum(os.stat(f).st_size for f in self._files) 153 | 154 | def select_record_indices(self, indices: Set) -> Generator: 155 | """Iterate over provided indices only, skipping other records. 156 | 157 | Parameters 158 | ---------- 159 | indices : Set[int] 160 | indices to include in the output 161 | 162 | Yields 163 | ------ 164 | record, str 165 | records from file corresponding to indices 166 | 167 | """ 168 | indices = copy( 169 | indices 170 | ) # passed indices is a reference, need own copy to modify 171 | for idx, record in enumerate(self): 172 | if idx in indices: 173 | yield record 174 | indices.remove(idx) 175 | 176 | # stopping condition 177 | if not indices: 178 | break 179 | 180 | 181 | def zip_readers(*readers, indices=None) -> Generator: 182 | """Zip together multiple reader objects, yielding records simultaneously. 183 | 184 | If indices is passed, only return lines in file that correspond to indices 185 | 186 | Parameters 187 | ---------- 188 | *readers : List[Reader] 189 | Reader objects to simultaneously iterate over 190 | indices : Set[int], optional 191 | indices to include in the output 192 | 193 | Yields 194 | ------ 195 | records : Tuple[str] 196 | one record per reader passed 197 | 198 | """ 199 | if indices: 200 | iterators = zip(*(r.select_record_indices(indices) for r in readers)) 201 | else: 202 | iterators = zip(*readers) 203 | for record_tuple in iterators: 204 | yield record_tuple 205 | -------------------------------------------------------------------------------- /src/sctools/stats.py: -------------------------------------------------------------------------------- 1 | """ 2 | Statistics Functions for Sequence Data Analysis 3 | =============================================== 4 | 5 | .. currentmodule:: sctools 6 | 7 | This module implements statistical modules for sequence analysis 8 | 9 | Methods 10 | ------- 11 | base4_entropy(x: np.array, axis: int=1) 12 | calculate the entropy of a 4 x sequence length base frequency matrix 13 | 14 | Classes 15 | ------- 16 | OnlineGaussianSuficientStatistic Empirical (online) calculation of mean and variance 17 | 18 | """ 19 | 20 | from typing import Tuple 21 | import numpy as np 22 | 23 | 24 | def base4_entropy(x, axis=1): 25 | """Calculate entropy in base four of a data matrix x 26 | 27 | Useful for measuring DNA entropy (with 4 nucleotides) as the output is restricted to [0, 1] 28 | 29 | Parameters 30 | ---------- 31 | x : np.ndarray 32 | array of dimension one or more containing numeric types 33 | axis : int, optional 34 | axis to calculate entropy across. Values in this axis are treated as observation frequencies 35 | 36 | Returns 37 | ------- 38 | entropy : np.ndarray 39 | array of input dimension - 1 containin entropy values bounded in [0, 1] 40 | 41 | """ 42 | 43 | # convert to probabilities 44 | if axis == 1: 45 | x = np.divide(x, np.sum(x, axis=axis)[:, None]) 46 | else: 47 | x = np.divide(x, np.sum(x, axis=axis)) 48 | 49 | with np.errstate(divide="ignore"): 50 | r = np.log(x) / np.log(4) 51 | 52 | # convention: 0 * log(0) = 0, != -INF. 53 | r[np.isinf(r)] = 0 54 | 55 | return np.abs(-1 * np.sum(x * r, axis=axis)) 56 | 57 | 58 | class OnlineGaussianSufficientStatistic: 59 | """ 60 | Implementation of Welford's online mean and variance algorithm 61 | 62 | Methods 63 | ------- 64 | update(new_value: float) 65 | incorporate new_value into the online estimate of mean and variance 66 | mean() 67 | return the mean value 68 | calculate_variance() 69 | calculate and return the variance 70 | mean_and_variance() 71 | return both mean and variance 72 | 73 | """ 74 | 75 | __slots__ = ["_count", "_mean", "_mean_squared_error"] 76 | 77 | def __init__(self): 78 | self._mean_squared_error: float = 0.0 79 | self._mean: float = 0.0 80 | self._count: int = 0 81 | 82 | def update(self, new_value: float) -> None: 83 | self._count += 1 84 | delta = new_value - self._mean 85 | self._mean += delta / self._count 86 | delta2 = new_value - self._mean 87 | self._mean_squared_error += delta * delta2 88 | 89 | @property 90 | def mean(self) -> float: 91 | """return the mean value""" 92 | return self._mean 93 | 94 | def calculate_variance(self): 95 | """calculate and return the variance""" 96 | if self._count < 2: 97 | return float("nan") 98 | else: 99 | return self._mean_squared_error / (self._count - 1) 100 | 101 | def mean_and_variance(self) -> Tuple[float, float]: 102 | """calculate and return the mean and variance""" 103 | return self.mean, self.calculate_variance() 104 | -------------------------------------------------------------------------------- /src/sctools/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/__init__.py -------------------------------------------------------------------------------- /src/sctools/test/data/cell-gene-umi-queryname-sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/cell-gene-umi-queryname-sorted.bam -------------------------------------------------------------------------------- /src/sctools/test/data/cell-sorted-missing-cb.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/cell-sorted-missing-cb.bam -------------------------------------------------------------------------------- /src/sctools/test/data/cell-sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/cell-sorted.bam -------------------------------------------------------------------------------- /src/sctools/test/data/cell_metrics_missing_cb.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/cell_metrics_missing_cb.csv.gz -------------------------------------------------------------------------------- /src/sctools/test/data/chr1.30k_records.gtf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/chr1.30k_records.gtf.gz -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/expected_picard_group.csv: -------------------------------------------------------------------------------- 1 | ,BAD_CYCLES.FIRST_OF_PAIR,BAD_CYCLES.PAIR,BAD_CYCLES.SECOND_OF_PAIR,MEAN_READ_LENGTH.FIRST_OF_PAIR,MEAN_READ_LENGTH.PAIR,MEAN_READ_LENGTH.SECOND_OF_PAIR,PCT_ADAPTER.FIRST_OF_PAIR,PCT_ADAPTER.PAIR,PCT_ADAPTER.SECOND_OF_PAIR,PCT_CHIMERAS.FIRST_OF_PAIR,PCT_CHIMERAS.PAIR,PCT_CHIMERAS.SECOND_OF_PAIR,PCT_PF_READS.FIRST_OF_PAIR,PCT_PF_READS.PAIR,PCT_PF_READS.SECOND_OF_PAIR,PCT_PF_READS_ALIGNED.FIRST_OF_PAIR,PCT_PF_READS_ALIGNED.PAIR,PCT_PF_READS_ALIGNED.SECOND_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.PAIR,PCT_PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.PAIR,PCT_READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,PF_ALIGNED_BASES.FIRST_OF_PAIR,PF_ALIGNED_BASES.PAIR,PF_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_BASES.PAIR,PF_HQ_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.PAIR,PF_HQ_ALIGNED_Q20_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_READS.FIRST_OF_PAIR,PF_HQ_ALIGNED_READS.PAIR,PF_HQ_ALIGNED_READS.SECOND_OF_PAIR,PF_HQ_ERROR_RATE.FIRST_OF_PAIR,PF_HQ_ERROR_RATE.PAIR,PF_HQ_ERROR_RATE.SECOND_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.FIRST_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.PAIR,PF_HQ_MEDIAN_MISMATCHES.SECOND_OF_PAIR,PF_INDEL_RATE.FIRST_OF_PAIR,PF_INDEL_RATE.PAIR,PF_INDEL_RATE.SECOND_OF_PAIR,PF_MISMATCH_RATE.FIRST_OF_PAIR,PF_MISMATCH_RATE.PAIR,PF_MISMATCH_RATE.SECOND_OF_PAIR,PF_NOISE_READS.FIRST_OF_PAIR,PF_NOISE_READS.PAIR,PF_NOISE_READS.SECOND_OF_PAIR,PF_READS.FIRST_OF_PAIR,PF_READS.PAIR,PF_READS.SECOND_OF_PAIR,PF_READS_ALIGNED.FIRST_OF_PAIR,PF_READS_ALIGNED.PAIR,PF_READS_ALIGNED.SECOND_OF_PAIR,PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PF_READS_IMPROPER_PAIRS.PAIR,PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,READS_ALIGNED_IN_PAIRS.PAIR,READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,STRAND_BALANCE.FIRST_OF_PAIR,STRAND_BALANCE.PAIR,STRAND_BALANCE.SECOND_OF_PAIR,TOTAL_READS.FIRST_OF_PAIR,TOTAL_READS.PAIR,TOTAL_READS.SECOND_OF_PAIR,MAX_INSERT_SIZE,MEAN_INSERT_SIZE,MEDIAN_ABSOLUTE_DEVIATION,MEDIAN_INSERT_SIZE,MIN_INSERT_SIZE,PAIR_ORIENTATION,READ_PAIRS,STANDARD_DEVIATION,WIDTH_OF_10_PERCENT,WIDTH_OF_20_PERCENT,WIDTH_OF_30_PERCENT,WIDTH_OF_40_PERCENT,WIDTH_OF_50_PERCENT,WIDTH_OF_60_PERCENT,WIDTH_OF_70_PERCENT,WIDTH_OF_80_PERCENT,WIDTH_OF_90_PERCENT,WIDTH_OF_99_PERCENT,ESTIMATED_LIBRARY_SIZE,PERCENT_DUPLICATION,READ_PAIRS_EXAMINED,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,SECONDARY_OR_SUPPLEMENTARY_RDS,UNMAPPED_READS,UNPAIRED_READS_EXAMINED,UNPAIRED_READ_DUPLICATES,CODING_BASES,CORRECT_STRAND_READS,IGNORED_READS,INCORRECT_STRAND_READS,INTERGENIC_BASES,INTRONIC_BASES,MEDIAN_3PRIME_BIAS,MEDIAN_5PRIME_BIAS,MEDIAN_5PRIME_TO_3PRIME_BIAS,MEDIAN_CV_COVERAGE,NUM_R1_TRANSCRIPT_STRAND_READS,NUM_R2_TRANSCRIPT_STRAND_READS,NUM_UNEXPLAINED_READS,PCT_CODING_BASES,PCT_CORRECT_STRAND_READS,PCT_INTERGENIC_BASES,PCT_INTRONIC_BASES,PCT_MRNA_BASES,PCT_R1_TRANSCRIPT_STRAND_READS,PCT_R2_TRANSCRIPT_STRAND_READS,PCT_RIBOSOMAL_BASES,PCT_USABLE_BASES,PCT_UTR_BASES,PF_ALIGNED_BASES,PF_BASES,RIBOSOMAL_BASES,UTR_BASES,ACCUMULATION_LEVEL,ALIGNED_READS,AT_DROPOUT,GC_DROPOUT,GC_NC_0_19,GC_NC_20_39,GC_NC_40_59,GC_NC_60_79,GC_NC_80_100,READS_USED,TOTAL_CLUSTERS,WINDOW_SIZE 2 | Class,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics 3 | test,0.0,0.0,0.0,25.0,25.0,25.0,0.0,0.0,0.0,0.006141,0.006153,0.006165,1.0,1.0,1.0,0.959299,0.956379,0.953459,0.036149,0.033206,0.030245,0.966514,0.969466,0.972435,131124.0,261405.0,130281.0,116063.0,231550.0,115487.0,115095.0,229110.0,114015.0,4650.0,9279.0,4629.0,0.000922,0.000885,0.000849,0.0,0.0,0.0,6.9e-05,5.4e-05,3.8e-05,0.0009,0.000876,0.000852,0.0,0.0,0.0,5479.0,10958.0,5479.0,5256.0,10480.0,5224.0,190.0,348.0,158.0,5080.0,10160.0,5080.0,0.494292,0.501527,0.508806,5479.0,10958.0,5479.0,2725787,207.219528,63,191,33,FR,5067,106.256303,25,49,73,99,127,157,195,267,641,87835,612743.0,0.007156,5080.0,21.0,0.0,4393.0,478.0,320.0,33.0,56934.0,0.0,0.0,0.0,65569.0,101238.0,0.705663,0.680576,0.496023,0.939679,719.0,795.0,60.0,0.2178,0.0,0.250833,0.387284,0.361883,0.474901,0.525099,0.0,0.345311,0.144083,261405.0,273950.0,0.0,37664.0,All Reads,14873,10.733266,1.82225,0.112713,0.817807,1.086361,2.181453,0.143318,ALL,7701,100 4 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_hisat2.csv: -------------------------------------------------------------------------------- 1 | ,Aligned 0 time,Aligned 1 time,Aligned >1 times,Aligned concordantly 1 time,Aligned concordantly >1 times,Aligned concordantly or discordantly 0 time,Aligned discordantly 1 time,Overall alignment rate,Total pairs,Total unpaired reads 2 | Class,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G 3 | test,478,240,106,4414,652,412,1,95.64%,5479,824 4 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_hisat2_paired_end_qc.log: -------------------------------------------------------------------------------- 1 | HISAT2 summary stats: 2 | Total pairs: 5479 3 | Aligned concordantly or discordantly 0 time: 412 (7.52%) 4 | Aligned concordantly 1 time: 4414 (80.56%) 5 | Aligned concordantly >1 times: 652 (11.90%) 6 | Aligned discordantly 1 time: 1 (0.02%) 7 | Total unpaired reads: 824 8 | Aligned 0 time: 478 (58.01%) 9 | Aligned 1 time: 240 (29.13%) 10 | Aligned >1 times: 106 (12.86%) 11 | Overall alignment rate: 95.64% 12 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_hisat2_trans.csv: -------------------------------------------------------------------------------- 1 | ,Aligned 0 time,Aligned 1 time,Aligned >1 times,Aligned concordantly 1 time,Aligned concordantly >1 times,Aligned concordantly or discordantly 0 time,Aligned discordantly 1 time,Overall alignment rate,Total pairs,Total unpaired reads 2 | Class,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T 3 | test,7270,0,0,360,1484,3635,0,33.66%,5479,7270 4 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_hisat2_transcriptome_rsem.log: -------------------------------------------------------------------------------- 1 | HISAT2 summary stats: 2 | Total pairs: 5479 3 | Aligned concordantly or discordantly 0 time: 3635 (66.34%) 4 | Aligned concordantly 1 time: 360 (6.57%) 5 | Aligned concordantly >1 times: 1484 (27.09%) 6 | Aligned discordantly 1 time: 0 (0.00%) 7 | Total unpaired reads: 7270 8 | Aligned 0 time: 7270 (100.00%) 9 | Aligned 1 time: 0 (0.00%) 10 | Aligned >1 times: 0 (0.00%) 11 | Overall alignment rate: 33.66% 12 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_picard_group.csv: -------------------------------------------------------------------------------- 1 | ,BAD_CYCLES.FIRST_OF_PAIR,BAD_CYCLES.PAIR,BAD_CYCLES.SECOND_OF_PAIR,MEAN_READ_LENGTH.FIRST_OF_PAIR,MEAN_READ_LENGTH.PAIR,MEAN_READ_LENGTH.SECOND_OF_PAIR,PCT_ADAPTER.FIRST_OF_PAIR,PCT_ADAPTER.PAIR,PCT_ADAPTER.SECOND_OF_PAIR,PCT_CHIMERAS.FIRST_OF_PAIR,PCT_CHIMERAS.PAIR,PCT_CHIMERAS.SECOND_OF_PAIR,PCT_PF_READS.FIRST_OF_PAIR,PCT_PF_READS.PAIR,PCT_PF_READS.SECOND_OF_PAIR,PCT_PF_READS_ALIGNED.FIRST_OF_PAIR,PCT_PF_READS_ALIGNED.PAIR,PCT_PF_READS_ALIGNED.SECOND_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.PAIR,PCT_PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.PAIR,PCT_READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,PF_ALIGNED_BASES.FIRST_OF_PAIR,PF_ALIGNED_BASES.PAIR,PF_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_BASES.PAIR,PF_HQ_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.PAIR,PF_HQ_ALIGNED_Q20_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_READS.FIRST_OF_PAIR,PF_HQ_ALIGNED_READS.PAIR,PF_HQ_ALIGNED_READS.SECOND_OF_PAIR,PF_HQ_ERROR_RATE.FIRST_OF_PAIR,PF_HQ_ERROR_RATE.PAIR,PF_HQ_ERROR_RATE.SECOND_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.FIRST_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.PAIR,PF_HQ_MEDIAN_MISMATCHES.SECOND_OF_PAIR,PF_INDEL_RATE.FIRST_OF_PAIR,PF_INDEL_RATE.PAIR,PF_INDEL_RATE.SECOND_OF_PAIR,PF_MISMATCH_RATE.FIRST_OF_PAIR,PF_MISMATCH_RATE.PAIR,PF_MISMATCH_RATE.SECOND_OF_PAIR,PF_NOISE_READS.FIRST_OF_PAIR,PF_NOISE_READS.PAIR,PF_NOISE_READS.SECOND_OF_PAIR,PF_READS.FIRST_OF_PAIR,PF_READS.PAIR,PF_READS.SECOND_OF_PAIR,PF_READS_ALIGNED.FIRST_OF_PAIR,PF_READS_ALIGNED.PAIR,PF_READS_ALIGNED.SECOND_OF_PAIR,PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PF_READS_IMPROPER_PAIRS.PAIR,PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,READS_ALIGNED_IN_PAIRS.PAIR,READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,STRAND_BALANCE.FIRST_OF_PAIR,STRAND_BALANCE.PAIR,STRAND_BALANCE.SECOND_OF_PAIR,TOTAL_READS.FIRST_OF_PAIR,TOTAL_READS.PAIR,TOTAL_READS.SECOND_OF_PAIR,MAX_INSERT_SIZE,MEAN_INSERT_SIZE,MEDIAN_ABSOLUTE_DEVIATION,MEDIAN_INSERT_SIZE,MIN_INSERT_SIZE,PAIR_ORIENTATION,READ_PAIRS,STANDARD_DEVIATION,WIDTH_OF_10_PERCENT,WIDTH_OF_20_PERCENT,WIDTH_OF_30_PERCENT,WIDTH_OF_40_PERCENT,WIDTH_OF_50_PERCENT,WIDTH_OF_60_PERCENT,WIDTH_OF_70_PERCENT,WIDTH_OF_80_PERCENT,WIDTH_OF_90_PERCENT,WIDTH_OF_99_PERCENT,ESTIMATED_LIBRARY_SIZE,PERCENT_DUPLICATION,READ_PAIRS_EXAMINED,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,SECONDARY_OR_SUPPLEMENTARY_RDS,UNMAPPED_READS,UNPAIRED_READS_EXAMINED,UNPAIRED_READ_DUPLICATES,CODING_BASES,CORRECT_STRAND_READS,IGNORED_READS,INCORRECT_STRAND_READS,INTERGENIC_BASES,INTRONIC_BASES,MEDIAN_3PRIME_BIAS,MEDIAN_5PRIME_BIAS,MEDIAN_5PRIME_TO_3PRIME_BIAS,MEDIAN_CV_COVERAGE,NUM_R1_TRANSCRIPT_STRAND_READS,NUM_R2_TRANSCRIPT_STRAND_READS,NUM_UNEXPLAINED_READS,PCT_CODING_BASES,PCT_CORRECT_STRAND_READS,PCT_INTERGENIC_BASES,PCT_INTRONIC_BASES,PCT_MRNA_BASES,PCT_R1_TRANSCRIPT_STRAND_READS,PCT_R2_TRANSCRIPT_STRAND_READS,PCT_RIBOSOMAL_BASES,PCT_USABLE_BASES,PCT_UTR_BASES,PF_ALIGNED_BASES,PF_BASES,RIBOSOMAL_BASES,UTR_BASES,ACCUMULATION_LEVEL,ALIGNED_READS,AT_DROPOUT,GC_DROPOUT,GC_NC_0_19,GC_NC_20_39,GC_NC_40_59,GC_NC_60_79,GC_NC_80_100,READS_USED,TOTAL_CLUSTERS,WINDOW_SIZE 2 | Class,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics 3 | test,0.0,0.0,0.0,25.0,25.0,25.0,0.0,0.0,0.0,0.006141,0.006153,0.006165,1.0,1.0,1.0,0.959299,0.956379,0.953459,0.036149,0.033206,0.030245,0.966514,0.969466,0.972435,131124.0,261405.0,130281.0,116063.0,231550.0,115487.0,115095.0,229110.0,114015.0,4650.0,9279.0,4629.0,0.000922,0.000885,0.000849,0.0,0.0,0.0,6.9e-05,5.4e-05,3.8e-05,0.0009,0.000876,0.000852,0.0,0.0,0.0,5479.0,10958.0,5479.0,5256.0,10480.0,5224.0,190.0,348.0,158.0,5080.0,10160.0,5080.0,0.494292,0.501527,0.508806,5479.0,10958.0,5479.0,2725787,207.219528,63,191,33,FR,5067,106.256303,25,49,73,99,127,157,195,267,641,87835,612743.0,0.007156,5080.0,21.0,0.0,4393.0,478.0,320.0,33.0,56934.0,0.0,0.0,0.0,65569.0,101238.0,0.705663,0.680576,0.496023,0.939679,719.0,795.0,60.0,0.2178,0.0,0.250833,0.387284,0.361883,0.474901,0.525099,0.0,0.345311,0.144083,261405.0,273950.0,0.0,37664.0,All Reads,14873,10.733266,1.82225,0.112713,0.817807,1.086361,2.181453,0.143318,ALL,7701,100 4 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_qc.alignment_summary_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Thu Aug 30 20:52:21 UTC 2018 5 | 6 | ## METRICS CLASS picard.analysis.AlignmentSummaryMetrics 7 | CATEGORY TOTAL_READS PF_READS PCT_PF_READS PF_NOISE_READS PF_READS_ALIGNED PCT_PF_READS_ALIGNED PF_ALIGNED_BASES PF_HQ_ALIGNED_READS PF_HQ_ALIGNED_BASES PF_HQ_ALIGNED_Q20_BASES PF_HQ_MEDIAN_MISMATCHES PF_MISMATCH_RATE PF_HQ_ERROR_RATE PF_INDEL_RATE MEAN_READ_LENGTH READS_ALIGNED_IN_PAIRS PCT_READS_ALIGNED_IN_PAIRS PF_READS_IMPROPER_PAIRS PCT_PF_READS_IMPROPER_PAIRS BAD_CYCLES STRAND_BALANCE PCT_CHIMERAS PCT_ADAPTER SAMPLE LIBRARY READ_GROUP 8 | FIRST_OF_PAIR 5479 5479 1 0 5256 0.959299 131124 4650 116063 115095 0 0.0009 0.000922 0.000069 25 5080 0.966514 190 0.036149 0 0.494292 0.006141 0 9 | SECOND_OF_PAIR 5479 5479 1 0 5224 0.953459 130281 4629 115487 114015 0 0.000852 0.000849 0.000038 25 5080 0.972435 158 0.030245 0 0.508806 0.006165 0 10 | PAIR 10958 10958 1 0 10480 0.956379 261405 9279 231550 229110 0 0.000876 0.000885 0.000054 25 10160 0.969466 348 0.033206 0 0.501527 0.006153 0 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_qc.duplicate_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # MarkDuplicates INPUT=[/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam] OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.MarkDuplicated.bam METRICS_FILE=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX= OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Thu Aug 30 20:51:46 UTC 2018 5 | 6 | ## METRICS CLASS picard.sam.DuplicationMetrics 7 | LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE 8 | d20fb2dd-3d98-4516-a648-dee5e1917bd7 320 5080 4393 478 33 21 0 0.007156 612743 9 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_qc.error_summary_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Thu Aug 30 20:52:21 UTC 2018 5 | 6 | ## METRICS CLASS picard.analysis.artifacts.ErrorSummaryMetrics 7 | REF_BASE ALT_BASE SUBSTITUTION REF_COUNT ALT_COUNT SUBSTITUTION_RATE 8 | A C A>C 231512 16 0.000069 9 | A G A>G 231512 156 0.000673 10 | A T A>T 231512 16 0.000069 11 | C A C>A 173880 16 0.000092 12 | C G C>G 173880 14 0.000081 13 | C T C>T 173880 82 0.000471 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_qc.gc_bias.summary_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Thu Aug 30 20:52:21 UTC 2018 5 | 6 | ## METRICS CLASS picard.analysis.GcBiasSummaryMetrics 7 | ACCUMULATION_LEVEL READS_USED WINDOW_SIZE TOTAL_CLUSTERS ALIGNED_READS AT_DROPOUT GC_DROPOUT GC_NC_0_19 GC_NC_20_39 GC_NC_40_59 GC_NC_60_79 GC_NC_80_100 SAMPLE LIBRARY READ_GROUP 8 | All Reads ALL 100 7701 14873 10.733266 1.82225 0.112713 0.817807 1.086361 2.181453 0.143318 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_qc.insert_size_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Thu Aug 30 20:52:21 UTC 2018 5 | 6 | ## METRICS CLASS picard.analysis.InsertSizeMetrics 7 | MEDIAN_INSERT_SIZE MEDIAN_ABSOLUTE_DEVIATION MIN_INSERT_SIZE MAX_INSERT_SIZE MEAN_INSERT_SIZE STANDARD_DEVIATION READ_PAIRS PAIR_ORIENTATION WIDTH_OF_10_PERCENT WIDTH_OF_20_PERCENT WIDTH_OF_30_PERCENT WIDTH_OF_40_PERCENT WIDTH_OF_50_PERCENT WIDTH_OF_60_PERCENT WIDTH_OF_70_PERCENT WIDTH_OF_80_PERCENT WIDTH_OF_90_PERCENT WIDTH_OF_99_PERCENT SAMPLE LIBRARY READ_GROUP 8 | 191 63 33 2725787 207.219528 106.256303 5067 FR 25 49 73 99 127 157 195 267 641 87835 9 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_qc.rna_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # CollectRnaSeqMetrics REF_FLAT=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Thu Aug 30 20:51:55 UTC 2018 5 | 6 | ## METRICS CLASS picard.analysis.RnaSeqMetrics 7 | PF_BASES PF_ALIGNED_BASES RIBOSOMAL_BASES CODING_BASES UTR_BASES INTRONIC_BASES INTERGENIC_BASES IGNORED_READS CORRECT_STRAND_READS INCORRECT_STRAND_READS NUM_R1_TRANSCRIPT_STRAND_READS NUM_R2_TRANSCRIPT_STRAND_READS NUM_UNEXPLAINED_READS PCT_R1_TRANSCRIPT_STRAND_READS PCT_R2_TRANSCRIPT_STRAND_READS PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES PCT_USABLE_BASES PCT_CORRECT_STRAND_READS MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS MEDIAN_5PRIME_TO_3PRIME_BIAS SAMPLE LIBRARY READ_GROUP 8 | 273950 261405 0 56934 37664 101238 65569 0 0 0 719 795 60 0.474901 0.525099 0 0.2178 0.144083 0.387284 0.250833 0.361883 0.345311 0 0.939679 0.680576 0.705663 0.496023 9 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_rsem.cnt: -------------------------------------------------------------------------------- 1 | 3635 1844 0 5479 2 | 1652 192 1484 3 | 6599 3 4 | 0 3635 5 | 1 360 6 | 2 327 7 | 3 416 8 | 4 243 9 | 5 185 10 | 6 85 11 | 7 76 12 | 8 53 13 | 9 16 14 | 10 83 15 | Inf 0 16 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics/test_rsem.csv: -------------------------------------------------------------------------------- 1 | ,alignable reads,filtered reads,multiple mapped,strand,total alignments,total reads,unalignable reads,uncertain reads,unique aligned 2 | Class,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM 3 | test,1844,0,192,3,6599,5479,3635,1484,1652 4 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Tue May 14 15:45:18 UTC 2019 5 | 6 | ## METRICS CLASS picard.analysis.AlignmentSummaryMetrics 7 | CATEGORY TOTAL_READS PF_READS PCT_PF_READS PF_NOISE_READS PF_READS_ALIGNED PCT_PF_READS_ALIGNED PF_ALIGNED_BASES PF_HQ_ALIGNED_READS PF_HQ_ALIGNED_BASES PF_HQ_ALIGNED_Q20_BASES PF_HQ_MEDIAN_MISMATCHES PF_MISMATCH_RATE PF_HQ_ERROR_RATE PF_INDEL_RATE MEAN_READ_LENGTH READS_ALIGNED_IN_PAIRS PCT_READS_ALIGNED_IN_PAIRS PF_READS_IMPROPER_PAIRS PCT_PF_READS_IMPROPER_PAIRS BAD_CYCLES STRAND_BALANCE PCT_CHIMERAS PCT_ADAPTER SAMPLE LIBRARY READ_GROUP 8 | UNPAIRED 1086652 1086652 1 0 770963 0.709485 38213614 697232 34613985 34073804 0 0.002624 0.002357 0.000149 50 0 0 0 0 0 0.501303 0 0.000027 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # MarkDuplicates INPUT=[/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectDuplicationMetrics/inputs/-1585165421/SRR6258488_qc.bam] OUTPUT=SRR6258488_qc.MarkDuplicated.bam METRICS_FILE=SRR6258488_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX= OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Tue May 14 15:45:17 UTC 2019 5 | 6 | ## METRICS CLASS picard.sam.DuplicationMetrics 7 | LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE 8 | SRR6258488 770963 0 473100 315689 345396 0 0 0.448006 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Tue May 14 15:45:18 UTC 2019 5 | 6 | ## METRICS CLASS picard.analysis.GcBiasSummaryMetrics 7 | ACCUMULATION_LEVEL READS_USED WINDOW_SIZE TOTAL_CLUSTERS ALIGNED_READS AT_DROPOUT GC_DROPOUT GC_NC_0_19 GC_NC_20_39 GC_NC_40_59 GC_NC_60_79 GC_NC_80_100 SAMPLE LIBRARY READ_GROUP 8 | All Reads ALL 100 1559752 1244063 13.760859 1.1878 0.219754 0.753171 1.281724 0.883386 0.021428 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt: -------------------------------------------------------------------------------- 1 | ## htsjdk.samtools.metrics.StringHeader 2 | # CollectRnaSeqMetrics REF_FLAT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=SRR6258488_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-1585165421/SRR6258488_qc.bam OUTPUT=SRR6258488_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false 3 | ## htsjdk.samtools.metrics.StringHeader 4 | # Started on: Tue May 14 15:45:18 UTC 2019 5 | 6 | ## METRICS CLASS picard.analysis.RnaSeqMetrics 7 | PF_BASES PF_ALIGNED_BASES RIBOSOMAL_BASES CODING_BASES UTR_BASES INTRONIC_BASES INTERGENIC_BASES IGNORED_READS CORRECT_STRAND_READS INCORRECT_STRAND_READS NUM_R1_TRANSCRIPT_STRAND_READS NUM_R2_TRANSCRIPT_STRAND_READS NUM_UNEXPLAINED_READS PCT_R1_TRANSCRIPT_STRAND_READS PCT_R2_TRANSCRIPT_STRAND_READS PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES PCT_USABLE_BASES PCT_CORRECT_STRAND_READS MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS MEDIAN_5PRIME_TO_3PRIME_BIAS SAMPLE LIBRARY READ_GROUP 8 | 54332600 38213614 0 371628 1152265 18630585 18059136 0 0 0 12352 12891 538 0.489324 0.510676 0 0.009725 0.030153 0.487538 0.472584 0.039878 0.028047 0 2.183917 0 0 0 9 | 10 | ## HISTOGRAM java.lang.Integer 11 | normalized_position All_Reads.normalized_coverage 12 | 0 1.252653 13 | 1 1.146108 14 | 2 1.065068 15 | 3 1.122433 16 | 4 1.234516 17 | 5 1.247113 18 | 6 1.2191 19 | 7 1.08917 20 | 8 1.101883 21 | 9 1.130302 22 | 10 1.082888 23 | 11 1.146879 24 | 12 1.173149 25 | 13 1.084206 26 | 14 1.035169 27 | 15 1.169359 28 | 16 1.278125 29 | 17 1.298059 30 | 18 1.418038 31 | 19 1.468055 32 | 20 1.306559 33 | 21 1.210198 34 | 22 0.953958 35 | 23 0.806139 36 | 24 0.815513 37 | 25 0.887045 38 | 26 0.763414 39 | 27 0.737914 40 | 28 0.702678 41 | 29 0.689913 42 | 30 0.633512 43 | 31 0.665368 44 | 32 0.682949 45 | 33 0.848599 46 | 34 0.941722 47 | 35 1.082228 48 | 36 1.113449 49 | 37 1.049003 50 | 38 0.97788 51 | 39 0.989931 52 | 40 0.92986 53 | 41 0.874432 54 | 42 0.87788 55 | 43 0.868871 56 | 44 0.92942 57 | 45 1.015775 58 | 46 1.070114 59 | 47 1.023889 60 | 48 1.023103 61 | 49 0.988576 62 | 50 0.931694 63 | 51 0.794716 64 | 52 0.765784 65 | 53 0.721218 66 | 54 0.723223 67 | 55 0.711507 68 | 56 0.704034 69 | 57 0.694139 70 | 58 0.741844 71 | 59 0.831505 72 | 60 0.806244 73 | 61 0.869419 74 | 62 0.987354 75 | 63 0.954176 76 | 64 0.925553 77 | 65 0.951851 78 | 66 0.906269 79 | 67 0.85666 80 | 68 0.985052 81 | 69 0.947861 82 | 70 0.98528 83 | 71 0.873541 84 | 72 0.87925 85 | 73 0.956294 86 | 74 1.137028 87 | 75 1.206313 88 | 76 1.148145 89 | 77 1.159051 90 | 78 1.207689 91 | 79 1.170334 92 | 80 1.199969 93 | 81 1.391121 94 | 82 1.243649 95 | 83 1.235795 96 | 84 1.227105 97 | 85 1.278662 98 | 86 1.298065 99 | 87 1.201038 100 | 88 1.2361 101 | 89 1.098932 102 | 90 1.042881 103 | 91 1.037875 104 | 92 0.95545 105 | 93 0.969215 106 | 94 1.059149 107 | 95 0.857316 108 | 96 0.792585 109 | 97 0.817511 110 | 98 0.880909 111 | 99 0.786114 112 | 100 0.548663 113 | 114 | -------------------------------------------------------------------------------- /src/sctools/test/data/small-cell-sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/small-cell-sorted.bam -------------------------------------------------------------------------------- /src/sctools/test/data/small-gene-sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/small-gene-sorted.bam -------------------------------------------------------------------------------- /src/sctools/test/data/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test.bam -------------------------------------------------------------------------------- /src/sctools/test/data/test.gtf.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test.gtf.bz2 -------------------------------------------------------------------------------- /src/sctools/test/data/test.gtf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test.gtf.gz -------------------------------------------------------------------------------- /src/sctools/test/data/test_i7.fastq: -------------------------------------------------------------------------------- 1 | @ST-K00126:308:HFLYFBBXX:1:1101:5172:1173 1:N:0:NCACAATG 2 | NCACAATG 3 | + 4 | #AAFFJFF 5 | @ST-K00126:308:HFLYFBBXX:1:1101:31314:1173 1:N:0:NCACAATG 6 | NCACAATG 7 | + 8 | #AAFFJA- 9 | @ST-K00126:308:HFLYFBBXX:1:1101:31984:1173 1:N:0:NCACAATG 10 | NCACAATG 11 | + 12 | #AAFFJAA 13 | @ST-K00126:308:HFLYFBBXX:1:1101:4574:1191 1:N:0:NCACAATG 14 | NCACAATG 15 | + 16 | #AAAFJFJ 17 | @ST-K00126:308:HFLYFBBXX:1:1101:8186:1191 1:N:0:NCACAATG 18 | NCACAATG 19 | + 20 | #AAFFJFJ 21 | @ST-K00126:308:HFLYFBBXX:1:1101:13159:1191 1:N:0:NCACAATG 22 | NCACAATG 23 | + 24 | #AAFFJJJ 25 | @ST-K00126:308:HFLYFBBXX:1:1101:17969:1191 1:N:0:NCACAATG 26 | NCACAATG 27 | + 28 | #AAFFJFJ 29 | @ST-K00126:308:HFLYFBBXX:1:1101:22942:1191 1:N:0:NCACAATG 30 | NCACAATG 31 | + 32 | #AAFFJAF 33 | @ST-K00126:308:HFLYFBBXX:1:1101:13190:1209 1:N:0:NCACAATG 34 | NCACAATG 35 | + 36 | #AAFFJJJ 37 | @ST-K00126:308:HFLYFBBXX:1:1101:15300:1209 1:N:0:NCACAATG 38 | NCACAATG 39 | + 40 | #AAFFJJF 41 | @ST-K00126:308:HFLYFBBXX:1:1101:19603:1209 1:N:0:NCACAATG 42 | NCACAATG 43 | + 44 | #AAAFJAA 45 | @ST-K00126:308:HFLYFBBXX:1:1101:20111:1209 1:N:0:NCACAATG 46 | NCACAATG 47 | + 48 | #AAFFJJJ 49 | @ST-K00126:308:HFLYFBBXX:1:1101:23926:1209 1:N:0:NCACAATG 50 | NCACAATG 51 | + 52 | #AAFFJJJ 53 | @ST-K00126:308:HFLYFBBXX:1:1101:2828:1226 1:N:0:NCACAATG 54 | NCACAATG 55 | + 56 | #AA-FFFF 57 | @ST-K00126:308:HFLYFBBXX:1:1101:8004:1226 1:N:0:NCACAATG 58 | NCACAATG 59 | + 60 | #AAFFJJJ 61 | @ST-K00126:308:HFLYFBBXX:1:1101:12814:1226 1:N:0:NCACAATG 62 | NCACAATG 63 | + 64 | #AAAFJJF 65 | @ST-K00126:308:HFLYFBBXX:1:1101:13240:1226 1:N:0:NCACAATG 66 | NCACAATG 67 | + 68 | #AAFFJJJ 69 | @ST-K00126:308:HFLYFBBXX:1:1101:15818:1226 1:N:0:NCACAATG 70 | NCACAATG 71 | + 72 | #AAFFJJJ 73 | @ST-K00126:308:HFLYFBBXX:1:1101:23652:1226 1:N:0:NCACAATG 74 | NCACAATG 75 | + 76 | #AAFFJJF 77 | @ST-K00126:308:HFLYFBBXX:1:1101:27793:1226 1:N:0:NCACAATG 78 | NCACAATG 79 | + 80 | #AAFFJJJ 81 | @ST-K00126:308:HFLYFBBXX:1:1101:29904:1226 1:N:0:NCACAATG 82 | NCACAATG 83 | + 84 | #AAAFJ-A 85 | @ST-K00126:308:HFLYFBBXX:1:1101:2920:1244 1:N:0:NCACAATG 86 | NCACAATG 87 | + 88 | #AAFFJAA 89 | @ST-K00126:308:HFLYFBBXX:1:1101:5010:1244 1:N:0:NCACAATG 90 | NCACAATG 91 | + 92 | #AAFFJJ< 93 | @ST-K00126:308:HFLYFBBXX:1:1101:11667:1244 1:N:0:NCACAATG 94 | NCACAATG 95 | + 96 | #AAF= 0) 59 | assert np.all(bd <= 1) 60 | 61 | 62 | def test_summarize_hamming_distances_gives_reasonable_results( 63 | short_barcode_set_from_iterable, 64 | ): 65 | 66 | hamming_summary = short_barcode_set_from_iterable.summarize_hamming_distances() 67 | 68 | # we know 10x barcodes have at least this much distance 69 | assert hamming_summary["minimum"] >= 2 70 | # no barcode can have more hamming distance than length 71 | assert all(v <= 16 for v in hamming_summary.values()) 72 | 73 | 74 | # TEST HashErrorsToCorrectBarcodes 75 | 76 | 77 | @pytest.fixture(scope="module") 78 | def trivial_whitelist(): 79 | barcode_iterable = ["A" * 8] 80 | error_mapping = barcode.ErrorsToCorrectBarcodesMap._prepare_single_base_error_hash_table( 81 | barcode_iterable 82 | ) 83 | return barcode.ErrorsToCorrectBarcodesMap(error_mapping) 84 | 85 | 86 | @pytest.fixture(scope="module") 87 | def truncated_whitelist_from_10x(): 88 | # note that this whitelist contains 1 non-10x barcode to ensure the presence of a matching 89 | # target in the test data. 90 | error_mapping = barcode.ErrorsToCorrectBarcodesMap.single_hamming_errors_from_whitelist( 91 | data_dir + "1k-august-2016.txt" 92 | ) 93 | return error_mapping 94 | 95 | 96 | def test_incorrect_input_raises_errors(trivial_whitelist): 97 | with pytest.raises(TypeError): 98 | barcode.ErrorsToCorrectBarcodesMap("not_a_mapping") 99 | with pytest.raises(TypeError): 100 | barcode.ErrorsToCorrectBarcodesMap({"not_a_mapping"}) 101 | with pytest.raises(TypeError): 102 | barcode.ErrorsToCorrectBarcodesMap(["not_a_mapping", "sldkf"]) 103 | assert isinstance(trivial_whitelist, barcode.ErrorsToCorrectBarcodesMap) 104 | 105 | 106 | def test_correct_barcode_finds_and_corrects_1_base_errors(trivial_whitelist): 107 | assert trivial_whitelist.get_corrected_barcode("TAAAAAAA") == "AAAAAAAA" 108 | assert trivial_whitelist.get_corrected_barcode("AAAACAAA") == "AAAAAAAA" 109 | assert trivial_whitelist.get_corrected_barcode("AAAGAAAA") == "AAAAAAAA" 110 | assert trivial_whitelist.get_corrected_barcode("AAAAAAAA") == "AAAAAAAA" 111 | 112 | 113 | def test_correct_barcode_raises_keyerror_when_barcode_not_correct_length( 114 | trivial_whitelist, 115 | ): 116 | with pytest.raises(KeyError): 117 | trivial_whitelist.get_corrected_barcode("AAA") 118 | with pytest.raises(KeyError): 119 | trivial_whitelist.get_corrected_barcode("AAAAAAAAA") 120 | with pytest.raises(KeyError): 121 | trivial_whitelist.get_corrected_barcode("AAAAAAAAAA") 122 | 123 | 124 | def test_correct_barcode_raises_keyerror_when_barcode_has_more_than_one_error( 125 | trivial_whitelist, 126 | ): 127 | with pytest.raises(KeyError): 128 | trivial_whitelist.get_corrected_barcode("AAAAAATT") 129 | with pytest.raises(KeyError): 130 | trivial_whitelist.get_corrected_barcode("TTAAAAAA") 131 | 132 | 133 | @pytest.fixture(scope="module") 134 | def tagged_bamfile(): 135 | outbam = data_dir + "bam_with_tags_test.bam" 136 | args = [ 137 | "--r1", 138 | data_dir + "test_r1.fastq", 139 | "--i1", 140 | data_dir + "test_i7.fastq", 141 | "--u2", 142 | data_dir + "test.bam", 143 | "--output-bamfile", 144 | outbam, 145 | ] 146 | platform.TenXV2.attach_barcodes(args) 147 | return outbam 148 | 149 | 150 | def test_correct_bam_produces_cb_tags(tagged_bamfile, truncated_whitelist_from_10x): 151 | outbam = data_dir + "bam_with_cb_tags.bam" 152 | truncated_whitelist_from_10x.correct_bam(tagged_bamfile, outbam) 153 | success = False 154 | with pysam.AlignmentFile(outbam, "rb") as f: 155 | for record in f: 156 | try: 157 | success = record.get_tag(consts.CELL_BARCODE_TAG_KEY) 158 | except KeyError: 159 | continue 160 | assert success 161 | os.remove(outbam) 162 | -------------------------------------------------------------------------------- /src/sctools/test/test_encodings.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from .. import encodings 3 | from itertools import combinations 4 | 5 | 6 | @pytest.fixture(scope="module") 7 | def sequence(): 8 | return b"ACGTTTGAGATGAGATATAGANNNN" 9 | 10 | 11 | @pytest.fixture(scope="module") 12 | def encoder_2bit(sequence): 13 | length = len(sequence) 14 | return encodings.TwoBit(length) 15 | 16 | 17 | @pytest.fixture(scope="module") 18 | def encoder_3bit(): 19 | return encodings.ThreeBit() 20 | 21 | 22 | @pytest.fixture(scope="module", params=[encodings.TwoBit, encodings.ThreeBit]) 23 | def encoder(request): 24 | return request.param 25 | 26 | 27 | def test_two_bit_encode_decode_produces_same_string_except_for_N( 28 | sequence, encoder_2bit 29 | ): 30 | encoded = encoder_2bit.encode(sequence) 31 | decoded = encoder_2bit.decode(encoded) 32 | assert sequence[:4] == decoded[:4] # last 4 are N, which get randomized 33 | 34 | 35 | def test_three_bit_encode_decode_produces_same_string(sequence, encoder_3bit): 36 | encoded = encoder_3bit.encode(sequence) 37 | decoded = encoder_3bit.decode(encoded) 38 | assert sequence == decoded 39 | 40 | 41 | def test_two_bit_encoder_gets_correct_gc_content(encoder_2bit): 42 | sequence_no_n = b"AGCGCGAT" 43 | gc_content = sequence_no_n.count(b"C") + sequence_no_n.count(b"G") 44 | encoded = encoder_2bit.encode(sequence_no_n) 45 | assert encoder_2bit.gc_content(encoded) == gc_content 46 | 47 | 48 | def test_three_bit_encoder_gets_correct_gc_content(sequence, encoder_3bit): 49 | encoded = encoder_3bit.encode(sequence) 50 | assert encoder_3bit.gc_content(encoded) == sequence.count(b"C") + sequence.count( 51 | b"G" 52 | ) 53 | 54 | 55 | def test_two_bit_throws_errors_when_asked_to_encode_unknown_nucleotide(encoder_2bit): 56 | with pytest.raises(KeyError): 57 | encoder_2bit.encode(b"ACGTP") # P is not a valid code 58 | 59 | 60 | def test_three_bit_encodes_unknown_nucleotides_as_N(encoder_3bit): 61 | encoded = encoder_3bit.encode(b"ACGTP") # P is not a valid code 62 | decoded = encoder_3bit.decode(encoded) 63 | assert decoded == b"ACGTN" 64 | 65 | 66 | @pytest.fixture 67 | def simple_barcodes(): 68 | """simple barcode set with min_hamming = 1, max_hamming = 2""" 69 | return [b"ACGT", b"ACGG", b"ACGA", b"ACGC", b"TCGT", b"CCGT", b"GCGT"] 70 | 71 | 72 | @pytest.fixture 73 | def simple_hamming_distances(simple_barcodes): 74 | simple_hamming_distances = [] 75 | for a, b in combinations(simple_barcodes, 2): 76 | d_hamming = 0 77 | for i, j in zip(a, b): 78 | if i != j: 79 | d_hamming += 1 80 | simple_hamming_distances.append(d_hamming) 81 | return simple_hamming_distances 82 | 83 | 84 | def test_encoded_hamming_distance_is_accurate( 85 | simple_hamming_distances, simple_barcodes, encoder 86 | ): 87 | # encode simple barcodes 88 | tbe = encoder(4) 89 | encoded = [tbe.encode(b) for b in simple_barcodes] 90 | encoded_hamming_distances = [] 91 | 92 | # use hamming distance function 93 | for a, b in combinations(encoded, 2): 94 | encoded_hamming_distances.append(tbe.hamming_distance(a, b)) 95 | 96 | # verify they are the same as the simple function used in this file 97 | assert simple_hamming_distances == encoded_hamming_distances 98 | -------------------------------------------------------------------------------- /src/sctools/test/test_entrypoints.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import tempfile 4 | 5 | import numpy as np 6 | import pysam 7 | import pytest 8 | import scipy.sparse as sp 9 | 10 | from sctools import bam, platform, count, consts 11 | 12 | data_dir = os.path.split(__file__)[0] + "/data/" 13 | 14 | 15 | def test_Attach10XBarcodes_entrypoint(): 16 | args = [ 17 | "--r1", 18 | data_dir + "test_r1.fastq", 19 | "--i1", 20 | data_dir + "test_i7.fastq", 21 | "--u2", 22 | data_dir + "test.bam", 23 | "--output-bamfile", 24 | "test_tagged_bam.bam", 25 | ] 26 | 27 | rc = platform.TenXV2.attach_barcodes(args) 28 | assert rc == 0 29 | with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: 30 | for alignment in f: 31 | # each alignment should now have a tag, and that tag should be a string 32 | assert isinstance( 33 | alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str 34 | ) 35 | assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str) 36 | assert isinstance( 37 | alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str 38 | ) 39 | assert isinstance( 40 | alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str 41 | ) 42 | assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str) 43 | assert isinstance( 44 | alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str 45 | ) 46 | os.remove("test_tagged_bam.bam") # clean up 47 | 48 | 49 | def test_Attach10XBarcodes_entrypoint_with_whitelist(): 50 | args = [ 51 | "--r1", 52 | data_dir + "test_r1.fastq", 53 | "--i1", 54 | data_dir + "test_i7.fastq", 55 | "--u2", 56 | data_dir + "test.bam", 57 | "--output-bamfile", 58 | "test_tagged_bam.bam", 59 | "--whitelist", 60 | data_dir + "1k-august-2016.txt", 61 | ] 62 | 63 | return_call = platform.TenXV2.attach_barcodes(args) 64 | assert return_call == 0 65 | success = False 66 | with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: 67 | for alignment in f: 68 | if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY): 69 | success = True 70 | # each alignment should now have a tag, and that tag should be a string 71 | assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str) 72 | assert isinstance( 73 | alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str 74 | ) 75 | assert isinstance( 76 | alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str 77 | ) 78 | assert isinstance( 79 | alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str 80 | ) 81 | assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str) 82 | assert isinstance( 83 | alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str 84 | ) 85 | assert success 86 | os.remove("test_tagged_bam.bam") # clean up 87 | 88 | 89 | def test_AttachBarcodes_entrypoint_with_whitelist(): 90 | # test of the BarcodePlatform.attach_barcodes entry point with 91 | # sample, cell, and molecule barcodes all specified 92 | args = [ 93 | "--r1", 94 | data_dir + "test_r1.fastq", 95 | "--i1", 96 | data_dir + "test_i7.fastq", 97 | "--u2", 98 | data_dir + "test.bam", 99 | "--output-bamfile", 100 | "test_tagged_bam.bam", 101 | "--whitelist", 102 | data_dir + "1k-august-2016.txt", 103 | "--sample-barcode-start-position", 104 | "0", 105 | "--sample-barcode-length", 106 | "8", 107 | "--cell-barcode-start-position", 108 | "0", 109 | "--cell-barcode-length", 110 | "16", 111 | "--molecule-barcode-start-position", 112 | "16", 113 | "--molecule-barcode-length", 114 | "7", # changed 10>7 intentionally for test 115 | ] 116 | 117 | return_call = platform.BarcodePlatform.attach_barcodes(args) 118 | assert return_call == 0 119 | success = False 120 | with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: 121 | for alignment in f: 122 | if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY): 123 | success = True 124 | # each alignment should now have a tag, and that tag should be a string 125 | assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str) 126 | assert isinstance( 127 | alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str 128 | ) 129 | assert isinstance( 130 | alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str 131 | ) 132 | assert len(alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY)) == 7 133 | assert isinstance( 134 | alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str 135 | ) 136 | assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str) 137 | assert isinstance( 138 | alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str 139 | ) 140 | assert success 141 | os.remove("test_tagged_bam.bam") # clean up 142 | 143 | 144 | def test_split_bam(): 145 | tag_args = [ 146 | "--r1", 147 | data_dir + "test_r1.fastq", 148 | "--i1", 149 | data_dir + "test_i7.fastq", 150 | "--u2", 151 | data_dir + "test.bam", 152 | "--output-bamfile", 153 | "test_tagged_bam.bam", 154 | "--whitelist", 155 | data_dir + "1k-august-2016.txt", 156 | ] 157 | 158 | platform.TenXV2.attach_barcodes(tag_args) 159 | 160 | split_args = [ 161 | "--bamfile", 162 | "test_tagged_bam.bam", 163 | "--output-prefix", 164 | "test_tagged", 165 | "--subfile-size", 166 | "0.005", 167 | "--tags", 168 | consts.CELL_BARCODE_TAG_KEY, 169 | consts.RAW_CELL_BARCODE_TAG_KEY, 170 | ] 171 | 172 | return_call = platform.GenericPlatform.split_bam(split_args) 173 | assert return_call == 0 174 | 175 | for f in glob.glob("test_tagged*"): 176 | os.remove(f) 177 | 178 | 179 | def test_tag_sort_bam(): 180 | args = [ 181 | "-i", 182 | data_dir + "unsorted.bam", 183 | "-o", 184 | "test_sorted.bam", 185 | "-t", 186 | consts.CELL_BARCODE_TAG_KEY, 187 | consts.GENE_NAME_TAG_KEY, 188 | consts.MOLECULE_BARCODE_TAG_KEY, 189 | ] 190 | 191 | return_call = platform.GenericPlatform.tag_sort_bam(args) 192 | assert return_call == 0 193 | 194 | tag_keys = [ 195 | consts.CELL_BARCODE_TAG_KEY, 196 | consts.GENE_NAME_TAG_KEY, 197 | consts.MOLECULE_BARCODE_TAG_KEY, 198 | ] 199 | with pysam.AlignmentFile("test_sorted.bam", "rb") as f: 200 | segments = f.fetch(until_eof=True) 201 | tag_sortable_records = ( 202 | bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments 203 | ) 204 | bam.verify_sort(tag_sortable_records, tag_keys) 205 | 206 | for f in glob.glob("test_sorted*"): 207 | os.remove(f) 208 | 209 | 210 | def test_tag_sort_bam_dash_t_specified_multiple_times(): 211 | args = [ 212 | "-i", 213 | data_dir + "unsorted.bam", 214 | "-o", 215 | "test_sorted.bam", 216 | "-t", 217 | consts.CELL_BARCODE_TAG_KEY, 218 | "-t", 219 | consts.GENE_NAME_TAG_KEY, 220 | "-t", 221 | consts.MOLECULE_BARCODE_TAG_KEY, 222 | ] 223 | 224 | return_call = platform.GenericPlatform.tag_sort_bam(args) 225 | assert return_call == 0 226 | 227 | tag_keys = [ 228 | consts.CELL_BARCODE_TAG_KEY, 229 | consts.GENE_NAME_TAG_KEY, 230 | consts.MOLECULE_BARCODE_TAG_KEY, 231 | ] 232 | with pysam.AlignmentFile("test_sorted.bam", "rb") as f: 233 | segments = f.fetch(until_eof=True) 234 | tag_sortable_record_generator = ( 235 | bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments 236 | ) 237 | bam.verify_sort(tag_sortable_record_generator, tag_keys) 238 | 239 | for f in glob.glob("test_sorted*"): 240 | os.remove(f) 241 | 242 | 243 | def test_tag_sort_bam_no_tags(): 244 | args = ["-i", data_dir + "unsorted.bam", "-o", "test_sorted.bam"] 245 | 246 | return_call = platform.GenericPlatform.tag_sort_bam(args) 247 | assert return_call == 0 248 | 249 | tag_keys = [] 250 | with pysam.AlignmentFile("test_sorted.bam", "rb") as f: 251 | segments = f.fetch(until_eof=True) 252 | tag_sortable_records = ( 253 | bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments 254 | ) 255 | bam.verify_sort(tag_sortable_records, tag_keys) 256 | 257 | for f in glob.glob("test_sorted*"): 258 | os.remove(f) 259 | 260 | 261 | def test_verify_bam_sort(): 262 | args = [ 263 | "-i", 264 | data_dir + "cell-gene-umi-queryname-sorted.bam", 265 | "-t", 266 | consts.CELL_BARCODE_TAG_KEY, 267 | consts.GENE_NAME_TAG_KEY, 268 | consts.MOLECULE_BARCODE_TAG_KEY, 269 | ] 270 | 271 | return_call = platform.GenericPlatform.verify_bam_sort(args) 272 | assert return_call == 0 273 | 274 | 275 | def test_verify_bam_sort_raises_error_on_unsorted(): 276 | args = [ 277 | "-i", 278 | data_dir + "unsorted.bam", 279 | "-t", 280 | consts.CELL_BARCODE_TAG_KEY, 281 | consts.GENE_NAME_TAG_KEY, 282 | consts.MOLECULE_BARCODE_TAG_KEY, 283 | ] 284 | 285 | with pytest.raises(bam.SortError): 286 | platform.GenericPlatform.verify_bam_sort(args) 287 | 288 | 289 | def test_count_merge(): 290 | tmp = tempfile.mkdtemp() 291 | 292 | data, ind, col = [np.arange(10)] * 3 293 | matrix = sp.coo_matrix((data, (ind, col)), shape=(10, 10), dtype=np.float32).tocsr() 294 | # be lazy and reuse the inds as the col and row index 295 | counts = count.CountMatrix(matrix, ind, col) 296 | counts.save(tmp + "/test_input_1") 297 | counts.save(tmp + "/test_input_2") 298 | 299 | merge_args = [ 300 | "-o", 301 | tmp + "/test_merged_counts", 302 | "-i", 303 | tmp + "/test_input_2", 304 | tmp + "/test_input_1", 305 | ] 306 | return_call = platform.GenericPlatform.merge_count_matrices(merge_args) 307 | assert return_call == 0 308 | -------------------------------------------------------------------------------- /src/sctools/test/test_fastq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | from functools import partial 4 | from itertools import product 5 | 6 | import pytest 7 | 8 | from .. import fastq, consts 9 | from ..reader import zip_readers 10 | 11 | # set some useful globals for testing 12 | data_dir = os.path.split(__file__)[0] + "/data/" 13 | _i7_files = [ 14 | data_dir + f for f in ("test_i7.fastq", "test_i7.fastq.gz", "test_i7.fastq.bz2") 15 | ] 16 | _files = [data_dir + f for f in ("test_i7.fastq", "test_r1.fastq", "test_r2.fastq")] 17 | _gz_files = [ 18 | data_dir + f for f in ("test_i7.fastq.gz", "test_r1.fastq.gz", "test_r2.fastq.gz") 19 | ] 20 | _bz2_files = [ 21 | data_dir + f 22 | for f in ("test_i7.fastq.bz2", "test_r1.fastq.bz2", "test_r2.fastq.bz2") 23 | ] 24 | 25 | _modes = ("r", "rb") 26 | _files_and_modes = list(product(_i7_files, _modes)) 27 | _multifiles_and_modes = list(product((_files, _gz_files, _bz2_files), _modes)) 28 | _map_encoder = {"r": str, "rb": partial(bytes, encoding="utf-8")} 29 | 30 | 31 | # TEST READER 32 | 33 | 34 | @pytest.fixture(scope="module", params=_files_and_modes) 35 | def i7_files_compressions_and_modes(request): 36 | """generates different compression types and modes for testing""" 37 | return request.param[0], request.param[1] 38 | 39 | 40 | @pytest.fixture(scope="module", params=_multifiles_and_modes) 41 | def reader_all_compressions(request): 42 | """generates open fastq reader files for each compression and read mode""" 43 | return fastq.Reader(request.param[0], request.param[1]) 44 | 45 | 46 | @pytest.fixture(scope="module") 47 | def bytes_fastq_record(): 48 | return [b"@name\n", b"ACTACAAT\n", b"+\n", b"%%%%AAAA\n"] 49 | 50 | 51 | @pytest.fixture(scope="module") 52 | def string_fastq_record(): 53 | return ["@name\n", "ACTACAAT\n", "+\n", "%%%%AAAA\n"] 54 | 55 | 56 | def test_reader_stores_filenames(): 57 | names = ["notreal", "fake"] 58 | rd = fastq.Reader(files=names) 59 | assert rd.filenames == names 60 | 61 | 62 | def test_reader_reads_first_record(reader_all_compressions): 63 | for record in reader_all_compressions: 64 | assert isinstance(record, fastq.Record) 65 | expected_result = ( 66 | "NCACAATG\n" if isinstance(record.sequence, str) else b"NCACAATG\n" 67 | ) 68 | assert record.sequence == expected_result 69 | break # just first record 70 | 71 | 72 | def test_reader_skips_header_character_raises_value_error( 73 | i7_files_compressions_and_modes, 74 | ): 75 | """ 76 | test should skip the first name line, shifting each record up 1. As a result, the 77 | first sequence should be found in the name field 78 | """ 79 | filename, mode = i7_files_compressions_and_modes 80 | rd = fastq.Reader(filename, mode=mode, header_comment_char="@") 81 | with pytest.raises(ValueError): 82 | next(iter(rd)) 83 | 84 | 85 | def test_reader_reads_correct_number_of_records_across_multiple_files( 86 | reader_all_compressions, 87 | ): 88 | assert len(reader_all_compressions) == 300 # 3 files 89 | 90 | 91 | def test_mixed_filetype_read_gets_correct_record_number(): 92 | rd = fastq.Reader([_gz_files[0], _bz2_files[0]], mode="r", header_comment_char="#") 93 | 94 | assert len(rd) == 200 95 | 96 | 97 | def test_non_string_filename_raises_typeerror(): 98 | with pytest.raises(TypeError): 99 | _ = fastq.Reader(10, "r") 100 | 101 | 102 | def test_non_string_filename_in_iterable_raises_typeerror(): 103 | with pytest.raises(TypeError): 104 | _ = fastq.Reader(("works", 10), "r") 105 | 106 | 107 | def test_invalid_open_mode_raises_valueerror(): 108 | with pytest.raises(ValueError): 109 | _ = fastq.Reader("works", "not_acceptable_open_mode") 110 | 111 | 112 | def test_fastq_returns_correct_filesize_for_single_and_multiple_files(): 113 | rd = fastq.Reader( 114 | _i7_files[0], mode="r", header_comment_char="#" # mode irrelevant 115 | ) 116 | assert rd.size == 7774 117 | 118 | rd = fastq.Reader(_i7_files, mode="r", header_comment_char="#") # mode irrelevant 119 | assert rd.size == 7774 + 853 + 802 # three file sizes 120 | 121 | 122 | def test_reader_properly_subsets_based_on_indices(): 123 | rd = fastq.Reader(_i7_files[0], mode="r") 124 | indices = {0, 5, 10, 12} 125 | n_records = sum(1 for _ in rd.select_record_indices(indices)) 126 | assert n_records == len(indices) 127 | 128 | 129 | def test_zipping_readers_generates_expected_output(): 130 | rd1 = fastq.Reader(_files[0], "r") 131 | rd2 = fastq.Reader(_files[0], "r") 132 | for r1, r2 in zip_readers(rd1, rd2): 133 | assert isinstance(r1, fastq.Record) 134 | assert isinstance(r2, fastq.Record) 135 | expected_result = "NCACAATG\n" 136 | assert r1.sequence == r2.sequence == expected_result 137 | break # just first record 138 | 139 | 140 | def test_zipping_readers_with_indices_generates_expected_output(): 141 | rd1 = fastq.Reader(_files[0], "r") 142 | rd2 = fastq.Reader(_files[0], "r") 143 | indices = {0, 1, 2, 3} 144 | for r1, r2 in zip_readers(rd1, rd2, indices=indices): 145 | assert isinstance(r1, fastq.Record) 146 | assert isinstance(r2, fastq.Record) 147 | expected_result = "NCACAATG\n" 148 | assert r1.sequence == r2.sequence == expected_result 149 | break # just first record 150 | 151 | 152 | def test_printing_bytes_record_generates_valid_fastq_record(bytes_fastq_record): 153 | record = fastq.Record(bytes_fastq_record) 154 | assert str(record) == b"".join(bytes_fastq_record).decode() 155 | assert bytes(record) == b"".join(bytes_fastq_record) 156 | 157 | 158 | def test_bytes_fastq_record_quality_score_parsing(bytes_fastq_record): 159 | record = fastq.Record(bytes_fastq_record) 160 | assert record.average_quality() == 18 161 | 162 | 163 | def test_printing_string_record_generates_valid_fastq_record(string_fastq_record): 164 | record = fastq.StrRecord(string_fastq_record) 165 | assert str(record) == "".join(string_fastq_record) 166 | assert bytes(record) == "".join(string_fastq_record).encode() 167 | 168 | 169 | def test_string_fastq_record_quality_score_parsing(string_fastq_record): 170 | record = fastq.StrRecord(string_fastq_record) 171 | assert record.average_quality() == 18 172 | 173 | 174 | # TEST RECORD 175 | 176 | 177 | def test_fields_populate_properly(reader_all_compressions): 178 | encoder = _map_encoder[reader_all_compressions._mode] 179 | name_prefix = encoder("@") 180 | alphabet = set(encoder("ACGTN")) 181 | name2_string = encoder("+\n") 182 | ascii_chars = set(i for i in encoder(string.printable)) 183 | for record in reader_all_compressions: 184 | assert record.name.startswith(name_prefix) 185 | assert all(i in alphabet for i in record.sequence.strip()) 186 | assert record.name2 == name2_string 187 | assert all(i in ascii_chars for i in record.quality.strip()) 188 | 189 | 190 | # TEST BarcodeGeneratorWithCorrectedCellbarcodes 191 | 192 | 193 | @pytest.fixture(scope="function") 194 | def embedded_barcode_generator(): 195 | cell_barcode = fastq.EmbeddedBarcode( 196 | start=0, 197 | end=16, 198 | quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, 199 | sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, 200 | ) 201 | molecule_barcode = fastq.EmbeddedBarcode( 202 | start=16, 203 | end=26, 204 | quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, 205 | sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, 206 | ) 207 | return fastq.EmbeddedBarcodeGenerator( 208 | data_dir + "test_r1.fastq.gz", [cell_barcode, molecule_barcode] 209 | ) 210 | 211 | 212 | @pytest.fixture(scope="function") 213 | def barcode_generator_with_corrected_cell_barcodes(): 214 | cell_barcode = fastq.EmbeddedBarcode( 215 | start=0, 216 | end=16, 217 | quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, 218 | sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, 219 | ) 220 | molecule_barcode = fastq.EmbeddedBarcode( 221 | start=16, 222 | end=26, 223 | quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, 224 | sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, 225 | ) 226 | return fastq.BarcodeGeneratorWithCorrectedCellBarcodes( 227 | data_dir + "test_r1.fastq.gz", 228 | cell_barcode, 229 | data_dir + "1k-august-2016.txt", 230 | [molecule_barcode], 231 | ) 232 | 233 | 234 | def test_embedded_barcode_generator_produces_outputs_of_expected_size( 235 | embedded_barcode_generator, 236 | ): 237 | for cell_seq, cell_qual, umi_seq, umi_qual in embedded_barcode_generator: 238 | 239 | # correct values 240 | correct_cell_barcode_length = 16 241 | correct_umi_length = 10 242 | 243 | # note that all barcodes are strings and therefore should get 'Z' values 244 | 245 | # test cell tags 246 | assert cell_seq[0] == consts.RAW_CELL_BARCODE_TAG_KEY 247 | assert len(cell_seq[1]) == correct_cell_barcode_length 248 | assert all(v in "ACGTN" for v in cell_seq[1]) 249 | assert cell_seq[2] == "Z" 250 | assert cell_qual[0] == consts.QUALITY_CELL_BARCODE_TAG_KEY 251 | assert len(cell_qual[1]) == correct_cell_barcode_length 252 | assert all(v in string.printable for v in cell_qual[1]) 253 | assert cell_seq[2] == "Z" 254 | 255 | # test umi tags 256 | assert umi_seq[0] == consts.RAW_MOLECULE_BARCODE_TAG_KEY 257 | assert len(umi_seq[1]) == correct_umi_length 258 | assert all(v in "ACGTN" for v in umi_seq[1]) 259 | assert umi_seq[2] == "Z" 260 | assert umi_qual[0] == consts.QUALITY_MOLECULE_BARCODE_TAG_KEY 261 | assert len(umi_qual[1]) == correct_umi_length 262 | assert all(v in string.printable for v in umi_qual[1]) 263 | assert umi_seq[2] == "Z" 264 | 265 | break # just the first tag is fine 266 | 267 | 268 | def test_corrects_barcodes(barcode_generator_with_corrected_cell_barcodes): 269 | success = False 270 | for barcode_sets in barcode_generator_with_corrected_cell_barcodes: 271 | for barcode_set in barcode_sets: 272 | if barcode_set[0] == consts.CELL_BARCODE_TAG_KEY: 273 | success = True 274 | break 275 | assert success 276 | -------------------------------------------------------------------------------- /src/sctools/test/test_gtf.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .. import gtf 3 | from itertools import chain 4 | import pytest 5 | 6 | _data_dir = os.path.split(__file__)[0] + "/data" 7 | _files = ["%s/%s" % (_data_dir, f) for f in ("test.gtf", "test.gtf.gz", "test.gtf.bz2")] 8 | 9 | 10 | @pytest.fixture(scope="module", params=_files) 11 | def files(request): 12 | """returns a filename""" 13 | return request.param 14 | 15 | 16 | def test_opens_file_reads_first_line(files): 17 | rd = gtf.Reader(files, "r", header_comment_char="#") 18 | record = next(iter(rd)) 19 | assert isinstance(record, gtf.GTFRecord) 20 | 21 | 22 | def test_opens_file_populates_fields_properly(files): 23 | rd = gtf.Reader(files, "r", header_comment_char="#") 24 | record = next(iter(rd)) 25 | assert record.seqname == "chr19" 26 | assert record.chromosome == "chr19" 27 | assert record.source == "HAVANA" 28 | assert record.feature == "gene" 29 | assert record.start == 60951 30 | assert record.end == 71626 31 | assert record.score == "." 32 | assert record.strand == "-" 33 | assert record.frame == "." 34 | 35 | expected_features = { 36 | "gene_id": "ENSG00000282458.1", 37 | "gene_type": "transcribed_processed_pseudogene", 38 | "gene_status": "KNOWN", 39 | "gene_name": "WASH5P", 40 | "level": "2", 41 | "havana_gene": "OTTHUMG00000180466.8", 42 | } 43 | assert record._attributes == expected_features 44 | 45 | assert all( 46 | i in str(record) 47 | for i in chain(expected_features.keys(), expected_features.values()) 48 | ) 49 | 50 | 51 | def test_set_attribute_verify_included_in_output_string(files): 52 | rd = gtf.Reader(files, "r", header_comment_char="#") 53 | record = next(iter(rd)) 54 | record.set_attribute("test_attr", "foo") 55 | assert record.get_attribute("test_attr") == "foo" 56 | 57 | # verify in output string 58 | assert "foo" in str(record) 59 | 60 | 61 | def test_opens_file_parses_size(files): 62 | rd = gtf.Reader(files, "r", header_comment_char="#") 63 | record = next(iter(rd)) 64 | assert 71626 - 60951 == record.size 65 | 66 | # mangle record, make sure error is raised 67 | record._fields[3:5] = [record.end, record.start] 68 | with pytest.raises(ValueError): 69 | getattr(record, "size") 70 | -------------------------------------------------------------------------------- /src/sctools/test/test_platform.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import pysam 4 | 5 | from .. import platform 6 | 7 | data_dir = os.path.split(__file__)[0] + "/data/" 8 | 9 | 10 | def test_attach_barcodes(): 11 | """High-level test of the AttachBarcodes command""" 12 | 13 | temp_dir_name = tempfile.mkdtemp() 14 | 15 | # Construct cli arguments to pass to the command 16 | temp_output_bam = temp_dir_name + "output.bam" 17 | 18 | args = [ 19 | "--r1", 20 | data_dir + "test_r1.fastq", 21 | "--u2", 22 | data_dir + "test_r2.bam", 23 | "--i1", 24 | data_dir + "test_i1.fastq", 25 | "--o", 26 | temp_output_bam, 27 | "--sample-barcode-start-pos", 28 | "0", 29 | "--sample-barcode-length", 30 | "8", 31 | "--cell-barcode-start-pos", 32 | "0", 33 | "--cell-barcode-length", 34 | "16", 35 | "--molecule-barcode-start-pos", 36 | "16", 37 | "--molecule-barcode-length", 38 | "4", 39 | ] 40 | 41 | platform.BarcodePlatform.attach_barcodes(args) 42 | 43 | with pysam.AlignmentFile(temp_output_bam, "rb", check_sq=False) as samfile: 44 | for read in samfile: 45 | tag_cr = read.get_tag("CR") 46 | tag_cy = read.get_tag("CY") 47 | tag_ur = read.get_tag("UR") 48 | tag_uy = read.get_tag("UY") 49 | tag_sr = read.get_tag("SR") 50 | tag_sy = read.get_tag("SY") 51 | assert len(tag_cr) == 16 52 | assert len(tag_cy) == 16 53 | assert len(tag_ur) == 4 54 | assert len(tag_uy) == 4 55 | assert len(tag_sr) == 8 56 | assert len(tag_sy) == 8 57 | -------------------------------------------------------------------------------- /src/sctools/test/test_stats.py: -------------------------------------------------------------------------------- 1 | from .. import stats 2 | 3 | 4 | def test_concentrated_data_produces_entropy_0(): 5 | entropy = stats.base4_entropy([1, 0, 0, 0], axis=0) 6 | assert entropy == 0 7 | 8 | 9 | def test_concentrated_unnormalized_data_produces_entropy_0(): 10 | entropy = stats.base4_entropy([1000, 0, 0, 0], axis=0) 11 | assert entropy == 0 12 | 13 | 14 | def test_balanced_data_produces_entropy_1(): 15 | entropy = stats.base4_entropy([0.25, 0.25, 0.25, 0.25], axis=0) 16 | assert entropy == 1 17 | 18 | 19 | def test_balanced_unnormalized_data_produces_entropy_1(): 20 | entropy = stats.base4_entropy([20, 20, 20, 20], axis=0) 21 | assert entropy == 1 22 | --------------------------------------------------------------------------------