├── .circleci
    └── config.yml
├── .dockerignore
├── .flake8
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docker_build.sh
├── docs
    ├── README.md
    └── source
    │   ├── Makefile
    │   ├── conf.py
    │   ├── index.rst
    │   ├── readme.rst
    │   ├── sctools.metrics.rst
    │   ├── sctools.rst
    │   └── sctools.test.rst
├── fastqpreprocessing
    ├── .gitignore
    ├── Makefile
    ├── patches
    │   ├── BgzfFileType.cpp.patch
    │   ├── FastQFile.cpp.patch
    │   ├── Makefile.patch
    │   └── general.Makefile.patch
    ├── src
    │   ├── example-run.sh
    │   ├── fastq_common.cpp
    │   ├── fastq_common.h
    │   ├── fastq_metrics.cpp
    │   ├── fastq_metrics.h
    │   ├── fastq_slideseq.cpp
    │   ├── fastqprocess.cpp
    │   ├── htslib_tagsort.cpp
    │   ├── htslib_tagsort.h
    │   ├── input_options.cpp
    │   ├── input_options.h
    │   ├── metricgatherer.cpp
    │   ├── metricgatherer.h
    │   ├── samplefastq.cpp
    │   ├── tagsort.cpp
    │   ├── utilities.cpp
    │   └── utilities.h
    └── utils
    │   ├── big-run.sh
    │   ├── check_barcode_partition.py
    │   ├── create_fastq.sh
    │   ├── example-run.sh
    │   └── run.sh
├── pull_request_template.md
├── readthedocs.yml
├── requirements.txt
├── security.txt
├── setup.py
└── src
    └── sctools
        ├── __init__.py
        ├── bam.py
        ├── barcode.py
        ├── consts.py
        ├── count.py
        ├── encodings.py
        ├── fastq.py
        ├── groups.py
        ├── gtf.py
        ├── metrics
            ├── README.md
            ├── __init__.py
            ├── aggregator.py
            ├── gatherer.py
            ├── merge.py
            └── writer.py
        ├── platform.py
        ├── reader.py
        ├── stats.py
        └── test
            ├── __init__.py
            ├── characterize-cell-testing-data.ipynb
            ├── characterize-gene-testing-data.ipynb
            ├── data
                ├── 1k-august-2016.txt
                ├── cell-gene-umi-queryname-sorted.bam
                ├── cell-sorted-missing-cb.bam
                ├── cell-sorted.bam
                ├── cell_metrics_missing_cb.csv.gz
                ├── chr1.30k_records.gtf.gz
                ├── group_metrics
                │   ├── expected_picard_group.csv
                │   ├── test_hisat2.csv
                │   ├── test_hisat2_paired_end_qc.log
                │   ├── test_hisat2_trans.csv
                │   ├── test_hisat2_transcriptome_rsem.log
                │   ├── test_picard_group.csv
                │   ├── test_qc.alignment_summary_metrics.txt
                │   ├── test_qc.duplicate_metrics.txt
                │   ├── test_qc.error_summary_metrics.txt
                │   ├── test_qc.gc_bias.summary_metrics.txt
                │   ├── test_qc.insert_size_metrics.txt
                │   ├── test_qc.rna_metrics.txt
                │   ├── test_rsem.cnt
                │   └── test_rsem.csv
                ├── group_metrics_unpaired_ss2
                │   ├── SRR6258488_qc.alignment_summary_metrics.txt
                │   ├── SRR6258488_qc.duplicate_metrics.txt
                │   ├── SRR6258488_qc.gc_bias.summary_metrics.txt
                │   └── SRR6258488_qc.rna_metrics.txt
                ├── small-cell-sorted.bam
                ├── small-gene-sorted.bam
                ├── test.bam
                ├── test.gtf
                ├── test.gtf.bz2
                ├── test.gtf.gz
                ├── test.sam
                ├── test_i7.fastq
                ├── test_i7.fastq.bz2
                ├── test_i7.fastq.gz
                ├── test_r1.fastq
                ├── test_r1.fastq.bz2
                ├── test_r1.fastq.gz
                ├── test_r2.bam
                ├── test_r2.fastq
                ├── test_r2.fastq.bz2
                ├── test_r2.fastq.gz
                ├── test_r2_tagged.bam
                └── unsorted.bam
            ├── test_bam.py
            ├── test_barcode.py
            ├── test_count.py
            ├── test_encodings.py
            ├── test_entrypoints.py
            ├── test_fastq.py
            ├── test_groups.py
            ├── test_gtf.py
            ├── test_metrics.py
            ├── test_platform.py
            └── test_stats.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 4 | #
 5 | version: 2
 6 | jobs:
 7 |   build:
 8 |     docker:
 9 |       # specify the version you desire here
10 |       - image: circleci/python:3.6.1
11 |       
12 |       # Specify service dependencies here if necessary
13 |       # CircleCI maintains a library of pre-built images
14 |       # documented at https://circleci.com/docs/2.0/circleci-images/
15 |       # - image: circleci/postgres:9.4
16 | 
17 |     working_directory: ~/repo
18 | 
19 |     steps:
20 |       - checkout
21 | 
22 |       # Download and cache dependencies
23 |       - restore_cache:
24 |           keys:
25 |           - v1-dependencies-{{ checksum "requirements.txt" }}
26 |           # fallback to using the latest cache if no exact match is found
27 |           - v1-dependencies-
28 | 
29 |       - run:
30 |           name: install dependencies
31 |           command: |
32 |             python3 -m venv venv
33 |             . venv/bin/activate
34 |             pip install -r requirements.txt
35 |             pip install codecov
36 | 
37 |       - save_cache:
38 |           paths:
39 |             - ./venv
40 |           key: v1-dependencies-{{ checksum "requirements.txt" }}
41 | 
42 |       # run tests!
43 |       # https://pytest.org
44 |       # And upload reports to codecov.io
45 |       - run:
46 |           name: linting test
47 |           command: |
48 |             . venv/bin/activate
49 |             # Check Black code style compliance
50 |             black ./ --skip-string-normalization --check --exclude venv
51 |             # Check PEP-8 compliance
52 |             flake8
53 | 
54 |       - run:
55 |           name: run tests
56 |           command: |
57 |             . venv/bin/activate
58 |             mkdir test-reports
59 |             pytest --junitxml=test-reports/junit.xml --cov=sctools
60 |             codecov
61 | 
62 |       - store_artifacts:
63 |           path: test-reports
64 |           destination: test-reports
65 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | #files ignored when building docker image
2 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | # We ignore the following PEP-8 styles:
 2 | 
 3 | # E203  whitespace before ‘:’
 4 | # E266  too many leading ‘#’ for block comment
 5 | # E501  line too long (82 > 79 characters) (^)
 6 | # W503  line break occurred before a binary operator
 7 | # F841  local variable is assigned to but never used
 8 | # W605  invalid escape sequence (causes false alarms around regex)
 9 | 
10 | # Note: (^) These checks can be disabled at the
11 | # line level using the # noqa special comment.
12 | # This possibility should be reserved for special cases.
13 | 
14 | [flake8]
15 | ignore = E203, E266, E501, W503, F841, W605
16 | max-complexity = 18
17 | select = B,C,E,F,W,T4,B9
18 | exclude = 
19 |     # No need to traverse the virtualenv directory, which'll be created by Circle CI
20 |     venv
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.idea
  6 | .pytest_cache
  7 | 
  8 | # C extensions
  9 | *.so
 10 | *.o
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | test/data/bam_with_tags_test.bam
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | docs/generated
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | .venv
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | *.DS_Store
104 | 
105 | # do not check in the executable and bam file
106 | fastqpreprocessing/src/fastqprocess
107 | fastqpreprocessing/src/TagSort
108 | fastqpreprocessing/src/obj/
109 | fastqpreprocessing/bin/
110 | src/sctools/test/data/bam_with_tags_test.bam
111 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/ambv/black
 3 |     rev: 19.3b0
 4 |     hooks:
 5 |     - id: black
 6 |       language_version: python3.6
 7 |       # Using args here is not recommended by Black:
 8 |       # https://black.readthedocs.io/en/stable/version_control_integration.html
 9 |       # But since we only have one argument here, and
10 |       # we don't force developers to use editor plugins,
11 |       # putting the args here seems to be fine
12 |       args: [./, --skip-string-normalization]
13 | 
14 | -   repo: https://gitlab.com/pycqa/flake8
15 |     rev: 3.7.7
16 |     hooks:
17 |     -   id: flake8
18 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.7
 2 | 
 3 | LABEL maintainer="Farzaneh Khajouei <fkhajoue@broadinstitute.org>" \
 4 |   software="sctools  v.1.0.0" \
 5 |   description="A collection of tools for single cell data. Splitting fastq files based on cellbarcodes and other tools to compute metrics on single cell data using barcodes and UMIs."
 6 | 
 7 | 
 8 | RUN apt-get update && apt-get upgrade -y && apt-get install -y patch libhdf5-dev vim apt-utils
 9 | RUN mkdir /sctools/
10 | 
11 | COPY . /sctools 
12 | 
13 | ARG htslib_version="1.13"
14 | 
15 | RUN cd /sctools/fastqpreprocessing &&\
16 |     wget https://github.com/khajoue2/libStatGen/archive/refs/tags/v1.0.15.broad.tar.gz &&\
17 |     wget https://github.com/samtools/htslib/releases/download/${htslib_version}/htslib-${htslib_version}.tar.bz2 &&\
18 |     tar -zxvf v1.0.15.broad.tar.gz &&\
19 |     tar -jxvf htslib-${htslib_version}.tar.bz2 &&\
20 |     mv libStatGen-1.0.15.broad libStatGen 
21 | 
22 | RUN cd /sctools/fastqpreprocessing &&\
23 |     wget http://www.cs.unc.edu/Research/compgeom/gzstream/gzstream.tgz &&\
24 |     tar -xvf gzstream.tgz 
25 | 
26 | RUN cd /sctools/fastqpreprocessing &&\
27 |     make -C libStatGen 
28 | 
29 | RUN cd /sctools/fastqpreprocessing && make -C htslib-${htslib_version}/ && make -C gzstream
30 | 
31 | RUN cd /sctools/fastqpreprocessing && mkdir bin obj && make install
32 | 
33 | RUN cp /sctools/fastqpreprocessing/bin/* /usr/local/bin/
34 | 
35 | WORKDIR usr/local/bin/sctools
36 | 
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Human Cell Atlas Authors, https://humancellatlas.org
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name Broad Institute, Inc. nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/sctools/test/data/*
2 | include README.rst
3 | include LICENSE


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Single Cell Tools
  2 | #################
  3 | 
  4 | .. image:: https://img.shields.io/circleci/project/github/HumanCellAtlas/sctools.svg?label=Unit%20Test%20on%20Circle%20CI%20&style=flat-square&logo=circleci
  5 |   :target: https://circleci.com/gh/HumanCellAtlas/sctools/tree/master
  6 |   :alt: Unit Test Status
  7 | 
  8 | .. image:: https://img.shields.io/codecov/c/github/HumanCellAtlas/sctools/master.svg?label=Test%20Coverage&logo=codecov&style=flat-square
  9 |   :target: https://codecov.io/gh/HumanCellAtlas/sctools
 10 |   :alt: Test Coverage on Codecov
 11 | 
 12 | .. image:: https://img.shields.io/readthedocs/sctools/latest.svg?label=ReadtheDocs%3A%20Latest&logo=Read%20the%20Docs&style=flat-square
 13 |   :target: http://sctools.readthedocs.io/en/latest/?badge=latest
 14 |   :alt: Documentation Status
 15 | 
 16 | .. image:: https://img.shields.io/snyk/vulnerabilities/github/HumanCellAtlas/sctools/requirements.txt.svg?label=Snyk%20Vulnerabilities&logo=Snyk
 17 |   :target: https://snyk.io/test/github/HumanCellAtlas/sctools/?targetFile=requirements.txt
 18 |   :alt: Snyk Vulnerabilities for GitHub Repo (Specific Manifest)
 19 | 
 20 | .. image:: https://img.shields.io/github/release/HumanCellAtlas/sctools.svg?label=Latest%20Release&style=flat-square&colorB=green
 21 |   :target: https://github.com/HumanCellAtlas/sctools/releases
 22 |   :alt: Latest Release
 23 | 
 24 | .. image:: https://img.shields.io/github/license/HumanCellAtlas/sctools.svg?style=flat-square
 25 |   :target: https://img.shields.io/github/license/HumanCellAtlas/sctools.svg?style=flat-square
 26 |   :alt: License
 27 | 
 28 | .. image:: https://img.shields.io/badge/python-3.6-green.svg?style=flat-square&logo=python&colorB=blue
 29 |   :target: https://img.shields.io/badge/python-3.6-green.svg?style=flat-square&logo=python&colorB=blue
 30 |   :alt: Language
 31 | 
 32 | .. image:: https://img.shields.io/badge/Code%20Style-black-000000.svg?style=flat-square
 33 |   :target: https://github.com/ambv/black
 34 |   :alt: Code Style
 35 | 
 36 | Single Cell Tools provides utilities for manipulating sequence data formats suitable for use in
 37 | distributed systems analyzing large biological datasets.
 38 | 
 39 | Download and Installation
 40 | =========================
 41 | 
 42 | .. code bash
 43 |    git clone https://github.com/humancellatlas/sctools.git
 44 |    cd sctools
 45 |    pip3 install .
 46 |    pytest  # verify installation; run tests
 47 | 
 48 | sctools Package
 49 | ===============
 50 | 
 51 | The sctools package provides both command line utilities and classes designed for use in python
 52 | programs.
 53 | 
 54 | Command Line Utilities
 55 | ======================
 56 | 
 57 | 1. Attach10XBarcodes: Attached barcodes stored in fastq files to reads in an unaligned bam file
 58 | 2. SplitBam: Split a bam file into chunks, guaranteeing that cells are contained in 1 chunk
 59 | 3. CalculateGeneMetrics: Calculate information about genes in an experiment or chunk
 60 | 4. CalculateCellMetrics: Calculate information about cells in an experiment or chunk
 61 | 5. MergeGeneMetrics: Merge gene metrics calculated from different chunks of an experiment
 62 | 6. MergeCellMetrics Merge cell metrics calculated from different chunks of an experiment
 63 | 
 64 | Main Package Classes
 65 | ====================
 66 | 
 67 | 1. **Platform**: an abstract class that defines a common data structure for different 3' sequencing
 68 |    formats. All algorithms and methods in this package that are designed to work on 3' sequencing data
 69 |    speak to this common data structure. Currently 10X_v2 is defined.
 70 | 
 71 | 2. **Reader**: a general iterator over arbitrarily zipped file(s) that is extended to work with common
 72 |    sequence formats like fastq (fastq.Reader) and gtf (gtf.Reader). We recommend using the pysam
 73 |    package for reading sam and bam files.
 74 | 
 75 | 3. **TwoBit & ThreeBit** DNA encoders that store DNA in 2- and 3-bit form. 2-bit is smaller but
 76 |    randomizes "N" nucleotides. Both classes support fastq operations over common sequence tasks such
 77 |    as the calculation of GC content.
 78 | 
 79 | 4. **ObservedBarcodeSet & PriorBarcodeSet**: classes for analysis and comparison of sets of barcodes
 80 |    such as the cell barcodes used by 10X genomics. Supports operations like summarizing hamming
 81 |    distances and comparing observed sequence diversity to expected (normally uniform) diversity.
 82 | 
 83 | 5. **gtf.Reader & gtf.Record** GTF iterator and GTF record class that exposes the gtf
 84 |    fields as a lightweight, lazy-parsed python object.
 85 | 
 86 | 6. **fastq.Reader & fastq.Record** fastq reader and fastq record class that exposes the fastq fields
 87 |    as a lightweight, lazy-parsed python object.
 88 | 
 89 | 7. **Metrics** calculate information about the genes and cells of an experiment
 90 | 
 91 | 8. **Bam** Split bam files into chunks and attach barcodes as tags
 92 | 
 93 | 
 94 | Viewing Test Results and Coverage
 95 | =================================
 96 | 
 97 | To calculate and view test coverage cd to the ``sctools`` directory and
 98 | type the following two commands to generate the report and open it in your web browser:
 99 | 
100 | .. code:: bash
101 | 
102 |    pytest --cov-report html:cov_html --cov=sctools
103 |    open cov_html/index.html
104 | 
105 | Definitions
106 | ===========
107 | 
108 | Several definitions are helpful to understand how sequence data is analyzed.
109 | 
110 | 1. **Cell**: an individual cell, the target of single-cell RNA-seq experiments and the entity that we
111 | wish to characterize
112 | 
113 | 2. **Capture Primer**: A DNA oligonucleotide containing amplification machinery, a fixed cell barcode,
114 | a random molecule barcode, and an oligo-dT tail to capture poly-adenylated RNA
115 | 
116 | 3. **Molecule**: A molecule refers to a single mRNA molecule that is captured by an oligo-dT capture
117 | primer in a single-cell sequencing experiment
118 | 
119 | 4. **Molecule Barcode**: A molecule barcode (alias: UMI, RMT) is a short, random DNA barcode attached
120 | to the capture primer that has adequate length to be probabilistically unique across the experiment.
121 | Therefore, when multiple molecules of the same gene are captured in the same cell, they can be
122 | differentiated through having different molecule barcodes. The proposed GA4GH standard tag for a
123 | molecule barcode is UB and molecule barcode qualities is UY
124 | 
125 | 5. **Cell Barcode**: A short DNA barcode that is typically selected from a whitelist of barcodes that
126 | will be used in an experiment. All capture primers for a given cell will contain the same cell
127 | barcode. The proposed GA4GH standard tag for a cell barcode is CB and cell barcode qualities is CY
128 | 
129 | 6. **Fragment**: During library construction, mRNA molecules captured on capture primers are amplified,
130 | and the resulting amplified oligonucleotides are fragmented. In 3' experiments, only the fragment
131 | that contains the 3' end is retained, but the break point will be random, which means fragments
132 | often have different lengths. Once sequenced, different fragments can be identified as unique
133 | combinations of cell barcode, molecule barcode, the chromosome the sequence aligns to, and the
134 | position it aligns to on that chromosome, after correcting for clipping that the aligner may add
135 | 
136 | 7. **Bam/Sam file**: The GA4GH standard file type for the storage of aligned sequencing reads.
137 | Unless specified, our Single Cell Tools will operate over bam files containing either aligned or
138 | unaligned reads
139 | 
140 | Development
141 | ===========
142 | 
143 | Code Style
144 | ----------
145 | The sctools code base is complying with the PEP-8 and using `Black <https://github.com/ambv/black>`_ to
146 | format our code, in order to avoid "nitpicky" comments during the code review process so we spend more time discussing about the logic, 
147 | not code styles.
148 | 
149 | In order to enable the auto-formatting in the development process, you have to spend a few seconds setting 
150 | up the ``pre-commit`` the first time you clone the repo:
151 | 
152 | 1. Install ``pre-commit`` by running: ``pip install pre-commit`` (or simply run ``pip install -r requirements.txt``).
153 | 2. Run `pre-commit install` to install the git hook.
154 | 
155 | Once you successfully install the ``pre-commit`` hook to this repo, the Black linter/formatter will be automatically triggered and run on this repo. Please make sure you followed the above steps, otherwise your commits might fail at the linting test!
156 | 
157 | If you really want to manually trigger the linters and formatters on your code, make sure ``Black`` and ``flake8`` are installed in your Python environment and run ``flake8 DIR1 DIR2`` and ``black DIR1 DIR2 --skip-string-normalization`` respectively.
158 | 


--------------------------------------------------------------------------------
/docker_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Update version when changes to Dockerfile are made
 5 | DOCKER_IMAGE_VERSION=1.0.0
 6 | TIMESTAMP=$(date +"%s")
 7 | DIR=$(cd $(dirname $0) && pwd)
 8 | 
 9 | # Registries and tags
10 | GCR_URL="us.gcr.io/broad-gotc-prod/sctools"
11 | 
12 | # sctools version
13 | SCTOOLS_VERSION="v0.3.15"
14 | 
15 | # Necessary tools and help text
16 | TOOLS=(docker gcloud)
17 | HELP="$(basename "$0") [-h|--help] [-v|--version] [-t|tools] -- script to build the sctools image and push to GCR & Quay
18 | 
19 | where:
20 |     -h|--help Show help text
21 |     -v|--version Version of Samtools to use (default: $SCTOOLS_VERSION)
22 |     -t|--tools Show tools needed to run script
23 |     "
24 | 
25 | function main(){
26 |     for t in "${TOOLS[@]}"; do which "$t" >/dev/null || ok=no; done
27 |         if [[ $ok == no ]]; then
28 |             echo "Missing one of the following tools: "
29 |             for t in "${TOOLS[@]}"; do echo "$t"; done
30 |             exit 1
31 |         fi
32 | 
33 |     while [[ $# -gt 0 ]]
34 |     do 
35 |     key="$1"
36 |     case $key in
37 |         -v|--version)
38 |         SCTOOLS_VERSION="$2"
39 |         shift
40 |         shift
41 |         ;;
42 |         -h|--help)
43 |         echo "$HELP"
44 |         exit 0
45 |         ;;
46 |         -t|--tools)
47 |         for t in "${TOOLS[@]}"; do echo "$t"; done
48 |         exit 0
49 |         ;;
50 |         *)
51 |         shift
52 |         ;;
53 |     esac
54 |     done
55 |     
56 |     IMAGE_TAG="$DOCKER_IMAGE_VERSION-$SCTOOLS_VERSION-$TIMESTAMP"
57 | 
58 |     echo "building and pushing GCR Image - $GCR_URL:$IMAGE_TAG"
59 |     docker build --no-cache -t "$GCR_URL:$IMAGE_TAG" \
60 |         --build-arg SCTOOLS_VERSION="$SCTOOLS_VERSION" "$DIR" 
61 |     docker push "$GCR_URL:$IMAGE_TAG"
62 | 
63 |     echo -e "$GCR_URL:$IMAGE_TAG" >> "$DIR/docker_versions.tsv"
64 |     echo "done"
65 | }
66 | 
67 | main "$@"
68 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Build Docs 
 2 | 
 3 | 1. Make sure you have [Sphinx](http://www.sphinx-doc.org/en/stable/) installed.
 4 | 2. Install the sctools package in advance following the instructions.
 5 | 3. From the current directory (/docs/), type:
 6 | 
 7 | ```bash
 8 | make target
 9 | ```
10 | where `target` is one of {html, epub, latex, ...}. For more details about the sphinx builders, check [here](http://www.sphinx-doc.org/en/master/man/sphinx-build.html)
11 | 
12 | Note that there are still some bugs to be worked out. 
13 | - There are warnings about: 
14 | ```
15 | WARNING: [autosummary] failed to import 'sctools.metrics.CellMetrics': no module named sctools.metrics.CellMetrics
16 | WARNING: [autosummary] failed to import 'sctools.metrics.GeneMetrics': no module named sctools.metrics.GeneMetrics
17 | WARNING: [autosummary] failed to import 'sctools.metrics.MetricAggregatorBase': no module named sctools.metrics.MetricAggregatorBase
18 | ```
19 | 
20 | - There are a bunch of warnings: `WARNING: Unexpected section title.`
21 | - There are a bunch of warnings: `WARNING: toctree contains reference to nonexisting document`
22 | 
23 | Most of the warnings can be solved by refactoring the docstrings and standardize the usages of `autosummary` later.
24 | 


--------------------------------------------------------------------------------
/docs/source/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = SCTools
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/stable/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, os.path.abspath('..'))
 18 | from pkg_resources import get_distribution
 19 | 
 20 | 
 21 | # -- Project information -----------------------------------------------------
 22 | 
 23 | project = 'SC Tools'
 24 | copyright = '2018, Ambrose J. Carr'
 25 | author = 'Ambrose J. Carr'
 26 | 
 27 | # The short X.Y version
 28 | version = ''
 29 | # The full version, including alpha/beta/rc tags
 30 | release = get_distribution('sctools').version
 31 | 
 32 | 
 33 | # -- General configuration ---------------------------------------------------
 34 | 
 35 | # If your documentation needs a minimal Sphinx version, state it here.
 36 | #
 37 | # needs_sphinx = '1.0'
 38 | 
 39 | # Add any Sphinx extension module names here, as strings. They can be
 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 41 | # ones.
 42 | extensions = [
 43 |     'sphinx.ext.autodoc',
 44 |     'sphinx.ext.doctest',
 45 |     'sphinx.ext.mathjax',
 46 |     'sphinx.ext.viewcode',
 47 |     'sphinx.ext.napoleon',
 48 |     'sphinx.ext.autosummary',
 49 | ]
 50 | 
 51 | # Add any paths that contain templates here, relative to this directory.
 52 | templates_path = ['_templates']
 53 | 
 54 | # The suffix(es) of source filenames.
 55 | # You can specify multiple suffix as a list of string:
 56 | #
 57 | # source_suffix = ['.rst', '.md']
 58 | source_suffix = ['.rst', '.md']
 59 | 
 60 | # The master toctree document.
 61 | master_doc = 'index'
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #
 66 | # This is also used if you do content translation via gettext catalogs.
 67 | # Usually you set "language" from the command line for these cases.
 68 | language = None
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | # This pattern also affects html_static_path and html_extra_path .
 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 74 | 
 75 | # The name of the Pygments (syntax highlighting) style to use.
 76 | pygments_style = 'sphinx'
 77 | 
 78 | 
 79 | # -- Options for HTML output -------------------------------------------------
 80 | 
 81 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 82 | # a list of builtin themes.
 83 | #
 84 | html_theme = 'sphinx_rtd_theme'
 85 | 
 86 | # Theme options are theme-specific and customize the look and feel of a theme
 87 | # further.  For a list of options available for each theme, see the
 88 | # documentation.
 89 | #
 90 | # html_theme_options = {}
 91 | 
 92 | # Add any paths that contain custom static files (such as style sheets) here,
 93 | # relative to this directory. They are copied after the builtin static files,
 94 | # so a file named "default.css" will overwrite the builtin "default.css".
 95 | html_static_path = ['_static']
 96 | 
 97 | # Custom sidebar templates, must be a dictionary that maps document names
 98 | # to template names.
 99 | #
100 | # The default sidebars (for documents that don't match any pattern) are
101 | # defined by theme itself.  Builtin themes are using these templates by
102 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
103 | # 'searchbox.html']``.
104 | #
105 | # html_sidebars = {}
106 | 
107 | 
108 | # -- Options for HTMLHelp output ---------------------------------------------
109 | 
110 | # Output file base name for HTML help builder.
111 | htmlhelp_basename = 'SCToolsdoc'
112 | 
113 | 
114 | # -- Options for LaTeX output ------------------------------------------------
115 | 
116 | latex_elements = {
117 |     # The paper size ('letterpaper' or 'a4paper').
118 |     #
119 |     # 'papersize': 'letterpaper',
120 |     # The font size ('10pt', '11pt' or '12pt').
121 |     #
122 |     # 'pointsize': '10pt',
123 |     # Additional stuff for the LaTeX preamble.
124 |     #
125 |     # 'preamble': '',
126 |     # Latex figure (float) alignment
127 |     #
128 |     # 'figure_align': 'htbp',
129 | }
130 | 
131 | # Grouping the document tree into LaTeX files. List of tuples
132 | # (source start file, target name, title,
133 | #  author, documentclass [howto, manual, or own class]).
134 | latex_documents = [
135 |     (master_doc, 'SCTools.tex', 'SC Tools Documentation', 'Ambrose J. Carr', 'manual')
136 | ]
137 | 
138 | 
139 | # -- Options for manual page output ------------------------------------------
140 | 
141 | # One entry per manual page. List of tuples
142 | # (source start file, name, description, authors, manual section).
143 | man_pages = [(master_doc, 'sctools', 'SC Tools Documentation', [author], 1)]
144 | 
145 | 
146 | # -- Options for Texinfo output ----------------------------------------------
147 | 
148 | # Grouping the document tree into Texinfo files. List of tuples
149 | # (source start file, target name, title, author,
150 | #  dir menu entry, description, category)
151 | texinfo_documents = [
152 |     (
153 |         master_doc,
154 |         'SCTools',
155 |         'SC Tools Documentation',
156 |         author,
157 |         'SCTools',
158 |         'One line description of project.',
159 |         'Miscellaneous',
160 |     )
161 | ]
162 | 
163 | 
164 | # -- Extension configuration -------------------------------------------------
165 | numpydoc_show_class_members = False
166 | autosummary_generate = True
167 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. toctree::
 2 |    :maxdepth: 1
 3 |    :caption: Overview
 4 | 
 5 |    readme
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 |    :caption: API References
10 | 
11 |    sctools
12 |    sctools.metrics
13 |    sctools.test
14 | 
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 


--------------------------------------------------------------------------------
/docs/source/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../../README.rst
2 | 


--------------------------------------------------------------------------------
/docs/source/sctools.metrics.rst:
--------------------------------------------------------------------------------
 1 | sctools.metrics package
 2 | =======================
 3 | 
 4 | Submodules
 5 | ~~~~~~~~~~
 6 | 
 7 | sctools.metrics.aggregator module
 8 | ---------------------------------
 9 | 
10 | .. automodule:: sctools.metrics.aggregator
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 |     :inherited-members:
15 | 
16 | sctools.metrics.gatherer module
17 | -------------------------------
18 | 
19 | .. automodule:: sctools.metrics.gatherer
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 |     :inherited-members:
24 | 
25 | sctools.metrics.merge module
26 | ----------------------------
27 | 
28 | .. automodule:: sctools.metrics.merge
29 |     :members:
30 |     :undoc-members:
31 |     :show-inheritance:
32 |     :inherited-members:
33 | 
34 | sctools.metrics.writer module
35 | -----------------------------
36 | 
37 | .. automodule:: sctools.metrics.writer
38 |     :members:
39 |     :undoc-members:
40 |     :show-inheritance:
41 |     :inherited-members:
42 | 


--------------------------------------------------------------------------------
/docs/source/sctools.rst:
--------------------------------------------------------------------------------
 1 | sctools package
 2 | ===============
 3 | 
 4 | 
 5 | Submodules
 6 | ~~~~~~~~~~
 7 | 
 8 | sctools.bam module
 9 | ------------------
10 | 
11 | .. automodule:: sctools.bam
12 |     :members:
13 |     :undoc-members:
14 |     :show-inheritance:
15 |     :inherited-members:
16 | 
17 | sctools.barcode module
18 | ----------------------
19 | 
20 | .. automodule:: sctools.barcode
21 |     :members:
22 |     :undoc-members:
23 |     :show-inheritance:
24 |     :inherited-members:
25 | 
26 | sctools.encodings module
27 | ------------------------
28 | 
29 | .. automodule:: sctools.encodings
30 |     :members:
31 |     :undoc-members:
32 |     :show-inheritance:
33 |     :inherited-members:
34 | 
35 | sctools.fastq module
36 | --------------------
37 | 
38 | .. automodule:: sctools.fastq
39 |     :members:
40 |     :undoc-members:
41 |     :show-inheritance:
42 |     :inherited-members:
43 | 
44 | sctools.gtf module
45 | ------------------
46 | 
47 | .. automodule:: sctools.gtf
48 |     :members:
49 |     :undoc-members:
50 |     :show-inheritance:
51 |     :inherited-members:
52 | 
53 | sctools.platform module
54 | -----------------------
55 | 
56 | .. automodule:: sctools.platform
57 |     :members:
58 |     :undoc-members:
59 |     :show-inheritance:
60 |     :inherited-members:
61 | 
62 | sctools.reader module
63 | ---------------------
64 | 
65 | .. automodule:: sctools.reader
66 |     :members:
67 |     :undoc-members:
68 |     :show-inheritance:
69 |     :inherited-members:
70 | 
71 | sctools.stats module
72 | --------------------
73 | 
74 | .. automodule:: sctools.stats
75 |     :members:
76 |     :undoc-members:
77 |     :show-inheritance:
78 |     :inherited-members:
79 | 


--------------------------------------------------------------------------------
/docs/source/sctools.test.rst:
--------------------------------------------------------------------------------
 1 | sctools.test package
 2 | ====================
 3 | 
 4 | Submodules
 5 | ~~~~~~~~~~
 6 | 
 7 | sctools.test.test\_bam module
 8 | -----------------------------
 9 | 
10 | .. automodule:: sctools.test.test_bam
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | sctools.test.test\_barcode module
16 | ---------------------------------
17 | 
18 | .. automodule:: sctools.test.test_barcode
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | sctools.test.test\_encodings module
24 | -----------------------------------
25 | 
26 | .. automodule:: sctools.test.test_encodings
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | sctools.test.test\_entrypoints module
32 | -------------------------------------
33 | 
34 | .. automodule:: sctools.test.test_entrypoints
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | sctools.test.test\_fastq module
40 | -------------------------------
41 | 
42 | .. automodule:: sctools.test.test_fastq
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | sctools.test.test\_gtf module
48 | -----------------------------
49 | 
50 | .. automodule:: sctools.test.test_gtf
51 |     :members:
52 |     :undoc-members:
53 |     :show-inheritance:
54 | 
55 | sctools.test.test\_metrics module
56 | ---------------------------------
57 | 
58 | .. automodule:: sctools.test.test_metrics
59 |     :members:
60 |     :undoc-members:
61 |     :show-inheritance:
62 | 
63 | sctools.test.test\_stats module
64 | -------------------------------
65 | 
66 | .. automodule:: sctools.test.test_stats
67 |     :members:
68 |     :undoc-members:
69 |     :show-inheritance:
70 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.o
3 | *.a
4 | *.bak
5 | dox/
6 | dox_errors.txt
7 | *#
8 | *nohup.txt
9 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/Makefile:
--------------------------------------------------------------------------------
 1 | IDIR1 = libStatGen/include
 2 | IDIR2 = htslib-1.13
 3 | IDIR3 = gzstream
 4 | 
 5 | CC = g++ -std=c++17 -fPIC  -DHTSLIB -Wall -O4  -Wwrite-strings
 6 | 
 7 | CFLAGS = -I$(IDIR1) -LlibStatGen  -Lgzstream
 8 | 
 9 | LIBS = -LlibStatGen -lStatGen -lz -lpthread -lstdc++fs -Lgzstream -lgzstream
10 | 
11 | _DEPS = src/utilities.h src/input_options.h src/fastq_common.h
12 | 
13 | TARGET1 = bin/fastqprocess
14 | TARGET1_OBJ = obj/fastqprocess.o
15 | 
16 | TARGET2 = bin/TagSort
17 | TARGET2_OBJ = obj/tagsort.o obj/htslib_tagsort.o obj/metricgatherer.o
18 | 
19 | TARGET3 = bin/fastq_slideseq
20 | TARGET3_OBJ = obj/fastq_slideseq.o
21 | 
22 | TARGET4 = bin/fastq_metrics
23 | TARGET4_OBJ = obj/fastq_metrics.o
24 | 
25 | TARGET5 = bin/samplefastq
26 | TARGET5_OBJ = obj/samplefastq.o
27 | 
28 | install: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5)
29 | 	cp htslib-1.13/*.so.? bin/
30 | 
31 | all: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5)
32 | 
33 | COMMON_OBJ = obj/utilities.o obj/input_options.o obj/fastq_common.o
34 | 
35 | obj/%.o: src/%.cpp $(_DEPS)
36 | 	$(CC) -c -o $@ $<  -I$(IDIR1) -I$(IDIR2) -I$(IDIR3)
37 | 
38 | $(TARGET1): $(COMMON_OBJ) $(TARGET1_OBJ)
39 | 	$(CC) -o $@ $^ $(CFLAGS)  $(LIBS)
40 | 
41 | $(TARGET2): $(COMMON_OBJ) $(TARGET2_OBJ)
42 | 	$(CC) -Wl,-rpath,/usr/local/bin:fastqpreprocessing/bin:bin:. -o $@ $(COMMON_OBJ)  $(TARGET2_OBJ) $(LIBS) -Lhtslib-1.13 -lhts
43 | 
44 | $(TARGET3): $(COMMON_OBJ) $(TARGET3_OBJ)
45 | 	$(CC) -o $@ $^ $(CFLAGS)  $(LIBS)
46 | 
47 | $(TARGET4): $(COMMON_OBJ) $(TARGET4_OBJ)
48 | 	$(CC) -o $@ $^ $(CFLAGS)  $(LIBS)
49 | 
50 | $(TARGET5): $(COMMON_OBJ) $(TARGET5_OBJ)
51 | 	$(CC) -o $@ $^ $(CFLAGS)  $(LIBS)
52 | 
53 | .PHONY: clean
54 | clean:
55 | 	rm -f obj/*.o *~ core $(INCDIR)/*~  *.o *.so *.a
56 | 	rm -rf $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5)
57 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/patches/BgzfFileType.cpp.patch:
--------------------------------------------------------------------------------
 1 | --- libStatGen/general/BgzfFileType.cpp	2015-07-08 20:03:23.000000000 +0000
 2 | +++ /tmp/BgzfFileType.cpp	2020-11-03 12:25:36.168474179 +0000
 3 | @@ -23,7 +23,7 @@
 4 |  #include "BgzfFileType.h"
 5 |  
 6 |  // Default to require the EOF block at the end of the file.
 7 | -bool BgzfFileType::ourRequireEofBlock = true;
 8 | +bool BgzfFileType::ourRequireEofBlock = false;
 9 |  
10 |  BgzfFileType::BgzfFileType(const char * filename, const char * mode)
11 |  {
12 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/patches/FastQFile.cpp.patch:
--------------------------------------------------------------------------------
 1 | --- libStatGen-1.0.14/fastq/FastQFile.cpp	2015-07-08 20:03:23.000000000 +0000
 2 | +++ ../libStatGen/FastQFile.cpp	2020-09-17 19:35:48.797593411 +0000
 3 | @@ -489,6 +489,7 @@
 4 |         // Check to see if the sequenceIdentifier is a repeat by adding
 5 |         // it to the set and seeing if it already existed.
 6 |         std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
 7 | +       /*
 8 |         insertResult = 
 9 |             myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(), 
10 |                                                   myLineNum));
11 | @@ -505,6 +506,7 @@
12 |             reportErrorOnLine();
13 |             return(false);
14 |         }
15 | +       */
16 |     }
17 |  
18 |     // Valid, return true.
19 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/patches/Makefile.patch:
--------------------------------------------------------------------------------
 1 | --- libStatGen-1.0.14/Makefile	2015-07-08 20:03:23.000000000 +0000
 2 | +++ ../libStatGen/Makefile	2020-09-03 14:15:41.904210140 +0000
 3 | @@ -2,7 +2,8 @@
 4 |  
 5 |  .PHONY: package
 6 |  
 7 | -SUBDIRS=general bam fastq glf samtools vcf
 8 | +#SUBDIRS=general bam fastq glf samtools vcf
 9 | +SUBDIRS=general fastq samtools bam
10 |  
11 |  include Makefiles/Makefile.base
12 |  
13 | @@ -16,7 +17,8 @@
14 |  general: samtools
15 |  
16 |  # other subdirectories depend on general
17 | -bam fastq glf vcf: general
18 | +#bam fastq glf vcf: general
19 | +bam fastq : general
20 |  
21 |  RELEASE_FILE?=libStatGen.$(VERSION).tgz
22 |  
23 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/patches/general.Makefile.patch:
--------------------------------------------------------------------------------
 1 | --- libStatGen-1.0.14/general/Makefile	2020-09-17 20:29:00.320563968 +0000
 2 | +++ ../libStatGen/Makefile.general	2020-09-17 20:57:47.982915972 +0000
 3 | @@ -8,7 +8,7 @@
 4 |    # an error, but allow unused results and variables for the
 5 |    # time being.
 6 |    #
 7 | -  USER_WARNINGS ?= -Werror $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi)
 8 | +  USER_WARNINGS ?= $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi)
 9 |  #-Wno-strict-overflow
10 |  # -Wno-unused-variable $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-unused-result" ; fi)
11 |  endif
12 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/example-run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ./fastqprocess --verbose \
 3 |  --bam-size 0.001 \
 4 |  --barcode-length 16 \
 5 |  --umi-length 10 \
 6 |  --sample-id L8TX \
 7 |  --white-list ../../../data/L8TX/737K-august-2016.txt \
 8 |  --I1 ../../../data/L8TX/A_I1.fastq.gz \
 9 |  --R1 ../../../data/L8TX/A_R1.fastq.gz \
10 |  --R2 ../../../data/L8TX/A_R2.fastq.gz \
11 |  --I1 ../../../data/L8TX/B_I1.fastq.gz \
12 |  --R1 ../../../data/L8TX/B_R1.fastq.gz \
13 |  --R2 ../../../data/L8TX/B_R2.fastq.gz \
14 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/fastq_common.h:
--------------------------------------------------------------------------------
 1 | #ifndef __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_
 2 | #define __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_
 3 | 
 4 | #include <condition_variable>
 5 | #include <functional>
 6 | #include <mutex>
 7 | #include <queue>
 8 | #include <string>
 9 | #include <vector>
10 | 
11 | #include "FastQFile.h"
12 | #include "FastQStatus.h"
13 | #include "SamFile.h"
14 | #include "SamValidation.h"
15 | 
16 | // A pointer to a valid SamRecord waiting to be written to disk, and the index
17 | // of the g_read_arenas that pointer should be released to after the write.
18 | using PendingWrite = std::pair<SamRecord*, int>;
19 | 
20 | class WriteQueue
21 | {
22 | public:
23 |   static constexpr int kShutdown = -1;
24 |   PendingWrite dequeueWrite();
25 |   void enqueueWrite(PendingWrite write);
26 |   void enqueueShutdownSignal();
27 | private:
28 |   std::mutex mutex_;
29 |   std::condition_variable cv_;
30 |   std::queue<PendingWrite> queue_;
31 | };
32 | 
33 | // This is a hack for the sake of samplefastq program.
34 | void releaseReaderThreadMemory(int reader_thread_index, SamRecord* samRecord);
35 | 
36 | void fillSamRecordCommon(SamRecord* samRecord, FastQFile* fastQFileI1,
37 |                          FastQFile* fastQFileR1, FastQFile* fastQFileR2,
38 |                          bool has_I1_file_list,
39 |                          std::string const& barcode_seq, std::string const& barcode_quality,
40 |                          std::string const& umi_seq, std::string const& umi_quality);
41 | 
42 | void mainCommon(
43 |     std::string white_list_file, int num_writer_threads, std::string output_format,
44 |     std::vector<std::string> I1s, std::vector<std::string> R1s, std::vector<std::string> R2s,
45 |     std::string sample_id,
46 |     std::function<void(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> sam_record_filler,
47 |     std::function<std::string(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> barcode_getter,
48 |     std::function<void(WriteQueue*, SamRecord*, int)> output_handler);
49 | 
50 | #endif // __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_
51 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/fastq_metrics.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *  @file   fastq_metrics.cpp
  3 |  *  @brief  functions for computing metrics
  4 |  *  @author Farzaneh Khajouei and Fred Douglas
  5 |  *  @date   2022-05-25
  6 |  ***********************************************/
  7 | #include "FastQFile.h"
  8 | #include "FastQStatus.h"
  9 | #include "fastq_metrics.h"
 10 | #include <algorithm>
 11 | #include <fstream>
 12 | #include <iostream>
 13 | #include <cassert>
 14 | 
 15 | using std::string;
 16 | 
 17 | std::vector<std::pair<char, int>> parseReadStructure(std::string read_structure)
 18 | {
 19 |   std::vector<std::pair<char, int>> ret;
 20 |   int next_ind = 0;
 21 |   while (next_ind < read_structure.size())
 22 |   {
 23 |     int type_ind = read_structure.find_first_not_of("0123456789", next_ind);
 24 |     assert(type_ind != std::string::npos);
 25 |     char type = read_structure[type_ind];
 26 |     int len = std::stoi(read_structure.substr(next_ind, type_ind - next_ind));
 27 |     ret.emplace_back(type, len);
 28 |     next_ind = type_ind + 1;
 29 |   }
 30 |   return ret;
 31 | }
 32 | 
 33 | int getLengthOfType(string read_structure,char type)
 34 | {
 35 |   int total_length = 0;
 36 |   for (auto [curr_type, length] : parseReadStructure(read_structure))
 37 |     if (curr_type == type)
 38 |       total_length += length;
 39 |   return total_length;
 40 | }
 41 | 
 42 | void PositionWeightMatrix::recordChunk(string s)
 43 | {
 44 |   for (int index = 0; index < s.size(); index++)
 45 |   {
 46 |     switch (s[index])
 47 |     {
 48 |     case 'A':
 49 |     case 'a':
 50 |       A[index]++;
 51 |       break;
 52 |     case 'C':
 53 |     case 'c':
 54 |       C[index]++;
 55 |       break;
 56 |     case 'G':
 57 |     case 'g':
 58 |       G[index]++;
 59 |       break;
 60 |     case 'T':
 61 |     case 't':
 62 |       T[index]++;
 63 |       break;
 64 |     case 'N':
 65 |     case 'n':
 66 |       N[index]++;
 67 |       break;
 68 |     default:
 69 |       std::cerr<<"Unknown character:"<<s[index]<<std::endl;
 70 |     }
 71 |   }
 72 | }
 73 | 
 74 | FastQMetricsShard::FastQMetricsShard(std::string read_structure)
 75 |   : read_structure_(read_structure),
 76 |     barcode_length_(getLengthOfType(read_structure_,'C')),
 77 |     umi_length_(getLengthOfType(read_structure_,'M')),
 78 |     tagged_lengths_(parseReadStructure(read_structure_)),
 79 |     barcode_(barcode_length_),
 80 |     umi_(umi_length_) {}
 81 | 
 82 | // Read a chunk from a fastq r1 and get UMI and Cellbarcode filled
 83 | void FastQMetricsShard::ingestBarcodeAndUMI(std::string_view raw_seq)
 84 | {
 85 |   // extract the raw barcode and UMI 8C18X6C9M1X and raw barcode and UMI quality string
 86 |   std::string barcode_seq, umi_seq;
 87 |   int cur_ind = 0;
 88 |   for (auto [tag, length] : tagged_lengths_)
 89 |   {
 90 |     switch (tag)
 91 |     {
 92 |     case 'C':
 93 |       barcode_seq += raw_seq.substr(cur_ind, length);
 94 |       break;
 95 |     case 'M':
 96 |       umi_seq += raw_seq.substr(cur_ind, length);
 97 |       break;
 98 |     default:
 99 |       break;
100 |     }
101 |     cur_ind += length;
102 |   }
103 | 
104 |   barcode_counts_[barcode_seq]++;
105 |   umi_counts_[umi_seq]++;
106 |   barcode_.recordChunk(barcode_seq);
107 |   umi_.recordChunk(umi_seq);
108 | }
109 | 
110 | 
111 | // This is a wrapper to use std thread
112 | void processShard(FastQMetricsShard* fastq_metrics_shard, String filenameR1,
113 |                   std::string read_structure, const WhiteListData* white_list_data)
114 | {
115 |   fastq_metrics_shard->processShard(filenameR1, read_structure, white_list_data);
116 | }
117 | void FastQMetricsShard::processShard(String filenameR1, std::string read_structure,
118 |                                      const WhiteListData* white_list_data)
119 | {
120 |   /// setting the shortest sequence allowed to be read
121 |   FastQFile fastQFileR1(4, 4);
122 |   // open the R1 file
123 |   if (fastQFileR1.openFile(filenameR1, BaseAsciiMap::UNKNOWN) != FastQStatus::FASTQ_SUCCESS)
124 |     crash("Failed to open R1 file");
125 | 
126 |   // Keep reading the file until there are no more fastq sequences to process.
127 |   int n_lines_read = 0;
128 |   while (fastQFileR1.keepReadingFile())
129 |   {
130 |     if (fastQFileR1.readFastQSequence() != FastQStatus::FASTQ_SUCCESS)
131 |       break;
132 | 
133 |     ingestBarcodeAndUMI(std::string_view(fastQFileR1.myRawSequence.c_str(),fastQFileR1.myRawSequence.Length()));
134 | 
135 |     n_lines_read++;
136 |     if (n_lines_read % 10000000 == 0)
137 |     {
138 |       printf("%d\n", n_lines_read);
139 |       std::string a = std::string(fastQFileR1.myRawSequence.c_str());
140 |       printf("%s\n", fastQFileR1.mySequenceIdLine.c_str());
141 |     }
142 |   }
143 |   // Finished processing all of the sequences in the file.
144 |   // Close the input files.
145 |   fastQFileR1.closeFile();
146 | }
147 | 
148 | PositionWeightMatrix& PositionWeightMatrix::operator+=(const PositionWeightMatrix& rhs)
149 | {
150 |   for (int i=0; i < A.size(); i++)
151 |   {
152 |     A[i] += rhs.A[i];
153 |     C[i] += rhs.C[i];
154 |     G[i] += rhs.G[i];
155 |     T[i] += rhs.T[i];
156 |     N[i] += rhs.N[i];
157 |   }
158 |   return *this;
159 | }
160 | 
161 | FastQMetricsShard& FastQMetricsShard::operator+=(const FastQMetricsShard& rhs)
162 | {
163 |   for (auto [key,value] : rhs.barcode_counts_)
164 |     barcode_counts_[key] += value;
165 |   for (auto [key,value] : rhs.umi_counts_)
166 |     umi_counts_[key] += value;
167 | 
168 |   barcode_+=rhs.barcode_;
169 |   umi_+=rhs.umi_;
170 |   return *this;
171 | }
172 | 
173 | /** @copydoc process_inputs */
174 | void process_inputs(const INPUT_OPTIONS_FASTQ_READ_STRUCTURE& options,
175 |                     const WhiteListData* white_list_data)
176 | {
177 |   // number of files based on the input size
178 |   int num_files = options.R1s.size();
179 | 
180 |   // compute UMI and cell_barcode lengths
181 | 
182 |   int umi_length = getLengthOfType(options.read_structure,'M');
183 |   int CB_length = getLengthOfType(options.read_structure,'C');
184 | 
185 |   // create the data for the threads
186 |   vector <FastQMetricsShard> fastqMetrics;
187 |   for (int i = 0; i < num_files; i++)
188 |     fastqMetrics.emplace_back(options.read_structure);
189 | 
190 |   // execute the fastq readers threads
191 |   vector<std::thread> readers;
192 |   for (unsigned int i = 0; i < options.R1s.size(); i++)
193 |   {
194 |     readers.emplace_back(processShard,
195 |                          &fastqMetrics[i],
196 |                          options.R1s[i].c_str(),
197 |                          options.read_structure.c_str(),
198 |                          white_list_data);
199 | 
200 |   }
201 | 
202 |   // every reader thread joins.
203 |   for (unsigned int i = 0; i < options.R1s.size(); i++)
204 |     readers[i].join();
205 | 
206 |   std::cout << "Done reading all shards. Will now aggregate and write to file; "
207 |             << "this will take a few minutes." << std::endl;
208 |   FastQMetricsShard::mergeMetricsShardsToFile(options.sample_id, fastqMetrics, umi_length, CB_length);
209 | }
210 | 
211 | void writeCountsFile(std::unordered_map<string,int> counts, std::string filename)
212 | {
213 |   std::ofstream out(filename, std::ofstream::out);
214 |   std::vector<std::pair<std::string,int>> sorted_counts;
215 |   for (auto [str, count] : counts)
216 |     sorted_counts.emplace_back(str, count);
217 |   std::sort(sorted_counts.begin(), sorted_counts.end(), //sort counts from most to fewest!
218 |             [](std::pair<std::string,int> const& a, std::pair<std::string,int> const& b)
219 |   {
220 |     return a.second > b.second;
221 |   });
222 |   for (auto [str, count] : sorted_counts)
223 |     out << count << "\t" << str << "\n";
224 | }
225 | void PositionWeightMatrix::writeToFile(std::string filename)
226 | {
227 |   std::ofstream out(filename, std::ofstream::out);
228 |   out << "position\tA\tC\tG\tT\tN\n";
229 |   for (int i = 0; i < A.size(); i++)
230 |     out << (i + 1) << "\t" << A[i] << "\t" << C[i] << "\t" << G[i] << "\t" << T[i] << "\t" << N[i] << "\n";
231 | }
232 | void FastQMetricsShard::mergeMetricsShardsToFile(std::string filename_prefix, vector<FastQMetricsShard> shards, int umi_length, int CB_length)
233 | {
234 |   FastQMetricsShard total(shards[0].read_structure_);
235 |   for (FastQMetricsShard const& shard : shards)
236 |     total += shard;
237 | 
238 |   writeCountsFile(total.umi_counts_, filename_prefix + ".numReads_perCell_XM.txt");
239 |   writeCountsFile(total.barcode_counts_, filename_prefix + ".numReads_perCell_XC.txt");
240 |   total.barcode_.writeToFile(filename_prefix + ".barcode_distribution_XC.txt");
241 |   total.umi_.writeToFile(filename_prefix + ".barcode_distribution_XM.txt");
242 | }
243 | 
244 | int main(int argc, char** argv)
245 | {
246 |   INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqMetrics(argc, argv);
247 |   std::cout << "reading whitelist file " << options.white_list_file << "...";
248 |   WhiteListData white_list_data = readWhiteList(options.white_list_file);
249 |   std::cout << "done" << std::endl;
250 | 
251 |   process_inputs(options, &white_list_data);
252 |   return 0;
253 | }
254 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/fastq_metrics.h:
--------------------------------------------------------------------------------
 1 | #ifndef __FASTQ_METRICS_H__
 2 | #define __FASTQ_METRICS_H__
 3 | /**
 4 |  *  @file   fastq_metrics.h
 5 |  *  @brief  functions for computing metrics
 6 |  *  @author Farzaneh Khajouei and Fred Douglas
 7 |  *  @date   2022-05-25
 8 |  ***********************************************/
 9 | #include <string>
10 | #include <unordered_map>
11 | #include <vector>
12 | #include <thread>
13 | #include "BaseAsciiMap.h"
14 | #include "utilities.h"
15 | #include "input_options.h"
16 | #include "FastQFile.h"
17 | #include "FastQStatus.h"
18 | 
19 | class PositionWeightMatrix
20 | {
21 | public:
22 |   PositionWeightMatrix(int length): A(length), C(length), G(length), T(length), N(length) {}
23 |   void recordChunk(std::string s);
24 |   PositionWeightMatrix& operator+=(const PositionWeightMatrix& rhs);
25 |   void writeToFile(std::string filename);
26 | 
27 |   std::vector<int> A;
28 |   std::vector<int> C;
29 |   std::vector<int> G;
30 |   std::vector<int> T;
31 |   std::vector<int> N;
32 | };
33 | 
34 | class FastQMetricsShard
35 | {
36 | public:
37 |   FastQMetricsShard(std::string read_structure);
38 |   void ingestBarcodeAndUMI(std::string_view raw_seq);
39 |   void processShard(String filenameR1, std::string read_structure,
40 |                     const WhiteListData* white_list_data);
41 |   static void mergeMetricsShardsToFile(std::string filename_prefix,
42 |                                        std::vector<FastQMetricsShard> shards,
43 |                                        int umi_length, int CB_length);
44 |   FastQMetricsShard& operator+=(const FastQMetricsShard& rhs);
45 | 
46 | 
47 | private:
48 |   std::string read_structure_;
49 |   int barcode_length_;
50 |   int umi_length_;
51 |   std::vector<std::pair<char, int>> tagged_lengths_;
52 |   std::unordered_map<std::string, int> barcode_counts_;
53 |   std::unordered_map<std::string, int> umi_counts_;
54 |   PositionWeightMatrix barcode_;
55 |   PositionWeightMatrix umi_;
56 | };
57 | 
58 | #endif // __FASTQ_METRICS_H__
59 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/fastq_slideseq.cpp:
--------------------------------------------------------------------------------
 1 | #include "fastq_common.h"
 2 | #include "input_options.h"
 3 | 
 4 | std::vector<std::pair<char, int>> parseReadStructure(std::string const& read_structure)
 5 | {
 6 |   std::vector<std::pair<char, int>> ret;
 7 |   int next_ind = 0;
 8 |   while (next_ind < read_structure.size())
 9 |   {
10 |     int type_ind = read_structure.find_first_not_of("0123456789", next_ind);
11 |     assert(type_ind != std::string::npos);
12 |     char type = read_structure[type_ind];
13 |     int len = std::stoi(read_structure.substr(next_ind, type_ind - next_ind));
14 |     ret.emplace_back(type, len);
15 |     next_ind = type_ind + 1;
16 |   }
17 |   return ret;
18 | }
19 | 
20 | std::vector<std::pair<char, int>> g_parsed_read_structure;
21 | 
22 | void fillSamRecordWithReadStructure(SamRecord* sam, FastQFile* fastQFileI1,
23 |                                     FastQFile* fastQFileR1, FastQFile* fastQFileR2,
24 |                                     bool has_I1_file_list)
25 | {
26 |   // check the sequence names matching
27 |   std::string a = std::string(fastQFileR1->myRawSequence.c_str());
28 |   std::string b = std::string(fastQFileR1->myQualityString.c_str());
29 |   // extract the raw barcode and UMI 8C18X6C9M1X and raw barcode and UMI quality string
30 | 
31 |   std::string barcode_seq, barcode_quality, umi_seq, umi_quality;
32 |   int cur_ind = 0;
33 |   for (auto [tag, length] : g_parsed_read_structure)
34 |   {
35 |     switch (tag)
36 |     {
37 |     case 'C':
38 |       barcode_seq += a.substr(cur_ind, length);
39 |       barcode_quality += b.substr(cur_ind, length);
40 |       break;
41 |     case 'M':
42 |       umi_seq += a.substr(cur_ind, length);
43 |       umi_quality += b.substr(cur_ind, length);
44 |       break;
45 |     default:
46 |       break;
47 |     }
48 |     cur_ind += length;
49 |   }
50 |   fillSamRecordCommon(sam, fastQFileI1, fastQFileR1, fastQFileR2, has_I1_file_list,
51 |                       barcode_seq, barcode_quality, umi_seq, umi_quality);
52 | }
53 | 
54 | std::string slideseqBarcodeGetter(SamRecord* sam, FastQFile* fastQFileI1,
55 |                                   FastQFile* fastQFileR1, FastQFile* fastQFileR2,
56 |                                   bool has_I1_file_list)
57 | {
58 |   return std::string(sam->getString("CR").c_str());
59 | }
60 | 
61 | void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index)
62 | {
63 |   cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index));
64 | }
65 | 
66 | int main(int argc, char** argv)
67 | {
68 |   INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqSlideseq(argc, argv);
69 |   // number of output bam files, and one writer thread per bam file
70 |   int num_writer_threads = get_num_blocks(options);
71 | 
72 |   g_parsed_read_structure = parseReadStructure(options.read_structure);
73 | 
74 |   mainCommon(options.white_list_file, num_writer_threads, options.output_format,
75 |              options.I1s, options.R1s, options.R2s, options.sample_id,
76 |              fillSamRecordWithReadStructure, slideseqBarcodeGetter, outputHandler);
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/fastqprocess.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  @file   fastqprocess.cpp
 3 |  *  @brief  functions for file processing
 4 |  *  @author Kishori Konwar
 5 |  *  @date   2020-08-27
 6 |  ***********************************************/
 7 | 
 8 | #include "fastq_common.h"
 9 | #include "input_options.h"
10 | 
11 | unsigned int g_barcode_length;
12 | unsigned int g_umi_length;
13 | 
14 | void fillSamRecord(SamRecord* samRecord, FastQFile* fastQFileI1,
15 |                    FastQFile* fastQFileR1, FastQFile* fastQFileR2,
16 |                    bool has_I1_file_list)
17 | {
18 |   // check the sequence names matching
19 |   std::string a = std::string(fastQFileR1->myRawSequence.c_str());
20 |   std::string b = std::string(fastQFileR1->myQualityString.c_str());
21 | 
22 |   // extract the raw barcode and UMI
23 |   std::string barcode_seq = a.substr(0, g_barcode_length);
24 |   std::string umi_seq = a.substr(g_barcode_length, g_umi_length);
25 | 
26 |   // extract raw barcode and UMI quality string
27 |   std::string barcode_quality = b.substr(0, g_barcode_length);
28 |   std::string umi_quality = b.substr(g_barcode_length, g_umi_length);
29 | 
30 |   fillSamRecordCommon(samRecord, fastQFileI1, fastQFileR1, fastQFileR2, has_I1_file_list,
31 |                       barcode_seq, barcode_quality, umi_seq, umi_quality);
32 | }
33 | 
34 | std::string barcodeGetter(SamRecord* samRecord, FastQFile* fastQFileI1,
35 |                           FastQFile* fastQFileR1, FastQFile* fastQFileR2,
36 |                           bool has_I1_file_list)
37 | {
38 |   return std::string(fastQFileR1->myRawSequence.c_str()).substr(0, g_barcode_length);
39 | }
40 | 
41 | void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index)
42 | {
43 |   cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index));
44 | }
45 | 
46 | int main(int argc, char** argv)
47 | {
48 |   InputOptionsFastqProcess options = readOptionsFastqProcess(argc, argv);
49 |   // number of output bam files, and one writer thread per bam file
50 |   int num_writer_threads = get_num_blocks(options);
51 | 
52 |   g_barcode_length = options.barcode_length;
53 |   g_umi_length = options.umi_length;
54 | 
55 |   mainCommon(options.white_list_file, num_writer_threads, options.output_format,
56 |              options.I1s, options.R1s, options.R2s, options.sample_id,
57 |              fillSamRecord, barcodeGetter, outputHandler);
58 |   return 0;
59 | }
60 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/htslib_tagsort.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HTSLIB_TAG_SORT__
 2 | #define __HTSLIB_TAG_SORT__
 3 | 
 4 | /**
 5 |  *  @file   htslib_tagsort.h
 6 |  *  @brief  Utility functions for input options processing
 7 |  *  @author Kishori Konwar
 8 |  *  @date   2021-08-11
 9 |  ***********************************************/
10 | 
11 | #include <htslib/sam.h>
12 | #include "input_options.h"
13 | #include "utilities.h"
14 | 
15 | 
16 | /**
17 |  * @brief From the input bam create a list of txt files with the records (lines)
18 |  * sorted according to the * tags
19 |  *
20 |  * @details
21 |  * The input bam file is read chunk by chunk, sorted by the tags and the written
22 |  * out as a text file in the sorted manner.
23 |  *
24 |  * @param options: INPUT_OPTIONS_TAGSORT the inputs to the program
25 |  * @return a vector containing the file paths of the partial files
26 | */
27 | std::vector<std::string> create_sorted_file_splits_htslib(INPUT_OPTIONS_TAGSORT options);
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/input_options.h:
--------------------------------------------------------------------------------
  1 | #ifndef __SCTOOLS_FASTQPREPROCESSING_INPUT_OPTIONS_H_
  2 | #define __SCTOOLS_FASTQPREPROCESSING_INPUT_OPTIONS_H_
  3 | /**
  4 |  *  @file   input_options.h
  5 |  *  @brief  Utility functions for input options processing
  6 |  *  @author Kishori Konwar
  7 |  *  @date   2021-08-11
  8 |  ***********************************************/
  9 | 
 10 | #include "utilities.h"
 11 | 
 12 | #include <string>
 13 | #include <vector>
 14 | 
 15 | constexpr unsigned int kMaxTagsortThreads = 30;
 16 | constexpr unsigned int kDefaultNumAlignsPerThread = 1000000;
 17 | 
 18 | struct INPUT_OPTIONS_FASTQ_READ_STRUCTURE
 19 | {
 20 |   // I1, R1 and R2 files name
 21 |   std::vector<std::string> I1s, R1s, R2s;
 22 | 
 23 |   // Bead Barcode list
 24 |   std::string white_list_file;
 25 | 
 26 |   std::string output_format;
 27 | 
 28 |   // Bam file size to split by (in GB)
 29 |   double bam_size = 1.0;
 30 | 
 31 |   std::string read_structure;
 32 | 
 33 |   std::string sample_id;
 34 | };
 35 | 
 36 | 
 37 | // Structure to hold input options for fastqprocess
 38 | struct InputOptionsFastqProcess
 39 | {
 40 |   // I1, R1 and R2 files name
 41 |   std::vector<std::string> I1s, R1s, R2s;
 42 | 
 43 |   // Barcode white list file
 44 |   std::string white_list_file;
 45 | 
 46 |   std::string output_format;
 47 | 
 48 |   // chemistry dependent (V2/V3) barcode and UMI length
 49 |   int barcode_length = -1;
 50 |   int umi_length = -1;
 51 | 
 52 |   // Bam file size to split by (in GB)
 53 |   double bam_size = 1.0;
 54 | 
 55 |   std::string sample_id;
 56 | };
 57 | 
 58 | 
 59 | // Structure to hold input options for tagsort
 60 | struct INPUT_OPTIONS_TAGSORT
 61 | {
 62 |   std::string metric_type;
 63 |   bool output_sorted_info = false;
 64 |   bool compute_metric = false;
 65 |   // name of the bam file
 66 |   std::string bam_input;
 67 |   // name of the gtf file
 68 |   std::string gtf_file;
 69 |   // temp folder for disk sorting
 70 |   std::string temp_folder = "/tmp/";
 71 | 
 72 |   std::string metric_output_file;
 73 |   // sorted tsv output file
 74 |   std::string sorted_output_file;
 75 | 
 76 |   // Size (in number of alignments) of individual chunks to sort in a batch and
 77 |   // write to a partial file. Approximately 20 million alignments makes 1 GB bam file.
 78 |   unsigned int alignments_per_batch = kDefaultNumAlignsPerThread;
 79 |   unsigned int nthreads = 1;
 80 |   std::string barcode_tag;
 81 |   std::string umi_tag;
 82 |   std::string gene_tag;
 83 | 
 84 |   // order of the tags to sort by
 85 |   std::unordered_map<std::string, unsigned int> tag_order;
 86 | 
 87 |   std::string mitochondrial_gene_names_filename;
 88 | };
 89 | 
 90 | InputOptionsFastqProcess readOptionsFastqProcess(int argc, char** argv);
 91 | 
 92 | INPUT_OPTIONS_TAGSORT readOptionsTagsort(int argc, char** argv);
 93 | 
 94 | INPUT_OPTIONS_FASTQ_READ_STRUCTURE readOptionsFastqSlideseq(int argc, char** argv);
 95 | 
 96 | INPUT_OPTIONS_FASTQ_READ_STRUCTURE readOptionsFastqMetrics(int argc, char** argv);
 97 | 
 98 | int64_t get_num_blocks(InputOptionsFastqProcess const& options);
 99 | int64_t get_num_blocks(INPUT_OPTIONS_FASTQ_READ_STRUCTURE const& options);
100 | 
101 | #endif // __SCTOOLS_FASTQPREPROCESSING_INPUT_OPTIONS_H_
102 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/metricgatherer.h:
--------------------------------------------------------------------------------
  1 | #ifndef __METRIC_GATHERER__
  2 | #define __METRIC_GATHERER__
  3 | /**
  4 |  *  @file   metricgatherer.h
  5 |  *  @brief  functions for file processing
  6 |  *  @author Kishori Konwar
  7 |  *  @date   2021-08-11
  8 |  ***********************************************/
  9 | 
 10 | #include <unordered_map>
 11 | #include <string>
 12 | #include <regex>
 13 | #include <iostream>
 14 | #include <vector>
 15 | #include <assert.h>
 16 | #include <fstream>
 17 | #include <iomanip>
 18 | #include <math.h>
 19 | #include <unordered_set>
 20 | 
 21 | enum class MetricType { Cell, Gene };
 22 | 
 23 | /*
 24 |     Methods
 25 |     -------
 26 |     update(new_value: float)
 27 |         incorporate new_value into the online estimate of mean and variance
 28 |     getMean()
 29 |         return the mean value
 30 |     calculate_variance()
 31 |         calculate and return the variance
 32 |     mean_and_variance()
 33 |         return both mean and variance
 34 | */
 35 | class OnlineGaussianSufficientStatistic
 36 | {
 37 | private:
 38 |   double _mean_squared_error = 0.0;
 39 |   double sum_EX2 = 0.0;
 40 |   double _mean = 0.0;
 41 |   double _sum = 0.0;
 42 |   double _count = 0.0;
 43 | 
 44 | public:
 45 |   void update(double new_value)
 46 |   {
 47 |     _count += 1.0;
 48 |     _sum += new_value;
 49 |     sum_EX2 += (new_value*new_value);
 50 |   }
 51 | 
 52 |   // return the mean value
 53 |   double getMean()
 54 |   {
 55 |     _mean = _sum/_count;
 56 |     return _mean;
 57 |   }
 58 | 
 59 |   // calculate and return the variance
 60 |   double calculate_variance()
 61 |   {
 62 |     if (_count < 2)
 63 |       return -1.0;
 64 |     return sum_EX2 / (_count - 1) - (_sum/_count) * (_sum / (_count - 1));
 65 |   }
 66 | 
 67 |   void clear()
 68 |   {
 69 |     _mean_squared_error = 0.0;
 70 |     _mean = 0.0;
 71 |     _count = 0;
 72 |     _sum = 0;
 73 |     sum_EX2 = 0.0;
 74 |   }
 75 | };
 76 | 
 77 | class Metrics
 78 | {
 79 | private:
 80 |   // count information
 81 |   int n_reads = 0;
 82 |   const int noise_reads = 0; //# long polymers, N-sequences; NotImplemented
 83 | 
 84 |   std::unordered_map<std::string, int> _fragment_histogram;
 85 |   std::unordered_map<std::string, int> _molecule_histogram;
 86 | 
 87 |   // molecule information
 88 |   OnlineGaussianSufficientStatistic _molecule_barcode_fraction_bases_above_30;
 89 | 
 90 |   int perfect_molecule_barcodes = 0;
 91 | 
 92 |   OnlineGaussianSufficientStatistic _genomic_reads_fraction_bases_quality_above_30;
 93 | 
 94 |   OnlineGaussianSufficientStatistic _genomic_read_quality;
 95 | 
 96 |   // alignment location information
 97 |   int reads_mapped_exonic = 0;
 98 |   int reads_mapped_intronic = 0;
 99 |   int reads_mapped_utr = 0;
100 | 
101 |   // in future we can implement this when we have a gene model
102 |   // self.reads_mapped_outside_window = 0  # reads should be within 1000 bases of UTR
103 |   // self._read_distance_from_termination_site = OnlineGaussianSufficientStatistic()
104 | 
105 |   // alignment uniqueness information
106 |   int reads_mapped_uniquely = 0;
107 |   int reads_mapped_multiple = 0;
108 |   int duplicate_reads = 0;
109 | 
110 |   // alignment splicing information
111 |   int spliced_reads = 0;
112 |   int antisense_reads = 0;
113 |   int plus_strand_reads = 0;  // strand balance
114 | 
115 |   // higher-order methods, filled in by finalize() when all data is extracted
116 |   float molecule_barcode_fraction_bases_above_30_mean = -1;
117 |   float molecule_barcode_fraction_bases_above_30_variance = -1;
118 |   float genomic_reads_fraction_bases_quality_above_30_mean = -1;
119 |   float genomic_reads_fraction_bases_quality_above_30_variance = -1;
120 |   float genomic_read_quality_mean = -1;
121 |   float genomic_read_quality_variance  = -1;
122 |   float n_molecules = -1;
123 |   float n_fragments = -1;
124 |   float reads_per_molecule = -1;
125 |   float reads_per_fragment = -1;
126 |   float fragments_per_molecule = -1;
127 |   int fragments_with_single_read_evidence = -1;
128 |   int molecules_with_single_read_evidence = -1;
129 | 
130 |   // TODO separate these 2 out from the above, all of which gets clear()d
131 |   std::string prev_tag;
132 |   char* record[20];
133 | 
134 | protected:
135 |   std::string common_headers[24] =
136 |   {
137 |     "n_reads",
138 |     "noise_reads",
139 |     "perfect_molecule_barcodes",
140 |     "reads_mapped_exonic",
141 |     "reads_mapped_intronic",
142 |     "reads_mapped_utr",
143 |     "reads_mapped_uniquely",
144 |     "reads_mapped_multiple",
145 |     "duplicate_reads",
146 |     "spliced_reads",
147 |     "antisense_reads",
148 |     "molecule_barcode_fraction_bases_above_30_mean",
149 |     "molecule_barcode_fraction_bases_above_30_variance",
150 |     "genomic_reads_fraction_bases_quality_above_30_mean",
151 |     "genomic_reads_fraction_bases_quality_above_30_variance",
152 |     "genomic_read_quality_mean",
153 |     "genomic_read_quality_variance",
154 |     "n_molecules",
155 |     "n_fragments",
156 |     "reads_per_molecule",
157 |     "reads_per_fragment",
158 |     "fragments_per_molecule",
159 |     "fragments_with_single_read_evidence",
160 |     "molecules_with_single_read_evidence"
161 |   };
162 | 
163 | 
164 | public:
165 |   virtual ~Metrics() {}
166 |   //  get the headers
167 |   virtual std::string getHeader() = 0;
168 | 
169 |   void parse_line(std::string& str, std::ofstream& fmetric_out,
170 |                   std::unordered_set<std::string>& mitochondrial_genes,
171 |                   MetricType metric_type);
172 | 
173 |   void output_metrics(std::ofstream& fmetric_out);
174 |   virtual void output_metrics_extra(std::ofstream& fmetric_out) = 0;
175 |   virtual void parse_extra_fields(const std::string& first_tag,
176 |                                   const std::string& second_tag,
177 |                                   const std::string& third_tag,
178 |                                   char** record) = 0;
179 |   virtual void finalize(std::unordered_set<std::string>& mitochondrial_genes);
180 |   virtual void clear();
181 | };
182 | 
183 | class CellMetrics: public Metrics
184 | {
185 | private:
186 |   int perfect_cell_barcodes; // The number of reads whose cell barcodes contain no errors (tag ``CB`` == ``CR``)
187 |   int reads_mapped_intergenic; // The number of reads mapped to an intergenic region for this cell
188 | 
189 |   // reads unmapped
190 |   int reads_unmapped;
191 |   //  The number of reads that were mapped to too many loci across the genome and as a
192 |   //  consequence, are reported unmapped by the aligner
193 |   int reads_mapped_too_many_loci;
194 | 
195 |   // The variance of the fraction of Illumina base calls for the cell barcode sequence that
196 |   // are greater than 30, across molecules
197 |   float cell_barcode_fraction_bases_above_30_variance;
198 | 
199 |   // The average fraction of Illumina base calls for the cell barcode sequence that
200 |   // are greater than 30, across molecules
201 |   float cell_barcode_fraction_bases_above_30_mean;
202 | 
203 |   int n_genes;  //The number of genes detected by this cell
204 | 
205 |   int genes_detected_multiple_observations; // The number of genes that are observed by more than one read in this cell
206 |   int n_mitochondrial_genes; // The number of mitochondrial genes detected by this cell
207 |   int n_mitochondrial_molecules; // The number of molecules from mitochondrial genes detected for this cell
208 |   int pct_mitochondrial_molecules; // The percentage of molecules from mitoc
209 | 
210 |   OnlineGaussianSufficientStatistic _cell_barcode_fraction_bases_above_30;
211 |   std::unordered_map<std::string, int> _genes_histogram;
212 | 
213 |   std::string cell_specific_headers[11] =
214 |   {
215 |     "perfect_cell_barcodes",
216 |     "reads_mapped_intergenic",
217 |     "reads_unmapped",
218 |     "reads_mapped_too_many_loci",
219 |     "cell_barcode_fraction_bases_above_30_variance",
220 |     "cell_barcode_fraction_bases_above_30_mean",
221 |     "n_genes",
222 |     "genes_detected_multiple_observations",
223 |     "n_mitochondrial_genes",
224 |     "n_mitochondrial_molecules",
225 |     "pct_mitochondrial_molecules"
226 |   };
227 | 
228 | public:
229 |   std::string getHeader() override;
230 |   void output_metrics_extra(std::ofstream& fmetric_out) override;
231 |   void parse_extra_fields(const std::string& first_tag,
232 |                           const std::string& second_tag,
233 |                           const std::string& third_tag,
234 |                           char** record) override;
235 | 
236 |   void finalize(std::unordered_set<std::string>& mitochondrial_genes);
237 | 
238 |   void clear();
239 | };
240 | 
241 | 
242 | class GeneMetrics: public Metrics
243 | {
244 | private:
245 |   int number_cells_detected_multiple;
246 |   int number_cells_expressing;
247 | 
248 |   std::unordered_map<std::string, int> _cells_histogram;
249 |   std::string gene_specific_headers[2] =
250 |   {
251 |     "number_cells_detected_multiple",
252 |     "number_cells_expressing"
253 |   };
254 | 
255 | public:
256 |   GeneMetrics()
257 |   {
258 |     number_cells_detected_multiple = 0;
259 |     number_cells_expressing = 0;
260 |   }
261 | 
262 | public:
263 |   std::string getHeader() override;
264 |   void output_metrics_extra(std::ofstream& fmetric_out) override;
265 |   void parse_extra_fields(std::string const& first_tag,
266 |                           std::string const& second_tag,
267 |                           std::string const& third_tag,
268 |                           char** record) override;
269 | 
270 |   void finalize(std::unordered_set<std::string>& mitochondrial_genes);
271 |   void clear();
272 | };
273 | 
274 | #endif
275 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/samplefastq.cpp:
--------------------------------------------------------------------------------
  1 | #include "fastq_common.h"
  2 | #include "input_options.h"
  3 | #include <fstream>
  4 | 
  5 | std::vector<std::pair<char, int>> parseReadStructure(std::string const& read_structure)
  6 | {
  7 |   std::vector<std::pair<char, int>> ret;
  8 |   int next_ind = 0;
  9 |   while (next_ind < read_structure.size())
 10 |   {
 11 |     int type_ind = read_structure.find_first_not_of("0123456789", next_ind);
 12 |     assert(type_ind != std::string::npos);
 13 |     char type = read_structure[type_ind];
 14 |     int len = std::stoi(read_structure.substr(next_ind, type_ind - next_ind));
 15 |     ret.emplace_back(type, len);
 16 |     next_ind = type_ind + 1;
 17 |   }
 18 |   return ret;
 19 | }
 20 | 
 21 | std::vector<std::pair<char, int>> g_parsed_read_structure;
 22 | 
 23 | void fillSamRecordWithReadStructure(SamRecord* sam, FastQFile* fastQFileI1,
 24 |                                     FastQFile* fastQFileR1, FastQFile* fastQFileR2,
 25 |                                     bool has_I1_file_list)
 26 | {
 27 |   // check the sequence names matching
 28 |   std::string a = std::string(fastQFileR1->myRawSequence.c_str());
 29 |   std::string b = std::string(fastQFileR1->myQualityString.c_str());
 30 |   // extract the raw barcode and UMI 8C18X6C9M1X and raw barcode and UMI quality string
 31 | 
 32 |   std::string barcode_seq, barcode_quality, umi_seq, umi_quality;
 33 |   int cur_ind = 0;
 34 |   for (auto [tag, length] : g_parsed_read_structure)
 35 |   {
 36 |     switch (tag)
 37 |     {
 38 |     case 'C':
 39 |       barcode_seq += a.substr(cur_ind, length);
 40 |       barcode_quality += b.substr(cur_ind, length);
 41 |       break;
 42 |     case 'M':
 43 |       umi_seq += a.substr(cur_ind, length);
 44 |       umi_quality += b.substr(cur_ind, length);
 45 |       break;
 46 |     default:
 47 |       break;
 48 |     }
 49 |     cur_ind += length;
 50 |   }
 51 |   fillSamRecordCommon(sam, fastQFileI1, fastQFileR1, fastQFileR2, has_I1_file_list,
 52 |                       barcode_seq, barcode_quality, umi_seq, umi_quality);
 53 | }
 54 | 
 55 | std::string slideseqBarcodeGetter(SamRecord* sam, FastQFile* fastQFileI1,
 56 |                                   FastQFile* fastQFileR1, FastQFile* fastQFileR2,
 57 |                                   bool has_I1_file_list)
 58 | {
 59 |   return std::string(sam->getString("CR").c_str());
 60 | }
 61 | 
 62 | void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index)
 63 | {
 64 |   cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index));
 65 | }
 66 | 
 67 | 
 68 | int main(int argc, char** argv)
 69 | {
 70 |   INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqSlideseq(argc, argv);
 71 |   // number of output bam files, and one writer thread per bam file
 72 |   int num_writer_threads = get_num_blocks(options);
 73 | 
 74 |   std::ofstream outfile_r1("sampled_down.R1");
 75 |   if (!outfile_r1)
 76 |     crash("Failed to open output file sampled_down.R1");
 77 |   std::ofstream outfile_r2("sampled_down.R2");
 78 |   if (!outfile_r2)
 79 |     crash("Failed to open output file sampled_down.R2");
 80 | 
 81 |   g_parsed_read_structure = parseReadStructure(options.read_structure);
 82 |   mainCommon(options.white_list_file, /*num_writer_threads=*/1, options.output_format,
 83 |              options.I1s, options.R1s, options.R2s, options.sample_id,
 84 |              fillSamRecordWithReadStructure, slideseqBarcodeGetter,
 85 |              [&outfile_r1, &outfile_r2](WriteQueue* ignored1, SamRecord* sam, int reader_thread_index)
 86 |              {
 87 |                if (sam->getStringTag("CB"))
 88 |                {
 89 |                    // Assumed read structure of 8C18X6C9M1X with a fixed spacer sequence
 90 |                    const char* barcode = sam->getString("CR").c_str();
 91 |                    const char* quality_score = sam->getString("CY").c_str();
 92 |                    outfile_r1 << "@" << sam->getReadName() << "\n"
 93 |                           << std::string_view(barcode, 8) << "CTTCAGCGTTCCCGAGAG" << std::string_view(barcode+8, 6) << sam->getString("UR") <<"T\n"
 94 |                           << "+\n"
 95 |                           << std::string_view(quality_score, 8)<<"FFFFFFFFFFFFFFFFFF" << std::string_view(quality_score+8, 6) << sam->getString("UY") <<"F"<< "\n";
 96 | 
 97 |                    outfile_r2 << "@" << sam->getReadName() << "\n"
 98 |                           << sam->getSequence() << "\n"
 99 |                           << "+\n"
100 |                           << sam->getQuality() << "\n";
101 |                }
102 |                releaseReaderThreadMemory(reader_thread_index,sam);
103 |              });
104 |   return 0;
105 | }
106 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/utilities.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  @file   utilities.cpp
 3 |  *  @brief  Utility functions for file processing
 4 |  *  @author Kishori Konwar
 5 |  *  @date   2021-08-11
 6 |  ***********************************************/
 7 | 
 8 | #include "utilities.h"
 9 | 
10 | #include <fstream>
11 | #include <iostream>
12 | 
13 | /** @copydoc readWhiteList */
14 | WhiteListData readWhiteList(std::string const& white_list_file)
15 | {
16 |   const char ATCG[] = {'A', 'C', 'G', 'T', 'N'};
17 | 
18 |   std::ifstream file(white_list_file);
19 |   if (!file.is_open())
20 |     crash("Couldn't open whitelist file " + white_list_file);
21 | 
22 |   WhiteListData white_list_data;
23 |   int k = 0;
24 |   // read data from file object and put it into string.
25 |   for (std::string tp; getline(file, tp); )
26 |   {
27 |     white_list_data.barcodes.push_back(tp);
28 | 
29 |     for (unsigned int i=0; i < tp.size(); i++)
30 |     {
31 |       for (int j=0; j < 5; j++)
32 |       {
33 |         char c = tp[i];
34 |         tp[i] = ATCG[j];
35 |         // If the mutation we're writing is already present, we just overwrite
36 |         // what was there with the current.
37 |         // This is done to have the same values for corrected barcodes
38 |         // as in the python implementation.
39 |         white_list_data.mutations[tp] = k;
40 |         tp[i] = c;
41 |       }
42 |     }
43 | 
44 |     // -1 suggests it is already a whitelisted barcode
45 |     // This is used, instead of the actual index, because when
46 |     // the barcode is seen with -1 then no correction is necessary.
47 |     // Avoids lots of map lookups, as most barcodes are not erroneous.
48 |     white_list_data.mutations[tp] = -1;
49 |     k++;
50 |   }
51 | 
52 |   return white_list_data;
53 | }
54 | 
55 | 
56 | /** @copydoc crashWithPerror */
57 | void crashWithPerror(std::string msg)
58 | {
59 |   perror(msg.c_str());
60 |   exit(1);
61 | }
62 | 
63 | void crash(std::string msg)
64 | {
65 |   std::cout << msg << std::endl;
66 |   std::cerr << msg << std::endl;
67 |   exit(1);
68 | }
69 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/src/utilities.h:
--------------------------------------------------------------------------------
 1 | #ifndef __OPTIMUS_UTILITES__
 2 | #define __OPTIMUS_UTILITES__
 3 | 
 4 | /**
 5 |  *  @file   utilities.h
 6 |  *  @brief  Utility functions for file processing
 7 |  *  @author Kishori Konwar
 8 |  *  @date   2021-08-11
 9 |  ***********************************************/
10 | 
11 | #include <string>
12 | #include <vector>
13 | #include <unordered_map>
14 | 
15 | // structure for correcting the barcodes
16 | struct WhiteListData
17 | {
18 |   // an unordered map from whitelist barcodes and 1-mutations
19 |   // to the index of the correct barcode
20 |   std::unordered_map<std::string, int64_t> mutations;
21 |   // vector of whitelist barcodes
22 |   std::vector<std::string> barcodes;
23 | };
24 | 
25 | /**
26 |  * @brief Build barcode correction map white list barcodes & mutations
27 |  *
28 |  * @details
29 |  * A barcode is computed by checking if it is either in the white
30 |  * list or 1-mutation away from any white listed barcode. To check
31 |  * whether a barcode is correct or to correct it, if 1-mutation away from
32 |  * a barcode in the white list, we build a
33 |  * a map is created with the barcodes and the 1-mutation. The keys are
34 |  * barcodes or mutation and the values are index of the crrect barcode
35 |  *
36 |  * @param whilte_list_file  white list file from 10x genomics' cellranger
37 |  * @return a stricture containing the barcode/1-mutation barcode to index
38 |  *         of the correct barcode
39 | */
40 | WhiteListData readWhiteList(std::string const& white_list_file);
41 | 
42 | /**
43 |  * @brief Print system error and exit
44 |  *
45 |  * @param msg  error string to print
46 | */
47 | void crashWithPerror(std::string msg);
48 | 
49 | void crash(std::string msg);
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/utils/big-run.sh:
--------------------------------------------------------------------------------
1 | ./fastqproc ../../L8TX/L8TX_180221_01_F12_R1.fastq.gz ../../L8TX/L8TX_180221_01_F12_I1.fastq.gz ../../L8TX/L8TX_180221_01_F12_R2.fastq.gz ../../L8TX/L8TX_171026_01_F03_R1.fastq.gz ../../L8TX/L8TX_171026_01_F03_I1.fastq.gz ../../L8TX/L8TX_171026_01_F03_R2.fastq.gz 
2 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/utils/check_barcode_partition.py:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--bam", nargs="+", dest="bams", help="BAM files")
 6 | 
 7 | 
 8 | def check_disjoint_cbs():
 9 |     global parser
10 |     opts = parser.parse_args()
11 |     barcodes = {}
12 |     tot_alignments = 0
13 | 
14 |     for bam in opts.bams:
15 |         print("reading " + bam)
16 |         barcodes[bam] = {}
17 |         with pysam.AlignmentFile(bam, "rb", check_sq=False) as input_alignments:
18 |             for alignment in input_alignments:
19 |                 tot_alignments += 1
20 |                 if alignment.has_tag("CB"):
21 |                     barcodes[bam][alignment.get_tag("CB")] = True
22 | 
23 |     for bam in opts.bams:
24 |         print("checking " + bam)
25 |         files = set(opts.bams)
26 |         otherbams = files.difference(set([bam]))
27 |         for cb in barcodes[bam].keys():
28 |             for obam in otherbams:
29 |                 if cb in barcodes[obam]:
30 |                     print("not a partition")
31 |                     return
32 | 
33 |     print("total alignments : ", tot_alignments)
34 |     print("is a partition")
35 |     return
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     check_disjoint_cbs()
40 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/utils/create_fastq.sh:
--------------------------------------------------------------------------------
 1 | zcat ../../L8TX/L8TX_180221_01_F12_R2.fastq.gz | head -n 4000000 > a_R1.fastq
 2 | 
 3 | gzip a_R1.fastq
 4 | 
 5 | cp a_R1.fastq.gz  b_R2.fastq.gz
 6 | cp a_R1.fastq.gz  b_I1.fastq.gz
 7 | cp a_R1.fastq.gz  b_R1.fastq.gz
 8 | 
 9 | cp a_R1.fastq.gz  a_R2.fastq.gz
10 | cp a_R1.fastq.gz  a_I1.fastq.gz
11 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/utils/example-run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ./fastqprocess --verbose \
 4 |  --bam-size 0.001 \
 5 |  --barcode-length 16 \
 6 |  --umi-length 10 \
 7 |  --sample-id L8TX \
 8 |  --white-list ../../../data/L8TX/737K-august-2016.txt \
 9 |  --I1 ../../../data/L8TX/A_I1.fastq.gz \
10 |  --R1 ../../../data/L8TX/A_R1.fastq.gz \
11 |  --R2 ../../../data/L8TX/A_R2.fastq.gz \
12 |  --I1 ../../../data/L8TX/B_I1.fastq.gz \
13 |  --R1 ../../../data/L8TX/B_R1.fastq.gz \
14 |  --R2 ../../../data/L8TX/B_R2.fastq.gz \
15 | 


--------------------------------------------------------------------------------
/fastqpreprocessing/utils/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | #  --tool=memcheck \
 5 | #   --leak-check=full  \
 6 | #  --log-file=valgrind-out.txt  \
 7 | 
 8 | valgrind  \
 9 |    --tool=massif \
10 |     --time-unit=B \
11 |     ./fastqproc a_R1.fastq.gz a_I1.fastq.gz a_R2.fastq.gz \
12 |     b_R1.fastq.gz b_I1.fastq.gz b_R2.fastq.gz
13 | 


--------------------------------------------------------------------------------
/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ### Purpose
 2 | <!-- Please explain the purpose of this PR and include links to any GitHub issues that it fixes: -->
 3 | 
 4 | - No issue is linked to this PR.
 5 | 
 6 | ### Changes
 7 | <!-- Please list out what major changes were made in this PR to address the issue: -->
 8 | 
 9 | - No changes.
10 | 
11 | ### Review Instructions
12 | <!-- Please provide instructions about how should a reviewer test/verify the changes in this PR: -->
13 | 
14 | - No instructions.
15 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | 
 3 | build:
 4 |   image: latest
 5 | 
 6 | python:
 7 |   version: 3.6
 8 |   use_system_site_packages: false  # Set to true will let the virtualenv use the pre-installed packages such as numpy, which is not what we want
 9 |   setup_py_install: false
10 |   pip_install: true
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | crimson==0.5.2
 2 | pandas==0.25.3
 3 | pysam==0.16.0.1
 4 | pytest-cov==2.10.1
 5 | pytest==5.1.1
 6 | scipy==1.5.2
 7 | black==19.3b0
 8 | flake8==3.7.7
 9 | gffutils==0.9
10 | numpy==1.19.1
11 | requests==2.20.0
12 | setuptools==40.4.3
13 | setuptools_scm==3.1.0
14 | h5py==2.10.0
15 | tables==3.4.4


--------------------------------------------------------------------------------
/security.txt:
--------------------------------------------------------------------------------
1 | If you'd like to report a security issue please contact us.
2 | 
3 | Contact: security-leads@data.humancellatlas.org
4 | 
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | CLASSIFIERS = [
 4 |     "Development Status :: 4 - Beta",
 5 |     "Natural Language :: English",
 6 |     "License :: OSI Approved :: BSD License",
 7 |     "Operating System :: OS Independent",
 8 |     "Programming Language :: Python :: 3.6",
 9 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
10 | ]
11 | 
12 | setup(
13 |     name="sctools",
14 |     use_scm_version=True,
15 |     setup_requires=["setuptools_scm"],
16 |     description="Utilities for large-scale distributed single cell" + "data processing",
17 |     url="https://github.com/humancellatlas/sctools.git",
18 |     author="Ambrose J. Carr",
19 |     author_email="mail@ambrosejcarr.com",
20 |     package_dir={"": "src"},
21 |     packages=["sctools", "sctools/test", "sctools/metrics"],
22 |     install_requires=[
23 |         "gffutils",
24 |         "numpy",
25 |         "pandas",
26 |         "pysam",
27 |         "pytest",
28 |         "pytest-cov",
29 |         "sphinx",
30 |         "sphinxcontrib-websupport",
31 |         "sphinx_rtd_theme",
32 |         "setuptools_scm>=3.1.0",
33 |         "setuptools>=40.4.3",
34 |         "scipy>=1.0.0",
35 |         "crimson>=0.3.0",
36 |     ],
37 |     entry_points={
38 |         "console_scripts": [
39 |             "AttachBarcodes = sctools.platform:BarcodePlatform." + "attach_barcodes",
40 |             "Attach10xBarcodes = sctools.platform:TenXV2.attach_barcodes",
41 |             "SplitBam = sctools.platform:GenericPlatform.split_bam",
42 |             "CalculateGeneMetrics = sctools.platform:GenericPlatform."
43 |             + "calculate_gene_metrics",
44 |             "CalculateCellMetrics = sctools.platform:GenericPlatform."
45 |             + "calculate_cell_metrics",
46 |             "MergeGeneMetrics = sctools.platform:GenericPlatform."
47 |             + "merge_gene_metrics",
48 |             "MergeCellMetrics = sctools.platform:GenericPlatform."
49 |             + "merge_cell_metrics",
50 |             "CreateCountMatrix = sctools.platform:GenericPlatform."
51 |             + "bam_to_count_matrix",
52 |             "MergeCountMatrices = sctools.platform:GenericPlatform."
53 |             + "merge_count_matrices",
54 |             "TagSortBam = sctools.platform:GenericPlatform.tag_sort_bam",
55 |             "VerifyBamSort = sctools.platform:GenericPlatform.verify_bam_sort",
56 |             "GroupQCs = sctools.platform:GenericPlatform.group_qc_outputs",
57 |         ]
58 |     },
59 |     classifiers=CLASSIFIERS,
60 |     include_package_data=True,
61 | )
62 | 


--------------------------------------------------------------------------------
/src/sctools/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from . import bam
 3 | from . import encodings
 4 | from . import barcode
 5 | from . import fastq
 6 | from . import gtf
 7 | from . import stats
 8 | from . import reader
 9 | from . import metrics
10 | from . import platform
11 | from . import consts
12 | from . import groups
13 | from pkg_resources import get_distribution, DistributionNotFound
14 | 
15 | 
16 | try:
17 |     __version__ = get_distribution(__name__).version
18 | except DistributionNotFound:
19 |     pass
20 | 


--------------------------------------------------------------------------------
/src/sctools/consts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Global constants
 3 | ================
 4 | 
 5 | .. currentmodule:: sctools
 6 | 
 7 | This module contains global constants, such as various barcoded BAM tags, and sctools-specific
 8 | constants.
 9 | """
10 | 
11 | # BAM tag constants
12 | 
13 | RAW_SAMPLE_BARCODE_TAG_KEY = "SR"
14 | QUALITY_SAMPLE_BARCODE_TAG_KEY = "SY"
15 | 
16 | MOLECULE_BARCODE_TAG_KEY = "UB"
17 | RAW_MOLECULE_BARCODE_TAG_KEY = "UR"
18 | QUALITY_MOLECULE_BARCODE_TAG_KEY = "UY"
19 | 
20 | CELL_BARCODE_TAG_KEY = "CB"
21 | RAW_CELL_BARCODE_TAG_KEY = "CR"
22 | QUALITY_CELL_BARCODE_TAG_KEY = "CY"
23 | 
24 | GENE_NAME_TAG_KEY = "GE"
25 | NUMBER_OF_HITS_TAG_KEY = "NH"
26 | 
27 | ALIGNMENT_LOCATION_TAG_KEY = "XF"
28 | INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE = "INTRONIC"
29 | CODING_ALIGNMENT_LOCATION_TAG_VALUE = "CODING"
30 | UTR_ALIGNMENT_LOCATION_TAG_VALUE = "UTR"
31 | INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE = "INTERGENIC"
32 | 
33 | # bam.py constants
34 | 
35 | MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500
36 | MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000
37 | 
38 | 
39 | # modes of the count matrix runs
40 | SINGLE_CELL_COUNT_MATRIX = 0
41 | SINGLE_NUCLEI_COUNT_MATRIX = 1
42 | 


--------------------------------------------------------------------------------
/src/sctools/encodings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Compressed Barcode Encoding Methods
  3 | ===================================
  4 | 
  5 | .. currentmodule:: sctools
  6 | 
  7 | This module defines several classes to encode DNA sequences in memory-efficient forms, using 2 bits
  8 | to encode bases of a 4-letter DNA alphabet (ACGT) or 3 bits to encode a 5-letter DNA alphabet
  9 | that includes the ambiguous call often included by Illumina base calling software (ACGTN). The
 10 | classes also contain several methods useful for efficient querying and manipulation of the encoded
 11 | sequence.
 12 | 
 13 | Classes
 14 | -------
 15 | Encoding                            Encoder base class
 16 | ThreeBit                            Three bit DNA encoder / decoder
 17 | TwoBit                              Two bit DNA encoder / decoder
 18 | 
 19 | """
 20 | 
 21 | import random
 22 | from typing import Mapping, AnyStr, Set
 23 | 
 24 | 
 25 | class Encoding:
 26 |     """
 27 | 
 28 |     Attributes
 29 |     ----------
 30 |     encoding_map : TwoBitEncodingMap
 31 |         Class that mimics a Mapping[bytes, str] where bytes must be a single byte encoded character
 32 |         (encoder)
 33 |     decoding_map : Mapping[int, bytes]
 34 |         Dictionary that maps integers to bytes human-readable representations (decoder)
 35 |     bits_per_base : int
 36 |         number of bits used to encode each base
 37 | 
 38 |     Methods
 39 |     -------
 40 |     encode(bytes_encoded: bytes)
 41 |         encode a DNA string in a compressed representation
 42 |     decode(integer_encoded: int)
 43 |         decode a compressed DNA string into a human readable bytes format
 44 |     gc_content(integer_encoded: int)
 45 |         calculate the GC content of an encoded DNA string
 46 |     hamming_distance(a: int, b: int)
 47 |         calculate the hamming distance between two encoded DNA strings
 48 | 
 49 |     """
 50 | 
 51 |     encoding_map: Mapping[AnyStr, int] = NotImplemented
 52 |     decoding_map: Mapping[int, AnyStr] = NotImplemented
 53 |     bits_per_base: int = NotImplemented
 54 | 
 55 |     @classmethod
 56 |     def encode(cls, bytes_encoded: bytes) -> int:
 57 |         """Encode a DNA bytes string.
 58 | 
 59 |         Parameters
 60 |         ----------
 61 |         bytes_encoded : bytes
 62 |             bytes DNA string
 63 | 
 64 |         Returns
 65 |         -------
 66 |         encoded : int
 67 |             Encoded DNA sequence
 68 | 
 69 |         """
 70 |         raise NotImplementedError
 71 | 
 72 |     def decode(self, integer_encoded: int) -> bytes:
 73 |         """Decode a DNA bytes string.
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         integer_encoded : bytes
 78 |             Integer encoded DNA string
 79 | 
 80 |         Returns
 81 |         -------
 82 |         decoded : bytes
 83 |             Bytes decoded DNA sequence
 84 | 
 85 |         """
 86 |         raise NotImplementedError
 87 | 
 88 |     def gc_content(self, integer_encoded: int) -> int:
 89 |         """Return the number of G or C nucleotides in `integer_encoded`
 90 | 
 91 |         Parameters
 92 |         ----------
 93 |         integer_encoded : int
 94 |             Integer encoded DNA string
 95 | 
 96 |         Returns
 97 |         -------
 98 |         gc_content, int
 99 |             number of bases in `integer_encoded` input that are G or C.
100 | 
101 |         """
102 |         raise NotImplementedError
103 | 
104 |     @staticmethod
105 |     def hamming_distance(a, b) -> int:
106 |         """Calculate the hamming distance between two DNA sequences
107 | 
108 |         The hamming distance counts the number of bases that are not the same nucleotide
109 | 
110 |         Parameters
111 |         ----------
112 |         a, b : int
113 |             integer encoded
114 | 
115 | 
116 |         Returns
117 |         -------
118 |         d : int
119 |             hamming distance between a and b
120 |         """
121 |         raise NotImplementedError
122 | 
123 | 
124 | class TwoBit(Encoding):
125 |     """Encode a DNA sequence using a 2-bit encoding.
126 | 
127 |     Two-bit encoding uses 0 for an encoded nucleotide. As such, it cannot distinguish between
128 |     the end of sequence and trailing A nucleotides, and thus decoding these strings requires
129 |     knowledge of their length. Therefore, it is only appropriate for encoding fixed sequence
130 |     lengths
131 | 
132 |     In addition, in order to encode in 2-bit, N-nucleotides must be randomized to one of A, C,
133 |     G, and T.
134 | 
135 |     Parameters
136 |     ----------
137 |     sequence_length : int
138 |         number of nucleotides that are being encoded
139 | 
140 |     """
141 | 
142 |     __doc__ += Encoding.__doc__
143 | 
144 |     def __init__(self, sequence_length: int):
145 |         self.sequence_length: int = sequence_length
146 | 
147 |     class TwoBitEncodingMap:
148 |         """Dict-like class that maps bytes to 2-bit integer representations
149 | 
150 |         Generates random nucleotides for ambiguous nucleotides e.g. N
151 | 
152 |         """
153 | 
154 |         map_ = {
155 |             ord("A"): 0,
156 |             ord("C"): 1,
157 |             ord("T"): 2,
158 |             ord("G"): 3,
159 |             ord("a"): 0,
160 |             ord("c"): 1,
161 |             ord("t"): 2,
162 |             ord("g"): 3,
163 |         }
164 | 
165 |         iupac_ambiguous: Set[int] = {ord(c) for c in "MRWSYKVHDBNmrwsykvhdbn"}
166 | 
167 |         def __getitem__(self, byte: int) -> int:
168 |             try:
169 |                 return self.map_[byte]
170 |             except KeyError:
171 |                 if byte not in self.iupac_ambiguous:
172 |                     raise KeyError(f"{chr(byte)} is not a valid IUPAC nucleotide code")
173 |                 return random.randint(0, 3)
174 | 
175 |     encoding_map: TwoBitEncodingMap = TwoBitEncodingMap()
176 |     decoding_map: Mapping[int, bytes] = {0: b"A", 1: b"C", 2: b"T", 3: b"G"}
177 |     bits_per_base: int = 2
178 | 
179 |     @classmethod
180 |     def encode(cls, bytes_encoded: bytes) -> int:
181 |         encoded = 0
182 |         for character in bytes_encoded:
183 |             encoded <<= 2
184 |             encoded += cls.encoding_map[character]
185 |         return encoded
186 | 
187 |     def decode(self, integer_encoded: int) -> bytes:
188 |         decoded = b""
189 |         for _ in range(self.sequence_length):
190 |             decoded = self.decoding_map[integer_encoded & 3] + decoded
191 |             integer_encoded >>= 2
192 |         return decoded
193 | 
194 |     def gc_content(self, integer_encoded: int) -> int:
195 |         i = 0
196 |         for _ in range(self.sequence_length):
197 |             i += integer_encoded & 1
198 |             integer_encoded >>= 2
199 |         return i
200 | 
201 |     @staticmethod
202 |     def hamming_distance(a: int, b: int) -> int:
203 |         difference = a ^ b
204 |         d_hamming = 0
205 |         while difference:
206 |             if difference & 3:
207 |                 d_hamming += 1
208 |             difference >>= 2
209 |         return d_hamming
210 | 
211 | 
212 | class ThreeBit(Encoding):
213 |     """Encode a DNA sequence using a 3-bit encoding.
214 | 
215 |     Since no bases are encoded as 0, an empty triplet is interpreted as the end of the encoded
216 |     string; Three-bit encoding can be used to encode and decode strings without knowledge of their
217 |     length.
218 | 
219 |     """
220 | 
221 |     __doc__ += Encoding.__doc__
222 | 
223 |     def __init__(self, *args, **kwargs):
224 |         """
225 |         Notes
226 |         -----
227 |         args and kwargs are not used, but allow ThreeBit to be initialized the same way as TwoBit,
228 |         despite not requiring a sequence length parameter.
229 | 
230 |         """
231 |         pass
232 | 
233 |     class ThreeBitEncodingMap:
234 |         """Dict-like class that maps bytes to 3-bit integer representations
235 | 
236 |         All IUPAC ambiguous codes are treated as "N"
237 | 
238 |         """
239 | 
240 |         # C: 1, A: 2, G: 3, T: 4, N: 6;  # note, not using 0
241 |         map_ = {
242 |             ord("C"): 1,
243 |             ord("A"): 2,
244 |             ord("G"): 3,
245 |             ord("T"): 4,
246 |             ord("N"): 6,
247 |             ord("c"): 1,
248 |             ord("a"): 2,
249 |             ord("g"): 3,
250 |             ord("t"): 4,
251 |             ord("n"): 6,
252 |         }
253 | 
254 |         def __getitem__(self, byte: int) -> int:
255 |             try:
256 |                 return self.map_[byte]
257 |             except KeyError:
258 |                 return 6  # any non-standard nucleotide gets "N"
259 | 
260 |     encoding_map: ThreeBitEncodingMap = ThreeBitEncodingMap()
261 |     decoding_map: Mapping[int, bytes] = {1: b"C", 2: b"A", 3: b"G", 4: b"T", 6: b"N"}
262 |     bits_per_base: int = 3
263 | 
264 |     @classmethod
265 |     def encode(cls, bytes_encoded: bytes) -> int:
266 |         encoded = 0
267 |         for character in bytes_encoded:
268 |             encoded <<= 3
269 |             encoded += cls.encoding_map[character]
270 |         return encoded
271 | 
272 |     @classmethod
273 |     def decode(cls, integer_encoded: int) -> bytes:
274 |         decoded = b""
275 |         while integer_encoded:
276 |             decoded = cls.decoding_map[integer_encoded & 7] + decoded
277 |             integer_encoded >>= 3
278 |         return decoded
279 | 
280 |     @classmethod
281 |     def gc_content(cls, integer_encoded: int) -> int:
282 |         i = 0
283 |         while integer_encoded:
284 |             i += integer_encoded & 1
285 |             integer_encoded >>= 3
286 |         return i
287 | 
288 |     @staticmethod
289 |     def hamming_distance(a: int, b: int) -> int:
290 |         difference = a ^ b
291 |         d_hamming = 0
292 |         while difference:
293 |             if difference & 7:
294 |                 d_hamming += 1
295 |             difference >>= 3
296 |         return d_hamming
297 | 


--------------------------------------------------------------------------------
/src/sctools/groups.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Group QC outputs
  3 | 
  4 | """
  5 | 
  6 | from crimson import picard
  7 | import os
  8 | import pandas as pd
  9 | 
 10 | 
 11 | def write_aggregated_picard_metrics_by_row(file_names, output_name):
 12 |     """Command line entrypoint to parse, aggreagete and write Picard row metrics.
 13 |     Parameters
 14 |     ----------
 15 |     args:
 16 |         file_names: array of files. the basename of inputs should be formated
 17 |         as 'samplename_qc',such as
 18 |         "samplename_qc.alignment_summary_metrics.txt" and "samplename_qc.insert_size_metrics.txt"
 19 |         output_name: prefix of output file name without extension.
 20 |     Returns
 21 |     ----------
 22 |         return: 0
 23 |         return if the program completes successfully.
 24 |     """
 25 |     # initial output
 26 |     metrics = {}
 27 |     d = pd.DataFrame()
 28 |     for file_name in file_names:
 29 |         cell_id = os.path.basename(file_name).split("_qc")[0]
 30 |         metrics[cell_id] = {}
 31 |         parsed = picard.parse(file_name)
 32 |         class_name = parsed["metrics"]["class"].split(".")[2]
 33 |         # Alignment metrics return multiple lines,
 34 |         # but only output PAIRED-READS/third line
 35 |         contents = parsed["metrics"]["contents"]
 36 |         if class_name == "AlignmentSummaryMetrics":
 37 |             # parse out PE, R1 and R2. If the reads are unpaired, the contents
 38 |             # will be a single dict rather than a list of dicts.
 39 |             if isinstance(contents, dict):
 40 |                 contents = [contents]
 41 |             rows = {}
 42 |             for m in contents:
 43 |                 cat = m["CATEGORY"]
 44 |                 rows.update(
 45 |                     {
 46 |                         k + "." + cat: v
 47 |                         for k, v in m.items()
 48 |                         if k not in ["SAMPLE", "LIBRARY", "READ_GROUP", "CATEGORY"]
 49 |                     }
 50 |                 )
 51 |         # sometimes(very rare), insertion metrics also return multiple lines
 52 |         # results to include TANDEM repeats. but we only output the first line.
 53 |         elif class_name == "InsertSizeMetrics":
 54 |             # if the element counts is less than 21,
 55 |             # it means insertion metrics returns multiple line results.
 56 |             if len(contents) < 21:
 57 |                 rows = contents[0]
 58 |             else:
 59 |                 rows = contents
 60 |         else:
 61 |             # other metrics(so far) only return one line results.
 62 |             rows = contents
 63 |         metrics[cell_id].update(
 64 |             {
 65 |                 k: rows[k]
 66 |                 for k in rows
 67 |                 if k not in ["SAMPLE", "LIBRARY", "READ_GROUP", "CATEGORY"]
 68 |             }
 69 |         )
 70 |         df = pd.DataFrame.from_dict(metrics, orient="columns")
 71 |         df.insert(0, "Class", class_name)
 72 |         d = pd.concat([d, df])
 73 |     d_T = d.T
 74 |     d_T.to_csv(output_name + ".csv")
 75 | 
 76 | 
 77 | def write_aggregated_picard_metrics_by_table(file_names, output_name):
 78 |     """Command line entrypoint to parse and write Picard table metrics.
 79 |     Parameters
 80 |     ----------
 81 |     args:
 82 |         file_names: array of files.the basename of inputs should be formated as 'samplename_qc'
 83 |         output_name: prefix of output file name. the basename of outputs
 84 |         includes the Picard metrics class name.
 85 |     Returns
 86 |     ----------
 87 |         return: 0
 88 |         return if the program completes successfully.
 89 |     """
 90 |     for file_name in file_names:
 91 |         cell_id = os.path.basename(file_name).split("_qc")[0]
 92 |         class_name = os.path.basename(file_name).split(".")[1]
 93 |         parsed = picard.parse(file_name)
 94 |         dat = pd.DataFrame.from_dict(parsed["metrics"]["contents"])
 95 |         dat.insert(0, "Sample", cell_id)
 96 |         dat.to_csv(output_name + "_" + class_name + ".csv", index=False)
 97 | 
 98 | 
 99 | def write_aggregated_qc_metrics(file_names, output_name):
100 |     """Command line entrypoint to merge Picard metrics along with RSEM and HISAT2 log
101 |     Parameters
102 |     ----------
103 |     args:
104 |         file_names: array of files,such as Picard row metric, hisat2 metrics.
105 |         output_name: prefix of output file name.
106 |     Returns
107 |     ----------
108 |         return: 0
109 |         return if the program completes successfully.
110 |     """
111 |     df = pd.DataFrame()
112 |     for file_name in file_names:
113 |         dat = pd.read_csv(file_name, index_col=0)
114 |         print(dat.index)
115 |         print(df.head())
116 |         df = pd.concat([df, dat], axis=1, join="outer")
117 |     df.to_csv(output_name + ".csv", index=True)
118 | 
119 | 
120 | def parse_hisat2_log(file_names, output_name):
121 |     """Command line entrypoint parse, aggreagete and write HISAT2 logs
122 |     Parameters
123 |     ----------
124 |     args:
125 |         file_names: array of HISAT2 log files. Basename of file indicates
126 |         the alignment references 'samplename_qc.log' indicates the genome reference and
127 |         'samplename_rsem.log' indicates the transcriptome reference alignment.
128 |         output_name: prefix of output file name.
129 |     Returns
130 |     ----------
131 |         return: 0
132 |         return if the program completes successfully.
133 |     """
134 |     metrics = {}
135 |     tag = "NONE"
136 |     for file_name in file_names:
137 |         if "_qc" in file_name:
138 |             cell_id = os.path.basename(file_name).split("_qc")[0]
139 |             tag = "HISAT2G"
140 |         elif "_rsem" in file_name:
141 |             cell_id = os.path.basename(file_name).split("_rsem")[0]
142 |             tag = "HISAT2T"
143 |         with open(file_name) as f:
144 |             dat = f.readlines()
145 |             d = [x.strip().split(":") for x in dat]
146 |             # remove the first row of each section.
147 |             d.pop(0)
148 |             metrics[cell_id] = {x[0]: x[1].strip().split(" ")[0] for x in d}
149 |     df = pd.DataFrame.from_dict(metrics, orient="columns")
150 |     df.insert(0, "Class", tag)
151 |     df_T = df.T
152 |     df_T.to_csv(output_name + ".csv")
153 | 
154 | 
155 | def parse_rsem_cnt(file_names, output_name):
156 |     """Command line entrypoint parse, aggreagete and write RSEM cnt
157 |     Parameters
158 |     ----------
159 |     args:
160 |         file_names: array of RSEM cnt files. The basename of inputs should be
161 |         'samplename_rsem.cnt'
162 |         output_name: prefix of output file name.
163 |     Returns
164 |     ----------
165 |         return: 0
166 |         return if the program completes successfully.
167 |     """
168 |     metrics = {}
169 |     for file_name in file_names:
170 |         cell_id = os.path.basename(file_name).split("_rsem")[0]
171 |         i = 0
172 |         with open(file_name) as f:
173 |             while i < 3:
174 |                 if i == 0:
175 |                     [N0, N1, N2, N_tot] = f.readline().strip().split(" ")
176 |                 elif i == 1:
177 |                     [n_unique, n_multi, n_uncertain] = f.readline().strip().split(" ")
178 |                 elif i == 2:
179 |                     [n_hits, read_type] = f.readline().strip().split(" ")
180 |                 i = i + 1
181 |         metrics[cell_id] = {
182 |             "unalignable reads": N0,
183 |             "alignable reads": N1,
184 |             "filtered reads": N2,
185 |             "total reads": N_tot,
186 |             "unique aligned": n_unique,
187 |             "multiple mapped": n_multi,
188 |             "total alignments": n_hits,
189 |             "strand": read_type,
190 |             "uncertain reads": n_uncertain,
191 |         }
192 |     df = pd.DataFrame.from_dict(metrics, orient="columns")
193 |     df.insert(0, "Class", "RSEM")
194 |     df_T = df.T
195 |     df_T.to_csv(output_name + ".csv")
196 | 


--------------------------------------------------------------------------------
/src/sctools/metrics/README.md:
--------------------------------------------------------------------------------
 1 | ## Metric Processing
 2 | This module implements a metric suite that generates information on data quality at the level of 
 3 | both cells and genes. This QC information aligns with the cells and genes that make up the 
 4 | expression matrix, providing easy access to information that the user can examine to make decisions
 5 | about which cells or genes are of adequate quality to include in downstream processing. 
 6 | 
 7 | Metric processing in sctools can be run on large individual files, but also implements a map-reduce 
 8 | architecture execution at production scale. Specifically, the workflow is as follows: 
 9 | 
10 | 1. Chunk the input bam file using `SplitBam`, which generates several chunks, each of which is
11 | guaranteed to contain all data for any cell it contains
12 | 2. Sort each chunk by cell, gene, and molecule tags to ensure that all the reads associated with 
13 | a molecule are stored sequentially by cell (`CalculateCellMetrics`) or by gene 
14 | (`CalculateGeneMetrics`)
15 | 3. For each cell or gene, parse the information by molecule, which typically loads fewer than 
16 | 10,000 records into memory at a time. 
17 | 4. Merge data across chunks using `MergeCellMetrics` or `MergeGeneMetrics`.
18 | 
19 | This map-reduce approach is currently implemented by the 
20 | [HCA 3' pipeline](https://github.com/HumanCellAtlas/skylab/blob/master/pipelines/optimus/Optimus.wdl), 
21 | but an abbreviated WDL could be made in the future which would contain: 
22 | 
23 | ```
24 | 1. SplitBamByCellBarcode
25 | 2. scatter[CalculateMetrics]
26 | 3. MergeMetrics
27 | ```
28 | 
29 | ## Implementation Details: 
30 | 
31 | This module implements 4 base classes that carry out metric processing. These are: 
32 | 
33 | ```
34 | MetricAggregator:
35 |   - CellMetricAggregator
36 |   - GeneMetricAggregator
37 | 
38 | MetricGatherer:
39 |   - CellMetricGatherer
40 |   - GeneMetricGatherer
41 |  
42 | MetricCSVWriter
43 | 
44 | MergeMetrics:
45 |   - MergeCellMetrics
46 |   - MergeGeneMetrics
47 | ```
48 | MetricGatherer defines generator functions to group records into molecules, the bam parsing pattern 
49 | necessary to process data iteratively. 
50 | 
51 | MetricAggregator stores the information for a unit of the relevant data (cell, gene), 
52 | and processses all the records with the `.parse_records()` method. 
53 | 
54 | When all records of a single unit (cell, gene) have been processed, `.finalize()` is called to 
55 | calculate any higher-order metrics (for example, the variance in quality scores across reads of the 
56 | cell or gene), and it is written to file by `MetricSCVWriter`.  
57 | 
58 | MergeMetrics merges multiple metric outputs from the scattered chunks. This is a trivial 
59 | concatenation in the case of cell metrics, and a more complex merge in the case of gene metrics. 
60 | 


--------------------------------------------------------------------------------
/src/sctools/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from . import aggregator
3 | from . import gatherer
4 | from . import merge
5 | 


--------------------------------------------------------------------------------
/src/sctools/metrics/gatherer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Sequence Metric Gatherers
  3 | =========================
  4 | 
  5 | ..currentmodule:: sctools.metrics
  6 | 
  7 | This module defines classes to gather metrics across the cells or genes of an experiment and write
  8 | them to gzip-compressed csv files
  9 | 
 10 | Classes
 11 | -------
 12 | 
 13 | .. autosummary::
 14 |    :toctree: generated/
 15 | 
 16 |    MetricGatherer               Gatherer Base Class
 17 |    GatherCellMetrics            Class to gather metrics on all cells in an experiment
 18 |    GatherGeneMetrics            Class to gather metrics on all genes in an experiment
 19 | 
 20 | See Also
 21 | --------
 22 | sctools.metrics.aggregator
 23 | sctools.metrics.merge
 24 | sctools.metrics.writer
 25 | 
 26 | """
 27 | 
 28 | from contextlib import closing
 29 | 
 30 | import pysam
 31 | from typing import Set
 32 | 
 33 | from sctools.bam import iter_cell_barcodes, iter_genes, iter_molecule_barcodes
 34 | from sctools.metrics.aggregator import CellMetrics, GeneMetrics
 35 | from sctools.metrics.writer import MetricCSVWriter
 36 | 
 37 | 
 38 | class MetricGatherer:
 39 |     """Gathers Metrics from an experiment
 40 | 
 41 |     Because molecules tend to have relatively small numbers of reads, the memory footprint of
 42 |     this method is typically small (tens of megabytes).
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     bam_file : str
 47 |         the bam file containing the reads that metrics should be calculated from. Can be a chunk
 48 |         of cells or an entire experiment
 49 |     output_stem : str
 50 |         the file stem for the gzipped csv output
 51 | 
 52 |     Methods
 53 |     -------
 54 |     extract_metrics
 55 |         extracts metrics from ``bam_file`` and writes them to output_stem.csv.gz
 56 | 
 57 |     """
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         bam_file: str,
 62 |         output_stem: str,
 63 |         mitochondrial_gene_ids: Set[str] = set(),
 64 |         compress: bool = True,
 65 |     ):
 66 |         self._bam_file = bam_file
 67 |         self._output_stem = output_stem
 68 |         self._compress = compress
 69 |         self._mitochondrial_gene_ids = mitochondrial_gene_ids
 70 | 
 71 |     @property
 72 |     def bam_file(self) -> str:
 73 |         """the bam file that metrics are generated from"""
 74 |         return self._bam_file
 75 | 
 76 |     def extract_metrics(self, mode="rb") -> None:
 77 |         """extract metrics from the provided bam file and write the results to csv.
 78 | 
 79 |         Parameters
 80 |         ----------
 81 |         mode : {'r', 'rb'}, default 'rb'
 82 |             the open mode for pysam.AlignmentFile. 'r' indicates the input is a sam file, and 'rb'
 83 |             indicates a bam file.
 84 | 
 85 |         """
 86 |         raise NotImplementedError
 87 | 
 88 | 
 89 | class GatherCellMetrics(MetricGatherer):
 90 | 
 91 |     extra_docs = """
 92 |     Notes
 93 |     -----
 94 |     ``bam_file`` must be sorted by gene (``GE``), molecule (``UB``), and cell (``CB``), where gene
 95 |     varies fastest.
 96 | 
 97 |     Examples
 98 |     --------
 99 |     >>> from sctools.metrics.gatherer import GatherCellMetrics
100 |     >>> import os, tempfile
101 | 
102 |     >>> # example data
103 |     >>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam'
104 |     >>> temp_dir = tempfile.mkdtemp()
105 |     >>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True)
106 |     >>> g.extract_metrics()
107 | 
108 |     See Also
109 |     --------
110 |     GatherGeneMetrics
111 | 
112 |     """
113 | 
114 |     __doc__ += extra_docs
115 | 
116 |     def extract_metrics(self, mode: str = "rb") -> None:
117 |         """Extract cell metrics from self.bam_file
118 | 
119 |         Parameters
120 |         ----------
121 |         mode : str, optional
122 |             Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb').
123 | 
124 |         """
125 |         # open the files
126 |         with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing(
127 |             MetricCSVWriter(self._output_stem, self._compress)
128 |         ) as cell_metrics_output:
129 | 
130 |             # write the header
131 |             cell_metrics_output.write_header(vars(CellMetrics()))
132 | 
133 |             # break up the bam file into sub-iterators over cell barcodes
134 |             for cell_iterator, cell_tag in iter_cell_barcodes(
135 |                 bam_iterator=bam_iterator
136 |             ):
137 |                 metric_aggregator = CellMetrics()
138 | 
139 |                 # break up cell barcodes by molecule barcodes
140 |                 for molecule_iterator, molecule_tag in iter_molecule_barcodes(
141 |                     bam_iterator=cell_iterator
142 |                 ):
143 | 
144 |                     # break up molecule barcodes by gene ids
145 |                     for gene_iterator, gene_tag in iter_genes(
146 |                         bam_iterator=molecule_iterator
147 |                     ):
148 | 
149 |                         # process the data
150 |                         metric_aggregator.parse_molecule(
151 |                             tags=(cell_tag, molecule_tag, gene_tag),
152 |                             records=gene_iterator,
153 |                         )
154 | 
155 |                 # write a record for each cell
156 |                 metric_aggregator.finalize(
157 |                     mitochondrial_genes=self._mitochondrial_gene_ids
158 |                 )
159 |                 cell_metrics_output.write(cell_tag, vars(metric_aggregator))
160 | 
161 | 
162 | class GatherGeneMetrics(MetricGatherer):
163 | 
164 |     extra_docs = """
165 |     Notes
166 |     -----
167 |     ``bam_file`` must be sorted by molecule (``UB``), cell (``CB``), and gene (``GE``), where
168 |     molecule varies fastest.
169 | 
170 |     Examples
171 |     --------
172 |     >>> from sctools.metrics.gatherer import GatherCellMetrics
173 |     >>> import os, tempfile
174 | 
175 |     >>> # example data
176 |     >>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam'
177 |     >>> temp_dir = tempfile.mkdtemp()
178 |     >>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True)
179 |     >>> g.extract_metrics()
180 | 
181 |     See Also
182 |     --------
183 |     GatherGeneMetrics
184 | 
185 |     """
186 | 
187 |     __doc__ += extra_docs
188 | 
189 |     def extract_metrics(self, mode: str = "rb") -> None:
190 |         """Extract gene metrics from self.bam_file
191 | 
192 |         Parameters
193 |         ----------
194 |         mode : str, optional
195 |             Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb').
196 | 
197 |         """
198 |         # open the files
199 |         with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing(
200 |             MetricCSVWriter(self._output_stem, self._compress)
201 |         ) as gene_metrics_output:
202 | 
203 |             # write the header
204 |             gene_metrics_output.write_header(vars(GeneMetrics()))
205 | 
206 |             # break up the bam file into sub-iterators over gene ids
207 |             for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator):
208 |                 metric_aggregator = GeneMetrics()
209 | 
210 |                 # in case of multi-genes ignore as in the counting stage
211 |                 if gene_tag and len(gene_tag.split(",")) > 1:
212 |                     continue
213 | 
214 |                 # break up gene ids by cell barcodes
215 |                 for cell_iterator, cell_tag in iter_cell_barcodes(
216 |                     bam_iterator=gene_iterator
217 |                 ):
218 | 
219 |                     # break up cell barcodes by molecular barcodes
220 |                     for molecule_iterator, molecule_tag in iter_molecule_barcodes(
221 |                         bam_iterator=cell_iterator
222 |                     ):
223 | 
224 |                         # process the data
225 |                         metric_aggregator.parse_molecule(
226 |                             tags=(gene_tag, cell_tag, molecule_tag),
227 |                             records=molecule_iterator,
228 |                         )
229 | 
230 |                 # write a record for each gene id
231 |                 metric_aggregator.finalize()
232 |                 gene_metrics_output.write(gene_tag, vars(metric_aggregator))
233 | 


--------------------------------------------------------------------------------
/src/sctools/metrics/merge.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Merge Sequence Metrics
  3 | ======================
  4 | 
  5 | ..currentmodule:: sctools.metrics
  6 | 
  7 | This module defines classes to merge multiple metrics files that have been gathered from bam files
  8 | containing disjoint sets of cells. This is a common use pattern, as sequencing datasets are often
  9 | chunked to enable horizontal scaling using scatter-gather patterns.
 10 | 
 11 | Classes
 12 | -------
 13 | MergeMetrics                    Merge Metrics base class
 14 | MergeCellMetrics                Class to merge cell metrics
 15 | MergeGeneMetrics                Class to merge gene metrics
 16 | 
 17 | See Also
 18 | --------
 19 | sctools.metrics.gatherer
 20 | sctools.metrics.aggregator
 21 | sctools.metrics.writer
 22 | 
 23 | """
 24 | 
 25 | from typing import List, Sequence
 26 | 
 27 | import pandas as pd
 28 | import numpy as np
 29 | 
 30 | 
 31 | class MergeMetrics:
 32 |     """Merges multiple metrics files into a single gzip compressed csv file
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     metric_files : Sequence[str]
 37 |         metrics files to merge
 38 |     output_file : str
 39 |         file name for the merged output
 40 | 
 41 |     Methods
 42 |     -------
 43 |     execute
 44 |         merge metrics files
 45 |         # todo this should probably be wrapped into __init__ to make this more like a function
 46 | 
 47 |     """
 48 | 
 49 |     def __init__(self, metric_files: Sequence[str], output_file: str):
 50 |         self._metric_files = metric_files
 51 |         if not output_file.endswith(".csv.gz"):
 52 |             output_file += ".csv.gz"
 53 |         self._output_file = output_file
 54 | 
 55 |     def execute(self) -> None:
 56 |         raise NotImplementedError  # merge the metrics
 57 | 
 58 | 
 59 | class MergeCellMetrics(MergeMetrics):
 60 |     def execute(self) -> None:
 61 |         """Concatenate input cell metric files
 62 | 
 63 |         Since bam files that metrics are calculated from contain disjoint sets of cells, cell
 64 |         metrics can simply be concatenated together.
 65 | 
 66 |         """
 67 |         metric_dataframes: List[pd.DataFrame] = [
 68 |             pd.read_csv(f, index_col=0) for f in self._metric_files
 69 |         ]
 70 |         concatenated_frame: pd.DataFrame = pd.concat(metric_dataframes, axis=0)
 71 |         concatenated_frame.to_csv(self._output_file, compression="gzip")
 72 | 
 73 | 
 74 | class MergeGeneMetrics(MergeMetrics):
 75 |     def execute(self) -> None:
 76 |         """Merge input gene metric files
 77 | 
 78 |         The bam files that metrics are calculated from contain disjoint sets of cells, each
 79 |         of which can measure the same genes.
 80 |         As a result, the metric values must be summed (count based metrics) averaged over
 81 |         (fractional, averge, or variance metrics) or recalculated (metrics that depend on other
 82 |         metrics).
 83 | 
 84 |         """
 85 | 
 86 |         count_data_to_sum = [
 87 |             "n_reads",
 88 |             "noise_reads",
 89 |             "perfect_molecule_barcodes",
 90 |             "reads_mapped_exonic",
 91 |             "reads_mapped_intronic",
 92 |             "reads_mapped_utr",
 93 |             "reads_mapped_uniquely",
 94 |             "reads_mapped_multiple",
 95 |             "duplicate_reads",
 96 |             "spliced_reads",
 97 |             "antisense_reads",
 98 |             "n_molecules",
 99 |             "n_fragments",
100 |             "fragments_with_single_read_evidence",
101 |             "molecules_with_single_read_evidence",
102 |             "number_cells_detected_multiple",
103 |             "number_cells_expressing",
104 |         ]
105 | 
106 |         sum_operations = {c: "sum" for c in count_data_to_sum}
107 | 
108 |         def weighted_average(data_frame: pd.DataFrame) -> pd.Series:
109 |             """Calculate the average of each metric, weighted by number of reads per chunk
110 | 
111 |             Parameters
112 |             ----------
113 |             data_frame : pd.DataFrame
114 |               chunks x metrics data frame
115 | 
116 |             Returns
117 |             -------
118 |             weighted_average_metrics : pd.Series
119 |                 The average of each metric across chunks, weighted by the number of reads per chunk
120 | 
121 |             """
122 |             weights = data_frame["n_reads"].values
123 | 
124 |             columns_to_average_by_read = [
125 |                 "molecule_barcode_fraction_bases_above_30_mean",
126 |                 "molecule_barcode_fraction_bases_above_30_variance",
127 |                 "genomic_reads_fraction_bases_quality_above_30_mean",
128 |                 "genomic_reads_fraction_bases_quality_above_30_variance",
129 |                 "genomic_read_quality_mean",
130 |                 "genomic_read_quality_variance",
131 |             ]
132 | 
133 |             return pd.Series(
134 |                 {
135 |                     c: np.average(data_frame[c], weights=weights)
136 |                     for c in columns_to_average_by_read
137 |                 }
138 |             )
139 | 
140 |         def recalculate_operation(data_frame) -> pd.DataFrame:
141 |             """Recalculate metrics that are dependent on other metric values
142 | 
143 |             Other metrics should be merged before this function is executed
144 | 
145 |             Parameters
146 |             ----------
147 |             data_frame : pd.DataFrame
148 |                 chunks x metrics data frame
149 | 
150 |             Returns
151 |             -------
152 |             recalculated_metrics : pd.DataFrame
153 |                 data frame containing recalculated metrics
154 | 
155 |             """
156 |             return pd.DataFrame(
157 |                 data={
158 |                     "reads_per_molecule": data_frame["n_reads"]
159 |                     / data_frame["n_molecules"],
160 |                     "fragments_per_molecule": data_frame["n_fragments"]
161 |                     / data_frame["n_molecules"],
162 |                     "reads_per_fragment": data_frame["n_reads"]
163 |                     / data_frame["n_fragments"],
164 |                 }
165 |             )
166 | 
167 |         # pick one file as a nucleus and merge each subsequent dataframe into it
168 |         nucleus = pd.read_csv(self._metric_files[0], index_col=0)
169 |         for filename in self._metric_files[1:]:
170 |             leaf = pd.read_csv(filename, index_col=0)
171 | 
172 |             # concatenate this leaf with the nucleus metrics file
173 |             concatenated = pd.concat([nucleus, leaf], axis=0)
174 | 
175 |             # group all duplicate gene names together
176 |             grouped = concatenated.groupby(level=0, axis=0)
177 | 
178 |             # execute the merging operations
179 |             summed_columns = grouped.agg(sum_operations)
180 |             averaged_columns = grouped.apply(weighted_average)
181 | 
182 |             # stitch the columns back together, add the metrics that need to be recalculated
183 |             merged = pd.concat([summed_columns, averaged_columns], axis=1)
184 |             recalculated_columns = recalculate_operation(merged)
185 |             merged = pd.concat([merged, recalculated_columns], axis=1)
186 | 
187 |             # set as nucleus and continue
188 |             nucleus = merged
189 | 
190 |         # write the data
191 |         nucleus.to_csv(self._output_file, compression="gzip")
192 | 


--------------------------------------------------------------------------------
/src/sctools/metrics/writer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Metric Writers
  3 | ==============
  4 | 
  5 | ..currentmodule:: sctools.metrics
  6 | 
  7 | This module defines a class to write metrics to csv as the data is generated, cell by cell or gene
  8 | by gene. This strategy keeps memory usage low, as no more than a single molecule's worth of sam
  9 | records and one cell or gene's worth of metric data are in-memory at a time.
 10 | 
 11 | Classes
 12 | -------
 13 | MetricCSVWriter                 Class to write metrics to file
 14 | 
 15 | See Also
 16 | --------
 17 | sctools.metrics.gatherer
 18 | sctools.metrics.aggregator
 19 | sctools.metrics.merge
 20 | 
 21 | """
 22 | from typing import TextIO, List, Mapping, Any
 23 | from numbers import Number
 24 | import gzip
 25 | 
 26 | 
 27 | class MetricCSVWriter:
 28 |     """Writes metric information iteratively to (optionally compressed) csv.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     output_stem : str
 33 |         File stem for the output file.
 34 |     compress : bool, optional
 35 |         Whether or not to compress the output file (default = True).
 36 | 
 37 |     Methods
 38 |     -------
 39 |     write_header
 40 |         Write the metric header to file.
 41 |     write
 42 |         Write an array of cell or gene metrics to file.
 43 |     close
 44 |         Close the metric file.
 45 | 
 46 |     """
 47 | 
 48 |     def __init__(self, output_stem: str, compress=True):
 49 | 
 50 |         # check and fix extension:
 51 |         if compress:
 52 |             if not output_stem.endswith(".csv.gz"):
 53 |                 output_stem += ".csv.gz"
 54 |         else:
 55 |             if not output_stem.endswith(".csv"):
 56 |                 output_stem += ".csv"
 57 |         self._filename: str = output_stem
 58 | 
 59 |         # open the file
 60 |         if compress:
 61 |             self._open_fid: TextIO = gzip.open(self._filename, "wt")
 62 |         else:
 63 |             self._open_fid: TextIO = open(self._filename, "w")
 64 |         self._header: List[str] = None
 65 | 
 66 |     @property
 67 |     def filename(self) -> str:
 68 |         """filename with correct suffix added"""
 69 |         return self._filename
 70 | 
 71 |     def write_header(self, record: Mapping[str, Any]) -> None:
 72 |         """Write the metric keys to file, producing the header line of the csv file.
 73 | 
 74 |         Parameters
 75 |         ----------
 76 |         record : Mapping[str, Any]
 77 |             Output of ``vars()`` called on an sctools.metrics.aggregator.MetricAggregator instance,
 78 |             producing a dictionary of keys to metric values.
 79 | 
 80 |         """
 81 |         self._header = list(key for key in record.keys() if not key.startswith("_"))
 82 |         self._open_fid.write("," + ",".join(self._header) + "\n")
 83 | 
 84 |     def write(self, index: str, record: Mapping[str, Number]) -> None:
 85 |         """Write the array of metric values for a cell or gene to file.
 86 | 
 87 |         Parameters
 88 |         ----------
 89 |         index : str
 90 |             The name of the cell or gene that these metrics summarize
 91 |         record : Mapping[str, Number]
 92 |             Output of ``vars()`` called on an sctools.metrics.aggregator.MetricAggregator instance,
 93 |             producing a dictionary of keys to metric values.
 94 | 
 95 |         """
 96 |         ordered_fields = [str(record[k]) for k in self._header]
 97 | 
 98 |         # genes and cells can be None, call repr to convert to string when this induces a TypeError
 99 |         try:
100 |             self._open_fid.write(index + "," + ",".join(ordered_fields) + "\n")
101 |         except TypeError:
102 |             index = repr(index)
103 |             self._open_fid.write(index + "," + ",".join(ordered_fields) + "\n")
104 | 
105 |     def close(self) -> None:
106 |         """Close the metrics file."""
107 |         self._open_fid.close()
108 | 


--------------------------------------------------------------------------------
/src/sctools/reader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Sequence File Iterators
  3 | =======================
  4 | 
  5 | .. currentmodule:: sctools
  6 | 
  7 | This module defines a general iterator and some helper functions for iterating over files
  8 | that contain sequencing data
  9 | 
 10 | Methods
 11 | -------
 12 | infer_open(file_: str, mode: str)
 13 |     helper function that determines the compression type of a file without relying on its extension
 14 | zip_readers(*readers, indices=None)
 15 |     helper function that iterates over one or more readers, optionally extracting only the records
 16 |     that correspond to indices
 17 | 
 18 | Classes
 19 | -------
 20 | Reader          Basic reader that loops over one or more input files.
 21 | 
 22 | See Also
 23 | --------
 24 | sctools.gtf.Reader
 25 | sctools.fastq.Reader
 26 | 
 27 | """
 28 | 
 29 | import os
 30 | import gzip
 31 | import bz2
 32 | from copy import copy
 33 | from functools import partial
 34 | from typing import Callable, Iterable, Generator, Set, List
 35 | 
 36 | 
 37 | def infer_open(file_: str, mode: str) -> Callable:
 38 |     """Helper function to infer the correct compression type of an input file
 39 | 
 40 |     Identifies files that are .gz or .bz2 compressed without requiring file extensions
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     file_ : str
 45 |         the file to open
 46 |     mode : {'r', 'rb'}
 47 |         the mode to open the file in. 'r' returns strings, 'rb' returns bytes
 48 | 
 49 |     Returns
 50 |     -------
 51 |     open_function : Callable
 52 |         the correct open function for the file's compression with mode pre-set through functools
 53 |         partial
 54 | 
 55 |     """
 56 |     with open(file_, "rb") as f:
 57 |         data: bytes = f.read(3)
 58 | 
 59 |         # gz and bzip treat 'r' = bytes, 'rt' = string
 60 |         if data[:2] == b"\x1f\x8b":  # gzip magic number
 61 |             inferred_openhook: Callable = gzip.open
 62 |             inferred_mode: str = "rt" if mode == "r" else mode
 63 | 
 64 |         elif data == b"BZh":  # bz2 magic number
 65 |             inferred_openhook: Callable = bz2.open
 66 |             inferred_mode: str = "rt" if mode == "r" else mode
 67 | 
 68 |         else:
 69 |             inferred_openhook: Callable = open
 70 |             inferred_mode: str = mode
 71 | 
 72 |     return partial(inferred_openhook, mode=inferred_mode)
 73 | 
 74 | 
 75 | class Reader:
 76 |     """Basic reader object that seamlessly loops over multiple input files.
 77 | 
 78 |     Is subclassed to create readers for specific file types (e.g. fastq, gtf, etc.)
 79 | 
 80 |     Parameters
 81 |     ----------
 82 |     files : Union[str, List], optional
 83 |         The file(s) to read. If '-', read sys.stdin (default = '-')
 84 |     mode : {'r', 'rb'}, optional
 85 |         The open mode for files. If 'r', yield string data, if 'rb', yield bytes data
 86 |         (default = 'r').
 87 |     header_comment_char : str, optional
 88 |         If not None, skip lines beginning with this character (default = None).
 89 | 
 90 |     """
 91 | 
 92 |     def __init__(self, files="-", mode="r", header_comment_char=None):
 93 |         if isinstance(files, str):
 94 |             self._files = [files]
 95 |         elif isinstance(files, Iterable):  # test items of iterable
 96 |             files = list(files)
 97 |             if all(isinstance(f, str) for f in files):
 98 |                 self._files = files
 99 |             else:
100 |                 raise TypeError("All passed files must be type str")
101 |         else:
102 |             raise TypeError("Files must be a string filename or a list of such names.")
103 | 
104 |         # set open mode:
105 |         if mode not in {"r", "rb"}:
106 |             raise ValueError("Mode must be one of 'r', 'rb'")
107 |         self._mode = mode
108 | 
109 |         if isinstance(header_comment_char, str) and mode == "rb":
110 |             self._header_comment_char = header_comment_char.encode()
111 |         else:
112 |             self._header_comment_char = header_comment_char
113 | 
114 |     @property
115 |     def filenames(self) -> List[str]:
116 |         return self._files
117 | 
118 |     def __len__(self):
119 |         """Return the length of the Reader object.
120 | 
121 |         Notes
122 |         -----
123 |         This function requires reading the complete file, and should typically not be
124 |         used with sys.stdin, as it will consume the input.
125 | 
126 |         """
127 |         return sum(1 for _ in self)
128 | 
129 |     def __iter__(self):
130 |         for file_ in self._files:
131 | 
132 |             f = infer_open(file_, self._mode)(file_)
133 | 
134 |             # iterate over the file, dropping header lines if requested
135 |             try:
136 |                 file_iterator = iter(f)
137 |                 if self._header_comment_char is not None:
138 |                     first_record = next(file_iterator)
139 |                     while first_record.startswith(self._header_comment_char):
140 |                         first_record = next(file_iterator)
141 | 
142 |                     yield first_record  # avoid loss of first non-comment line
143 | 
144 |                 for record in file_iterator:  # now, run to exhaustion
145 |                     yield record
146 |             finally:  # clean up
147 |                 f.close()
148 | 
149 |     @property
150 |     def size(self) -> int:
151 |         """return the collective size of all files being read in bytes"""
152 |         return sum(os.stat(f).st_size for f in self._files)
153 | 
154 |     def select_record_indices(self, indices: Set) -> Generator:
155 |         """Iterate over provided indices only, skipping other records.
156 | 
157 |         Parameters
158 |         ----------
159 |         indices : Set[int]
160 |             indices to include in the output
161 | 
162 |         Yields
163 |         ------
164 |         record, str
165 |             records from file corresponding to indices
166 | 
167 |         """
168 |         indices = copy(
169 |             indices
170 |         )  # passed indices is a reference, need own copy to modify
171 |         for idx, record in enumerate(self):
172 |             if idx in indices:
173 |                 yield record
174 |                 indices.remove(idx)
175 | 
176 |                 # stopping condition
177 |                 if not indices:
178 |                     break
179 | 
180 | 
181 | def zip_readers(*readers, indices=None) -> Generator:
182 |     """Zip together multiple reader objects, yielding records simultaneously.
183 | 
184 |     If indices is passed, only return lines in file that correspond to indices
185 | 
186 |     Parameters
187 |     ----------
188 |     *readers : List[Reader]
189 |         Reader objects to simultaneously iterate over
190 |     indices : Set[int], optional
191 |         indices to include in the output
192 | 
193 |     Yields
194 |     ------
195 |     records : Tuple[str]
196 |         one record per reader passed
197 | 
198 |     """
199 |     if indices:
200 |         iterators = zip(*(r.select_record_indices(indices) for r in readers))
201 |     else:
202 |         iterators = zip(*readers)
203 |     for record_tuple in iterators:
204 |         yield record_tuple
205 | 


--------------------------------------------------------------------------------
/src/sctools/stats.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Statistics Functions for Sequence Data Analysis
  3 | ===============================================
  4 | 
  5 | .. currentmodule:: sctools
  6 | 
  7 | This module implements statistical modules for sequence analysis
  8 | 
  9 | Methods
 10 | -------
 11 | base4_entropy(x: np.array, axis: int=1)
 12 |     calculate the entropy of a 4 x sequence length base frequency matrix
 13 | 
 14 | Classes
 15 | -------
 16 | OnlineGaussianSuficientStatistic        Empirical (online) calculation of mean and variance
 17 | 
 18 | """
 19 | 
 20 | from typing import Tuple
 21 | import numpy as np
 22 | 
 23 | 
 24 | def base4_entropy(x, axis=1):
 25 |     """Calculate entropy in base four of a data matrix x
 26 | 
 27 |     Useful for measuring DNA entropy (with 4 nucleotides) as the output is restricted to [0, 1]
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     x : np.ndarray
 32 |         array of dimension one or more containing numeric types
 33 |     axis : int, optional
 34 |         axis to calculate entropy across. Values in this axis are treated as observation frequencies
 35 | 
 36 |     Returns
 37 |     -------
 38 |     entropy : np.ndarray
 39 |         array of input dimension - 1 containin entropy values bounded in [0, 1]
 40 | 
 41 |     """
 42 | 
 43 |     # convert to probabilities
 44 |     if axis == 1:
 45 |         x = np.divide(x, np.sum(x, axis=axis)[:, None])
 46 |     else:
 47 |         x = np.divide(x, np.sum(x, axis=axis))
 48 | 
 49 |     with np.errstate(divide="ignore"):
 50 |         r = np.log(x) / np.log(4)
 51 | 
 52 |     # convention: 0 * log(0) = 0, != -INF.
 53 |     r[np.isinf(r)] = 0
 54 | 
 55 |     return np.abs(-1 * np.sum(x * r, axis=axis))
 56 | 
 57 | 
 58 | class OnlineGaussianSufficientStatistic:
 59 |     """
 60 |     Implementation of Welford's online mean and variance algorithm
 61 | 
 62 |     Methods
 63 |     -------
 64 |     update(new_value: float)
 65 |         incorporate new_value into the online estimate of mean and variance
 66 |     mean()
 67 |         return the mean value
 68 |     calculate_variance()
 69 |         calculate and return the variance
 70 |     mean_and_variance()
 71 |         return both mean and variance
 72 | 
 73 |     """
 74 | 
 75 |     __slots__ = ["_count", "_mean", "_mean_squared_error"]
 76 | 
 77 |     def __init__(self):
 78 |         self._mean_squared_error: float = 0.0
 79 |         self._mean: float = 0.0
 80 |         self._count: int = 0
 81 | 
 82 |     def update(self, new_value: float) -> None:
 83 |         self._count += 1
 84 |         delta = new_value - self._mean
 85 |         self._mean += delta / self._count
 86 |         delta2 = new_value - self._mean
 87 |         self._mean_squared_error += delta * delta2
 88 | 
 89 |     @property
 90 |     def mean(self) -> float:
 91 |         """return the mean value"""
 92 |         return self._mean
 93 | 
 94 |     def calculate_variance(self):
 95 |         """calculate and return the variance"""
 96 |         if self._count < 2:
 97 |             return float("nan")
 98 |         else:
 99 |             return self._mean_squared_error / (self._count - 1)
100 | 
101 |     def mean_and_variance(self) -> Tuple[float, float]:
102 |         """calculate and return the mean and variance"""
103 |         return self.mean, self.calculate_variance()
104 | 


--------------------------------------------------------------------------------
/src/sctools/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/__init__.py


--------------------------------------------------------------------------------
/src/sctools/test/data/cell-gene-umi-queryname-sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/cell-gene-umi-queryname-sorted.bam


--------------------------------------------------------------------------------
/src/sctools/test/data/cell-sorted-missing-cb.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/cell-sorted-missing-cb.bam


--------------------------------------------------------------------------------
/src/sctools/test/data/cell-sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/cell-sorted.bam


--------------------------------------------------------------------------------
/src/sctools/test/data/cell_metrics_missing_cb.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/cell_metrics_missing_cb.csv.gz


--------------------------------------------------------------------------------
/src/sctools/test/data/chr1.30k_records.gtf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/chr1.30k_records.gtf.gz


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/expected_picard_group.csv:
--------------------------------------------------------------------------------
1 | ,BAD_CYCLES.FIRST_OF_PAIR,BAD_CYCLES.PAIR,BAD_CYCLES.SECOND_OF_PAIR,MEAN_READ_LENGTH.FIRST_OF_PAIR,MEAN_READ_LENGTH.PAIR,MEAN_READ_LENGTH.SECOND_OF_PAIR,PCT_ADAPTER.FIRST_OF_PAIR,PCT_ADAPTER.PAIR,PCT_ADAPTER.SECOND_OF_PAIR,PCT_CHIMERAS.FIRST_OF_PAIR,PCT_CHIMERAS.PAIR,PCT_CHIMERAS.SECOND_OF_PAIR,PCT_PF_READS.FIRST_OF_PAIR,PCT_PF_READS.PAIR,PCT_PF_READS.SECOND_OF_PAIR,PCT_PF_READS_ALIGNED.FIRST_OF_PAIR,PCT_PF_READS_ALIGNED.PAIR,PCT_PF_READS_ALIGNED.SECOND_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.PAIR,PCT_PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.PAIR,PCT_READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,PF_ALIGNED_BASES.FIRST_OF_PAIR,PF_ALIGNED_BASES.PAIR,PF_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_BASES.PAIR,PF_HQ_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.PAIR,PF_HQ_ALIGNED_Q20_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_READS.FIRST_OF_PAIR,PF_HQ_ALIGNED_READS.PAIR,PF_HQ_ALIGNED_READS.SECOND_OF_PAIR,PF_HQ_ERROR_RATE.FIRST_OF_PAIR,PF_HQ_ERROR_RATE.PAIR,PF_HQ_ERROR_RATE.SECOND_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.FIRST_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.PAIR,PF_HQ_MEDIAN_MISMATCHES.SECOND_OF_PAIR,PF_INDEL_RATE.FIRST_OF_PAIR,PF_INDEL_RATE.PAIR,PF_INDEL_RATE.SECOND_OF_PAIR,PF_MISMATCH_RATE.FIRST_OF_PAIR,PF_MISMATCH_RATE.PAIR,PF_MISMATCH_RATE.SECOND_OF_PAIR,PF_NOISE_READS.FIRST_OF_PAIR,PF_NOISE_READS.PAIR,PF_NOISE_READS.SECOND_OF_PAIR,PF_READS.FIRST_OF_PAIR,PF_READS.PAIR,PF_READS.SECOND_OF_PAIR,PF_READS_ALIGNED.FIRST_OF_PAIR,PF_READS_ALIGNED.PAIR,PF_READS_ALIGNED.SECOND_OF_PAIR,PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PF_READS_IMPROPER_PAIRS.PAIR,PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,READS_ALIGNED_IN_PAIRS.PAIR,READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,STRAND_BALANCE.FIRST_OF_PAIR,STRAND_BALANCE.PAIR,STRAND_BALANCE.SECOND_OF_PAIR,TOTAL_READS.FIRST_OF_PAIR,TOTAL_READS.PAIR,TOTAL_READS.SECOND_OF_PAIR,MAX_INSERT_SIZE,MEAN_INSERT_SIZE,MEDIAN_ABSOLUTE_DEVIATION,MEDIAN_INSERT_SIZE,MIN_INSERT_SIZE,PAIR_ORIENTATION,READ_PAIRS,STANDARD_DEVIATION,WIDTH_OF_10_PERCENT,WIDTH_OF_20_PERCENT,WIDTH_OF_30_PERCENT,WIDTH_OF_40_PERCENT,WIDTH_OF_50_PERCENT,WIDTH_OF_60_PERCENT,WIDTH_OF_70_PERCENT,WIDTH_OF_80_PERCENT,WIDTH_OF_90_PERCENT,WIDTH_OF_99_PERCENT,ESTIMATED_LIBRARY_SIZE,PERCENT_DUPLICATION,READ_PAIRS_EXAMINED,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,SECONDARY_OR_SUPPLEMENTARY_RDS,UNMAPPED_READS,UNPAIRED_READS_EXAMINED,UNPAIRED_READ_DUPLICATES,CODING_BASES,CORRECT_STRAND_READS,IGNORED_READS,INCORRECT_STRAND_READS,INTERGENIC_BASES,INTRONIC_BASES,MEDIAN_3PRIME_BIAS,MEDIAN_5PRIME_BIAS,MEDIAN_5PRIME_TO_3PRIME_BIAS,MEDIAN_CV_COVERAGE,NUM_R1_TRANSCRIPT_STRAND_READS,NUM_R2_TRANSCRIPT_STRAND_READS,NUM_UNEXPLAINED_READS,PCT_CODING_BASES,PCT_CORRECT_STRAND_READS,PCT_INTERGENIC_BASES,PCT_INTRONIC_BASES,PCT_MRNA_BASES,PCT_R1_TRANSCRIPT_STRAND_READS,PCT_R2_TRANSCRIPT_STRAND_READS,PCT_RIBOSOMAL_BASES,PCT_USABLE_BASES,PCT_UTR_BASES,PF_ALIGNED_BASES,PF_BASES,RIBOSOMAL_BASES,UTR_BASES,ACCUMULATION_LEVEL,ALIGNED_READS,AT_DROPOUT,GC_DROPOUT,GC_NC_0_19,GC_NC_20_39,GC_NC_40_59,GC_NC_60_79,GC_NC_80_100,READS_USED,TOTAL_CLUSTERS,WINDOW_SIZE
2 | Class,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics
3 | test,0.0,0.0,0.0,25.0,25.0,25.0,0.0,0.0,0.0,0.006141,0.006153,0.006165,1.0,1.0,1.0,0.959299,0.956379,0.953459,0.036149,0.033206,0.030245,0.966514,0.969466,0.972435,131124.0,261405.0,130281.0,116063.0,231550.0,115487.0,115095.0,229110.0,114015.0,4650.0,9279.0,4629.0,0.000922,0.000885,0.000849,0.0,0.0,0.0,6.9e-05,5.4e-05,3.8e-05,0.0009,0.000876,0.000852,0.0,0.0,0.0,5479.0,10958.0,5479.0,5256.0,10480.0,5224.0,190.0,348.0,158.0,5080.0,10160.0,5080.0,0.494292,0.501527,0.508806,5479.0,10958.0,5479.0,2725787,207.219528,63,191,33,FR,5067,106.256303,25,49,73,99,127,157,195,267,641,87835,612743.0,0.007156,5080.0,21.0,0.0,4393.0,478.0,320.0,33.0,56934.0,0.0,0.0,0.0,65569.0,101238.0,0.705663,0.680576,0.496023,0.939679,719.0,795.0,60.0,0.2178,0.0,0.250833,0.387284,0.361883,0.474901,0.525099,0.0,0.345311,0.144083,261405.0,273950.0,0.0,37664.0,All Reads,14873,10.733266,1.82225,0.112713,0.817807,1.086361,2.181453,0.143318,ALL,7701,100
4 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_hisat2.csv:
--------------------------------------------------------------------------------
1 | ,Aligned 0 time,Aligned 1 time,Aligned >1 times,Aligned concordantly 1 time,Aligned concordantly >1 times,Aligned concordantly or discordantly 0 time,Aligned discordantly 1 time,Overall alignment rate,Total pairs,Total unpaired reads
2 | Class,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G
3 | test,478,240,106,4414,652,412,1,95.64%,5479,824
4 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_hisat2_paired_end_qc.log:
--------------------------------------------------------------------------------
 1 | HISAT2 summary stats:
 2 | 	Total pairs: 5479
 3 | 		Aligned concordantly or discordantly 0 time: 412 (7.52%)
 4 | 		Aligned concordantly 1 time: 4414 (80.56%)
 5 | 		Aligned concordantly >1 times: 652 (11.90%)
 6 | 		Aligned discordantly 1 time: 1 (0.02%)
 7 | 	Total unpaired reads: 824
 8 | 		Aligned 0 time: 478 (58.01%)
 9 | 		Aligned 1 time: 240 (29.13%)
10 | 		Aligned >1 times: 106 (12.86%)
11 | 	Overall alignment rate: 95.64%
12 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_hisat2_trans.csv:
--------------------------------------------------------------------------------
1 | ,Aligned 0 time,Aligned 1 time,Aligned >1 times,Aligned concordantly 1 time,Aligned concordantly >1 times,Aligned concordantly or discordantly 0 time,Aligned discordantly 1 time,Overall alignment rate,Total pairs,Total unpaired reads
2 | Class,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T
3 | test,7270,0,0,360,1484,3635,0,33.66%,5479,7270
4 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_hisat2_transcriptome_rsem.log:
--------------------------------------------------------------------------------
 1 | HISAT2 summary stats:
 2 | 	Total pairs: 5479
 3 | 		Aligned concordantly or discordantly 0 time: 3635 (66.34%)
 4 | 		Aligned concordantly 1 time: 360 (6.57%)
 5 | 		Aligned concordantly >1 times: 1484 (27.09%)
 6 | 		Aligned discordantly 1 time: 0 (0.00%)
 7 | 	Total unpaired reads: 7270
 8 | 		Aligned 0 time: 7270 (100.00%)
 9 | 		Aligned 1 time: 0 (0.00%)
10 | 		Aligned >1 times: 0 (0.00%)
11 | 	Overall alignment rate: 33.66%
12 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_picard_group.csv:
--------------------------------------------------------------------------------
1 | ,BAD_CYCLES.FIRST_OF_PAIR,BAD_CYCLES.PAIR,BAD_CYCLES.SECOND_OF_PAIR,MEAN_READ_LENGTH.FIRST_OF_PAIR,MEAN_READ_LENGTH.PAIR,MEAN_READ_LENGTH.SECOND_OF_PAIR,PCT_ADAPTER.FIRST_OF_PAIR,PCT_ADAPTER.PAIR,PCT_ADAPTER.SECOND_OF_PAIR,PCT_CHIMERAS.FIRST_OF_PAIR,PCT_CHIMERAS.PAIR,PCT_CHIMERAS.SECOND_OF_PAIR,PCT_PF_READS.FIRST_OF_PAIR,PCT_PF_READS.PAIR,PCT_PF_READS.SECOND_OF_PAIR,PCT_PF_READS_ALIGNED.FIRST_OF_PAIR,PCT_PF_READS_ALIGNED.PAIR,PCT_PF_READS_ALIGNED.SECOND_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.PAIR,PCT_PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.PAIR,PCT_READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,PF_ALIGNED_BASES.FIRST_OF_PAIR,PF_ALIGNED_BASES.PAIR,PF_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_BASES.PAIR,PF_HQ_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.PAIR,PF_HQ_ALIGNED_Q20_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_READS.FIRST_OF_PAIR,PF_HQ_ALIGNED_READS.PAIR,PF_HQ_ALIGNED_READS.SECOND_OF_PAIR,PF_HQ_ERROR_RATE.FIRST_OF_PAIR,PF_HQ_ERROR_RATE.PAIR,PF_HQ_ERROR_RATE.SECOND_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.FIRST_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.PAIR,PF_HQ_MEDIAN_MISMATCHES.SECOND_OF_PAIR,PF_INDEL_RATE.FIRST_OF_PAIR,PF_INDEL_RATE.PAIR,PF_INDEL_RATE.SECOND_OF_PAIR,PF_MISMATCH_RATE.FIRST_OF_PAIR,PF_MISMATCH_RATE.PAIR,PF_MISMATCH_RATE.SECOND_OF_PAIR,PF_NOISE_READS.FIRST_OF_PAIR,PF_NOISE_READS.PAIR,PF_NOISE_READS.SECOND_OF_PAIR,PF_READS.FIRST_OF_PAIR,PF_READS.PAIR,PF_READS.SECOND_OF_PAIR,PF_READS_ALIGNED.FIRST_OF_PAIR,PF_READS_ALIGNED.PAIR,PF_READS_ALIGNED.SECOND_OF_PAIR,PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PF_READS_IMPROPER_PAIRS.PAIR,PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,READS_ALIGNED_IN_PAIRS.PAIR,READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,STRAND_BALANCE.FIRST_OF_PAIR,STRAND_BALANCE.PAIR,STRAND_BALANCE.SECOND_OF_PAIR,TOTAL_READS.FIRST_OF_PAIR,TOTAL_READS.PAIR,TOTAL_READS.SECOND_OF_PAIR,MAX_INSERT_SIZE,MEAN_INSERT_SIZE,MEDIAN_ABSOLUTE_DEVIATION,MEDIAN_INSERT_SIZE,MIN_INSERT_SIZE,PAIR_ORIENTATION,READ_PAIRS,STANDARD_DEVIATION,WIDTH_OF_10_PERCENT,WIDTH_OF_20_PERCENT,WIDTH_OF_30_PERCENT,WIDTH_OF_40_PERCENT,WIDTH_OF_50_PERCENT,WIDTH_OF_60_PERCENT,WIDTH_OF_70_PERCENT,WIDTH_OF_80_PERCENT,WIDTH_OF_90_PERCENT,WIDTH_OF_99_PERCENT,ESTIMATED_LIBRARY_SIZE,PERCENT_DUPLICATION,READ_PAIRS_EXAMINED,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,SECONDARY_OR_SUPPLEMENTARY_RDS,UNMAPPED_READS,UNPAIRED_READS_EXAMINED,UNPAIRED_READ_DUPLICATES,CODING_BASES,CORRECT_STRAND_READS,IGNORED_READS,INCORRECT_STRAND_READS,INTERGENIC_BASES,INTRONIC_BASES,MEDIAN_3PRIME_BIAS,MEDIAN_5PRIME_BIAS,MEDIAN_5PRIME_TO_3PRIME_BIAS,MEDIAN_CV_COVERAGE,NUM_R1_TRANSCRIPT_STRAND_READS,NUM_R2_TRANSCRIPT_STRAND_READS,NUM_UNEXPLAINED_READS,PCT_CODING_BASES,PCT_CORRECT_STRAND_READS,PCT_INTERGENIC_BASES,PCT_INTRONIC_BASES,PCT_MRNA_BASES,PCT_R1_TRANSCRIPT_STRAND_READS,PCT_R2_TRANSCRIPT_STRAND_READS,PCT_RIBOSOMAL_BASES,PCT_USABLE_BASES,PCT_UTR_BASES,PF_ALIGNED_BASES,PF_BASES,RIBOSOMAL_BASES,UTR_BASES,ACCUMULATION_LEVEL,ALIGNED_READS,AT_DROPOUT,GC_DROPOUT,GC_NC_0_19,GC_NC_20_39,GC_NC_40_59,GC_NC_60_79,GC_NC_80_100,READS_USED,TOTAL_CLUSTERS,WINDOW_SIZE
2 | Class,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics
3 | test,0.0,0.0,0.0,25.0,25.0,25.0,0.0,0.0,0.0,0.006141,0.006153,0.006165,1.0,1.0,1.0,0.959299,0.956379,0.953459,0.036149,0.033206,0.030245,0.966514,0.969466,0.972435,131124.0,261405.0,130281.0,116063.0,231550.0,115487.0,115095.0,229110.0,114015.0,4650.0,9279.0,4629.0,0.000922,0.000885,0.000849,0.0,0.0,0.0,6.9e-05,5.4e-05,3.8e-05,0.0009,0.000876,0.000852,0.0,0.0,0.0,5479.0,10958.0,5479.0,5256.0,10480.0,5224.0,190.0,348.0,158.0,5080.0,10160.0,5080.0,0.494292,0.501527,0.508806,5479.0,10958.0,5479.0,2725787,207.219528,63,191,33,FR,5067,106.256303,25,49,73,99,127,157,195,267,641,87835,612743.0,0.007156,5080.0,21.0,0.0,4393.0,478.0,320.0,33.0,56934.0,0.0,0.0,0.0,65569.0,101238.0,0.705663,0.680576,0.496023,0.939679,719.0,795.0,60.0,0.2178,0.0,0.250833,0.387284,0.361883,0.474901,0.525099,0.0,0.345311,0.144083,261405.0,273950.0,0.0,37664.0,All Reads,14873,10.733266,1.82225,0.112713,0.817807,1.086361,2.181453,0.143318,ALL,7701,100
4 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_qc.alignment_summary_metrics.txt:
--------------------------------------------------------------------------------
 1 | ## htsjdk.samtools.metrics.StringHeader
 2 | # CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa    STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
 3 | ## htsjdk.samtools.metrics.StringHeader
 4 | # Started on: Thu Aug 30 20:52:21 UTC 2018
 5 | 
 6 | ## METRICS CLASS	picard.analysis.AlignmentSummaryMetrics
 7 | CATEGORY	TOTAL_READS	PF_READS	PCT_PF_READS	PF_NOISE_READS	PF_READS_ALIGNED	PCT_PF_READS_ALIGNED	PF_ALIGNED_BASES	PF_HQ_ALIGNED_READS	PF_HQ_ALIGNED_BASES	PF_HQ_ALIGNED_Q20_BASES	PF_HQ_MEDIAN_MISMATCHES	PF_MISMATCH_RATE	PF_HQ_ERROR_RATE	PF_INDEL_RATE	MEAN_READ_LENGTH	READS_ALIGNED_IN_PAIRS	PCT_READS_ALIGNED_IN_PAIRS	PF_READS_IMPROPER_PAIRS	PCT_PF_READS_IMPROPER_PAIRS	BAD_CYCLES	STRAND_BALANCE	PCT_CHIMERAS	PCT_ADAPTER	SAMPLE	LIBRARY	READ_GROUP
 8 | FIRST_OF_PAIR	5479	5479	1	0	5256	0.959299	131124	4650	116063	115095	0	0.0009	0.000922	0.000069	25	5080	0.966514	190	0.036149	0	0.494292	0.006141	0			
 9 | SECOND_OF_PAIR	5479	5479	1	0	5224	0.953459	130281	4629	115487	114015	0	0.000852	0.000849	0.000038	25	5080	0.972435	158	0.030245	0	0.508806	0.006165	0			
10 | PAIR	10958	10958	1	0	10480	0.956379	261405	9279	231550	229110	0	0.000876	0.000885	0.000054	25	10160	0.969466	348	0.033206	0	0.501527	0.006153	0			
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_qc.duplicate_metrics.txt:
--------------------------------------------------------------------------------
1 | ## htsjdk.samtools.metrics.StringHeader
2 | # MarkDuplicates INPUT=[/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam] OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.MarkDuplicated.bam METRICS_FILE=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT    MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
3 | ## htsjdk.samtools.metrics.StringHeader
4 | # Started on: Thu Aug 30 20:51:46 UTC 2018
5 | 
6 | ## METRICS CLASS	picard.sam.DuplicationMetrics
7 | LIBRARY	UNPAIRED_READS_EXAMINED	READ_PAIRS_EXAMINED	SECONDARY_OR_SUPPLEMENTARY_RDS	UNMAPPED_READS	UNPAIRED_READ_DUPLICATES	READ_PAIR_DUPLICATES	READ_PAIR_OPTICAL_DUPLICATES	PERCENT_DUPLICATION	ESTIMATED_LIBRARY_SIZE
8 | d20fb2dd-3d98-4516-a648-dee5e1917bd7	320	5080	4393	478	33	21	0	0.007156	612743
9 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_qc.error_summary_metrics.txt:
--------------------------------------------------------------------------------
 1 | ## htsjdk.samtools.metrics.StringHeader
 2 | # CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa    STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
 3 | ## htsjdk.samtools.metrics.StringHeader
 4 | # Started on: Thu Aug 30 20:52:21 UTC 2018
 5 | 
 6 | ## METRICS CLASS	picard.analysis.artifacts.ErrorSummaryMetrics
 7 | REF_BASE	ALT_BASE	SUBSTITUTION	REF_COUNT	ALT_COUNT	SUBSTITUTION_RATE
 8 | A	C	A>C	231512	16	0.000069
 9 | A	G	A>G	231512	156	0.000673
10 | A	T	A>T	231512	16	0.000069
11 | C	A	C>A	173880	16	0.000092
12 | C	G	C>G	173880	14	0.000081
13 | C	T	C>T	173880	82	0.000471
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_qc.gc_bias.summary_metrics.txt:
--------------------------------------------------------------------------------
 1 | ## htsjdk.samtools.metrics.StringHeader
 2 | # CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa    STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
 3 | ## htsjdk.samtools.metrics.StringHeader
 4 | # Started on: Thu Aug 30 20:52:21 UTC 2018
 5 | 
 6 | ## METRICS CLASS	picard.analysis.GcBiasSummaryMetrics
 7 | ACCUMULATION_LEVEL	READS_USED	WINDOW_SIZE	TOTAL_CLUSTERS	ALIGNED_READS	AT_DROPOUT	GC_DROPOUT	GC_NC_0_19	GC_NC_20_39	GC_NC_40_59	GC_NC_60_79	GC_NC_80_100	SAMPLE	LIBRARY	READ_GROUP
 8 | All Reads	ALL	100	7701	14873	10.733266	1.82225	0.112713	0.817807	1.086361	2.181453	0.143318			
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_qc.insert_size_metrics.txt:
--------------------------------------------------------------------------------
1 | ## htsjdk.samtools.metrics.StringHeader
2 | # CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa    STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
3 | ## htsjdk.samtools.metrics.StringHeader
4 | # Started on: Thu Aug 30 20:52:21 UTC 2018
5 | 
6 | ## METRICS CLASS	picard.analysis.InsertSizeMetrics
7 | MEDIAN_INSERT_SIZE	MEDIAN_ABSOLUTE_DEVIATION	MIN_INSERT_SIZE	MAX_INSERT_SIZE	MEAN_INSERT_SIZE	STANDARD_DEVIATION	READ_PAIRS	PAIR_ORIENTATION	WIDTH_OF_10_PERCENT	WIDTH_OF_20_PERCENT	WIDTH_OF_30_PERCENT	WIDTH_OF_40_PERCENT	WIDTH_OF_50_PERCENT	WIDTH_OF_60_PERCENT	WIDTH_OF_70_PERCENT	WIDTH_OF_80_PERCENT	WIDTH_OF_90_PERCENT	WIDTH_OF_99_PERCENT	SAMPLE	LIBRARY	READ_GROUP
8 | 191	63	33	2725787	207.219528	106.256303	5067	FR	25	49	73	99	127	157	195	267	641	87835			
9 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_qc.rna_metrics.txt:
--------------------------------------------------------------------------------
1 | ## htsjdk.samtools.metrics.StringHeader
2 | # CollectRnaSeqMetrics REF_FLAT=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT    MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
3 | ## htsjdk.samtools.metrics.StringHeader
4 | # Started on: Thu Aug 30 20:51:55 UTC 2018
5 | 
6 | ## METRICS CLASS	picard.analysis.RnaSeqMetrics
7 | PF_BASES	PF_ALIGNED_BASES	RIBOSOMAL_BASES	CODING_BASES	UTR_BASES	INTRONIC_BASES	INTERGENIC_BASES	IGNORED_READS	CORRECT_STRAND_READS	INCORRECT_STRAND_READS	NUM_R1_TRANSCRIPT_STRAND_READS	NUM_R2_TRANSCRIPT_STRAND_READS	NUM_UNEXPLAINED_READS	PCT_R1_TRANSCRIPT_STRAND_READS	PCT_R2_TRANSCRIPT_STRAND_READS	PCT_RIBOSOMAL_BASES	PCT_CODING_BASES	PCT_UTR_BASES	PCT_INTRONIC_BASES	PCT_INTERGENIC_BASES	PCT_MRNA_BASES	PCT_USABLE_BASES	PCT_CORRECT_STRAND_READS	MEDIAN_CV_COVERAGE	MEDIAN_5PRIME_BIAS	MEDIAN_3PRIME_BIAS	MEDIAN_5PRIME_TO_3PRIME_BIAS	SAMPLE	LIBRARY	READ_GROUP
8 | 273950	261405	0	56934	37664	101238	65569	0	0	0	719	795	60	0.474901	0.525099	0	0.2178	0.144083	0.387284	0.250833	0.361883	0.345311	0	0.939679	0.680576	0.705663	0.496023			
9 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_rsem.cnt:
--------------------------------------------------------------------------------
 1 | 3635 1844 0 5479
 2 | 1652 192 1484
 3 | 6599 3
 4 | 0	3635
 5 | 1	360
 6 | 2	327
 7 | 3	416
 8 | 4	243
 9 | 5	185
10 | 6	85
11 | 7	76
12 | 8	53
13 | 9	16
14 | 10	83
15 | Inf	0
16 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics/test_rsem.csv:
--------------------------------------------------------------------------------
1 | ,alignable reads,filtered reads,multiple mapped,strand,total alignments,total reads,unalignable reads,uncertain reads,unique aligned
2 | Class,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM
3 | test,1844,0,192,3,6599,5479,3635,1484,1652
4 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt:
--------------------------------------------------------------------------------
 1 | ## htsjdk.samtools.metrics.StringHeader
 2 | # CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa    STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
 3 | ## htsjdk.samtools.metrics.StringHeader
 4 | # Started on: Tue May 14 15:45:18 UTC 2019
 5 | 
 6 | ## METRICS CLASS	picard.analysis.AlignmentSummaryMetrics
 7 | CATEGORY	TOTAL_READS	PF_READS	PCT_PF_READS	PF_NOISE_READS	PF_READS_ALIGNED	PCT_PF_READS_ALIGNED	PF_ALIGNED_BASES	PF_HQ_ALIGNED_READS	PF_HQ_ALIGNED_BASES	PF_HQ_ALIGNED_Q20_BASES	PF_HQ_MEDIAN_MISMATCHES	PF_MISMATCH_RATE	PF_HQ_ERROR_RATE	PF_INDEL_RATE	MEAN_READ_LENGTH	READS_ALIGNED_IN_PAIRS	PCT_READS_ALIGNED_IN_PAIRS	PF_READS_IMPROPER_PAIRS	PCT_PF_READS_IMPROPER_PAIRS	BAD_CYCLES	STRAND_BALANCE	PCT_CHIMERAS	PCT_ADAPTER	SAMPLE	LIBRARY	READ_GROUP
 8 | UNPAIRED	1086652	1086652	1	0	770963	0.709485	38213614	697232	34613985	34073804	0	0.002624	0.002357	0.000149	50	0	0	0	0	0	0.501303	0	0.000027			
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt:
--------------------------------------------------------------------------------
 1 | ## htsjdk.samtools.metrics.StringHeader
 2 | # MarkDuplicates INPUT=[/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectDuplicationMetrics/inputs/-1585165421/SRR6258488_qc.bam] OUTPUT=SRR6258488_qc.MarkDuplicated.bam METRICS_FILE=SRR6258488_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT    MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
 3 | ## htsjdk.samtools.metrics.StringHeader
 4 | # Started on: Tue May 14 15:45:17 UTC 2019
 5 | 
 6 | ## METRICS CLASS	picard.sam.DuplicationMetrics
 7 | LIBRARY	UNPAIRED_READS_EXAMINED	READ_PAIRS_EXAMINED	SECONDARY_OR_SUPPLEMENTARY_RDS	UNMAPPED_READS	UNPAIRED_READ_DUPLICATES	READ_PAIR_DUPLICATES	READ_PAIR_OPTICAL_DUPLICATES	PERCENT_DUPLICATION	ESTIMATED_LIBRARY_SIZE
 8 | SRR6258488	770963	0	473100	315689	345396	0	0	0.448006	
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt:
--------------------------------------------------------------------------------
 1 | ## htsjdk.samtools.metrics.StringHeader
 2 | # CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa    STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
 3 | ## htsjdk.samtools.metrics.StringHeader
 4 | # Started on: Tue May 14 15:45:18 UTC 2019
 5 | 
 6 | ## METRICS CLASS	picard.analysis.GcBiasSummaryMetrics
 7 | ACCUMULATION_LEVEL	READS_USED	WINDOW_SIZE	TOTAL_CLUSTERS	ALIGNED_READS	AT_DROPOUT	GC_DROPOUT	GC_NC_0_19	GC_NC_20_39	GC_NC_40_59	GC_NC_60_79	GC_NC_80_100	SAMPLE	LIBRARY	READ_GROUP
 8 | All Reads	ALL	100	1559752	1244063	13.760859	1.1878	0.219754	0.753171	1.281724	0.883386	0.021428			
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt:
--------------------------------------------------------------------------------
  1 | ## htsjdk.samtools.metrics.StringHeader
  2 | # CollectRnaSeqMetrics REF_FLAT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=SRR6258488_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-1585165421/SRR6258488_qc.bam OUTPUT=SRR6258488_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT    MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
  3 | ## htsjdk.samtools.metrics.StringHeader
  4 | # Started on: Tue May 14 15:45:18 UTC 2019
  5 | 
  6 | ## METRICS CLASS	picard.analysis.RnaSeqMetrics
  7 | PF_BASES	PF_ALIGNED_BASES	RIBOSOMAL_BASES	CODING_BASES	UTR_BASES	INTRONIC_BASES	INTERGENIC_BASES	IGNORED_READS	CORRECT_STRAND_READS	INCORRECT_STRAND_READS	NUM_R1_TRANSCRIPT_STRAND_READS	NUM_R2_TRANSCRIPT_STRAND_READS	NUM_UNEXPLAINED_READS	PCT_R1_TRANSCRIPT_STRAND_READS	PCT_R2_TRANSCRIPT_STRAND_READS	PCT_RIBOSOMAL_BASES	PCT_CODING_BASES	PCT_UTR_BASES	PCT_INTRONIC_BASES	PCT_INTERGENIC_BASES	PCT_MRNA_BASES	PCT_USABLE_BASES	PCT_CORRECT_STRAND_READS	MEDIAN_CV_COVERAGE	MEDIAN_5PRIME_BIAS	MEDIAN_3PRIME_BIAS	MEDIAN_5PRIME_TO_3PRIME_BIAS	SAMPLE	LIBRARY	READ_GROUP
  8 | 54332600	38213614	0	371628	1152265	18630585	18059136	0	0	0	12352	12891	538	0.489324	0.510676	0	0.009725	0.030153	0.487538	0.472584	0.039878	0.028047	0	2.183917	0	0	0			
  9 | 
 10 | ## HISTOGRAM	java.lang.Integer
 11 | normalized_position	All_Reads.normalized_coverage
 12 | 0	1.252653
 13 | 1	1.146108
 14 | 2	1.065068
 15 | 3	1.122433
 16 | 4	1.234516
 17 | 5	1.247113
 18 | 6	1.2191
 19 | 7	1.08917
 20 | 8	1.101883
 21 | 9	1.130302
 22 | 10	1.082888
 23 | 11	1.146879
 24 | 12	1.173149
 25 | 13	1.084206
 26 | 14	1.035169
 27 | 15	1.169359
 28 | 16	1.278125
 29 | 17	1.298059
 30 | 18	1.418038
 31 | 19	1.468055
 32 | 20	1.306559
 33 | 21	1.210198
 34 | 22	0.953958
 35 | 23	0.806139
 36 | 24	0.815513
 37 | 25	0.887045
 38 | 26	0.763414
 39 | 27	0.737914
 40 | 28	0.702678
 41 | 29	0.689913
 42 | 30	0.633512
 43 | 31	0.665368
 44 | 32	0.682949
 45 | 33	0.848599
 46 | 34	0.941722
 47 | 35	1.082228
 48 | 36	1.113449
 49 | 37	1.049003
 50 | 38	0.97788
 51 | 39	0.989931
 52 | 40	0.92986
 53 | 41	0.874432
 54 | 42	0.87788
 55 | 43	0.868871
 56 | 44	0.92942
 57 | 45	1.015775
 58 | 46	1.070114
 59 | 47	1.023889
 60 | 48	1.023103
 61 | 49	0.988576
 62 | 50	0.931694
 63 | 51	0.794716
 64 | 52	0.765784
 65 | 53	0.721218
 66 | 54	0.723223
 67 | 55	0.711507
 68 | 56	0.704034
 69 | 57	0.694139
 70 | 58	0.741844
 71 | 59	0.831505
 72 | 60	0.806244
 73 | 61	0.869419
 74 | 62	0.987354
 75 | 63	0.954176
 76 | 64	0.925553
 77 | 65	0.951851
 78 | 66	0.906269
 79 | 67	0.85666
 80 | 68	0.985052
 81 | 69	0.947861
 82 | 70	0.98528
 83 | 71	0.873541
 84 | 72	0.87925
 85 | 73	0.956294
 86 | 74	1.137028
 87 | 75	1.206313
 88 | 76	1.148145
 89 | 77	1.159051
 90 | 78	1.207689
 91 | 79	1.170334
 92 | 80	1.199969
 93 | 81	1.391121
 94 | 82	1.243649
 95 | 83	1.235795
 96 | 84	1.227105
 97 | 85	1.278662
 98 | 86	1.298065
 99 | 87	1.201038
100 | 88	1.2361
101 | 89	1.098932
102 | 90	1.042881
103 | 91	1.037875
104 | 92	0.95545
105 | 93	0.969215
106 | 94	1.059149
107 | 95	0.857316
108 | 96	0.792585
109 | 97	0.817511
110 | 98	0.880909
111 | 99	0.786114
112 | 100	0.548663
113 | 
114 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/small-cell-sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/small-cell-sorted.bam


--------------------------------------------------------------------------------
/src/sctools/test/data/small-gene-sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/small-gene-sorted.bam


--------------------------------------------------------------------------------
/src/sctools/test/data/test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test.bam


--------------------------------------------------------------------------------
/src/sctools/test/data/test.gtf.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test.gtf.bz2


--------------------------------------------------------------------------------
/src/sctools/test/data/test.gtf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test.gtf.gz


--------------------------------------------------------------------------------
/src/sctools/test/data/test_i7.fastq:
--------------------------------------------------------------------------------
  1 | @ST-K00126:308:HFLYFBBXX:1:1101:5172:1173 1:N:0:NCACAATG
  2 | NCACAATG
  3 | +
  4 | #AAFFJFF
  5 | @ST-K00126:308:HFLYFBBXX:1:1101:31314:1173 1:N:0:NCACAATG
  6 | NCACAATG
  7 | +
  8 | #AAFFJA-
  9 | @ST-K00126:308:HFLYFBBXX:1:1101:31984:1173 1:N:0:NCACAATG
 10 | NCACAATG
 11 | +
 12 | #AAFFJAA
 13 | @ST-K00126:308:HFLYFBBXX:1:1101:4574:1191 1:N:0:NCACAATG
 14 | NCACAATG
 15 | +
 16 | #AAAFJFJ
 17 | @ST-K00126:308:HFLYFBBXX:1:1101:8186:1191 1:N:0:NCACAATG
 18 | NCACAATG
 19 | +
 20 | #AAFFJFJ
 21 | @ST-K00126:308:HFLYFBBXX:1:1101:13159:1191 1:N:0:NCACAATG
 22 | NCACAATG
 23 | +
 24 | #AAFFJJJ
 25 | @ST-K00126:308:HFLYFBBXX:1:1101:17969:1191 1:N:0:NCACAATG
 26 | NCACAATG
 27 | +
 28 | #AAFFJFJ
 29 | @ST-K00126:308:HFLYFBBXX:1:1101:22942:1191 1:N:0:NCACAATG
 30 | NCACAATG
 31 | +
 32 | #AAFFJAF
 33 | @ST-K00126:308:HFLYFBBXX:1:1101:13190:1209 1:N:0:NCACAATG
 34 | NCACAATG
 35 | +
 36 | #AAFFJJJ
 37 | @ST-K00126:308:HFLYFBBXX:1:1101:15300:1209 1:N:0:NCACAATG
 38 | NCACAATG
 39 | +
 40 | #AAFFJJF
 41 | @ST-K00126:308:HFLYFBBXX:1:1101:19603:1209 1:N:0:NCACAATG
 42 | NCACAATG
 43 | +
 44 | #AAAFJAA
 45 | @ST-K00126:308:HFLYFBBXX:1:1101:20111:1209 1:N:0:NCACAATG
 46 | NCACAATG
 47 | +
 48 | #AAFFJJJ
 49 | @ST-K00126:308:HFLYFBBXX:1:1101:23926:1209 1:N:0:NCACAATG
 50 | NCACAATG
 51 | +
 52 | #AAFFJJJ
 53 | @ST-K00126:308:HFLYFBBXX:1:1101:2828:1226 1:N:0:NCACAATG
 54 | NCACAATG
 55 | +
 56 | #AA-FFFF
 57 | @ST-K00126:308:HFLYFBBXX:1:1101:8004:1226 1:N:0:NCACAATG
 58 | NCACAATG
 59 | +
 60 | #AAFFJJJ
 61 | @ST-K00126:308:HFLYFBBXX:1:1101:12814:1226 1:N:0:NCACAATG
 62 | NCACAATG
 63 | +
 64 | #AAAFJJF
 65 | @ST-K00126:308:HFLYFBBXX:1:1101:13240:1226 1:N:0:NCACAATG
 66 | NCACAATG
 67 | +
 68 | #AAFFJJJ
 69 | @ST-K00126:308:HFLYFBBXX:1:1101:15818:1226 1:N:0:NCACAATG
 70 | NCACAATG
 71 | +
 72 | #AAFFJJJ
 73 | @ST-K00126:308:HFLYFBBXX:1:1101:23652:1226 1:N:0:NCACAATG
 74 | NCACAATG
 75 | +
 76 | #AAFFJJF
 77 | @ST-K00126:308:HFLYFBBXX:1:1101:27793:1226 1:N:0:NCACAATG
 78 | NCACAATG
 79 | +
 80 | #AAFFJJJ
 81 | @ST-K00126:308:HFLYFBBXX:1:1101:29904:1226 1:N:0:NCACAATG
 82 | NCACAATG
 83 | +
 84 | #AAAFJ-A
 85 | @ST-K00126:308:HFLYFBBXX:1:1101:2920:1244 1:N:0:NCACAATG
 86 | NCACAATG
 87 | +
 88 | #AAFFJAA
 89 | @ST-K00126:308:HFLYFBBXX:1:1101:5010:1244 1:N:0:NCACAATG
 90 | NCACAATG
 91 | +
 92 | #AAFFJJ<
 93 | @ST-K00126:308:HFLYFBBXX:1:1101:11667:1244 1:N:0:NCACAATG
 94 | NCACAATG
 95 | +
 96 | #AAF<JFJ
 97 | @ST-K00126:308:HFLYFBBXX:1:1101:15402:1244 1:N:0:NCACAATG
 98 | NCACAATG
 99 | +
100 | #AAFFJJJ
101 | @ST-K00126:308:HFLYFBBXX:1:1101:26890:1244 1:N:0:NCACAATG
102 | NCACAATG
103 | +
104 | #AAFFJJJ
105 | @ST-K00126:308:HFLYFBBXX:1:1101:28858:1244 1:N:0:NCACAATG
106 | NCACAATG
107 | +
108 | #AAFFJFJ
109 | @ST-K00126:308:HFLYFBBXX:1:1101:31680:1244 1:N:0:NCACAATG
110 | NCACAATG
111 | +
112 | #AAFFF-A
113 | @ST-K00126:308:HFLYFBBXX:1:1101:3092:1261 1:N:0:NCACAATG
114 | NCACAATG
115 | +
116 | #AAAFJFF
117 | @ST-K00126:308:HFLYFBBXX:1:1101:3457:1261 1:N:0:NCACAATG
118 | NCACAATG
119 | +
120 | #AAAFFFA
121 | @ST-K00126:308:HFLYFBBXX:1:1101:5345:1261 1:N:0:NCACAATG
122 | NCACAATG
123 | +
124 | #AAAFAFF
125 | @ST-K00126:308:HFLYFBBXX:1:1101:11129:1261 1:N:0:NCACAATG
126 | NCACAATG
127 | +
128 | #AA<FJJJ
129 | @ST-K00126:308:HFLYFBBXX:1:1101:18558:1261 1:N:0:NCACAATG
130 | NCACAATG
131 | +
132 | #AAFFJJF
133 | @ST-K00126:308:HFLYFBBXX:1:1101:28219:1261 1:N:0:NCACAATG
134 | NCACAATG
135 | +
136 | #AAAFAAF
137 | @ST-K00126:308:HFLYFBBXX:1:1101:28767:1261 1:N:0:NCACAATG
138 | NCACAATG
139 | +
140 | #AAFFJJJ
141 | @ST-K00126:308:HFLYFBBXX:1:1101:3407:1279 1:N:0:NCACAATG
142 | NCACAATG
143 | +
144 | #AAF7AFJ
145 | @ST-K00126:308:HFLYFBBXX:1:1101:7222:1279 1:N:0:NCACAATG
146 | NCACAATG
147 | +
148 | #AAAFJJJ
149 | @ST-K00126:308:HFLYFBBXX:1:1101:9678:1279 1:N:0:NCACAATG
150 | NCACAATG
151 | +
152 | #AAFFJJJ
153 | @ST-K00126:308:HFLYFBBXX:1:1101:10713:1279 1:N:0:NCACAATG
154 | NCACAATG
155 | +
156 | #AAFFJJJ
157 | @ST-K00126:308:HFLYFBBXX:1:1101:27417:1279 1:N:0:NCACAATG
158 | NCACAATG
159 | +
160 | #AAFFJJJ
161 | @ST-K00126:308:HFLYFBBXX:1:1101:28047:1279 1:N:0:NCACAATG
162 | NCACAATG
163 | +
164 | #AAFFJFJ
165 | @ST-K00126:308:HFLYFBBXX:1:1101:30198:1279 1:N:0:NCACAATG
166 | NCACAATG
167 | +
168 | #AAAFJA-
169 | @ST-K00126:308:HFLYFBBXX:1:1101:3295:1297 1:N:0:NCACAATG
170 | NCACAATG
171 | +
172 | #AAAFJJJ
173 | @ST-K00126:308:HFLYFBBXX:1:1101:13565:1297 1:N:0:NCACAATG
174 | NCACAATG
175 | +
176 | #AAFFJFJ
177 | @ST-K00126:308:HFLYFBBXX:1:1101:18517:1297 1:N:0:NCACAATG
178 | NCACAATG
179 | +
180 | #AAFFJJJ
181 | @ST-K00126:308:HFLYFBBXX:1:1101:18822:1297 1:N:0:NCACAATG
182 | NCACAATG
183 | +
184 | #AAFFJFJ
185 | @ST-K00126:308:HFLYFBBXX:1:1101:23246:1297 1:N:0:NCACAATG
186 | NCACAATG
187 | +
188 | #AAFFJJA
189 | @ST-K00126:308:HFLYFBBXX:1:1101:1722:1314 1:N:0:NCACAATG
190 | NCACAATG
191 | +
192 | #A<AAA-A
193 | @ST-K00126:308:HFLYFBBXX:1:1101:6776:1314 1:N:0:NCACAATG
194 | NCACAATG
195 | +
196 | #AAFFJJJ
197 | @ST-K00126:308:HFLYFBBXX:1:1101:6898:1314 1:N:0:NCACAATG
198 | NCACAATG
199 | +
200 | #AAFFFF<
201 | @ST-K00126:308:HFLYFBBXX:1:1101:20801:1314 1:N:0:NCACAATG
202 | NCACAATG
203 | +
204 | #AAFFJJJ
205 | @ST-K00126:308:HFLYFBBXX:1:1101:21430:1314 1:N:0:NCACAATG
206 | NCACAATG
207 | +
208 | #AAAFJJJ
209 | @ST-K00126:308:HFLYFBBXX:1:1101:28270:1314 1:N:0:NCACAATG
210 | NCACAATG
211 | +
212 | #AAAFJJF
213 | @ST-K00126:308:HFLYFBBXX:1:1101:32573:1314 1:N:0:GCACAATG
214 | GCACAATG
215 | +
216 | <AFFFJAA
217 | @ST-K00126:308:HFLYFBBXX:1:1101:3133:1332 1:N:0:GCACAATG
218 | GCACAATG
219 | +
220 | AAFAFFFF
221 | @ST-K00126:308:HFLYFBBXX:1:1101:4350:1332 1:N:0:GCACAATG
222 | GCACAATG
223 | +
224 | AAFFFJAF
225 | @ST-K00126:308:HFLYFBBXX:1:1101:15635:1332 1:N:0:GCACAATG
226 | GCACAATG
227 | +
228 | AAAFFJJF
229 | @ST-K00126:308:HFLYFBBXX:1:1101:18294:1332 1:N:0:GCACAATG
230 | GCACAATG
231 | +
232 | AAFFFJJJ
233 | @ST-K00126:308:HFLYFBBXX:1:1101:24464:1332 1:N:0:GCACAATG
234 | GCACAATG
235 | +
236 | AAAFFJFJ
237 | @ST-K00126:308:HFLYFBBXX:1:1101:16234:1349 1:N:0:GCACAATG
238 | GCACAATG
239 | +
240 | AAAFFJJJ
241 | @ST-K00126:308:HFLYFBBXX:1:1101:18365:1349 1:N:0:GCACAATG
242 | GCACAATG
243 | +
244 | AAFFFJFF
245 | @ST-K00126:308:HFLYFBBXX:1:1101:20070:1349 1:N:0:GCACAATG
246 | GCACAATG
247 | +
248 | AAAFFJF-
249 | @ST-K00126:308:HFLYFBBXX:1:1101:20151:1349 1:N:0:GCACAATG
250 | GCACAATG
251 | +
252 | AAFFFJJJ
253 | @ST-K00126:308:HFLYFBBXX:1:1101:31213:1349 1:N:0:GCACAATG
254 | GCACAATG
255 | +
256 | AAAFFJ<A
257 | @ST-K00126:308:HFLYFBBXX:1:1101:31497:1349 1:N:0:GCACAATG
258 | GCACAATG
259 | +
260 | AAAFFJJJ
261 | @ST-K00126:308:HFLYFBBXX:1:1101:32613:1349 1:N:0:GCACAATG
262 | GCACAATG
263 | +
264 | AAFFFJ<F
265 | @ST-K00126:308:HFLYFBBXX:1:1101:17563:1367 1:N:0:GCACAATG
266 | GCACAATG
267 | +
268 | <-AF-A--
269 | @ST-K00126:308:HFLYFBBXX:1:1101:20811:1367 1:N:0:GCACAATG
270 | GCACAATG
271 | +
272 | AAFFFJJF
273 | @ST-K00126:308:HFLYFBBXX:1:1101:24870:1367 1:N:0:GCACAATG
274 | GCACAATG
275 | +
276 | AAFFFJJJ
277 | @ST-K00126:308:HFLYFBBXX:1:1101:32623:1367 1:N:0:GCACAATG
278 | GCACAATG
279 | +
280 | AAAFAFF-
281 | @ST-K00126:308:HFLYFBBXX:1:1101:3265:1384 1:N:0:GCACAATG
282 | GCACAATG
283 | +
284 | AAAAFAFF
285 | @ST-K00126:308:HFLYFBBXX:1:1101:6471:1384 1:N:0:GCACAATG
286 | GCACAATG
287 | +
288 | AAAFFJJF
289 | @ST-K00126:308:HFLYFBBXX:1:1101:13758:1384 1:N:0:GCACAATG
290 | GCACAATG
291 | +
292 | AAFFFJFF
293 | @ST-K00126:308:HFLYFBBXX:1:1101:17026:1384 1:N:0:GCACAATG
294 | GCACAATG
295 | +
296 | AAAFFAJJ
297 | @ST-K00126:308:HFLYFBBXX:1:1101:20009:1384 1:N:0:GCACAATG
298 | GCACAATG
299 | +
300 | AAA<FFAF
301 | @ST-K00126:308:HFLYFBBXX:1:1101:25814:1384 1:N:0:GCACAATG
302 | GCACAATG
303 | +
304 | AAFFFJJJ
305 | @ST-K00126:308:HFLYFBBXX:1:1101:19147:1402 1:N:0:GCACAATG
306 | GCACAATG
307 | +
308 | AAFFFJJJ
309 | @ST-K00126:308:HFLYFBBXX:1:1101:23186:1402 1:N:0:GCACAATG
310 | GCACAATG
311 | +
312 | -AAFFJAA
313 | @ST-K00126:308:HFLYFBBXX:1:1101:24931:1402 1:N:0:GCACAATG
314 | GCACAATG
315 | +
316 | AAFAFFAF
317 | @ST-K00126:308:HFLYFBBXX:1:1101:25824:1402 1:N:0:GCACAATG
318 | GCACAATG
319 | +
320 | AAFFFFJJ
321 | @ST-K00126:308:HFLYFBBXX:1:1101:3143:1420 1:N:0:GCACAATG
322 | GCACAATG
323 | +
324 | <AAFFJJJ
325 | @ST-K00126:308:HFLYFBBXX:1:1101:13433:1420 1:N:0:GCACAATG
326 | GCACAATG
327 | +
328 | AAAFFJJJ
329 | @ST-K00126:308:HFLYFBBXX:1:1101:13778:1420 1:N:0:GCACAATG
330 | GCACAATG
331 | +
332 | AAFFFJJJ
333 | @ST-K00126:308:HFLYFBBXX:1:1101:16660:1420 1:N:0:GCACAATG
334 | GCACAATG
335 | +
336 | AAFFFJJJ
337 | @ST-K00126:308:HFLYFBBXX:1:1101:18081:1420 1:N:0:GCACAATG
338 | GCACAATG
339 | +
340 | <AFFFJJJ
341 | @ST-K00126:308:HFLYFBBXX:1:1101:19015:1420 1:N:0:GCACAATG
342 | GCACAATG
343 | +
344 | AAAFFFFF
345 | @ST-K00126:308:HFLYFBBXX:1:1101:21268:1420 1:N:0:GCACAATG
346 | GCACAATG
347 | +
348 | AAFFFJJJ
349 | @ST-K00126:308:HFLYFBBXX:1:1101:2849:1437 1:N:0:GCACAATG
350 | GCACAATG
351 | +
352 | -AA-AA-A
353 | @ST-K00126:308:HFLYFBBXX:1:1101:7699:1437 1:N:0:GCACAATG
354 | GCACAATG
355 | +
356 | AAFFFJJJ
357 | @ST-K00126:308:HFLYFBBXX:1:1101:13504:1437 1:N:0:GCACAATG
358 | GCACAATG
359 | +
360 | AAA<FJJJ
361 | @ST-K00126:308:HFLYFBBXX:1:1101:21034:1437 1:N:0:GCACAATG
362 | GCACAATG
363 | +
364 | AAAFFJJJ
365 | @ST-K00126:308:HFLYFBBXX:1:1101:28970:1437 1:N:0:GCACAATG
366 | GCACAATG
367 | +
368 | AAAFFJJF
369 | @ST-K00126:308:HFLYFBBXX:1:1101:29457:1437 1:N:0:GCACAATG
370 | GCACAATG
371 | +
372 | AAAFFJJA
373 | @ST-K00126:308:HFLYFBBXX:1:1101:30858:1437 1:N:0:GCACAATG
374 | GCACAATG
375 | +
376 | -AAA<-AA
377 | @ST-K00126:308:HFLYFBBXX:1:1101:2108:1455 1:N:0:GCACAATG
378 | GCACAATG
379 | +
380 | <AAAFJAA
381 | @ST-K00126:308:HFLYFBBXX:1:1101:20598:1455 1:N:0:GCACAATG
382 | GCACAATG
383 | +
384 | AAFFFJJJ
385 | @ST-K00126:308:HFLYFBBXX:1:1101:22445:1455 1:N:0:GCACAATG
386 | GCACAATG
387 | +
388 | AAFFFJJJ
389 | @ST-K00126:308:HFLYFBBXX:1:1101:24008:1455 1:N:0:GCACAATG
390 | GCACAATG
391 | +
392 | AAFFFJJF
393 | @ST-K00126:308:HFLYFBBXX:1:1101:27377:1455 1:N:0:GCACAATG
394 | GCACAATG
395 | +
396 | AAAFFJJJ
397 | @ST-K00126:308:HFLYFBBXX:1:1101:3275:1472 1:N:0:GCACAATG
398 | GCACAATG
399 | +
400 | AAAF7FJA
401 | 


--------------------------------------------------------------------------------
/src/sctools/test/data/test_i7.fastq.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test_i7.fastq.bz2


--------------------------------------------------------------------------------
/src/sctools/test/data/test_i7.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test_i7.fastq.gz


--------------------------------------------------------------------------------
/src/sctools/test/data/test_r1.fastq.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test_r1.fastq.bz2


--------------------------------------------------------------------------------
/src/sctools/test/data/test_r1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test_r1.fastq.gz


--------------------------------------------------------------------------------
/src/sctools/test/data/test_r2.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test_r2.bam


--------------------------------------------------------------------------------
/src/sctools/test/data/test_r2.fastq.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test_r2.fastq.bz2


--------------------------------------------------------------------------------
/src/sctools/test/data/test_r2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test_r2.fastq.gz


--------------------------------------------------------------------------------
/src/sctools/test/data/test_r2_tagged.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/test_r2_tagged.bam


--------------------------------------------------------------------------------
/src/sctools/test/data/unsorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCellAtlas/sctools/7876b1e5a6463856cac2609ab8bea42eb1d638e2/src/sctools/test/data/unsorted.bam


--------------------------------------------------------------------------------
/src/sctools/test/test_barcode.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import pysam
  5 | import pytest
  6 | 
  7 | from .. import barcode, encodings, platform, consts
  8 | 
  9 | data_dir = os.path.split(__file__)[0] + "/data/"
 10 | 
 11 | 
 12 | # TEST BARCODES
 13 | 
 14 | 
 15 | @pytest.fixture
 16 | def barcode_set():
 17 |     return barcode.Barcodes.from_whitelist(
 18 |         data_dir + "1k-august-2016.txt", barcode_length=16
 19 |     )
 20 | 
 21 | 
 22 | @pytest.fixture(scope="module", params=["r", "rb"])
 23 | def short_barcode_set_from_iterable(request):
 24 |     with open(data_dir + "1k-august-2016.txt", request.param) as f:
 25 |         barcodes = [l.strip() for l in f.readlines()[:50]]
 26 |     if isinstance(barcodes[0], bytes):
 27 |         return barcode.Barcodes.from_iterable_bytes(barcodes, barcode_length=16)
 28 |     else:
 29 |         return barcode.Barcodes.from_iterable_strings(barcodes, barcode_length=16)
 30 | 
 31 | 
 32 | @pytest.fixture(scope="module")
 33 | def short_barcode_set_from_encoded():
 34 |     return barcode.Barcodes.from_iterable_encoded(
 35 |         [0, 1, 2, 3, 4, 5, 6, 7], barcode_length=2
 36 |     )
 37 | 
 38 | 
 39 | def test_iterable_produces_correct_barcodes(short_barcode_set_from_encoded):
 40 |     tbe = encodings.TwoBit(2)
 41 |     decoded = [tbe.decode(b) for b in short_barcode_set_from_encoded]
 42 |     print(decoded)
 43 |     assert decoded == [b"AA", b"AC", b"AT", b"AG", b"CA", b"CC", b"CT", b"CG"]
 44 | 
 45 | 
 46 | def test_reads_barcodes_from_file(barcode_set):
 47 |     assert len(barcode_set) == 1001  # number of barcodes in file.
 48 | 
 49 | 
 50 | def test_base_frequency_sums_are_all_equal_to_barcode_set_length(barcode_set):
 51 |     bf = barcode_set.base_frequency()
 52 |     assert isinstance(bf, np.ndarray)
 53 |     assert np.array_equal(bf.sum(axis=1), np.ones(16) * len(barcode_set))
 54 | 
 55 | 
 56 | def test_barcode_diversity_is_in_range(barcode_set):
 57 |     bd = barcode_set.effective_diversity()
 58 |     assert np.all(bd >= 0)
 59 |     assert np.all(bd <= 1)
 60 | 
 61 | 
 62 | def test_summarize_hamming_distances_gives_reasonable_results(
 63 |     short_barcode_set_from_iterable,
 64 | ):
 65 | 
 66 |     hamming_summary = short_barcode_set_from_iterable.summarize_hamming_distances()
 67 | 
 68 |     # we know 10x barcodes have at least this much distance
 69 |     assert hamming_summary["minimum"] >= 2
 70 |     # no barcode can have more hamming distance than length
 71 |     assert all(v <= 16 for v in hamming_summary.values())
 72 | 
 73 | 
 74 | # TEST HashErrorsToCorrectBarcodes
 75 | 
 76 | 
 77 | @pytest.fixture(scope="module")
 78 | def trivial_whitelist():
 79 |     barcode_iterable = ["A" * 8]
 80 |     error_mapping = barcode.ErrorsToCorrectBarcodesMap._prepare_single_base_error_hash_table(
 81 |         barcode_iterable
 82 |     )
 83 |     return barcode.ErrorsToCorrectBarcodesMap(error_mapping)
 84 | 
 85 | 
 86 | @pytest.fixture(scope="module")
 87 | def truncated_whitelist_from_10x():
 88 |     # note that this whitelist contains 1 non-10x barcode to ensure the presence of a matching
 89 |     # target in the test data.
 90 |     error_mapping = barcode.ErrorsToCorrectBarcodesMap.single_hamming_errors_from_whitelist(
 91 |         data_dir + "1k-august-2016.txt"
 92 |     )
 93 |     return error_mapping
 94 | 
 95 | 
 96 | def test_incorrect_input_raises_errors(trivial_whitelist):
 97 |     with pytest.raises(TypeError):
 98 |         barcode.ErrorsToCorrectBarcodesMap("not_a_mapping")
 99 |     with pytest.raises(TypeError):
100 |         barcode.ErrorsToCorrectBarcodesMap({"not_a_mapping"})
101 |     with pytest.raises(TypeError):
102 |         barcode.ErrorsToCorrectBarcodesMap(["not_a_mapping", "sldkf"])
103 |     assert isinstance(trivial_whitelist, barcode.ErrorsToCorrectBarcodesMap)
104 | 
105 | 
106 | def test_correct_barcode_finds_and_corrects_1_base_errors(trivial_whitelist):
107 |     assert trivial_whitelist.get_corrected_barcode("TAAAAAAA") == "AAAAAAAA"
108 |     assert trivial_whitelist.get_corrected_barcode("AAAACAAA") == "AAAAAAAA"
109 |     assert trivial_whitelist.get_corrected_barcode("AAAGAAAA") == "AAAAAAAA"
110 |     assert trivial_whitelist.get_corrected_barcode("AAAAAAAA") == "AAAAAAAA"
111 | 
112 | 
113 | def test_correct_barcode_raises_keyerror_when_barcode_not_correct_length(
114 |     trivial_whitelist,
115 | ):
116 |     with pytest.raises(KeyError):
117 |         trivial_whitelist.get_corrected_barcode("AAA")
118 |     with pytest.raises(KeyError):
119 |         trivial_whitelist.get_corrected_barcode("AAAAAAAAA")
120 |     with pytest.raises(KeyError):
121 |         trivial_whitelist.get_corrected_barcode("AAAAAAAAAA")
122 | 
123 | 
124 | def test_correct_barcode_raises_keyerror_when_barcode_has_more_than_one_error(
125 |     trivial_whitelist,
126 | ):
127 |     with pytest.raises(KeyError):
128 |         trivial_whitelist.get_corrected_barcode("AAAAAATT")
129 |     with pytest.raises(KeyError):
130 |         trivial_whitelist.get_corrected_barcode("TTAAAAAA")
131 | 
132 | 
133 | @pytest.fixture(scope="module")
134 | def tagged_bamfile():
135 |     outbam = data_dir + "bam_with_tags_test.bam"
136 |     args = [
137 |         "--r1",
138 |         data_dir + "test_r1.fastq",
139 |         "--i1",
140 |         data_dir + "test_i7.fastq",
141 |         "--u2",
142 |         data_dir + "test.bam",
143 |         "--output-bamfile",
144 |         outbam,
145 |     ]
146 |     platform.TenXV2.attach_barcodes(args)
147 |     return outbam
148 | 
149 | 
150 | def test_correct_bam_produces_cb_tags(tagged_bamfile, truncated_whitelist_from_10x):
151 |     outbam = data_dir + "bam_with_cb_tags.bam"
152 |     truncated_whitelist_from_10x.correct_bam(tagged_bamfile, outbam)
153 |     success = False
154 |     with pysam.AlignmentFile(outbam, "rb") as f:
155 |         for record in f:
156 |             try:
157 |                 success = record.get_tag(consts.CELL_BARCODE_TAG_KEY)
158 |             except KeyError:
159 |                 continue
160 |     assert success
161 |     os.remove(outbam)
162 | 


--------------------------------------------------------------------------------
/src/sctools/test/test_encodings.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from .. import encodings
 3 | from itertools import combinations
 4 | 
 5 | 
 6 | @pytest.fixture(scope="module")
 7 | def sequence():
 8 |     return b"ACGTTTGAGATGAGATATAGANNNN"
 9 | 
10 | 
11 | @pytest.fixture(scope="module")
12 | def encoder_2bit(sequence):
13 |     length = len(sequence)
14 |     return encodings.TwoBit(length)
15 | 
16 | 
17 | @pytest.fixture(scope="module")
18 | def encoder_3bit():
19 |     return encodings.ThreeBit()
20 | 
21 | 
22 | @pytest.fixture(scope="module", params=[encodings.TwoBit, encodings.ThreeBit])
23 | def encoder(request):
24 |     return request.param
25 | 
26 | 
27 | def test_two_bit_encode_decode_produces_same_string_except_for_N(
28 |     sequence, encoder_2bit
29 | ):
30 |     encoded = encoder_2bit.encode(sequence)
31 |     decoded = encoder_2bit.decode(encoded)
32 |     assert sequence[:4] == decoded[:4]  # last 4 are N, which get randomized
33 | 
34 | 
35 | def test_three_bit_encode_decode_produces_same_string(sequence, encoder_3bit):
36 |     encoded = encoder_3bit.encode(sequence)
37 |     decoded = encoder_3bit.decode(encoded)
38 |     assert sequence == decoded
39 | 
40 | 
41 | def test_two_bit_encoder_gets_correct_gc_content(encoder_2bit):
42 |     sequence_no_n = b"AGCGCGAT"
43 |     gc_content = sequence_no_n.count(b"C") + sequence_no_n.count(b"G")
44 |     encoded = encoder_2bit.encode(sequence_no_n)
45 |     assert encoder_2bit.gc_content(encoded) == gc_content
46 | 
47 | 
48 | def test_three_bit_encoder_gets_correct_gc_content(sequence, encoder_3bit):
49 |     encoded = encoder_3bit.encode(sequence)
50 |     assert encoder_3bit.gc_content(encoded) == sequence.count(b"C") + sequence.count(
51 |         b"G"
52 |     )
53 | 
54 | 
55 | def test_two_bit_throws_errors_when_asked_to_encode_unknown_nucleotide(encoder_2bit):
56 |     with pytest.raises(KeyError):
57 |         encoder_2bit.encode(b"ACGTP")  # P is not a valid code
58 | 
59 | 
60 | def test_three_bit_encodes_unknown_nucleotides_as_N(encoder_3bit):
61 |     encoded = encoder_3bit.encode(b"ACGTP")  # P is not a valid code
62 |     decoded = encoder_3bit.decode(encoded)
63 |     assert decoded == b"ACGTN"
64 | 
65 | 
66 | @pytest.fixture
67 | def simple_barcodes():
68 |     """simple barcode set with min_hamming = 1, max_hamming = 2"""
69 |     return [b"ACGT", b"ACGG", b"ACGA", b"ACGC", b"TCGT", b"CCGT", b"GCGT"]
70 | 
71 | 
72 | @pytest.fixture
73 | def simple_hamming_distances(simple_barcodes):
74 |     simple_hamming_distances = []
75 |     for a, b in combinations(simple_barcodes, 2):
76 |         d_hamming = 0
77 |         for i, j in zip(a, b):
78 |             if i != j:
79 |                 d_hamming += 1
80 |         simple_hamming_distances.append(d_hamming)
81 |     return simple_hamming_distances
82 | 
83 | 
84 | def test_encoded_hamming_distance_is_accurate(
85 |     simple_hamming_distances, simple_barcodes, encoder
86 | ):
87 |     # encode simple barcodes
88 |     tbe = encoder(4)
89 |     encoded = [tbe.encode(b) for b in simple_barcodes]
90 |     encoded_hamming_distances = []
91 | 
92 |     # use hamming distance function
93 |     for a, b in combinations(encoded, 2):
94 |         encoded_hamming_distances.append(tbe.hamming_distance(a, b))
95 | 
96 |     # verify they are the same as the simple function used in this file
97 |     assert simple_hamming_distances == encoded_hamming_distances
98 | 


--------------------------------------------------------------------------------
/src/sctools/test/test_entrypoints.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import tempfile
  4 | 
  5 | import numpy as np
  6 | import pysam
  7 | import pytest
  8 | import scipy.sparse as sp
  9 | 
 10 | from sctools import bam, platform, count, consts
 11 | 
 12 | data_dir = os.path.split(__file__)[0] + "/data/"
 13 | 
 14 | 
 15 | def test_Attach10XBarcodes_entrypoint():
 16 |     args = [
 17 |         "--r1",
 18 |         data_dir + "test_r1.fastq",
 19 |         "--i1",
 20 |         data_dir + "test_i7.fastq",
 21 |         "--u2",
 22 |         data_dir + "test.bam",
 23 |         "--output-bamfile",
 24 |         "test_tagged_bam.bam",
 25 |     ]
 26 | 
 27 |     rc = platform.TenXV2.attach_barcodes(args)
 28 |     assert rc == 0
 29 |     with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f:
 30 |         for alignment in f:
 31 |             # each alignment should now have a tag, and that tag should be a string
 32 |             assert isinstance(
 33 |                 alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str
 34 |             )
 35 |             assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str)
 36 |             assert isinstance(
 37 |                 alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str
 38 |             )
 39 |             assert isinstance(
 40 |                 alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str
 41 |             )
 42 |             assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str)
 43 |             assert isinstance(
 44 |                 alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str
 45 |             )
 46 |     os.remove("test_tagged_bam.bam")  # clean up
 47 | 
 48 | 
 49 | def test_Attach10XBarcodes_entrypoint_with_whitelist():
 50 |     args = [
 51 |         "--r1",
 52 |         data_dir + "test_r1.fastq",
 53 |         "--i1",
 54 |         data_dir + "test_i7.fastq",
 55 |         "--u2",
 56 |         data_dir + "test.bam",
 57 |         "--output-bamfile",
 58 |         "test_tagged_bam.bam",
 59 |         "--whitelist",
 60 |         data_dir + "1k-august-2016.txt",
 61 |     ]
 62 | 
 63 |     return_call = platform.TenXV2.attach_barcodes(args)
 64 |     assert return_call == 0
 65 |     success = False
 66 |     with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f:
 67 |         for alignment in f:
 68 |             if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY):
 69 |                 success = True
 70 |             # each alignment should now have a tag, and that tag should be a string
 71 |             assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str)
 72 |             assert isinstance(
 73 |                 alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str
 74 |             )
 75 |             assert isinstance(
 76 |                 alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str
 77 |             )
 78 |             assert isinstance(
 79 |                 alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str
 80 |             )
 81 |             assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str)
 82 |             assert isinstance(
 83 |                 alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str
 84 |             )
 85 |     assert success
 86 |     os.remove("test_tagged_bam.bam")  # clean up
 87 | 
 88 | 
 89 | def test_AttachBarcodes_entrypoint_with_whitelist():
 90 |     # test of the BarcodePlatform.attach_barcodes entry point with
 91 |     # sample, cell, and molecule barcodes all specified
 92 |     args = [
 93 |         "--r1",
 94 |         data_dir + "test_r1.fastq",
 95 |         "--i1",
 96 |         data_dir + "test_i7.fastq",
 97 |         "--u2",
 98 |         data_dir + "test.bam",
 99 |         "--output-bamfile",
100 |         "test_tagged_bam.bam",
101 |         "--whitelist",
102 |         data_dir + "1k-august-2016.txt",
103 |         "--sample-barcode-start-position",
104 |         "0",
105 |         "--sample-barcode-length",
106 |         "8",
107 |         "--cell-barcode-start-position",
108 |         "0",
109 |         "--cell-barcode-length",
110 |         "16",
111 |         "--molecule-barcode-start-position",
112 |         "16",
113 |         "--molecule-barcode-length",
114 |         "7",  # changed 10>7 intentionally for test
115 |     ]
116 | 
117 |     return_call = platform.BarcodePlatform.attach_barcodes(args)
118 |     assert return_call == 0
119 |     success = False
120 |     with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f:
121 |         for alignment in f:
122 |             if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY):
123 |                 success = True
124 |             # each alignment should now have a tag, and that tag should be a string
125 |             assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str)
126 |             assert isinstance(
127 |                 alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str
128 |             )
129 |             assert isinstance(
130 |                 alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str
131 |             )
132 |             assert len(alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY)) == 7
133 |             assert isinstance(
134 |                 alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str
135 |             )
136 |             assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str)
137 |             assert isinstance(
138 |                 alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str
139 |             )
140 |     assert success
141 |     os.remove("test_tagged_bam.bam")  # clean up
142 | 
143 | 
144 | def test_split_bam():
145 |     tag_args = [
146 |         "--r1",
147 |         data_dir + "test_r1.fastq",
148 |         "--i1",
149 |         data_dir + "test_i7.fastq",
150 |         "--u2",
151 |         data_dir + "test.bam",
152 |         "--output-bamfile",
153 |         "test_tagged_bam.bam",
154 |         "--whitelist",
155 |         data_dir + "1k-august-2016.txt",
156 |     ]
157 | 
158 |     platform.TenXV2.attach_barcodes(tag_args)
159 | 
160 |     split_args = [
161 |         "--bamfile",
162 |         "test_tagged_bam.bam",
163 |         "--output-prefix",
164 |         "test_tagged",
165 |         "--subfile-size",
166 |         "0.005",
167 |         "--tags",
168 |         consts.CELL_BARCODE_TAG_KEY,
169 |         consts.RAW_CELL_BARCODE_TAG_KEY,
170 |     ]
171 | 
172 |     return_call = platform.GenericPlatform.split_bam(split_args)
173 |     assert return_call == 0
174 | 
175 |     for f in glob.glob("test_tagged*"):
176 |         os.remove(f)
177 | 
178 | 
179 | def test_tag_sort_bam():
180 |     args = [
181 |         "-i",
182 |         data_dir + "unsorted.bam",
183 |         "-o",
184 |         "test_sorted.bam",
185 |         "-t",
186 |         consts.CELL_BARCODE_TAG_KEY,
187 |         consts.GENE_NAME_TAG_KEY,
188 |         consts.MOLECULE_BARCODE_TAG_KEY,
189 |     ]
190 | 
191 |     return_call = platform.GenericPlatform.tag_sort_bam(args)
192 |     assert return_call == 0
193 | 
194 |     tag_keys = [
195 |         consts.CELL_BARCODE_TAG_KEY,
196 |         consts.GENE_NAME_TAG_KEY,
197 |         consts.MOLECULE_BARCODE_TAG_KEY,
198 |     ]
199 |     with pysam.AlignmentFile("test_sorted.bam", "rb") as f:
200 |         segments = f.fetch(until_eof=True)
201 |         tag_sortable_records = (
202 |             bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments
203 |         )
204 |         bam.verify_sort(tag_sortable_records, tag_keys)
205 | 
206 |     for f in glob.glob("test_sorted*"):
207 |         os.remove(f)
208 | 
209 | 
210 | def test_tag_sort_bam_dash_t_specified_multiple_times():
211 |     args = [
212 |         "-i",
213 |         data_dir + "unsorted.bam",
214 |         "-o",
215 |         "test_sorted.bam",
216 |         "-t",
217 |         consts.CELL_BARCODE_TAG_KEY,
218 |         "-t",
219 |         consts.GENE_NAME_TAG_KEY,
220 |         "-t",
221 |         consts.MOLECULE_BARCODE_TAG_KEY,
222 |     ]
223 | 
224 |     return_call = platform.GenericPlatform.tag_sort_bam(args)
225 |     assert return_call == 0
226 | 
227 |     tag_keys = [
228 |         consts.CELL_BARCODE_TAG_KEY,
229 |         consts.GENE_NAME_TAG_KEY,
230 |         consts.MOLECULE_BARCODE_TAG_KEY,
231 |     ]
232 |     with pysam.AlignmentFile("test_sorted.bam", "rb") as f:
233 |         segments = f.fetch(until_eof=True)
234 |         tag_sortable_record_generator = (
235 |             bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments
236 |         )
237 |         bam.verify_sort(tag_sortable_record_generator, tag_keys)
238 | 
239 |     for f in glob.glob("test_sorted*"):
240 |         os.remove(f)
241 | 
242 | 
243 | def test_tag_sort_bam_no_tags():
244 |     args = ["-i", data_dir + "unsorted.bam", "-o", "test_sorted.bam"]
245 | 
246 |     return_call = platform.GenericPlatform.tag_sort_bam(args)
247 |     assert return_call == 0
248 | 
249 |     tag_keys = []
250 |     with pysam.AlignmentFile("test_sorted.bam", "rb") as f:
251 |         segments = f.fetch(until_eof=True)
252 |         tag_sortable_records = (
253 |             bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments
254 |         )
255 |         bam.verify_sort(tag_sortable_records, tag_keys)
256 | 
257 |     for f in glob.glob("test_sorted*"):
258 |         os.remove(f)
259 | 
260 | 
261 | def test_verify_bam_sort():
262 |     args = [
263 |         "-i",
264 |         data_dir + "cell-gene-umi-queryname-sorted.bam",
265 |         "-t",
266 |         consts.CELL_BARCODE_TAG_KEY,
267 |         consts.GENE_NAME_TAG_KEY,
268 |         consts.MOLECULE_BARCODE_TAG_KEY,
269 |     ]
270 | 
271 |     return_call = platform.GenericPlatform.verify_bam_sort(args)
272 |     assert return_call == 0
273 | 
274 | 
275 | def test_verify_bam_sort_raises_error_on_unsorted():
276 |     args = [
277 |         "-i",
278 |         data_dir + "unsorted.bam",
279 |         "-t",
280 |         consts.CELL_BARCODE_TAG_KEY,
281 |         consts.GENE_NAME_TAG_KEY,
282 |         consts.MOLECULE_BARCODE_TAG_KEY,
283 |     ]
284 | 
285 |     with pytest.raises(bam.SortError):
286 |         platform.GenericPlatform.verify_bam_sort(args)
287 | 
288 | 
289 | def test_count_merge():
290 |     tmp = tempfile.mkdtemp()
291 | 
292 |     data, ind, col = [np.arange(10)] * 3
293 |     matrix = sp.coo_matrix((data, (ind, col)), shape=(10, 10), dtype=np.float32).tocsr()
294 |     # be lazy and reuse the inds as the col and row index
295 |     counts = count.CountMatrix(matrix, ind, col)
296 |     counts.save(tmp + "/test_input_1")
297 |     counts.save(tmp + "/test_input_2")
298 | 
299 |     merge_args = [
300 |         "-o",
301 |         tmp + "/test_merged_counts",
302 |         "-i",
303 |         tmp + "/test_input_2",
304 |         tmp + "/test_input_1",
305 |     ]
306 |     return_call = platform.GenericPlatform.merge_count_matrices(merge_args)
307 |     assert return_call == 0
308 | 


--------------------------------------------------------------------------------
/src/sctools/test/test_fastq.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import string
  3 | from functools import partial
  4 | from itertools import product
  5 | 
  6 | import pytest
  7 | 
  8 | from .. import fastq, consts
  9 | from ..reader import zip_readers
 10 | 
 11 | # set some useful globals for testing
 12 | data_dir = os.path.split(__file__)[0] + "/data/"
 13 | _i7_files = [
 14 |     data_dir + f for f in ("test_i7.fastq", "test_i7.fastq.gz", "test_i7.fastq.bz2")
 15 | ]
 16 | _files = [data_dir + f for f in ("test_i7.fastq", "test_r1.fastq", "test_r2.fastq")]
 17 | _gz_files = [
 18 |     data_dir + f for f in ("test_i7.fastq.gz", "test_r1.fastq.gz", "test_r2.fastq.gz")
 19 | ]
 20 | _bz2_files = [
 21 |     data_dir + f
 22 |     for f in ("test_i7.fastq.bz2", "test_r1.fastq.bz2", "test_r2.fastq.bz2")
 23 | ]
 24 | 
 25 | _modes = ("r", "rb")
 26 | _files_and_modes = list(product(_i7_files, _modes))
 27 | _multifiles_and_modes = list(product((_files, _gz_files, _bz2_files), _modes))
 28 | _map_encoder = {"r": str, "rb": partial(bytes, encoding="utf-8")}
 29 | 
 30 | 
 31 | # TEST READER
 32 | 
 33 | 
 34 | @pytest.fixture(scope="module", params=_files_and_modes)
 35 | def i7_files_compressions_and_modes(request):
 36 |     """generates different compression types and modes for testing"""
 37 |     return request.param[0], request.param[1]
 38 | 
 39 | 
 40 | @pytest.fixture(scope="module", params=_multifiles_and_modes)
 41 | def reader_all_compressions(request):
 42 |     """generates open fastq reader files for each compression and read mode"""
 43 |     return fastq.Reader(request.param[0], request.param[1])
 44 | 
 45 | 
 46 | @pytest.fixture(scope="module")
 47 | def bytes_fastq_record():
 48 |     return [b"@name\n", b"ACTACAAT\n", b"+\n", b"%%%%AAAA\n"]
 49 | 
 50 | 
 51 | @pytest.fixture(scope="module")
 52 | def string_fastq_record():
 53 |     return ["@name\n", "ACTACAAT\n", "+\n", "%%%%AAAA\n"]
 54 | 
 55 | 
 56 | def test_reader_stores_filenames():
 57 |     names = ["notreal", "fake"]
 58 |     rd = fastq.Reader(files=names)
 59 |     assert rd.filenames == names
 60 | 
 61 | 
 62 | def test_reader_reads_first_record(reader_all_compressions):
 63 |     for record in reader_all_compressions:
 64 |         assert isinstance(record, fastq.Record)
 65 |         expected_result = (
 66 |             "NCACAATG\n" if isinstance(record.sequence, str) else b"NCACAATG\n"
 67 |         )
 68 |         assert record.sequence == expected_result
 69 |         break  # just first record
 70 | 
 71 | 
 72 | def test_reader_skips_header_character_raises_value_error(
 73 |     i7_files_compressions_and_modes,
 74 | ):
 75 |     """
 76 |     test should skip the first name line, shifting each record up 1. As a result, the
 77 |      first sequence should be found in the name field
 78 |     """
 79 |     filename, mode = i7_files_compressions_and_modes
 80 |     rd = fastq.Reader(filename, mode=mode, header_comment_char="@")
 81 |     with pytest.raises(ValueError):
 82 |         next(iter(rd))
 83 | 
 84 | 
 85 | def test_reader_reads_correct_number_of_records_across_multiple_files(
 86 |     reader_all_compressions,
 87 | ):
 88 |     assert len(reader_all_compressions) == 300  # 3 files
 89 | 
 90 | 
 91 | def test_mixed_filetype_read_gets_correct_record_number():
 92 |     rd = fastq.Reader([_gz_files[0], _bz2_files[0]], mode="r", header_comment_char="#")
 93 | 
 94 |     assert len(rd) == 200
 95 | 
 96 | 
 97 | def test_non_string_filename_raises_typeerror():
 98 |     with pytest.raises(TypeError):
 99 |         _ = fastq.Reader(10, "r")
100 | 
101 | 
102 | def test_non_string_filename_in_iterable_raises_typeerror():
103 |     with pytest.raises(TypeError):
104 |         _ = fastq.Reader(("works", 10), "r")
105 | 
106 | 
107 | def test_invalid_open_mode_raises_valueerror():
108 |     with pytest.raises(ValueError):
109 |         _ = fastq.Reader("works", "not_acceptable_open_mode")
110 | 
111 | 
112 | def test_fastq_returns_correct_filesize_for_single_and_multiple_files():
113 |     rd = fastq.Reader(
114 |         _i7_files[0], mode="r", header_comment_char="#"  # mode irrelevant
115 |     )
116 |     assert rd.size == 7774
117 | 
118 |     rd = fastq.Reader(_i7_files, mode="r", header_comment_char="#")  # mode irrelevant
119 |     assert rd.size == 7774 + 853 + 802  # three file sizes
120 | 
121 | 
122 | def test_reader_properly_subsets_based_on_indices():
123 |     rd = fastq.Reader(_i7_files[0], mode="r")
124 |     indices = {0, 5, 10, 12}
125 |     n_records = sum(1 for _ in rd.select_record_indices(indices))
126 |     assert n_records == len(indices)
127 | 
128 | 
129 | def test_zipping_readers_generates_expected_output():
130 |     rd1 = fastq.Reader(_files[0], "r")
131 |     rd2 = fastq.Reader(_files[0], "r")
132 |     for r1, r2 in zip_readers(rd1, rd2):
133 |         assert isinstance(r1, fastq.Record)
134 |         assert isinstance(r2, fastq.Record)
135 |         expected_result = "NCACAATG\n"
136 |         assert r1.sequence == r2.sequence == expected_result
137 |         break  # just first record
138 | 
139 | 
140 | def test_zipping_readers_with_indices_generates_expected_output():
141 |     rd1 = fastq.Reader(_files[0], "r")
142 |     rd2 = fastq.Reader(_files[0], "r")
143 |     indices = {0, 1, 2, 3}
144 |     for r1, r2 in zip_readers(rd1, rd2, indices=indices):
145 |         assert isinstance(r1, fastq.Record)
146 |         assert isinstance(r2, fastq.Record)
147 |         expected_result = "NCACAATG\n"
148 |         assert r1.sequence == r2.sequence == expected_result
149 |         break  # just first record
150 | 
151 | 
152 | def test_printing_bytes_record_generates_valid_fastq_record(bytes_fastq_record):
153 |     record = fastq.Record(bytes_fastq_record)
154 |     assert str(record) == b"".join(bytes_fastq_record).decode()
155 |     assert bytes(record) == b"".join(bytes_fastq_record)
156 | 
157 | 
158 | def test_bytes_fastq_record_quality_score_parsing(bytes_fastq_record):
159 |     record = fastq.Record(bytes_fastq_record)
160 |     assert record.average_quality() == 18
161 | 
162 | 
163 | def test_printing_string_record_generates_valid_fastq_record(string_fastq_record):
164 |     record = fastq.StrRecord(string_fastq_record)
165 |     assert str(record) == "".join(string_fastq_record)
166 |     assert bytes(record) == "".join(string_fastq_record).encode()
167 | 
168 | 
169 | def test_string_fastq_record_quality_score_parsing(string_fastq_record):
170 |     record = fastq.StrRecord(string_fastq_record)
171 |     assert record.average_quality() == 18
172 | 
173 | 
174 | # TEST RECORD
175 | 
176 | 
177 | def test_fields_populate_properly(reader_all_compressions):
178 |     encoder = _map_encoder[reader_all_compressions._mode]
179 |     name_prefix = encoder("@")
180 |     alphabet = set(encoder("ACGTN"))
181 |     name2_string = encoder("+\n")
182 |     ascii_chars = set(i for i in encoder(string.printable))
183 |     for record in reader_all_compressions:
184 |         assert record.name.startswith(name_prefix)
185 |         assert all(i in alphabet for i in record.sequence.strip())
186 |         assert record.name2 == name2_string
187 |         assert all(i in ascii_chars for i in record.quality.strip())
188 | 
189 | 
190 | # TEST BarcodeGeneratorWithCorrectedCellbarcodes
191 | 
192 | 
193 | @pytest.fixture(scope="function")
194 | def embedded_barcode_generator():
195 |     cell_barcode = fastq.EmbeddedBarcode(
196 |         start=0,
197 |         end=16,
198 |         quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY,
199 |         sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY,
200 |     )
201 |     molecule_barcode = fastq.EmbeddedBarcode(
202 |         start=16,
203 |         end=26,
204 |         quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY,
205 |         sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY,
206 |     )
207 |     return fastq.EmbeddedBarcodeGenerator(
208 |         data_dir + "test_r1.fastq.gz", [cell_barcode, molecule_barcode]
209 |     )
210 | 
211 | 
212 | @pytest.fixture(scope="function")
213 | def barcode_generator_with_corrected_cell_barcodes():
214 |     cell_barcode = fastq.EmbeddedBarcode(
215 |         start=0,
216 |         end=16,
217 |         quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY,
218 |         sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY,
219 |     )
220 |     molecule_barcode = fastq.EmbeddedBarcode(
221 |         start=16,
222 |         end=26,
223 |         quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY,
224 |         sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY,
225 |     )
226 |     return fastq.BarcodeGeneratorWithCorrectedCellBarcodes(
227 |         data_dir + "test_r1.fastq.gz",
228 |         cell_barcode,
229 |         data_dir + "1k-august-2016.txt",
230 |         [molecule_barcode],
231 |     )
232 | 
233 | 
234 | def test_embedded_barcode_generator_produces_outputs_of_expected_size(
235 |     embedded_barcode_generator,
236 | ):
237 |     for cell_seq, cell_qual, umi_seq, umi_qual in embedded_barcode_generator:
238 | 
239 |         # correct values
240 |         correct_cell_barcode_length = 16
241 |         correct_umi_length = 10
242 | 
243 |         # note that all barcodes are strings and therefore should get 'Z' values
244 | 
245 |         # test cell tags
246 |         assert cell_seq[0] == consts.RAW_CELL_BARCODE_TAG_KEY
247 |         assert len(cell_seq[1]) == correct_cell_barcode_length
248 |         assert all(v in "ACGTN" for v in cell_seq[1])
249 |         assert cell_seq[2] == "Z"
250 |         assert cell_qual[0] == consts.QUALITY_CELL_BARCODE_TAG_KEY
251 |         assert len(cell_qual[1]) == correct_cell_barcode_length
252 |         assert all(v in string.printable for v in cell_qual[1])
253 |         assert cell_seq[2] == "Z"
254 | 
255 |         # test umi tags
256 |         assert umi_seq[0] == consts.RAW_MOLECULE_BARCODE_TAG_KEY
257 |         assert len(umi_seq[1]) == correct_umi_length
258 |         assert all(v in "ACGTN" for v in umi_seq[1])
259 |         assert umi_seq[2] == "Z"
260 |         assert umi_qual[0] == consts.QUALITY_MOLECULE_BARCODE_TAG_KEY
261 |         assert len(umi_qual[1]) == correct_umi_length
262 |         assert all(v in string.printable for v in umi_qual[1])
263 |         assert umi_seq[2] == "Z"
264 | 
265 |         break  # just the first tag is fine
266 | 
267 | 
268 | def test_corrects_barcodes(barcode_generator_with_corrected_cell_barcodes):
269 |     success = False
270 |     for barcode_sets in barcode_generator_with_corrected_cell_barcodes:
271 |         for barcode_set in barcode_sets:
272 |             if barcode_set[0] == consts.CELL_BARCODE_TAG_KEY:
273 |                 success = True
274 |                 break
275 |     assert success
276 | 


--------------------------------------------------------------------------------
/src/sctools/test/test_gtf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .. import gtf
 3 | from itertools import chain
 4 | import pytest
 5 | 
 6 | _data_dir = os.path.split(__file__)[0] + "/data"
 7 | _files = ["%s/%s" % (_data_dir, f) for f in ("test.gtf", "test.gtf.gz", "test.gtf.bz2")]
 8 | 
 9 | 
10 | @pytest.fixture(scope="module", params=_files)
11 | def files(request):
12 |     """returns a filename"""
13 |     return request.param
14 | 
15 | 
16 | def test_opens_file_reads_first_line(files):
17 |     rd = gtf.Reader(files, "r", header_comment_char="#")
18 |     record = next(iter(rd))
19 |     assert isinstance(record, gtf.GTFRecord)
20 | 
21 | 
22 | def test_opens_file_populates_fields_properly(files):
23 |     rd = gtf.Reader(files, "r", header_comment_char="#")
24 |     record = next(iter(rd))
25 |     assert record.seqname == "chr19"
26 |     assert record.chromosome == "chr19"
27 |     assert record.source == "HAVANA"
28 |     assert record.feature == "gene"
29 |     assert record.start == 60951
30 |     assert record.end == 71626
31 |     assert record.score == "."
32 |     assert record.strand == "-"
33 |     assert record.frame == "."
34 | 
35 |     expected_features = {
36 |         "gene_id": "ENSG00000282458.1",
37 |         "gene_type": "transcribed_processed_pseudogene",
38 |         "gene_status": "KNOWN",
39 |         "gene_name": "WASH5P",
40 |         "level": "2",
41 |         "havana_gene": "OTTHUMG00000180466.8",
42 |     }
43 |     assert record._attributes == expected_features
44 | 
45 |     assert all(
46 |         i in str(record)
47 |         for i in chain(expected_features.keys(), expected_features.values())
48 |     )
49 | 
50 | 
51 | def test_set_attribute_verify_included_in_output_string(files):
52 |     rd = gtf.Reader(files, "r", header_comment_char="#")
53 |     record = next(iter(rd))
54 |     record.set_attribute("test_attr", "foo")
55 |     assert record.get_attribute("test_attr") == "foo"
56 | 
57 |     # verify in output string
58 |     assert "foo" in str(record)
59 | 
60 | 
61 | def test_opens_file_parses_size(files):
62 |     rd = gtf.Reader(files, "r", header_comment_char="#")
63 |     record = next(iter(rd))
64 |     assert 71626 - 60951 == record.size
65 | 
66 |     # mangle record, make sure error is raised
67 |     record._fields[3:5] = [record.end, record.start]
68 |     with pytest.raises(ValueError):
69 |         getattr(record, "size")
70 | 


--------------------------------------------------------------------------------
/src/sctools/test/test_platform.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import pysam
 4 | 
 5 | from .. import platform
 6 | 
 7 | data_dir = os.path.split(__file__)[0] + "/data/"
 8 | 
 9 | 
10 | def test_attach_barcodes():
11 |     """High-level test of the AttachBarcodes command"""
12 | 
13 |     temp_dir_name = tempfile.mkdtemp()
14 | 
15 |     # Construct cli arguments to pass to the command
16 |     temp_output_bam = temp_dir_name + "output.bam"
17 | 
18 |     args = [
19 |         "--r1",
20 |         data_dir + "test_r1.fastq",
21 |         "--u2",
22 |         data_dir + "test_r2.bam",
23 |         "--i1",
24 |         data_dir + "test_i1.fastq",
25 |         "--o",
26 |         temp_output_bam,
27 |         "--sample-barcode-start-pos",
28 |         "0",
29 |         "--sample-barcode-length",
30 |         "8",
31 |         "--cell-barcode-start-pos",
32 |         "0",
33 |         "--cell-barcode-length",
34 |         "16",
35 |         "--molecule-barcode-start-pos",
36 |         "16",
37 |         "--molecule-barcode-length",
38 |         "4",
39 |     ]
40 | 
41 |     platform.BarcodePlatform.attach_barcodes(args)
42 | 
43 |     with pysam.AlignmentFile(temp_output_bam, "rb", check_sq=False) as samfile:
44 |         for read in samfile:
45 |             tag_cr = read.get_tag("CR")
46 |             tag_cy = read.get_tag("CY")
47 |             tag_ur = read.get_tag("UR")
48 |             tag_uy = read.get_tag("UY")
49 |             tag_sr = read.get_tag("SR")
50 |             tag_sy = read.get_tag("SY")
51 |             assert len(tag_cr) == 16
52 |             assert len(tag_cy) == 16
53 |             assert len(tag_ur) == 4
54 |             assert len(tag_uy) == 4
55 |             assert len(tag_sr) == 8
56 |             assert len(tag_sy) == 8
57 | 


--------------------------------------------------------------------------------
/src/sctools/test/test_stats.py:
--------------------------------------------------------------------------------
 1 | from .. import stats
 2 | 
 3 | 
 4 | def test_concentrated_data_produces_entropy_0():
 5 |     entropy = stats.base4_entropy([1, 0, 0, 0], axis=0)
 6 |     assert entropy == 0
 7 | 
 8 | 
 9 | def test_concentrated_unnormalized_data_produces_entropy_0():
10 |     entropy = stats.base4_entropy([1000, 0, 0, 0], axis=0)
11 |     assert entropy == 0
12 | 
13 | 
14 | def test_balanced_data_produces_entropy_1():
15 |     entropy = stats.base4_entropy([0.25, 0.25, 0.25, 0.25], axis=0)
16 |     assert entropy == 1
17 | 
18 | 
19 | def test_balanced_unnormalized_data_produces_entropy_1():
20 |     entropy = stats.base4_entropy([20, 20, 20, 20], axis=0)
21 |     assert entropy == 1
22 | 


--------------------------------------------------------------------------------