├── .gitattributes ├── python ├── rnaseqc │ ├── __init__.py │ ├── __main__.py │ ├── create_notebook.py │ ├── run.py │ ├── legacy_exon_remap.py │ ├── nb_encode.py │ ├── insert_size_intervals.py │ ├── aggregate.py │ ├── report.py │ └── plot.py ├── setup.py └── README.md ├── .gitmodules ├── test_data ├── chr1.output │ ├── chr1.bam.gene_tpm.gct.gz │ ├── chr1.bam.exon_reads.gct.gz │ ├── chr1.bam.gene_reads.gct.gz │ ├── chr1.bam.gene_fragments.gct.gz │ ├── chr1.cram.gc_content.tsv │ ├── chr1.bam.metrics.tsv │ └── chr1.cram.metrics.tsv ├── legacy.output │ ├── legacy.exon_reads.gct.gz │ ├── legacy.gene_reads.gct.gz │ ├── downsampled.bam.gene_tpm.gct.gz │ ├── downsampled.bam.exon_reads.gct.gz │ ├── downsampled.bam.gene_reads.gct.gz │ ├── downsampled.bam.gene_fragments.gct.gz │ ├── downsampled.bam.metrics.tsv │ └── downsampled.bam.fragmentSizes.txt ├── downsampled.output │ ├── downsampled.bam.gene_tpm.gct.gz │ ├── downsampled.bam.exon_reads.gct.gz │ ├── downsampled.bam.gene_reads.gct.gz │ ├── downsampled.bam.gene_fragments.gct.gz │ ├── downsampled.bam.metrics.tsv │ └── downsampled.bam.fragmentSizes.txt ├── single_pair.output │ ├── single_pair.bam.gene_tpm.gct.gz │ ├── single_pair.bam.exon_reads.gct.gz │ ├── single_pair.bam.gene_reads.gct.gz │ ├── single_pair.bam.gene_fragments.gct.gz │ └── single_pair.bam.metrics.tsv ├── approx_diff.py ├── Makefile.osx └── Makefile.linux ├── .gitignore ├── src ├── BED.h ├── BamReader.cpp ├── BED.cpp ├── Expression.h ├── GTF.h ├── Fasta.h ├── BamReader.h ├── Fasta.cpp ├── Metrics.h ├── GTF.cpp └── Metrics.cpp ├── THIRD-PARTY-LICENSES.md ├── cloudbuild.yaml ├── LICENSE ├── Dockerfile ├── .github └── workflows │ ├── CI.yml │ └── Deployment.yml ├── Metrics.md ├── Makefile └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/rnaseqc/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.3' 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "SeqLib"] 2 | path = SeqLib 3 | url = https://github.com/walaj/SeqLib.git 4 | -------------------------------------------------------------------------------- /test_data/chr1.output/chr1.bam.gene_tpm.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/chr1.output/chr1.bam.gene_tpm.gct.gz -------------------------------------------------------------------------------- /test_data/chr1.output/chr1.bam.exon_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/chr1.output/chr1.bam.exon_reads.gct.gz -------------------------------------------------------------------------------- /test_data/chr1.output/chr1.bam.gene_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/chr1.output/chr1.bam.gene_reads.gct.gz -------------------------------------------------------------------------------- /test_data/legacy.output/legacy.exon_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/legacy.exon_reads.gct.gz -------------------------------------------------------------------------------- /test_data/legacy.output/legacy.gene_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/legacy.gene_reads.gct.gz -------------------------------------------------------------------------------- /test_data/chr1.output/chr1.bam.gene_fragments.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/chr1.output/chr1.bam.gene_fragments.gct.gz -------------------------------------------------------------------------------- /test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz -------------------------------------------------------------------------------- /test_data/legacy.output/downsampled.bam.exon_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/downsampled.bam.exon_reads.gct.gz -------------------------------------------------------------------------------- /test_data/legacy.output/downsampled.bam.gene_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/downsampled.bam.gene_reads.gct.gz -------------------------------------------------------------------------------- /test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz -------------------------------------------------------------------------------- /test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz -------------------------------------------------------------------------------- /test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz -------------------------------------------------------------------------------- /test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz -------------------------------------------------------------------------------- /test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz -------------------------------------------------------------------------------- /test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz -------------------------------------------------------------------------------- /test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz -------------------------------------------------------------------------------- /test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz -------------------------------------------------------------------------------- /test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.xcodeproj/ 2 | __pycache__/ 3 | src/*.o 4 | rnaseqc 5 | !python/rnaseqc 6 | rnaseqc.a 7 | build/ 8 | *.egg-info/ 9 | *.bam 10 | *.cram 11 | *.gtf 12 | *.fasta 13 | *.fa 14 | *.fai 15 | test_data/test_inputs.tar.gz 16 | -------------------------------------------------------------------------------- /src/BED.h: -------------------------------------------------------------------------------- 1 | // 2 | // BED.hpp 3 | // IntervalTree 4 | // 5 | // Created by Aaron Graubert on 7/11/17. 6 | // Copyright © 2017 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #ifndef BED_h 10 | #define BED_h 11 | 12 | #include "GTF.h" 13 | 14 | namespace rnaseqc { 15 | struct bedException : public std::exception { 16 | std::string error; 17 | bedException(std::string msg) : error(msg) {}; 18 | }; 19 | 20 | std::ifstream& extractBED(std::ifstream&, Feature&); 21 | } 22 | #endif /* BED_h */ 23 | -------------------------------------------------------------------------------- /THIRD-PARTY-LICENSES.md: -------------------------------------------------------------------------------- 1 | RNA-SeQC third party code notice: 2 | 3 | In addition, RNA-SeQC is distributed, in part, under and subject to the licenses for: 4 | [SeqLib](https://github.com/walaj/SeqLib) - Copyright © 2016 Jeremiah A. Wala. All Rights Reserved. 5 | [Apache 2.0 License](https://github.com/walaj/SeqLib/blob/master/LICENSE) (as of 7e1f982). 6 | 7 | [Args](https://github.com/Taywee/args/) - Copyright © 2016 – 2017 Taylor C. Richberger and Pavel Belikov. All Rights Reserved. 8 | [MIT license](https://github.com/Taywee/args/blob/master/LICENSE) (as of 7bf17000). 9 | 10 | [BioIO](https://github.com/dancooke/bioio/), Copyright © 2017 Daniel Cooke. All Rights Reserved. 11 | [MIT License](https://github.com/dancooke/bioio/blob/master/LICENSE) (as of 99978e1). 12 | -------------------------------------------------------------------------------- /src/BamReader.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // BamReader.cpp 3 | // RNA-SeQC 4 | // 5 | // Created by Aaron Graubert on 10/3/18. 6 | // Copyright © 2018 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #include "BamReader.h" 10 | 11 | namespace rnaseqc { 12 | bool SeqlibReader::next(SeqLib::BamRecord &read) 13 | { 14 | // Must uncomment before adding multithreading 15 | // std::lock_guard guard(*this); 16 | try { 17 | bool ok = this->bam.GetNextRecord(read); 18 | if (ok) this->read_count++; 19 | return ok; 20 | } 21 | catch (std::runtime_error &e) { 22 | if (this->user_cram_reference) throw referenceHTSMismatch(std::string("HTSLib was unable to find a suitable reference while decoding a cram: ")+e.what()); 23 | throw; 24 | } 25 | return false; // No way to get here 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from setuptools import setup, find_packages 4 | with open("rnaseqc/__init__.py") as reader: 5 | __version__ = re.search( 6 | r'__version__ ?= ?[\'\"]([\w.]+)[\'\"]', 7 | reader.read() 8 | ).group(1) 9 | with open(os.path.join(os.path.dirname(__file__), 'README.md')) as r: 10 | long_description = r.read() 11 | 12 | # Setup information 13 | setup( 14 | name = 'rnaseqc', 15 | version = __version__, 16 | packages = find_packages(), 17 | description = 'Multi-sample visualization of metrics from RNA-SeQC', 18 | long_description = long_description, 19 | long_description_content_type='text/markdown', 20 | install_requires = [ 21 | 'numpy', 22 | 'pandas', 23 | 'matplotlib', 24 | 'seaborn', 25 | 'qtl', 26 | 'agutil', 27 | 'nbformat' 28 | ], 29 | classifiers = [ 30 | "Programming Language :: Python :: 3", 31 | "Intended Audience :: Science/Research", 32 | "Topic :: Scientific/Engineering :: Bio-Informatics", 33 | ], 34 | ) 35 | -------------------------------------------------------------------------------- /test_data/chr1.output/chr1.cram.gc_content.tsv: -------------------------------------------------------------------------------- 1 | Content Bin Count 2 | 0 0 3 | 0.01 0 4 | 0.02 0 5 | 0.03 0 6 | 0.04 0 7 | 0.05 0 8 | 0.06 0 9 | 0.07 0 10 | 0.08 0 11 | 0.09 0 12 | 0.1 0 13 | 0.11 0 14 | 0.12 0 15 | 0.13 0 16 | 0.14 2 17 | 0.15 0 18 | 0.16 1 19 | 0.17 1 20 | 0.18 6 21 | 0.19 14 22 | 0.2 17 23 | 0.21 34 24 | 0.22 128 25 | 0.23 225 26 | 0.24 531 27 | 0.25 1027 28 | 0.26 1816 29 | 0.27 2880 30 | 0.28 4199 31 | 0.29 5730 32 | 0.3 6819 33 | 0.31 7925 34 | 0.32 8547 35 | 0.33 9823 36 | 0.34 10260 37 | 0.35 10631 38 | 0.36 11507 39 | 0.37 11568 40 | 0.38 11731 41 | 0.39 11664 42 | 0.4 12154 43 | 0.41 11357 44 | 0.42 11028 45 | 0.43 10721 46 | 0.44 10951 47 | 0.45 10661 48 | 0.46 10776 49 | 0.47 11125 50 | 0.48 11119 51 | 0.49 10503 52 | 0.5 10200 53 | 0.51 10029 54 | 0.52 9791 55 | 0.53 9357 56 | 0.54 9196 57 | 0.55 8883 58 | 0.56 8474 59 | 0.57 7835 60 | 0.58 6694 61 | 0.59 5991 62 | 0.6 5467 63 | 0.61 5127 64 | 0.62 4517 65 | 0.63 4205 66 | 0.64 3741 67 | 0.65 3197 68 | 0.66 2617 69 | 0.67 2049 70 | 0.68 1596 71 | 0.69 1198 72 | 0.7 959 73 | 0.71 727 74 | 0.72 498 75 | 0.73 335 76 | 0.74 238 77 | 0.75 157 78 | 0.76 61 79 | 0.77 38 80 | 0.78 8 81 | 0.79 7 82 | 0.8 1 83 | 0.81 1 84 | 0.82 1 85 | 0.83 0 86 | 0.84 0 87 | 0.85 0 88 | 0.86 0 89 | 0.87 0 90 | 0.88 0 91 | 0.89 0 92 | 0.9 0 93 | 0.91 0 94 | 0.92 0 95 | 0.93 0 96 | 0.94 0 97 | 0.95 0 98 | 0.96 0 99 | 0.97 0 100 | 0.98 0 101 | 0.99 0 102 | -------------------------------------------------------------------------------- /src/BED.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // BED.cpp 3 | // IntervalTree 4 | // 5 | // Created by Aaron Graubert on 7/11/17. 6 | // Copyright © 2017 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #include "BED.h" 10 | #include 11 | #include 12 | #include 13 | 14 | using std::ifstream; 15 | using std::string; 16 | 17 | namespace rnaseqc { 18 | ifstream& extractBED(ifstream &input, Feature &out) 19 | { 20 | try 21 | { 22 | string line; 23 | while(getline(input, line)) 24 | { 25 | if(line[0] == '#') continue; //Do beds even have comment lines? 26 | std::istringstream tokenizer(line); 27 | string buffer; 28 | tokenizer >> buffer; //chromosome name 29 | out.chromosome = chromosomeMap(buffer); 30 | tokenizer >> buffer; //start 31 | out.start = std::stoull(buffer) + 1; 32 | tokenizer >> buffer; //stop 33 | out.end = std::stoull(buffer) + 1; 34 | out.feature_id = line; // add a dummy exon_id for mapping interval intersections later 35 | out.type = FeatureType::Exon; 36 | break; 37 | } 38 | } 39 | catch (std::exception &e) 40 | { 41 | throw bedException(std::string("Encountered an unknown error while parsing the BED: ") + e.what()); 42 | } 43 | return input; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: gcr.io/cloud-builders/docker 3 | args: 4 | - pull 5 | - gcr.io/broad-cga-aarong-gtex/rnaseqc:latest 6 | - name: gcr.io/cloud-builders/docker 7 | args: 8 | - build 9 | - -t 10 | - gcr.io/broad-cga-aarong-gtex/rnaseqc:$COMMIT_SHA 11 | - --cache-from 12 | - gcr.io/broad-cga-aarong-gtex/rnaseqc:latest 13 | - . 14 | timeout: 900s 15 | - name: gcr.io/cloud-builders/docker 16 | args: 17 | - push 18 | - gcr.io/broad-cga-aarong-gtex/rnaseqc:$COMMIT_SHA 19 | - name: gcr.io/broad-cga-aarong-gtex/rnaseqc:$COMMIT_SHA 20 | args: 21 | - bash 22 | - -c 23 | - > 24 | apt-get update && 25 | apt-get install git wget -y && 26 | git clone https://github.com/getzlab/rnaseqc.git && 27 | mv rnaseqc/test_data /opt/rnaseqc && 28 | cd /opt/rnaseqc/test_data && 29 | wget https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz && 30 | tar xzf test_inputs.tar.gz && 31 | cd .. && 32 | make && make -f test_data/Makefile.linux test 33 | timeout: 900s 34 | - name: gcr.io/cloud-builders/docker 35 | args: 36 | - tag 37 | - gcr.io/broad-cga-aarong-gtex/rnaseqc:$COMMIT_SHA 38 | - gcr.io/broad-cga-aarong-gtex/rnaseqc:latest 39 | - name: gcr.io/cloud-builders/docker 40 | args: 41 | - push 42 | - gcr.io/broad-cga-aarong-gtex/rnaseqc:latest 43 | images: 44 | - gcr.io/broad-cga-aarong-gtex/rnaseqc 45 | timeout: 1800s 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | RNA-SeQC is licensed under the following BSD 3-clause license: 2 | 3 | Copyright © 2018 The Broad Institute, Inc. and The General Hospital Corporation. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software without 17 | specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING,BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /src/Expression.h: -------------------------------------------------------------------------------- 1 | // 2 | // Expression.hpp 3 | // IntervalTree 4 | // 5 | // Created by Aaron Graubert on 8/2/17. 6 | // Copyright © 2017 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #ifndef Expression_h 10 | #define Expression_h 11 | 12 | #include "Metrics.h" 13 | #include "BamReader.h" 14 | #include 15 | #include 16 | 17 | namespace rnaseqc { 18 | //Utility functions 19 | unsigned int extractBlocks(Alignment&, std::vector&, chrom, bool); 20 | //unsigned int legacyExtractBlocks(BamTools::BamAlignment&, std::vector&, chrom); 21 | std::list* intersectBlock(Feature&, std::list&); 22 | void trimFeatures(Alignment&, std::list&); 23 | void trimFeatures(Alignment&, std::list&, BaseCoverage&); 24 | void dropFeatures(std::list&, BaseCoverage&); 25 | 26 | // Definitions for fragment tracking 27 | typedef std::tuple FragmentMateEntry; // Used to record mate end point (exon name, read end position) 28 | const std::size_t EXON = 0, ENDPOS = 1; 29 | 30 | //Metrics functions 31 | void fragmentSizeMetrics(unsigned int&, std::map>*, std::map&, std::map&,std::vector&, Alignment&, SeqLib::HeaderSequenceVector&); 32 | 33 | double exonAlignmentMetrics(std::map>&, Metrics&, std::vector&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool, std::map&, Fasta&); 34 | 35 | void legacyExonAlignmentMetrics(unsigned int, std::map>&, Metrics&, std::vector&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool); 36 | 37 | Strand feature_strand(Alignment&, Strand); 38 | } 39 | 40 | #endif /* Expression_h */ 41 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # RNA-SeQC Python utilities 2 | 3 | This module contains utility code for RNA-SeQC 4 | 5 | ## Installing 6 | 7 | * From pip: `pip install rnaseqc` 8 | * From the git repo: `pip install -e python` (Invoke from root of git repo) 9 | 10 | ## Usage 11 | 12 | This does not install a console entrypoint. You can invoke the utilities in one of three ways: 13 | 14 | * From the main module: `python3 -m rnaseqc ...` 15 | * Calling the target module: `python3 -m rnaseqc.example ...` 16 | * Calling scripts directly: `python3 python/rnaseqc/example.py` 17 | 18 | ## Utilities 19 | 20 | The `rnaseqc` module contains 5 main utilities. To get more help with each utility, 21 | invoke the utility with the `-h` or `--help` option 22 | 23 | ### Aggregation 24 | 25 | Aggregates RNA-SeQC outputs from multiple samples 26 | 27 | ``` 28 | python3 -m rnaseqc aggregate [-h] [--parquet] [-o OUTPUT_DIR] results_dir prefix 29 | ``` 30 | 31 | ### Jupyter Notebooks 32 | 33 | Creates a jupyter notebook with several figures for comparing samples 34 | 35 | ``` 36 | python3 -m rnaseqc notebook [-h] [-t TPM] [-i INSERT_SIZE] [-c COHORT] [-d DATE] metrics output 37 | ``` 38 | 39 | ### Figures 40 | 41 | Generates figures from an aggregated RNA-SeQC metrics table 42 | 43 | ``` 44 | python3 -m rnaseqc report [-h] [--tpm TPM] [--insert-size INSERT_SIZE] [--cohort COHORT] [--output-dir OUTPUT_DIR] [--dpi DPI] metrics prefix 45 | ``` 46 | 47 | ### Insert Size distributions 48 | 49 | Generates a BED file with intervals used by RNA-SeQC for estimating a sample's insert size distribution 50 | 51 | ``` 52 | python3 -m rnaseqc insert-size [-h] [--min-length MIN_LENGTH] [--min-mappability MIN_MAPPABILITY] [--output-dir OUTPUT_DIR] gtf_path mappability_bigwig prefix 53 | ``` 54 | 55 | ### Exon remapping 56 | 57 | Convert exon names in an `*.exon_reads.gct` file from RNA-SeQC 2.X.X to match names 58 | as reported by RNA-SeQC 1.1.9 59 | 60 | ``` 61 | python3 -m rnaseqc legacy-exons gct gtf 62 | ``` 63 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile for RNASeQC 2 | FROM ubuntu:20.04 3 | MAINTAINER Aaron Graubert 4 | 5 | RUN apt-get update && apt-get install -y software-properties-common && \ 6 | apt-get update && apt-get install -y \ 7 | build-essential \ 8 | cmake \ 9 | git \ 10 | python3 \ 11 | python3-pip \ 12 | libboost-filesystem-dev \ 13 | libboost-regex-dev \ 14 | libboost-system-dev \ 15 | libbz2-dev \ 16 | libcurl3-dev \ 17 | liblzma-dev \ 18 | libpthread-stubs0-dev \ 19 | wget \ 20 | zlib1g-dev \ 21 | && rm -rf /var/lib/apt/lists/* 22 | 23 | # Python 24 | RUN python3 -m pip install --upgrade pip setuptools pyarrow jupyter 25 | 26 | # SeqLib 27 | COPY Makefile /opt/rnaseqc/Makefile 28 | RUN cd /opt/rnaseqc && git clone --recursive https://github.com/walaj/SeqLib.git && \ 29 | cd SeqLib && git checkout 7e1f98267b5057f9505dbff119308137a0e006db && cd .. && \ 30 | make SeqLib/lib/libseqlib.a 31 | 32 | # python 33 | RUN cd /opt && git clone https://github.com/francois-a/rnaseq-utils rnaseq && cd rnaseq && \ 34 | git checkout f1c6a5677bbca465ea1edd06c2293a5d1078a18b && python3 -m pip install --upgrade pip setuptools && \ 35 | python3 -m pip install numpy && python3 -m pip install pandas matplotlib scipy pyBigWig bx-python \ 36 | agutil nbformat seaborn sklearn qtl && mkdir -p /root/.config/matplotlib && echo "backend : Agg" > /root/.config/matplotlib/matplotlibrc 37 | ENV PYTHONPATH $PYTHONPATH:/opt/ 38 | 39 | #RNASeQC 40 | COPY src /opt/rnaseqc/src 41 | COPY python /opt/rnaseqc/python 42 | COPY args.hxx /opt/rnaseqc 43 | COPY bioio.hpp /opt/rnaseqc 44 | RUN cd /opt/rnaseqc && make && ln -s /opt/rnaseqc/rnaseqc /usr/local/bin/rnaseqc && make clean && python3 -m pip install -e /opt/rnaseqc/python 45 | 46 | # clean up 47 | RUN apt-get clean && \ 48 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ 49 | apt-get autoclean && \ 50 | apt-get autoremove -y && \ 51 | rm -rf /var/lib/{apt,dpkg,cache,log}/ 52 | -------------------------------------------------------------------------------- /src/GTF.h: -------------------------------------------------------------------------------- 1 | // 2 | // GTF.hpp 3 | // IntervalTree 4 | // 5 | // Created by Aaron Graubert on 6/28/17. 6 | // Copyright © 2017 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #ifndef GTF_h 10 | #define GTF_h 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "Fasta.h" 20 | 21 | namespace rnaseqc { 22 | struct gtfException : public std::exception { 23 | std::string error; 24 | gtfException(std::string msg) : error(msg) {}; 25 | }; 26 | 27 | enum FeatureType {Gene, Transcript, Exon, Other}; 28 | 29 | struct Feature { 30 | //Represents arbitrary genome features 31 | coord start, end; 32 | chrom chromosome; 33 | Strand strand; 34 | FeatureType type; 35 | std::string feature_id, gene_id, transcript_type; 36 | bool ribosomal; 37 | }; 38 | 39 | //For comparing features 40 | bool operator==(const Feature &a, const Feature &b); 41 | bool compIntervalStart(const Feature&, const Feature&); 42 | bool compIntervalEnd(const Feature&, const Feature&); 43 | bool intersectPoint(const Feature&, const coord); 44 | bool intersectInterval(const Feature&, const Feature&); 45 | int partialIntersect(const Feature&, const Feature&); 46 | 47 | struct FeatureSpan { 48 | chrom chromosome; 49 | coord start, length; 50 | }; 51 | 52 | 53 | extern std::map geneNames, geneSeqs; 54 | extern std::map geneLengths, geneCodingLengths; 55 | extern std::map exonLengths; 56 | extern std::vector geneList, exonList; 57 | extern std::map> exonsForGene; 58 | 59 | std::ifstream& operator>>(std::ifstream&, Feature&); 60 | std::map& parseAttributes(std::string&, std::map&); 61 | } 62 | 63 | #endif /* GTF_h */ 64 | -------------------------------------------------------------------------------- /test_data/single_pair.output/single_pair.bam.metrics.tsv: -------------------------------------------------------------------------------- 1 | Sample single_pair.bam 2 | Mapping Rate 1 3 | Unique Rate of Mapped 1 4 | Duplicate Rate of Mapped 0 5 | Duplicate Rate of Mapped, excluding Globins 0 6 | Base Mismatch 0 7 | End 1 Mapping Rate 1 8 | End 2 Mapping Rate 1 9 | End 1 Mismatch Rate 0 10 | End 2 Mismatch Rate 0 11 | Expression Profiling Efficiency 1 12 | High Quality Rate 1 13 | Exonic Rate 1 14 | Intronic Rate 0 15 | Intergenic Rate 0 16 | Intragenic Rate 1 17 | Ambiguous Alignment Rate 0 18 | High Quality Exonic Rate 1 19 | High Quality Intronic Rate 0 20 | High Quality Intergenic Rate 0 21 | High Quality Intragenic Rate 1 22 | High Quality Ambiguous Alignment Rate 0 23 | Discard Rate 0 24 | rRNA Rate 0 25 | End 1 Sense Rate 1 26 | End 2 Sense Rate 0 27 | Avg. Splits per Read 0 28 | Alternative Alignments 0 29 | Chimeric Fragments 0 30 | Chimeric Alignment Rate 0 31 | Duplicate Reads 0 32 | End 1 Antisense 0 33 | End 2 Antisense 1 34 | End 1 Bases 76 35 | End 2 Bases 76 36 | End 1 Mapped Reads 1 37 | End 2 Mapped Reads 1 38 | End 1 Mismatches 0 39 | End 2 Mismatches 0 40 | End 1 Sense 1 41 | End 2 Sense 0 42 | Exonic Reads 2 43 | Failed Vendor QC 0 44 | High Quality Reads 2 45 | Intergenic Reads 0 46 | Intragenic Reads 2 47 | Ambiguous Reads 0 48 | Intronic Reads 0 49 | Low Mapping Quality 0 50 | Low Quality Reads 0 51 | Mapped Duplicate Reads 0 52 | Mapped Reads 2 53 | Mapped Unique Reads 2 54 | Mismatched Bases 0 55 | Non-Globin Reads 2 56 | Non-Globin Duplicate Reads 0 57 | Reads used for Intron/Exon counts 2 58 | rRNA Reads 0 59 | Total Bases 152 60 | Total Mapped Pairs 1 61 | Total Reads 2 62 | Unique Mapping, Vendor QC Passed Reads 2 63 | Unpaired Reads 0 64 | Read Length 76 65 | Genes Detected 0 66 | Estimated Library Complexity 0 67 | Genes used in 3' bias 0 68 | Mean 3' bias 0 69 | Median 3' bias 0 70 | 3' bias Std 0 71 | 3' bias MAD_Std 0 72 | 3' Bias, 25th Percentile 0 73 | 3' Bias, 75th Percentile 0 74 | Median of Avg Transcript Coverage 0 75 | Median of Transcript Coverage Std 0 76 | Median of Transcript Coverage CV 0 77 | Median Exon CV nan 78 | Exon CV MAD nan 79 | -------------------------------------------------------------------------------- /test_data/chr1.output/chr1.bam.metrics.tsv: -------------------------------------------------------------------------------- 1 | Sample chr1.bam 2 | Mapping Rate 1 3 | Unique Rate of Mapped 1 4 | Duplicate Rate of Mapped 0 5 | Duplicate Rate of Mapped, excluding Globins 0 6 | Base Mismatch 0.00968147 7 | End 1 Mapping Rate 1.01474 8 | End 2 Mapping Rate 0.985262 9 | End 1 Mismatch Rate 0.00253608 10 | End 2 Mismatch Rate 0.0170406 11 | Expression Profiling Efficiency 0.807719 12 | High Quality Rate 0.884446 13 | Exonic Rate 0.807719 14 | Intronic Rate 0.131935 15 | Intergenic Rate 0.0274077 16 | Intragenic Rate 0.939654 17 | Ambiguous Alignment Rate 0.0329382 18 | High Quality Exonic Rate 0.835902 19 | High Quality Intronic Rate 0.108092 20 | High Quality Intergenic Rate 0.0240545 21 | High Quality Intragenic Rate 0.943994 22 | High Quality Ambiguous Alignment Rate 0.0319513 23 | Discard Rate 0 24 | rRNA Rate 4.18304e-06 25 | End 1 Sense Rate 0.495471 26 | End 2 Sense Rate 0.503206 27 | Avg. Splits per Read 0.261769 28 | Alternative Alignments 229158 29 | Chimeric Fragments 0 30 | Chimeric Alignment Rate 0 31 | Duplicate Reads 0 32 | End 1 Antisense 498854 33 | End 2 Antisense 477326 34 | End 1 Bases 82963728 35 | End 2 Bases 80553844 36 | End 1 Mapped Reads 1091628 37 | End 2 Mapped Reads 1059919 38 | End 1 Mismatches 210403 39 | End 2 Mismatches 1372688 40 | End 1 Sense 489897 41 | End 2 Sense 483486 42 | Exonic Reads 1737846 43 | Failed Vendor QC 122510 44 | High Quality Reads 1902928 45 | Intergenic Reads 58969 46 | Intragenic Reads 2021710 47 | Ambiguous Reads 70868 48 | Intronic Reads 283864 49 | Low Mapping Quality 186732 50 | Low Quality Reads 248619 51 | Mapped Duplicate Reads 0 52 | Mapped Reads 2151547 53 | Mapped Unique Reads 2151547 54 | Mismatched Bases 1583091 55 | Non-Globin Reads 2151547 56 | Non-Globin Duplicate Reads 0 57 | Reads used for Intron/Exon counts 2151547 58 | rRNA Reads 9 59 | Total Bases 163517572 60 | Total Mapped Pairs 1052544 61 | Total Reads 2503215 62 | Unique Mapping, Vendor QC Passed Reads 2151547 63 | Unpaired Reads 0 64 | Read Length 76 65 | Genes Detected 1842 66 | Estimated Library Complexity 0 67 | Genes used in 3' bias 236 68 | Mean 3' bias 0.511031 69 | Median 3' bias 0.5 70 | 3' bias Std 0.293274 71 | 3' bias MAD_Std 0.353802 72 | 3' Bias, 25th Percentile 0.266282 73 | 3' Bias, 75th Percentile 0.75 74 | Median of Avg Transcript Coverage 0.629976 75 | Median of Transcript Coverage Std 0.905514 76 | Median of Transcript Coverage CV 1.06256 77 | Median Exon CV 0.313763 78 | Exon CV MAD 0.308991 79 | -------------------------------------------------------------------------------- /src/Fasta.h: -------------------------------------------------------------------------------- 1 | // 2 | // Fasta.h 3 | // IntervalTree 4 | // 5 | // Created by Aaron Graubert on 5/23/18. 6 | // Copyright © 2018 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #ifndef Fasta_h 10 | #define Fasta_h 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | namespace rnaseqc { 22 | struct fileException : public std::exception { 23 | std::string error; 24 | fileException(std::string msg) : error(msg) {}; 25 | }; 26 | 27 | struct invalidContigException : public std::exception { 28 | std::string error; 29 | invalidContigException(std::string msg) : error(msg) {}; 30 | }; 31 | 32 | typedef long long coord; 33 | typedef unsigned long indexType; 34 | typedef unsigned short chrom; 35 | 36 | static const double PAGE_SIZE = 1e6; // Size of each cache page (in bases) 37 | static const unsigned short CACHE_SIZE = 10u; // How many pages are stored in the cache 38 | 39 | extern std::map chromosomes; 40 | 41 | enum Strand {Forward, Reverse, Unknown}; 42 | chrom chromosomeMap(std::string); 43 | 44 | class Fasta { 45 | // Represents an entire fasta file 46 | // Uses the bioio library for quickly retrieving sequences 47 | // Uses an internal LRU cache to minimize required reading 48 | bool _open; 49 | std::ifstream reader; 50 | std::unordered_map pageCache; 51 | std::list lru; 52 | std::unordered_map contigIndex; 53 | void updateLRU(indexType); 54 | indexType pageForContig(chrom); 55 | std::string readSeq(chrom, coord); 56 | unsigned long calls, misses; 57 | public: 58 | Fasta() : _open(), reader(), pageCache(), lru(), contigIndex(), calls(), misses() {}; 59 | ~Fasta(); 60 | void open(std::string&); 61 | std::string getSeq(chrom, coord, coord); 62 | std::string getSeq(chrom, coord, coord, Strand); 63 | indexType pageForCoord(chrom, coord); 64 | coord pageOffset(indexType); 65 | bool isOpen() const; 66 | bool hasContig(chrom) const; 67 | 68 | }; 69 | 70 | double gc(std::string&); 71 | } 72 | 73 | #endif /* Fasta_h */ 74 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | 10 | jobs: 11 | build-macos: 12 | 13 | runs-on: 14 | - macos-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | with: 19 | submodules: recursive 20 | lfs: true 21 | - name: Brew Update 22 | run: brew update >/dev/null 23 | - name: Reinstall 24 | run: brew reinstall xz curl 25 | continue-on-error: true 26 | - name: Install Deps 27 | run: brew install boost zlib curl samtools bzip2 xz && brew link --overwrite python@3.9 28 | - name: Pip install 29 | run: sudo python3 -m pip install --user --upgrade pip setuptools && sudo python3 -m pip install --user numpy && sudo python3 -m pip install --user -e ./python 30 | - name: make 31 | run: > 32 | export ZLIB_PATH=$(ls /usr/local/Cellar/zlib/*/lib/libz.a) 33 | LZMA_PATH=$(ls /usr/local/Cellar/xz/*/lib/liblzma.a) && 34 | make -f test_data/Makefile.osx 35 | - name: Download Tests 36 | run: > 37 | cd test_data && 38 | wget https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz && 39 | tar xzf test_inputs.tar.gz && 40 | cd .. 41 | - name: Run Tests 42 | run: sudo bash -c "PYTHONPATH=$(pwd) make -f test_data/Makefile.osx test" 43 | 44 | build-linux: 45 | 46 | runs-on: 47 | - ubuntu-latest 48 | 49 | steps: 50 | - uses: actions/checkout@v2 51 | with: 52 | submodules: recursive 53 | lfs: true 54 | - name: Install deps 55 | run: > 56 | sudo apt-get update && sudo apt-get install -y cmake python3 python3-dev 57 | libboost-filesystem-dev libboost-regex-dev libboost-system-dev libbz2-dev 58 | liblzma-dev libpthread-stubs0-dev wget zlib1g-dev g++ && 59 | curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && sudo 60 | python3 get-pip.py && python3 -m pip install --upgrade pip && 61 | python3 -m pip install numpy && python3 -m pip install -e ./python && 62 | python3 -m pip install --force-reinstall matplotlib 63 | - name: make 64 | run: make -f test_data/Makefile.linux 65 | - name: Download Tests 66 | run: > 67 | cd test_data && 68 | wget https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz && 69 | tar xzf test_inputs.tar.gz && 70 | cd .. 71 | - name: Run Tests 72 | run: make -f test_data/Makefile.linux test 73 | -------------------------------------------------------------------------------- /test_data/chr1.output/chr1.cram.metrics.tsv: -------------------------------------------------------------------------------- 1 | Sample chr1.cram 2 | Mapping Rate 1 3 | Unique Rate of Mapped 1 4 | Duplicate Rate of Mapped 0 5 | Duplicate Rate of Mapped, excluding Globins 0 6 | Base Mismatch 0.00968147 7 | End 1 Mapping Rate 1.01474 8 | End 2 Mapping Rate 0.985262 9 | End 1 Mismatch Rate 0.00253608 10 | End 2 Mismatch Rate 0.0170406 11 | Expression Profiling Efficiency 0.807719 12 | High Quality Rate 0.884446 13 | Exonic Rate 0.807719 14 | Intronic Rate 0.131935 15 | Intergenic Rate 0.0274077 16 | Intragenic Rate 0.939654 17 | Ambiguous Alignment Rate 0.0329382 18 | High Quality Exonic Rate 0.835902 19 | High Quality Intronic Rate 0.108092 20 | High Quality Intergenic Rate 0.0240545 21 | High Quality Intragenic Rate 0.943994 22 | High Quality Ambiguous Alignment Rate 0.0319513 23 | Discard Rate 0 24 | rRNA Rate 4.18304e-06 25 | End 1 Sense Rate 0.495471 26 | End 2 Sense Rate 0.503206 27 | Avg. Splits per Read 0.261769 28 | Alternative Alignments 229158 29 | Chimeric Fragments 0 30 | Chimeric Alignment Rate 0 31 | Duplicate Reads 0 32 | End 1 Antisense 498854 33 | End 2 Antisense 477326 34 | End 1 Bases 82963728 35 | End 2 Bases 80553844 36 | End 1 Mapped Reads 1091628 37 | End 2 Mapped Reads 1059919 38 | End 1 Mismatches 210403 39 | End 2 Mismatches 1372688 40 | End 1 Sense 489897 41 | End 2 Sense 483486 42 | Exonic Reads 1737846 43 | Failed Vendor QC 122510 44 | High Quality Reads 1902928 45 | Intergenic Reads 58969 46 | Intragenic Reads 2021710 47 | Ambiguous Reads 70868 48 | Intronic Reads 283864 49 | Low Mapping Quality 186732 50 | Low Quality Reads 248619 51 | Mapped Duplicate Reads 0 52 | Mapped Reads 2151547 53 | Mapped Unique Reads 2151547 54 | Mismatched Bases 1583091 55 | Non-Globin Reads 2151547 56 | Non-Globin Duplicate Reads 0 57 | Reads used for Intron/Exon counts 2151547 58 | rRNA Reads 9 59 | Total Bases 163517572 60 | Total Mapped Pairs 1052544 61 | Total Reads 2503215 62 | Unique Mapping, Vendor QC Passed Reads 2151547 63 | Unpaired Reads 0 64 | Read Length 76 65 | Genes Detected 1842 66 | Estimated Library Complexity 0 67 | Genes used in 3' bias 236 68 | Mean 3' bias 0.511031 69 | Median 3' bias 0.5 70 | 3' bias Std 0.293274 71 | 3' bias MAD_Std 0.353802 72 | 3' Bias, 25th Percentile 0.266282 73 | 3' Bias, 75th Percentile 0.75 74 | Median of Avg Transcript Coverage 0.629976 75 | Median of Transcript Coverage Std 0.905514 76 | Median of Transcript Coverage CV 1.06256 77 | Median Exon CV 0.313763 78 | Exon CV MAD 0.308991 79 | Fragment GC Content Mean 0.453382 80 | Fragment GC Content Std 0.104727 81 | Fragment GC Content Skewness 0.25671 82 | Fragment GC Content Kurtosis -0.673033 83 | -------------------------------------------------------------------------------- /test_data/approx_diff.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | import pandas as pd 4 | import numpy as np 5 | 6 | 7 | def main(args): 8 | if args.mode == 'tables': 9 | df = pd.read_csv(args.input1, index_col=0, header=2, sep='\t').join( 10 | pd.read_csv(args.input2, index_col=0, header=2, sep='\t'), 11 | how='outer', 12 | rsuffix='_' 13 | ) 14 | elif args.mode == 'metrics' or args.mode == 'fragments': 15 | df = pd.read_csv(args.input1, sep='\t', index_col=0).join( 16 | pd.read_csv(args.input2, sep='\t', index_col=0), 17 | how='outer', 18 | rsuffix='_', 19 | 20 | ) 21 | 22 | 23 | assert not (df[args.columns[0]].isna() ^ df[args.columns[1]].isna()).any(), df[df.isna().any(1)] 24 | assert len(df[np.abs(df[args.columns[0]] - df[args.columns[1]]) > args.tolerance]) == 0, df[np.abs(df[args.columns[0]] - df[args.columns[1]]) > args.tolerance].head() 25 | if args.mode == 'fragments': 26 | assert len(set(pd.read_csv(args.input1, sep='\t', index_col=0).index) ^ set(pd.read_csv(args.input2, sep='\t', index_col=0).index)) == 0 27 | 28 | if __name__ == '__main__': 29 | parser = argparse.ArgumentParser('legacy-test') 30 | parser.add_argument( 31 | 'input1', 32 | help='First input file' 33 | ) 34 | parser.add_argument( 35 | 'input2', 36 | help='Second input file' 37 | ) 38 | parser.add_argument( 39 | '-t', '--tolerance', 40 | nargs='?', 41 | type=float, 42 | help="Tolerance for differing values. If not provided, this defaults to 0," 43 | " for exact comparison. If provided without argument, this defaults to .01," 44 | " which is usually good for checking modern vs legacy counts (which vary" 45 | " slightly within Java's default precision). You can also provide a floating" 46 | " point number to manually specify tolerance", 47 | default=0.000001, 48 | const=0.01 49 | ) 50 | parser.add_argument( 51 | '-m', '--mode', 52 | choices=[ 53 | 'metrics', 54 | 'tables', 55 | 'fragments' 56 | ], 57 | default='metrics', 58 | help="What type of input file is being compared. Default: metrics" 59 | ) 60 | parser.add_argument( 61 | '-c', '--columns', 62 | nargs=2, 63 | help="Column names to load for 'tables'", 64 | metavar=['COLUMN-A', 'COLUMN-B'], 65 | default=('Counts', 'RNA-SeQC') 66 | ) 67 | args = parser.parse_args() 68 | main(args) 69 | -------------------------------------------------------------------------------- /test_data/downsampled.output/downsampled.bam.metrics.tsv: -------------------------------------------------------------------------------- 1 | Sample downsampled.bam 2 | Mapping Rate 0.354336 3 | Unique Rate of Mapped 1 4 | Duplicate Rate of Mapped 0 5 | Duplicate Rate of Mapped, excluding Globins 0 6 | Base Mismatch 0.0100175 7 | End 1 Mapping Rate 0.359515 8 | End 2 Mapping Rate 0.349158 9 | End 1 Mismatch Rate 0.00267655 10 | End 2 Mismatch Rate 0.0175762 11 | Expression Profiling Efficiency 0.275876 12 | High Quality Rate 0.881642 13 | Exonic Rate 0.778571 14 | Intronic Rate 0.114795 15 | Intergenic Rate 0.0700429 16 | Intragenic Rate 0.893366 17 | Ambiguous Alignment Rate 0.0365912 18 | High Quality Exonic Rate 0.811945 19 | High Quality Intronic Rate 0.111951 20 | High Quality Intergenic Rate 0.0397759 21 | High Quality Intragenic Rate 0.923895 22 | High Quality Ambiguous Alignment Rate 0.0363289 23 | Discard Rate 0 24 | rRNA Rate 0.00747698 25 | End 1 Sense Rate 0.49428 26 | End 2 Sense Rate 0.507332 27 | Avg. Splits per Read 0.256852 28 | Alternative Alignments 275296 29 | Chimeric Fragments 0 30 | Chimeric Alignment Rate 0 31 | Duplicate Reads 0 32 | End 1 Antisense 437593 33 | End 2 Antisense 413825 34 | End 1 Bases 76069312 35 | End 2 Bases 73877928 36 | End 1 Mapped Reads 1000912 37 | End 2 Mapped Reads 972078 38 | End 1 Mismatches 203603 39 | End 2 Mismatches 1298496 40 | End 1 Sense 427694 41 | End 2 Sense 426143 42 | Exonic Reads 1536112 43 | Failed Vendor QC 541350 44 | High Quality Reads 1739471 45 | Intergenic Reads 138194 46 | Intragenic Reads 1762602 47 | Ambiguous Reads 72194 48 | Intronic Reads 226490 49 | Low Mapping Quality 3766708 50 | Low Quality Reads 233519 51 | Mapped Duplicate Reads 0 52 | Mapped Reads 1972990 53 | Mapped Unique Reads 1972990 54 | Mismatched Bases 1502099 55 | Non-Globin Reads 1961273 56 | Non-Globin Duplicate Reads 0 57 | Reads used for Intron/Exon counts 1972990 58 | rRNA Reads 14752 59 | Total Bases 149947240 60 | Total Mapped Pairs 963702 61 | Total Reads 6384776 62 | Unique Mapping, Vendor QC Passed Reads 5568130 63 | Unpaired Reads 0 64 | Read Length 76 65 | Genes Detected 11590 66 | Estimated Library Complexity 0 67 | Genes used in 3' bias 153 68 | Mean 3' bias 0.601911 69 | Median 3' bias 0.591125 70 | 3' bias Std 0.314833 71 | 3' bias MAD_Std 0.445799 72 | 3' Bias, 25th Percentile 0.375 73 | 3' Bias, 75th Percentile 0.928571 74 | Average Fragment Length 182.276 75 | Fragment Length Median 164 76 | Fragment Length Std 69.7425 77 | Fragment Length MAD_Std 56.3388 78 | Median of Avg Transcript Coverage 0.0201045 79 | Median of Transcript Coverage Std 0.141663 80 | Median of Transcript Coverage CV 1.44311 81 | Median Exon CV 0.587046 82 | Exon CV MAD 0.495332 83 | -------------------------------------------------------------------------------- /test_data/legacy.output/downsampled.bam.metrics.tsv: -------------------------------------------------------------------------------- 1 | Sample downsampled.bam 2 | Mapping Rate 0.354336 3 | Unique Rate of Mapped 1 4 | Duplicate Rate of Mapped 0 5 | Duplicate Rate of Mapped, excluding Globins nan 6 | Base Mismatch 0.0100151 7 | End 1 Mapping Rate 0.359421 8 | End 2 Mapping Rate 0.34904 9 | End 1 Mismatch Rate 0.00267648 10 | End 2 Mismatch Rate 0.017572 11 | Expression Profiling Efficiency 0.272945 12 | High Quality Rate 0.881406 13 | Exonic Rate 0.770301 14 | Intronic Rate 0.159378 15 | Intergenic Rate 0.0700237 16 | Intragenic Rate 0.929679 17 | Ambiguous Alignment Rate 0 18 | High Quality Exonic Rate 0.803019 19 | High Quality Intronic Rate 0.157213 20 | High Quality Intergenic Rate 0.0397675 21 | High Quality Intragenic Rate 0.960232 22 | High Quality Ambiguous Alignment Rate 0 23 | Discard Rate 0.000297518 24 | rRNA Rate 0.00747647 25 | End 1 Sense Rate 0.494116 26 | End 2 Sense Rate 0.507167 27 | Avg. Splits per Read 0.256225 28 | Alternative Alignments 275296 29 | Chimeric Fragments 0 30 | Chimeric Alignment Rate 0 31 | Duplicate Reads 0 32 | End 1 Antisense 435833 33 | End 2 Antisense 412263 34 | End 1 Bases 76049552 35 | End 2 Bases 73853076 36 | End 1 Mapped Reads 1000652 37 | End 2 Mapped Reads 971751 38 | End 1 Mismatches 203545 39 | End 2 Mismatches 1297746 40 | End 1 Sense 425694 41 | End 2 Sense 424254 42 | Exonic Reads 1519796 43 | Failed Vendor QC 541350 44 | High Quality Reads 1739006 45 | Intergenic Reads 138156 46 | Intragenic Reads 1834247 47 | Ambiguous Reads 0 48 | Intronic Reads 314451 49 | Low Mapping Quality 3766708 50 | Low Quality Reads 233397 51 | Mapped Duplicate Reads 0 52 | Mapped Reads 1972990 53 | Mapped Unique Reads 1972990 54 | Mismatched Bases 1501291 55 | Non-Globin Reads 0 56 | Non-Globin Duplicate Reads 0 57 | Reads used for Intron/Exon counts 1972403 58 | rRNA Reads 14751 59 | Split Reads 395803 60 | Total Bases 149902628 61 | Total Mapped Pairs 963448 62 | Total Reads 6384776 63 | Unique Mapping, Vendor QC Passed Reads 5568130 64 | Unpaired Reads 0 65 | Read Length 76 66 | Genes Detected 11614 67 | Estimated Library Complexity 0 68 | Genes used in 3' bias 151 69 | Mean 3' bias 0.595942 70 | Median 3' bias 0.574761 71 | 3' bias Std 0.314981 72 | 3' bias MAD_Std 0.445135 73 | 3' Bias, 25th Percentile 0.374545 74 | 3' Bias, 75th Percentile 0.928571 75 | Average Fragment Length 182.276 76 | Fragment Length Median 164 77 | Fragment Length Std 69.7425 78 | Fragment Length MAD_Std 56.3388 79 | Median of Avg Transcript Coverage 0.0202775 80 | Median of Transcript Coverage Std 0.141889 81 | Median of Transcript Coverage CV 1.44268 82 | Median Exon CV 0.58593 83 | Exon CV MAD 0.495322 84 | -------------------------------------------------------------------------------- /python/rnaseqc/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import os 4 | import subprocess 5 | 6 | def get_filepath(name): 7 | # return os.path.join(os.path.dirname(os.path.abspath(__file__)), name) 8 | return 'rnaseqc.{}'.format(name) 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser('rnaseqc') 12 | 13 | subparsers = parser.add_subparsers(dest='command') 14 | 15 | commands = { 16 | 'aggregate': get_filepath('aggregate'), 17 | 'notebook': get_filepath('create_notebook'), 18 | 'insert-size': get_filepath('insert_size_intervals'), 19 | 'legacy-exons': get_filepath('legacy_exon_remap'), 20 | 'report': get_filepath('report'), 21 | 'run': get_filepath('run'), 22 | } 23 | 24 | run_parser = subparsers.add_parser( 25 | 'run', 26 | help='A light wrapper with some convenience functions to run RNA-SeQC', 27 | description='A light wrapper with some convenience functions to run RNA-SeQC', 28 | add_help=False 29 | ) 30 | 31 | aggregate_parser = subparsers.add_parser( 32 | 'aggregate', 33 | help='Aggregate RNA-SeQC outputs from multiple samples', 34 | description='Aggregate RNA-SeQC outputs from multiple samples', 35 | add_help=False 36 | ) 37 | 38 | notebook_parser = subparsers.add_parser( 39 | 'notebook', 40 | help='Generate a notebook with figures comparing outputs from multiple samples', 41 | description='Generate a notebook with figures comparing outputs from multiple samples', 42 | add_help=False 43 | ) 44 | 45 | report_parser = subparsers.add_parser( 46 | 'report', 47 | help='Generate PDF figures from aggregated RNA-SeQC results', 48 | description='Generate PDF figures from aggregated RNA-SeQC results', 49 | add_help=False 50 | ) 51 | 52 | intervals_parser = subparsers.add_parser( 53 | 'insert-size', 54 | help='Generate a BED file with long (>1000bp), high-mappability intervals for estimating insert sizes', 55 | description='Generate a BED file with long (>1000bp), high-mappability intervals for estimating insert sizes', 56 | add_help=False 57 | ) 58 | 59 | legacy_parser = subparsers.add_parser( 60 | 'legacy-exons', 61 | help='Renames exons in exon_reads.gct file from RNA-SeQC 2 to use naming convention from RNA-SeQC 1.1.x', 62 | description='Renames exons in exon_reads.gct file from RNA-SeQC 2 to use naming convention from RNA-SeQC 1.1.x', 63 | add_help=False 64 | ) 65 | 66 | args, remainder = parser.parse_known_args() 67 | if args.command in commands: 68 | os.execvp(sys.executable, [sys.executable, '-m', commands[args.command]] + remainder) 69 | else: 70 | parser.print_usage() 71 | sys.exit('A valid subcommand must be provided.') 72 | -------------------------------------------------------------------------------- /python/rnaseqc/create_notebook.py: -------------------------------------------------------------------------------- 1 | from . import nb_encode as nbe 2 | import argparse 3 | import subprocess 4 | 5 | 6 | def main(args): 7 | 8 | nb = nbe.Notebook() 9 | 10 | nb.add_markdown_cell( 11 | '# RNA-SeQC metrics report', 12 | ) 13 | 14 | nb.add_code_cell([ 15 | 'import pandas as pd', 16 | 'import qtl.io', 17 | 'import rnaseqc.report' 18 | ]) 19 | 20 | cell = [ 21 | "# load inputs", 22 | "metrics_df = pd.read_csv('{}', sep='\\t', index_col=0)".format(args.metrics), 23 | ] 24 | if args.tpm is not None: 25 | cell.append("tpm_df = qtl.io.read_gct('{}')".format(args.tpm)) 26 | if args.cohort is not None: 27 | cell.append("cohort_s = pd.read_csv('{}', sep='\\t', index_col=0, header=None).squeeze('columns')".format(args.cohort)) 28 | if args.date is not None: 29 | cell.append("date_s = pd.read_csv('{}', sep='\\t', index_col=0, header=None).squeeze('columns')".format(args.date)) 30 | if args.insert_size is not None: 31 | cell.append("insertsize_df = pd.read_csv('{}', sep='\\t', index_col=0)".format(args.insert_size)) 32 | nb.add_code_cell(cell) 33 | 34 | nb.add_code_cell([ 35 | "thresholds = {'Exonic Rate': 0.7}", 36 | 'rnaseqc.report.plot_qc_figures(metrics_df, cohort_s={}, cohort_colors=None, date_s={},'.format( 37 | 'cohort_s' if args.cohort is not None else 'None', 'date_s' if args.date is not None else 'None'), 38 | ' show_legend=True, ms=12, alpha=1, highlight_ids=None,', 39 | ' thresholds=thresholds, insertsize_df={}, tpm_df={})'.format( 40 | 'insertsize_df' if args.insert_size is not None else 'None', 'tpm_df' if args.tpm is not None else 'None'), 41 | ]) 42 | 43 | nb.add_code_cell('') 44 | nb.write(args.output) 45 | 46 | 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser('rnaseqc-plot') 50 | parser.add_argument('metrics', help='Aggregated metrics') 51 | parser.add_argument('output', type=argparse.FileType('w'), 52 | help="Output python notebook") 53 | parser.add_argument('-t', '--tpm', default=None, help='Aggregated TPM') 54 | parser.add_argument('-i', '--insert-size', default=None, 55 | help='Aggregated insert size distributions') 56 | parser.add_argument('-c', '--cohort', default=None, 57 | help='TSV file mapping sample IDs to cohort/batch IDs') 58 | parser.add_argument('-d', '--date', default=None, 59 | help='TSV file mapping sample IDs to dates') 60 | args = parser.parse_args() 61 | 62 | # generate notebook 63 | main(args) 64 | 65 | # execute notebook 66 | subprocess.check_call('jupyter nbconvert --execute --ExecutePreprocessor.timeout=300 --inplace {}'.format(args.output.name), shell=True) 67 | -------------------------------------------------------------------------------- /python/rnaseqc/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Author: Francois Aguet 3 | import argparse 4 | import subprocess 5 | from datetime import datetime 6 | import os 7 | import sys 8 | 9 | def locate_rnaseqc(): 10 | """ 11 | Search PATH for executable, then try up two directories 12 | (where compiled executable would exist in the git repo) 13 | """ 14 | for path in os.environ['PATH'].split(os.pathsep): 15 | exe = os.path.join(path, 'rnaseqc') 16 | if test_rnaseqc(exe): 17 | return exe 18 | exe = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'rnaseqc') 19 | if test_rnaseqc(exe): 20 | return exe 21 | print("Unable to find rnaseqc executable", file=sys.stderr) 22 | return 'rnaseqc' # Just try it and see what happens, I guess 23 | 24 | def test_rnaseqc(path): 25 | return os.path.isfile(path) and os.access(path, os.X_OK) and subprocess.run([path, '--version'], stdout=subprocess.PIPE).stdout.startswith(b'RNASeQC 2') 26 | 27 | if __name__ == '__main__': 28 | parser = argparse.ArgumentParser(description='Wrapper for RNA-SeQC 2') 29 | parser.add_argument('genes_gtf', type=str, help='Gene annotation GTF') 30 | parser.add_argument('bam_file', type=str, help='BAM file') 31 | parser.add_argument('prefix', type=str, default='Reads', help='Prefix for output files; usually sample_id') 32 | parser.add_argument('-o', '--output_dir', default=os.getcwd(), help='Output directory') 33 | parser.add_argument('-q', '--mapping-quality', default=None, type=int, help="Lower bound on read quality for reads used in coverage metrics") 34 | parser.add_argument('-m', '--mismatch-threshold', default=None, type=int, help="Maximum allowed mismatches in a read while still used for coverage metrics") 35 | parser.add_argument('-c', '--coverage', action='store_true', help="Include raw coverage metrics in a separate output table. By default, only summary statistics are included in metrics") 36 | parser.add_argument('--stranded', default=None, choices=['rf', 'fr'], help='Strandedness for stranded libraries') 37 | parser.add_argument('--bed', default=None, help='BED file with intervals for estimating insert size distribution') 38 | args = parser.parse_args() 39 | 40 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Running RNA-SeQC', flush=True) 41 | 42 | cmd = f"{locate_rnaseqc()} {args.genes_gtf} {args.bam_file} {args.output_dir}" \ 43 | + f" -s {args.prefix}" \ 44 | + ' -vv' 45 | if args.stranded is not None: 46 | cmd += f" --stranded {args.stranded}" 47 | if args.bed is not None: 48 | cmd += f" --bed {args.bed}" 49 | if args.mapping_quality is not None: 50 | cmd += f" --mapping-quality {args.mapping_quality}" 51 | if args.mismatch_threshold is not None: 52 | cmd += f" --base-mismatch {args.mismatch_threshold}" 53 | if args.coverage: 54 | cmd += ' --coverage' 55 | print(f' * command: "{cmd}"', flush=True) 56 | subprocess.check_call(cmd, shell=True) 57 | 58 | # gzip GCTs 59 | subprocess.check_call('gzip {0}.exon_reads.gct {0}.gene_tpm.gct {0}.gene_reads.gct {0}.gene_fragments.gct'.format(args.prefix), shell=True) 60 | if args.coverage: 61 | subprocess.check_call(f'gzip {args.prefix}.coverage.tsv', shell=True) 62 | 63 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Finished RNA-SeQC', flush=True) 64 | -------------------------------------------------------------------------------- /.github/workflows/Deployment.yml: -------------------------------------------------------------------------------- 1 | name: Deployment 2 | on: 3 | release: 4 | types: 5 | - published 6 | 7 | jobs: 8 | build-macos: 9 | 10 | runs-on: 11 | - macos-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | with: 16 | submodules: recursive 17 | - name: Brew Update 18 | run: brew update >/dev/null 19 | - name: Reinstall 20 | run: brew reinstall xz curl 21 | continue-on-error: true 22 | - name: Install Deps 23 | run: brew install boost zlib curl samtools bzip2 xz && brew link --overwrite python@3.9 24 | - name: Pip install 25 | run: sudo python3 -m pip install --user --upgrade pip setuptools && sudo python3 -m pip install --user numpy && sudo python3 -m pip install --user -e ./python 26 | - name: Zip Source 27 | run: > 28 | cd .. && tar --strip-components=4 -czf rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz $GITHUB_WORKSPACE && 29 | cd $GITHUB_WORKSPACE && mv ../rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz . 30 | - name: make 31 | run: > 32 | export ZLIB_PATH=$(ls /usr/local/Cellar/zlib/*/lib/libz.a) 33 | LZMA_PATH=$(ls /usr/local/Cellar/xz/*/lib/liblzma.a) && 34 | make -f test_data/Makefile.osx && 35 | gzip -c rnaseqc > rnaseqc.${{ github.event.release.tag_name }}.macos.gz && ls -l 36 | - name: Upload Executable 37 | uses: actions/upload-release-asset@v1 38 | env: 39 | GITHUB_TOKEN: ${{ secrets.OAUTH_TOKEN }} 40 | with: 41 | upload_url: https://uploads.github.com/repos/getzlab/rnaseqc/releases/${{ github.event.release.id }}/assets?name=rnaseqc.${{ github.event.release.tag_name }}.macos.gz 42 | asset_path: rnaseqc.${{ github.event.release.tag_name }}.macos.gz 43 | asset_name: rnaseqc.${{ github.event.release.tag_name }}.macos.gz 44 | asset_content_type: application/gzip 45 | - name: Upload Zip 46 | uses: actions/upload-release-asset@v1 47 | env: 48 | GITHUB_TOKEN: ${{ secrets.OAUTH_TOKEN }} 49 | with: 50 | upload_url: https://uploads.github.com/repos/getzlab/rnaseqc/releases/${{ github.event.release.id }}/assets?name=rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz 51 | asset_path: rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz 52 | asset_name: rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz 53 | asset_content_type: application/gzip 54 | 55 | build-linux: 56 | 57 | runs-on: 58 | - ubuntu-latest 59 | 60 | steps: 61 | - uses: actions/checkout@v2 62 | with: 63 | submodules: recursive 64 | - name: Install deps 65 | run: > 66 | sudo apt-get update && sudo apt-get install -y cmake python3 python3-dev 67 | libboost-filesystem-dev libboost-regex-dev libboost-system-dev libbz2-dev 68 | liblzma-dev libpthread-stubs0-dev wget zlib1g-dev g++ && 69 | curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && sudo 70 | python3 get-pip.py && python3 -m pip install --upgrade pip && 71 | python3 -m pip install numpy && python3 -m pip install -e ./python && 72 | python3 -m pip install --force-reinstall matplotlib 73 | - name: make 74 | run: make -f test_data/Makefile.linux && gzip -c rnaseqc > rnaseqc.${{ github.event.release.tag_name }}.linux.gz 75 | - name: Upload Executable 76 | uses: actions/upload-release-asset@v1 77 | env: 78 | GITHUB_TOKEN: ${{ secrets.OAUTH_TOKEN }} 79 | with: 80 | upload_url: https://uploads.github.com/repos/getzlab/rnaseqc/releases/${{ github.event.release.id }}/assets?name=rnaseqc.${{ github.event.release.tag_name }}.linux.gz 81 | asset_path: rnaseqc.${{ github.event.release.tag_name }}.linux.gz 82 | asset_name: rnaseqc.${{ github.event.release.tag_name }}.linux.gz 83 | asset_content_type: application/gzip 84 | -------------------------------------------------------------------------------- /src/BamReader.h: -------------------------------------------------------------------------------- 1 | // 2 | // BamReader.hpp 3 | // RNA-SeQC 4 | // 5 | // Created by Aaron Graubert on 10/3/18. 6 | // Copyright © 2018 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #ifndef BamReader_h 10 | #define BamReader_h 11 | 12 | #include "Fasta.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include // I really don't like using unofficial APIs, but not much choice here. 21 | 22 | namespace rnaseqc { 23 | 24 | struct referenceHTSMismatch : public std::exception { 25 | std::string error; 26 | referenceHTSMismatch(std::string msg) : error(msg) {}; 27 | }; 28 | 29 | 30 | class SynchronizedReader { 31 | std::mutex mtx; 32 | protected: 33 | unsigned long read_count; 34 | public: 35 | SynchronizedReader() : mtx(), read_count() { 36 | 37 | } 38 | 39 | void lock() 40 | { 41 | this->mtx.lock(); 42 | } 43 | 44 | void unlock() 45 | { 46 | this->mtx.unlock(); 47 | } 48 | 49 | unsigned long get_count() const 50 | { 51 | return this->read_count; 52 | } 53 | }; 54 | 55 | class SeqlibReader : public SynchronizedReader { 56 | SeqLib::BamReader bam; 57 | std::string reference_path; 58 | std::set valid_chroms; 59 | bool user_cram_reference; 60 | public: 61 | 62 | SeqlibReader() : reference_path(), valid_chroms(), user_cram_reference(false) {} 63 | 64 | bool next(SeqLib::BamRecord&); 65 | 66 | const SeqLib::BamHeader getHeader() const { 67 | return this->bam.Header(); 68 | } 69 | 70 | bool open(std::string filepath) { 71 | if (this->reference_path.length()) { 72 | auto htsfile = hts_open(filepath.c_str(), "r"); 73 | hts_set_fai_filename(htsfile, this->reference_path.c_str()); 74 | if (htsfile->format.format == htsExactFormat::cram) { 75 | this->user_cram_reference = true; 76 | // Cram handling is very dumb. All of this nonsense is just because htslib is incredibly opaque about reference handling 77 | // Even with a user-provided reference, htslib only uses it if the MD5 matches 78 | // So here we load up the file, check if it's a cram, then get a list of chromosomes that htslib decides to use 79 | cram_fd *cram = static_cast(htsfile->fp.cram); 80 | if (cram->refs && cram->refs->nref > 0) 81 | for (unsigned int i = 0; i < cram->refs->nref; ++i) 82 | if (this->reference_path == std::string(cram->refs->ref_id[i]->fn)) 83 | this->valid_chroms.insert( 84 | chromosomeMap(cram->refs->ref_id[i]->name) 85 | ); 86 | this->bam.SetCramReference(this->reference_path); // Consider moving out of if statement, if there's any meaningful use to having a reference set on a non-cram 87 | } 88 | hts_close(htsfile); 89 | } 90 | this->bam.Open(filepath); 91 | return this->bam.IsOpen(); 92 | } 93 | 94 | void addReference(std::string filepath) { 95 | this->reference_path = filepath; 96 | } 97 | 98 | inline bool validateChromosome(const chrom c) { 99 | // For crams, we only validate chromosomes which matched our reference. Otherwise yes! 100 | return this->user_cram_reference ? this->valid_chroms.count(c) > 0 : true; 101 | } 102 | 103 | }; 104 | 105 | typedef SeqLib::BamRecord Alignment; 106 | } 107 | 108 | #endif /* BamReader_h */ 109 | -------------------------------------------------------------------------------- /python/rnaseqc/legacy_exon_remap.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tqdm import tqdm 3 | import subprocess 4 | import csv 5 | import shutil 6 | from qtl.annotation import Annotation 7 | import tempfile 8 | 9 | def run(args): 10 | print("Parsing GTF") 11 | gtf = Annotation(args.gtf.name) 12 | print("Parsing GCT") 13 | numRows = int(subprocess.check_output(f"wc -l {args.gct.name}", shell=True).decode().strip().split()[0]) - 3 14 | header = ''.join([next(args.gct), next(args.gct)]) 15 | reader = csv.DictReader(args.gct, delimiter='\t') 16 | w = tempfile.NamedTemporaryFile('w') 17 | w.write(header) 18 | writer = csv.DictWriter(w, reader.fieldnames, delimiter='\t', lineterminator='\n') 19 | writer.writeheader() 20 | current = None 21 | features = [] 22 | for line in tqdm(reader, total=numRows): 23 | gene = '_'.join(line['Name'].split('_')[:-1]) 24 | if gene != current: 25 | if current is not None: 26 | ref = gtf.get_gene(current) 27 | try: 28 | if len(ref): 29 | ref = ref[0] 30 | except: 31 | pass 32 | exons = {exon.id:exon for transcript in ref.transcripts for exon in transcript.exons} 33 | raw_size = len(exons) 34 | for exon in [exon for exon in exons]: 35 | try: 36 | if exon.isdigit() and int(exon) <= raw_size: 37 | exons[current+'_'+exon] = exons[exon] 38 | except: 39 | pass 40 | features.sort( 41 | key=lambda feat:( 42 | 1 if exons[feat['Name']].length == 1 else 0, 43 | exons[feat['Name']].start_pos, 44 | exons[feat['Name']].end_pos 45 | ) 46 | ) 47 | for i in range(len(features)): 48 | parts = features[i]['Name'].split('_') 49 | prefix = '_'.join(parts[:-1]) 50 | suffix = parts[-1] 51 | if exons[features[i]['Name']].length == 1: 52 | features[i][reader.fieldnames[-1]] = 0 53 | suffix = str(i) 54 | features[i]['Name'] = prefix+'_'+suffix 55 | writer.writerows(features) 56 | current = gene 57 | features = [] 58 | features.append({k:v for k,v in line.items()}) 59 | if len(features): 60 | ref = gtf.get_gene(current) 61 | try: 62 | if len(ref): 63 | ref = ref[0] 64 | except: 65 | pass 66 | exons = {exon.id:exon for transcript in ref.transcripts for exon in transcript.exons} 67 | raw_size = len(exons) 68 | for exon in [exon for exon in exons]: 69 | try: 70 | if exon.isdigit() and int(exon) <= raw_size: 71 | exons[current+'_'+exon] = exons[exon] 72 | except: 73 | pass 74 | features.sort( 75 | key=lambda feat:( 76 | 1 if exons[feat['Name']].length == 1 else 0, 77 | exons[feat['Name']].start_pos, 78 | exons[feat['Name']].end_pos 79 | ) 80 | ) 81 | for i in range(len(features)): 82 | prefix, suffix = features[i]['Name'].split('_') 83 | if exons[features[i]['Name']].length == 1: 84 | features[i]['Counts'] = 0 85 | suffix = str(i) 86 | features[i]['Name'] = prefix+'_'+suffix 87 | writer.writerows(features) 88 | print("Cleaning up") 89 | w.flush() 90 | args.gct.close() 91 | shutil.copyfile(args.gct.name, args.gct.name+'.bak') 92 | shutil.copyfile(w.name, args.gct.name) 93 | 94 | 95 | def main(): 96 | parser = argparse.ArgumentParser('flipper') 97 | parser.add_argument('gct', type=argparse.FileType('r'), help="RNA-SeQC 2 Exon reads gct file") 98 | parser.add_argument('gtf', type=argparse.FileType('r'), help="Reference GTF for the exons") 99 | args = parser.parse_args() 100 | run(args) 101 | 102 | if __name__ == '__main__': 103 | main() 104 | -------------------------------------------------------------------------------- /python/rnaseqc/nb_encode.py: -------------------------------------------------------------------------------- 1 | # Author: Aaron Graubert https://github.com/agraubert 2 | import nbformat as nbf 3 | import base64 4 | import io 5 | import json 6 | import sys 7 | 8 | def trim(docstring): 9 | if not docstring: 10 | return '' 11 | # Convert tabs to spaces (following the normal Python rules) 12 | # and split into a list of lines: 13 | lines = docstring.expandtabs().splitlines() 14 | # Determine minimum indentation (first line doesn't count): 15 | indent = sys.maxsize 16 | for line in lines[1:]: 17 | stripped = line.lstrip() 18 | if stripped: 19 | indent = min(indent, len(line) - len(stripped)) 20 | # Remove indentation (first line is special): 21 | trimmed = [lines[0].strip()] 22 | if indent < sys.maxsize: 23 | for line in lines[1:]: 24 | trimmed.append(line[indent:].rstrip()) 25 | # Strip off trailing and leading blank lines: 26 | while trimmed and not trimmed[-1]: 27 | trimmed.pop() 28 | while trimmed and not trimmed[0]: 29 | trimmed.pop(0) 30 | # Return a single string: 31 | return '\n'.join(trimmed) 32 | 33 | def encode_figure(figure, **kwargs): 34 | img = io.BytesIO() 35 | figure.savefig(img, **kwargs) 36 | img.seek(0,0) 37 | return nbf.v4.new_output( 38 | 'display_data', 39 | { 40 | 'text/plain': [repr(figure)], 41 | 'image/png': base64.b64encode(img.read()).decode() 42 | } 43 | ) 44 | 45 | def encode_dataframe(df, n, **kwargs): 46 | return nbf.v4.new_output( 47 | 'execute_result', 48 | { 49 | 'text/plain': [df.to_string()], 50 | 'text/html': [df.to_html(**kwargs)] 51 | }, 52 | execution_count=n 53 | ) 54 | 55 | def encode_output(obj, n): 56 | 57 | return nbf.v4.new_output( 58 | 'execute_result', 59 | {'text/plain': [repr(obj)]}, 60 | execution_count=n 61 | ) if obj is not None else None 62 | 63 | class Notebook(object): 64 | """ 65 | Wrapper to nbformat Notebook 66 | """ 67 | def __init__(self, header=None): 68 | self.nb = nbf.v4.new_notebook() 69 | if header is not None: 70 | self.add_markdown_cell(header, '---', 'Created by the nb_encode api') 71 | self.exec_count = 1 72 | 73 | def add_markdown_cell(self, *lines): 74 | lines = [line.rstrip()+'\n' for line in lines] 75 | lines[-1] = lines[-1][:-1] 76 | self.nb['cells'].append(nbf.v4.new_markdown_cell(lines)) 77 | 78 | def add_code_cell(self, source, *outputs, **kwargs): 79 | if isinstance(source, list): 80 | source = '\n'.join(line.rstrip() for line in source) 81 | self.nb['cells'].append(nbf.v4.new_code_cell( 82 | source, 83 | execution_count=self.exec_count, 84 | outputs=[ 85 | encode_output(output, self.exec_count) 86 | if not isinstance(output, nbf.notebooknode.NotebookNode) 87 | else output 88 | for output in outputs 89 | if output is not None 90 | ], 91 | **kwargs 92 | )) 93 | self.exec_count += 1 94 | 95 | def write(self, dest): 96 | if isinstance(dest, str): 97 | with open(dest, 'w') as w: 98 | nbf.write(self.nb, w) 99 | else: 100 | nbf.write(self.nb, dest.name) 101 | 102 | def encode_plot_cell(cell, source, result, figure): 103 | img = io.BytesIO() 104 | figure.savefig(img) 105 | img.seek(0,0) 106 | img = base64.b64encode(img.read()) 107 | output_cell = nbf.v4.new_code_cell( 108 | source, 109 | outputs=[ 110 | nbf.v4.new_output( 111 | 'execute_result', 112 | { 113 | 'text/plain': [result] 114 | }, 115 | execution_count=cell 116 | ), 117 | nbf.v4.new_output( 118 | 'display_data', 119 | { 120 | 'text/plain': [repr(figure)], 121 | 'image/png': img.decode() 122 | } 123 | ) 124 | ] 125 | ) 126 | return output_cell 127 | 128 | def encode_standard_cell(cell): 129 | source = eval('_i%d'%cell) 130 | try: 131 | result = repr(eval('_%d'%cell)) 132 | except: 133 | result = None 134 | output_cell = nbf.v4.new_code_cell( 135 | source, 136 | outputs=([ 137 | nbf.v4.new_output( 138 | 'execute_result', 139 | {'text/plain': [result]}, 140 | execution_count=cell 141 | ) 142 | ] if result is not None else []) 143 | ) 144 | return output_cell 145 | -------------------------------------------------------------------------------- /python/rnaseqc/insert_size_intervals.py: -------------------------------------------------------------------------------- 1 | # Author: Francois Aguet 2 | import numpy as np 3 | import os 4 | import pyBigWig 5 | import argparse 6 | import qtl.annotation as annotation 7 | 8 | 9 | def intersect_overlap(intervals): 10 | """ 11 | intervals: list of tuples or 2-element lists 12 | 13 | breaks intersections into separate intervals 14 | e.g.: [0,6],[2,8] ->[0,1],[2,6],[7,8] 15 | """ 16 | intervals = intervals.copy() 17 | intervals.sort(key=lambda x: (x[0],x[1])) 18 | intersected = [] 19 | union = list(intervals[0]) 20 | bounds = [intervals[0]] 21 | for i in intervals[1:]: 22 | if i[0] <= union[1]: # overlap w/ previous 23 | if i[1] > union[1]: # only extend if larger 24 | union[1] = i[1] 25 | bounds.append(i) 26 | else: 27 | # process bounds 28 | if len(bounds)>1: 29 | p = np.unique([i[0] for i in bounds]+[i[1]+1 for i in bounds]) 30 | intersected.extend([[i,j-1] for i,j in zip(p[:-1],p[1:])]) 31 | else: 32 | intersected.append(bounds[0]) 33 | # reset 34 | bounds = [i] 35 | union = list(i) 36 | # process last 37 | if len(bounds)>1: 38 | p = np.unique([i[0] for i in bounds]+[i[1]+1 for i in bounds]) 39 | intersected.extend([[i,j-1] for i,j in zip(p[:-1], p[1:])]) 40 | else: 41 | intersected.append(bounds[0]) 42 | 43 | return intersected 44 | 45 | 46 | def parse_intervals(annot, mappability_bw, output_dir, prefix, min_length=1000, min_mappability=0.95): 47 | """Write intervals to BED format""" 48 | 49 | exclude = set(['retained_intron', 'readthrough_transcript']) 50 | 51 | bw = pyBigWig.open(mappability_bw) 52 | gintervals = {} 53 | for c in annot.chr_list: 54 | exon_coords = [] 55 | for g in annot.chr_genes[c]: 56 | for t in g.transcripts: 57 | if (t.type not in exclude) and (('tags' not in t.attributes) or len(set(t.attributes['tags']).intersection(exclude)) == 0): 58 | for e in t.exons: 59 | exon_coords.append((e.start_pos, e.end_pos)) 60 | 61 | v = np.array(intersect_overlap(exon_coords)) # intersect all exons on current chr 62 | l = v[:,1]-v[:,0]+1 63 | gintervals[c] = v[l >= min_length, :] 64 | # filter by mappability 65 | gintervals[c] = np.array([i for i in gintervals[c] if bw.stats(c, int(i[0])-1, int(i[1]), exact=True)[0] >= min_mappability]) 66 | bw.close() 67 | 68 | # all intervals 69 | with open(os.path.join(output_dir, f'{prefix}_geq{min_length}bp.bed'), 'w') as f: 70 | f.write('#chr\tstart\tend\n') 71 | for c in annot.chr_list: 72 | for i in range(gintervals[c].shape[0]): 73 | f.write(f'{c}\t{gintervals[c][i][0]-1}\t{gintervals[c][i][1]}\n') # BED is 0-indexed, [..) 74 | 75 | # single-isoform genes 76 | gintervals_1iso = {} 77 | for c in annot.chr_list: 78 | ec = [] 79 | for g in annot.chr_genes[c]: 80 | if len(g.transcripts) == 1: 81 | for e in g.transcripts[0].exons: 82 | if e.length >= min_length: 83 | ec.append([e.start_pos, e.end_pos]) 84 | ec = list(set([tuple(i) for i in ec]).intersection(set([tuple(i) for i in gintervals[c]]))) 85 | ec.sort(key=lambda x: (x[0],x[1])) 86 | gintervals_1iso[c] = np.array(ec) 87 | 88 | with open(os.path.join(output_dir, f'{prefix}_geq{min_length}bp_1iso.bed'), 'w') as f: 89 | f.write('#chr\tstart\tend\n') 90 | for c in annot.chr_list: 91 | for i in range(gintervals_1iso[c].shape[0]): 92 | f.write(f'{c}\t{gintervals_1iso[c][i][0]-1}\t{gintervals_1iso[c][i][1]}\n') 93 | 94 | 95 | if __name__=='__main__': 96 | 97 | parser = argparse.ArgumentParser(description='Parse long exons/UTRs with high mappability for estimating insert size distribution.') 98 | parser.add_argument('gtf_path', help='Reference annotation in GTF format.') 99 | parser.add_argument('mappability_bigwig', help='Mappability track in bigWig format.') 100 | parser.add_argument('prefix', help='Prefix for output file names.') 101 | parser.add_argument('--min-length', type=np.int32, default=1000, help='Minimum exon/UTR length for computing insert sizes. Default: 1000bp') 102 | parser.add_argument('--min-mappability', type=np.float64, default=0.95, help='Minimum mappability for retained intervals. Default: 0.95') 103 | parser.add_argument('--output-dir', default='.', help='Output directory.') 104 | args = parser.parse_args() 105 | 106 | annot = annotation.Annotation(args.gtf_path, verbose=True) 107 | parse_intervals(annot, args.mappability_bigwig, args.output_dir, args.prefix, 108 | min_length=args.min_length, min_mappability=args.min_mappability) 109 | -------------------------------------------------------------------------------- /test_data/legacy.output/downsampled.bam.fragmentSizes.txt: -------------------------------------------------------------------------------- 1 | Fragment Size Count 2 | 48 1 3 | 49 1 4 | 53 1 5 | 60 1 6 | 68 1 7 | 69 1 8 | 76 1 9 | 77 37 10 | 78 37 11 | 79 39 12 | 80 48 13 | 81 56 14 | 82 48 15 | 83 66 16 | 84 61 17 | 85 56 18 | 86 64 19 | 87 65 20 | 88 82 21 | 89 85 22 | 90 117 23 | 91 127 24 | 92 126 25 | 93 148 26 | 94 143 27 | 95 165 28 | 96 152 29 | 97 166 30 | 98 170 31 | 99 197 32 | 100 211 33 | 101 224 34 | 102 311 35 | 103 320 36 | 104 350 37 | 105 318 38 | 106 343 39 | 107 381 40 | 108 363 41 | 109 392 42 | 110 419 43 | 111 474 44 | 112 464 45 | 113 531 46 | 114 566 47 | 115 574 48 | 116 522 49 | 117 526 50 | 118 521 51 | 119 552 52 | 120 590 53 | 121 663 54 | 122 633 55 | 123 649 56 | 124 696 57 | 125 711 58 | 126 676 59 | 127 611 60 | 128 640 61 | 129 642 62 | 130 620 63 | 131 644 64 | 132 682 65 | 133 704 66 | 134 721 67 | 135 685 68 | 136 672 69 | 137 606 70 | 138 618 71 | 139 590 72 | 140 623 73 | 141 699 74 | 142 670 75 | 143 711 76 | 144 678 77 | 145 648 78 | 146 683 79 | 147 581 80 | 148 579 81 | 149 578 82 | 150 546 83 | 151 572 84 | 152 551 85 | 153 571 86 | 154 589 87 | 155 573 88 | 156 594 89 | 157 542 90 | 158 527 91 | 159 507 92 | 160 504 93 | 161 511 94 | 162 470 95 | 163 560 96 | 164 478 97 | 165 477 98 | 166 558 99 | 167 481 100 | 168 514 101 | 169 461 102 | 170 469 103 | 171 476 104 | 172 456 105 | 173 439 106 | 174 442 107 | 175 453 108 | 176 487 109 | 177 422 110 | 178 390 111 | 179 422 112 | 180 405 113 | 181 402 114 | 182 399 115 | 183 389 116 | 184 411 117 | 185 402 118 | 186 395 119 | 187 374 120 | 188 371 121 | 189 364 122 | 190 345 123 | 191 374 124 | 192 333 125 | 193 338 126 | 194 319 127 | 195 337 128 | 196 307 129 | 197 374 130 | 198 325 131 | 199 341 132 | 200 293 133 | 201 306 134 | 202 315 135 | 203 290 136 | 204 329 137 | 205 287 138 | 206 291 139 | 207 329 140 | 208 327 141 | 209 303 142 | 210 288 143 | 211 276 144 | 212 278 145 | 213 261 146 | 214 300 147 | 215 254 148 | 216 260 149 | 217 286 150 | 218 267 151 | 219 263 152 | 220 236 153 | 221 242 154 | 222 244 155 | 223 231 156 | 224 242 157 | 225 245 158 | 226 225 159 | 227 264 160 | 228 228 161 | 229 236 162 | 230 226 163 | 231 234 164 | 232 227 165 | 233 219 166 | 234 206 167 | 235 197 168 | 236 229 169 | 237 191 170 | 238 197 171 | 239 205 172 | 240 190 173 | 241 180 174 | 242 190 175 | 243 177 176 | 244 180 177 | 245 181 178 | 246 194 179 | 247 179 180 | 248 188 181 | 249 175 182 | 250 174 183 | 251 161 184 | 252 167 185 | 253 169 186 | 254 171 187 | 255 155 188 | 256 143 189 | 257 181 190 | 258 169 191 | 259 171 192 | 260 166 193 | 261 149 194 | 262 144 195 | 263 139 196 | 264 144 197 | 265 140 198 | 266 153 199 | 267 160 200 | 268 144 201 | 269 155 202 | 270 130 203 | 271 138 204 | 272 122 205 | 273 135 206 | 274 112 207 | 275 143 208 | 276 131 209 | 277 105 210 | 278 113 211 | 279 122 212 | 280 118 213 | 281 133 214 | 282 121 215 | 283 114 216 | 284 108 217 | 285 88 218 | 286 99 219 | 287 95 220 | 288 121 221 | 289 93 222 | 290 112 223 | 291 92 224 | 292 88 225 | 293 115 226 | 294 110 227 | 295 87 228 | 296 84 229 | 297 73 230 | 298 100 231 | 299 86 232 | 300 100 233 | 301 102 234 | 302 81 235 | 303 98 236 | 304 75 237 | 305 88 238 | 306 73 239 | 307 80 240 | 308 69 241 | 309 65 242 | 310 70 243 | 311 79 244 | 312 64 245 | 313 62 246 | 314 71 247 | 315 82 248 | 316 73 249 | 317 62 250 | 318 66 251 | 319 62 252 | 320 70 253 | 321 66 254 | 322 62 255 | 323 74 256 | 324 59 257 | 325 48 258 | 326 53 259 | 327 58 260 | 328 50 261 | 329 52 262 | 330 56 263 | 331 50 264 | 332 52 265 | 333 63 266 | 334 50 267 | 335 50 268 | 336 38 269 | 337 49 270 | 338 38 271 | 339 46 272 | 340 45 273 | 341 46 274 | 342 43 275 | 343 58 276 | 344 45 277 | 345 50 278 | 346 43 279 | 347 54 280 | 348 45 281 | 349 40 282 | 350 42 283 | 351 47 284 | 352 32 285 | 353 39 286 | 354 32 287 | 355 33 288 | 356 30 289 | 357 34 290 | 358 32 291 | 359 22 292 | 360 40 293 | 361 30 294 | 362 42 295 | 363 25 296 | 364 38 297 | 365 29 298 | 366 36 299 | 367 39 300 | 368 24 301 | 369 29 302 | 370 24 303 | 371 21 304 | 372 31 305 | 373 21 306 | 374 22 307 | 375 25 308 | 376 19 309 | 377 21 310 | 378 20 311 | 379 21 312 | 380 23 313 | 381 17 314 | 382 25 315 | 383 22 316 | 384 20 317 | 385 26 318 | 386 17 319 | 387 21 320 | 388 23 321 | 389 18 322 | 390 22 323 | 391 13 324 | 392 19 325 | 393 23 326 | 394 29 327 | 395 17 328 | 396 25 329 | 397 20 330 | 398 23 331 | 399 16 332 | 400 20 333 | 401 7 334 | 402 12 335 | 403 11 336 | 404 12 337 | 405 15 338 | 406 17 339 | 407 13 340 | 408 13 341 | 409 21 342 | 410 9 343 | 411 13 344 | 412 8 345 | 413 11 346 | 414 10 347 | 415 16 348 | 416 10 349 | 417 7 350 | 418 12 351 | 419 12 352 | 420 13 353 | 421 8 354 | 422 9 355 | 423 11 356 | 424 10 357 | 425 10 358 | 426 18 359 | 427 11 360 | 428 9 361 | 429 9 362 | 430 11 363 | 431 10 364 | 432 8 365 | 433 9 366 | 434 7 367 | 435 14 368 | 436 6 369 | 437 9 370 | 438 11 371 | 439 6 372 | 440 9 373 | 441 6 374 | 442 5 375 | 443 6 376 | 444 6 377 | 445 7 378 | 446 8 379 | 447 8 380 | 448 7 381 | 449 7 382 | 450 7 383 | 451 6 384 | 452 8 385 | 453 3 386 | 454 7 387 | 455 6 388 | 456 7 389 | 457 6 390 | 458 8 391 | 459 7 392 | 460 5 393 | 461 1 394 | 462 4 395 | 463 7 396 | 464 5 397 | 465 4 398 | 466 1 399 | 467 4 400 | 468 7 401 | 469 3 402 | 470 2 403 | 471 4 404 | 473 5 405 | 474 3 406 | 475 4 407 | 476 5 408 | 477 5 409 | 478 2 410 | 479 3 411 | 480 8 412 | 481 1 413 | 483 1 414 | 484 3 415 | 485 4 416 | 486 1 417 | 487 3 418 | 488 4 419 | 489 4 420 | 490 3 421 | 491 3 422 | 492 3 423 | 493 2 424 | 494 2 425 | 495 7 426 | 497 2 427 | 498 4 428 | 499 1 429 | 500 2 430 | 501 2 431 | 502 2 432 | 503 3 433 | 504 2 434 | 505 1 435 | 506 2 436 | 507 1 437 | 508 1 438 | 509 1 439 | 510 4 440 | 512 1 441 | 513 1 442 | 515 1 443 | 516 1 444 | 517 3 445 | 518 1 446 | 520 2 447 | 521 1 448 | 522 1 449 | 523 2 450 | 524 1 451 | 525 2 452 | 527 2 453 | 528 2 454 | 529 1 455 | 530 2 456 | 531 2 457 | 532 1 458 | 533 2 459 | 534 1 460 | 535 2 461 | 536 1 462 | 537 1 463 | 538 2 464 | 540 1 465 | 541 1 466 | 542 1 467 | 543 1 468 | 545 1 469 | 546 1 470 | 548 1 471 | 550 1 472 | 552 1 473 | 553 2 474 | 555 2 475 | 556 1 476 | 560 1 477 | 561 1 478 | 563 2 479 | 564 1 480 | 571 1 481 | 572 2 482 | 581 1 483 | 584 1 484 | 585 1 485 | 587 2 486 | 588 1 487 | 590 1 488 | 591 1 489 | 594 1 490 | 595 2 491 | 597 1 492 | 603 1 493 | 607 1 494 | 612 1 495 | 617 1 496 | 618 1 497 | 622 1 498 | 627 1 499 | 628 1 500 | 631 1 501 | 632 1 502 | 638 1 503 | 643 1 504 | 656 1 505 | 667 1 506 | 669 1 507 | 683 1 508 | 708 1 509 | 743 1 510 | 747 1 511 | 846 1 512 | 877 1 513 | 921 1 514 | 941 1 515 | 943 1 516 | 1065 1 517 | 1396 1 518 | 1559 1 519 | 1875 1 520 | 2074 1 521 | -------------------------------------------------------------------------------- /test_data/downsampled.output/downsampled.bam.fragmentSizes.txt: -------------------------------------------------------------------------------- 1 | Fragment Size Count 2 | 48 1 3 | 49 1 4 | 53 1 5 | 60 1 6 | 68 1 7 | 69 1 8 | 76 1 9 | 77 37 10 | 78 37 11 | 79 39 12 | 80 48 13 | 81 56 14 | 82 48 15 | 83 66 16 | 84 61 17 | 85 56 18 | 86 64 19 | 87 65 20 | 88 82 21 | 89 85 22 | 90 117 23 | 91 127 24 | 92 126 25 | 93 148 26 | 94 143 27 | 95 165 28 | 96 152 29 | 97 166 30 | 98 170 31 | 99 197 32 | 100 211 33 | 101 224 34 | 102 311 35 | 103 320 36 | 104 350 37 | 105 318 38 | 106 343 39 | 107 381 40 | 108 363 41 | 109 392 42 | 110 419 43 | 111 474 44 | 112 464 45 | 113 531 46 | 114 566 47 | 115 574 48 | 116 522 49 | 117 526 50 | 118 521 51 | 119 552 52 | 120 590 53 | 121 663 54 | 122 633 55 | 123 649 56 | 124 696 57 | 125 711 58 | 126 676 59 | 127 611 60 | 128 640 61 | 129 642 62 | 130 620 63 | 131 644 64 | 132 682 65 | 133 704 66 | 134 721 67 | 135 685 68 | 136 672 69 | 137 606 70 | 138 618 71 | 139 590 72 | 140 623 73 | 141 699 74 | 142 670 75 | 143 711 76 | 144 678 77 | 145 648 78 | 146 683 79 | 147 581 80 | 148 579 81 | 149 578 82 | 150 546 83 | 151 572 84 | 152 551 85 | 153 571 86 | 154 589 87 | 155 573 88 | 156 594 89 | 157 542 90 | 158 527 91 | 159 507 92 | 160 504 93 | 161 511 94 | 162 470 95 | 163 560 96 | 164 478 97 | 165 477 98 | 166 558 99 | 167 481 100 | 168 514 101 | 169 461 102 | 170 469 103 | 171 476 104 | 172 456 105 | 173 439 106 | 174 442 107 | 175 453 108 | 176 487 109 | 177 422 110 | 178 390 111 | 179 422 112 | 180 405 113 | 181 402 114 | 182 399 115 | 183 389 116 | 184 411 117 | 185 402 118 | 186 395 119 | 187 374 120 | 188 371 121 | 189 364 122 | 190 345 123 | 191 374 124 | 192 333 125 | 193 338 126 | 194 319 127 | 195 337 128 | 196 307 129 | 197 374 130 | 198 325 131 | 199 341 132 | 200 293 133 | 201 306 134 | 202 315 135 | 203 290 136 | 204 329 137 | 205 287 138 | 206 291 139 | 207 329 140 | 208 327 141 | 209 303 142 | 210 288 143 | 211 276 144 | 212 278 145 | 213 261 146 | 214 300 147 | 215 254 148 | 216 260 149 | 217 286 150 | 218 267 151 | 219 263 152 | 220 236 153 | 221 242 154 | 222 244 155 | 223 231 156 | 224 242 157 | 225 245 158 | 226 225 159 | 227 264 160 | 228 228 161 | 229 236 162 | 230 226 163 | 231 234 164 | 232 227 165 | 233 219 166 | 234 206 167 | 235 197 168 | 236 229 169 | 237 191 170 | 238 197 171 | 239 205 172 | 240 190 173 | 241 180 174 | 242 190 175 | 243 177 176 | 244 180 177 | 245 181 178 | 246 194 179 | 247 179 180 | 248 188 181 | 249 175 182 | 250 174 183 | 251 161 184 | 252 167 185 | 253 169 186 | 254 171 187 | 255 155 188 | 256 143 189 | 257 181 190 | 258 169 191 | 259 171 192 | 260 166 193 | 261 149 194 | 262 144 195 | 263 139 196 | 264 144 197 | 265 140 198 | 266 153 199 | 267 160 200 | 268 144 201 | 269 155 202 | 270 130 203 | 271 138 204 | 272 122 205 | 273 135 206 | 274 112 207 | 275 143 208 | 276 131 209 | 277 105 210 | 278 113 211 | 279 122 212 | 280 118 213 | 281 133 214 | 282 121 215 | 283 114 216 | 284 108 217 | 285 88 218 | 286 99 219 | 287 95 220 | 288 121 221 | 289 93 222 | 290 112 223 | 291 92 224 | 292 88 225 | 293 115 226 | 294 110 227 | 295 87 228 | 296 84 229 | 297 73 230 | 298 100 231 | 299 86 232 | 300 100 233 | 301 102 234 | 302 81 235 | 303 98 236 | 304 75 237 | 305 88 238 | 306 73 239 | 307 80 240 | 308 69 241 | 309 65 242 | 310 70 243 | 311 79 244 | 312 64 245 | 313 62 246 | 314 71 247 | 315 82 248 | 316 73 249 | 317 62 250 | 318 66 251 | 319 62 252 | 320 70 253 | 321 66 254 | 322 62 255 | 323 74 256 | 324 59 257 | 325 48 258 | 326 53 259 | 327 58 260 | 328 50 261 | 329 52 262 | 330 56 263 | 331 50 264 | 332 52 265 | 333 63 266 | 334 50 267 | 335 50 268 | 336 38 269 | 337 49 270 | 338 38 271 | 339 46 272 | 340 45 273 | 341 46 274 | 342 43 275 | 343 58 276 | 344 45 277 | 345 50 278 | 346 43 279 | 347 54 280 | 348 45 281 | 349 40 282 | 350 42 283 | 351 47 284 | 352 32 285 | 353 39 286 | 354 32 287 | 355 33 288 | 356 30 289 | 357 34 290 | 358 32 291 | 359 22 292 | 360 40 293 | 361 30 294 | 362 42 295 | 363 25 296 | 364 38 297 | 365 29 298 | 366 36 299 | 367 39 300 | 368 24 301 | 369 29 302 | 370 24 303 | 371 21 304 | 372 31 305 | 373 21 306 | 374 22 307 | 375 25 308 | 376 19 309 | 377 21 310 | 378 20 311 | 379 21 312 | 380 23 313 | 381 17 314 | 382 25 315 | 383 22 316 | 384 20 317 | 385 26 318 | 386 17 319 | 387 21 320 | 388 23 321 | 389 18 322 | 390 22 323 | 391 13 324 | 392 19 325 | 393 23 326 | 394 29 327 | 395 17 328 | 396 25 329 | 397 20 330 | 398 23 331 | 399 16 332 | 400 20 333 | 401 7 334 | 402 12 335 | 403 11 336 | 404 12 337 | 405 15 338 | 406 17 339 | 407 13 340 | 408 13 341 | 409 21 342 | 410 9 343 | 411 13 344 | 412 8 345 | 413 11 346 | 414 10 347 | 415 16 348 | 416 10 349 | 417 7 350 | 418 12 351 | 419 12 352 | 420 13 353 | 421 8 354 | 422 9 355 | 423 11 356 | 424 10 357 | 425 10 358 | 426 18 359 | 427 11 360 | 428 9 361 | 429 9 362 | 430 11 363 | 431 10 364 | 432 8 365 | 433 9 366 | 434 7 367 | 435 14 368 | 436 6 369 | 437 9 370 | 438 11 371 | 439 6 372 | 440 9 373 | 441 6 374 | 442 5 375 | 443 6 376 | 444 6 377 | 445 7 378 | 446 8 379 | 447 8 380 | 448 7 381 | 449 7 382 | 450 7 383 | 451 6 384 | 452 8 385 | 453 3 386 | 454 7 387 | 455 6 388 | 456 7 389 | 457 6 390 | 458 8 391 | 459 7 392 | 460 5 393 | 461 1 394 | 462 4 395 | 463 7 396 | 464 5 397 | 465 4 398 | 466 1 399 | 467 4 400 | 468 7 401 | 469 3 402 | 470 2 403 | 471 4 404 | 473 5 405 | 474 3 406 | 475 4 407 | 476 5 408 | 477 5 409 | 478 2 410 | 479 3 411 | 480 8 412 | 481 1 413 | 483 1 414 | 484 3 415 | 485 4 416 | 486 1 417 | 487 3 418 | 488 4 419 | 489 4 420 | 490 3 421 | 491 3 422 | 492 3 423 | 493 2 424 | 494 2 425 | 495 7 426 | 497 2 427 | 498 4 428 | 499 1 429 | 500 2 430 | 501 2 431 | 502 2 432 | 503 3 433 | 504 2 434 | 505 1 435 | 506 2 436 | 507 1 437 | 508 1 438 | 509 1 439 | 510 4 440 | 512 1 441 | 513 1 442 | 515 1 443 | 516 1 444 | 517 3 445 | 518 1 446 | 520 2 447 | 521 1 448 | 522 1 449 | 523 2 450 | 524 1 451 | 525 2 452 | 527 2 453 | 528 2 454 | 529 1 455 | 530 2 456 | 531 2 457 | 532 1 458 | 533 2 459 | 534 1 460 | 535 2 461 | 536 1 462 | 537 1 463 | 538 2 464 | 540 1 465 | 541 1 466 | 542 1 467 | 543 1 468 | 545 1 469 | 546 1 470 | 548 1 471 | 550 1 472 | 552 1 473 | 553 2 474 | 555 2 475 | 556 1 476 | 560 1 477 | 561 1 478 | 563 2 479 | 564 1 480 | 571 1 481 | 572 2 482 | 581 1 483 | 584 1 484 | 585 1 485 | 587 2 486 | 588 1 487 | 590 1 488 | 591 1 489 | 594 1 490 | 595 2 491 | 597 1 492 | 603 1 493 | 607 1 494 | 612 1 495 | 617 1 496 | 618 1 497 | 622 1 498 | 627 1 499 | 628 1 500 | 631 1 501 | 632 1 502 | 638 1 503 | 643 1 504 | 656 1 505 | 667 1 506 | 669 1 507 | 683 1 508 | 708 1 509 | 743 1 510 | 747 1 511 | 846 1 512 | 877 1 513 | 921 1 514 | 941 1 515 | 943 1 516 | 1065 1 517 | 1396 1 518 | 1559 1 519 | 1875 1 520 | 2074 1 521 | -------------------------------------------------------------------------------- /python/rnaseqc/aggregate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import argparse 5 | import glob 6 | import gzip 7 | import os 8 | 9 | 10 | def combine_gcts(path_dict, verbose=True): 11 | """Aggregate single-sample GCT files.""" 12 | 13 | sample_ids = sorted(path_dict) 14 | 15 | # load first sample and determine dtype 16 | sample_id = sample_ids[0] 17 | df = pd.read_csv(path_dict[sample_id], sep='\t', skiprows=3, header=None, 18 | index_col=0, names=['Name','Description', sample_id]) 19 | if df[sample_id].dtype == np.float64: 20 | dtype = np.float32 21 | elif df[sample_id].dtype == np.int64: 22 | dtype = np.int32 23 | else: 24 | dtype = df[sample_id].dtype.type 25 | 26 | # allocate 27 | gct_df = pd.DataFrame(0, index=df.index, columns=['Description']+list(sample_ids), dtype=dtype) 28 | gct_df['Description'] = df['Description'] 29 | gct_df[sample_id] = df[sample_id].astype(dtype) 30 | 31 | for k,sample_id in enumerate(sample_ids[1:], 2): 32 | if verbose: 33 | print(f'\r * loading GCT {k}/{len(path_dict)}', end='', flush=True) 34 | df = pd.read_csv(path_dict[sample_id], sep='\t', skiprows=3, header=None, 35 | usecols=[0,2], index_col=0, names=['Name', sample_id], 36 | dtype={'Name':str, sample_id:dtype}) 37 | gct_df[sample_id] = df[sample_id] 38 | if verbose: 39 | print() 40 | 41 | return gct_df 42 | 43 | 44 | def write_gct(df, gct_file, float_format='%.6g', compresslevel=6): 45 | """Write pd.DataFrame to GCT format""" 46 | 47 | assert df.index.name == 'Name' and df.columns[0] == 'Description' 48 | 49 | if gct_file.endswith('.gct.gz'): 50 | opener = gzip.open(gct_file, 'wt', compresslevel=compresslevel) 51 | else: 52 | opener = open(gct_file, 'w') 53 | 54 | with opener as gct: 55 | gct.write(f'#1.2\n{df.shape[0]:d}\t{df.shape[1]-1:d}\n') 56 | df.to_csv(gct, sep='\t', float_format=float_format) 57 | 58 | 59 | def combine_metrics(path_dict): 60 | """Aggregate single-sample metrics files.""" 61 | metrics_df = [] 62 | for k,sample_id in enumerate(sorted(path_dict), 1): 63 | metrics_df.append(pd.read_csv(path_dict[sample_id], sep='\t', index_col=0, dtype=str)) 64 | metrics_df = pd.concat(metrics_df, axis=1).T 65 | metrics_df.index.name = 'sample_id' 66 | return metrics_df 67 | 68 | 69 | def combine_distributions(path_dict): 70 | """Aggregate single-sample insert sizes distributions.""" 71 | distr_df = [] 72 | for k,sample_id in enumerate(sorted(path_dict), 1): 73 | distr_df.append(pd.read_csv(path_dict[sample_id], sep='\t', index_col=0).squeeze('columns').rename(sample_id)) 74 | distr_df = pd.concat(distr_df, axis=1).fillna(0).astype(np.int32) 75 | return distr_df 76 | 77 | 78 | if __name__ == '__main__': 79 | parser = argparse.ArgumentParser(description='Aggregate RNA-SeQC outputs') 80 | parser.add_argument('results_dir', help='Directory containing RNA-SeQC outputs for all samples to be combined.') 81 | parser.add_argument('prefix', help='Prefix for output files, e.g., .gct.gz') 82 | parser.add_argument('--parquet', action='store_true', help='Write to parquet format instead of GCT') 83 | parser.add_argument('-o', '--output-dir', default='.', help='Output directory') 84 | args = parser.parse_args() 85 | 86 | if not os.path.isdir(args.output_dir): 87 | os.makedirs(args.output_dir) 88 | 89 | # if os.path.isdir(args.results): 90 | gene_reads_gcts = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*gene_reads.gct*'), recursive=True)} 91 | gene_fragm_gcts = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*gene_fragments.gct*'), recursive=True)} 92 | gene_tpm_gcts = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*gene_tpm.gct*'), recursive=True)} 93 | exon_reads_gcts = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*exon_reads.gct*'), recursive=True)} 94 | metrics_files = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*metrics.tsv*'), recursive=True)} 95 | insertsize_files = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*fragmentSizes.txt*'), recursive=True)} 96 | gc_content_files = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*gc_content.tsv*'), recursive=True)} 97 | # coverage files don't get aggregated 98 | # coverage_files = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results, '*coverage.tsv*'))} 99 | 100 | if len(metrics_files) > 0: 101 | print('Aggregating metrics') 102 | metrics_df = combine_metrics(metrics_files) 103 | metrics_df.to_csv(os.path.join(args.output_dir, f'{args.prefix}.metrics.txt.gz'), sep='\t') 104 | 105 | if len(insertsize_files) > 0: 106 | print('Aggregating insert size distributions') 107 | insertsize_df = combine_distributions(insertsize_files) 108 | insertsize_df.to_csv(os.path.join(args.output_dir, f'{args.prefix}.insert_size_hists.txt.gz'), sep='\t') 109 | 110 | if len(gc_content_files) > 0: 111 | print('Aggregating GC content distributions') 112 | gc_df = combine_distributions(gc_content_files) 113 | gc_df.to_csv(os.path.join(args.output_dir, f'{args.prefix}.gc_content_hists.txt.gz'), sep='\t') 114 | 115 | if len(gene_reads_gcts) > 0: 116 | print('Aggregating read count GCTs') 117 | gct_df = combine_gcts(gene_reads_gcts) 118 | if args.parquet: 119 | gct_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.gene_reads.parquet')) 120 | else: 121 | write_gct(gct_df, os.path.join(args.output_dir, f'{args.prefix}.gene_reads.gct.gz')) 122 | 123 | if len(gene_fragm_gcts) > 0: 124 | print('Aggregating fragment count GCTs') 125 | gct_df = combine_gcts(gene_fragm_gcts) 126 | if args.parquet: 127 | gct_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.gene_fragments.parquet')) 128 | else: 129 | write_gct(gct_df, os.path.join(args.output_dir, f'{args.prefix}.gene_fragments.gct.gz')) 130 | 131 | if len(gene_tpm_gcts) > 0: 132 | print('Aggregating TPM GCTs') 133 | gct_df = combine_gcts(gene_tpm_gcts) 134 | if args.parquet: 135 | gct_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.gene_tpm.parquet')) 136 | else: 137 | write_gct(gct_df, os.path.join(args.output_dir, f'{args.prefix}.gene_tpm.gct.gz')) 138 | 139 | if len(exon_reads_gcts) > 0: 140 | print('Aggregating exon read count GCTs') 141 | gct_df = combine_gcts(exon_reads_gcts) 142 | if args.parquet: 143 | gct_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.exon_reads.parquet')) 144 | else: 145 | write_gct(gct_df, os.path.join(args.output_dir, f'{args.prefix}.exon_reads.gct.gz')) 146 | -------------------------------------------------------------------------------- /Metrics.md: -------------------------------------------------------------------------------- 1 | # RNA-SeQC Output Metrics 2 | 3 | This file provides a description for each of the output metrics in the `metrics.tsv` file. A description of other output files can be found at the bottom 4 | 5 | ## Output Metrics 6 | * Mapping Rate: The proportion of all reads in the Bam which were Mapped, and not Secondary Alignments or Platform/Vendor QC Failing reads ("Mapped Reads"). 7 | In legacy mode, this is the proportion of all reads which were Mapped out 8 | of all reads which were not Secondary Alignments or Platform/Vendor QC Failing reads. 9 | * Unique Rate of Mapped: This is the proportion of reads which **were not** marked as PCR/Optical Duplicates out of all "Mapped Reads" (as defined above; excludes Secondary and Vendor QC Failed reads). 10 | * Duplicate Rate of Mapped: This is the proportion of all reads which **were** marked as PCR/Optical Duplicates out of all "Mapped Reads" (as defined above; excludes Secondary and Vendor QC Failed reads). This is complementary to the "Unique Rate of Mapped". 11 | * Duplicate Rate of Mapped, excluding Globins: This is similar to the "Duplicate Rate of Mapped" except that it only includes reads which **did not** align to _HBA1_, _HBA2_, _HBB_, or _HBD_. 12 | * Base Mismatch: The total number of mismatched bases (as determined by the "NM" tag) of all "Mapped Reads" (as defined above) divided by the total aligned length of all "Mapped Reads". 13 | * End 1 & 2 Mapping Rate: The proportion of Paired reads which were marked as First or Second in the pair, respectively, out of all "Mapped Reads" (above). 14 | * End 1 & 2 Mismatch Rate: The proportion of mismatched bases (as determined by the "NM" tag) belonging to First or Second mates, divided by the total aligned length of all "Mapped" (above) First or Second mates, respectively. 15 | * Expression Profiling Efficiency: The proportion of "Exonic Reads" (see "Exonic Rate", below) out of all reads which were not Secondary Alignments or 16 | Platform/Vendor QC Failing reads. 17 | * High Quality Rate: The proportion of **properly paired** reads with less than 6 mismatched bases and a perfect mapping quality out of all "Mapped Reads" (above). 18 | * Exonic Rate: The proportion of "Mapped Reads" (above) for which all aligned segments unambiguously aligned to exons of the same gene. 19 | * Intronic Rate: The proportion of "Mapped Reads" (above) for which all aligned segments unambiguously aligned to the same gene, but none of which _intersected_ any exons of the gene. 20 | * Intergenic Rate: The proportion of "Mapped Reads" (above) for which none of the aligned segments _intersected_ any genes. 21 | * Intragenic Rate: The sum of "Exonic" and "Intronic" rates (see "Exonic Rate" and "Intronic Rate" above) 22 | * Ambiguous Alignment Rate: The proportion of "Mapped Reads" (above) where the aligned segments unambiguously aligned to exons of more than one gene. 23 | * High Quality Exonic, Intronic, Intergenic, Intragenic, and Ambiguous Alignment Rates: The proportion of "Exonic Reads", "Intronic Reads", "Intragenic Reads", "Intergenic Reads", and "Ambiguous Reads" (see rates above) out of "High Quality Reads" only (as defined in "High Quality Rate", above) 24 | * Discard Rate: The proportion of "Mapped Reads" (above) which discarded and not checked against the reference annotation. In most cases this should be 0, however, this will include reads which were discarded by additional command line flags (such as `--exclude-chimeric` or `--tag`) or extra legacy mode filters. "Exonic Rate", "Intronic Rate", "Intergenic Rate", "Ambiguous Alignment Rate" and "Discard Rate" will sum to 1. 25 | * rRNA Rate: The proportion of "Mapped Reads" (above) which at least partially intersected with an annotated rRNA gene. This is **not** complementary to any other rates. 26 | * End 1 & 2 Sense Rate: The proportion of First or Second Mate reads which intersected with a Sense Strand feature out of all First or Second 27 | Mate reads which intersected with any features, respectively. 28 | * Avg. Splits per Read: The average number of gaps or deletions present in "Mapped Reads" (above). This is generally not an important metric, but may indicate aligner errors if the value is too high. 29 | * Raw Counts: All raw counts used in any metrics are reported here in the file 30 | * Read Length: The longest aligned length observed in any read 31 | * Genes Detected: The number of genes which had at least 5 unambiguous reads. The detection threshold can be changed with `--detection-threshold` 32 | * Estimated Library Complexity: An estimation of the number of unique cDNA fragments present in the library. This computation follows the same formula as Picard EstimateLibraryComplexity 33 | * 3' Bias statistics (Mean, Median, Std Deviation, Median Absolute Deviation, 25th percentile, 75th percentile): These aggregate statistics are based on the total coverage in 100 bp windows on both the 3' and 5' ends of a gene. The windows are both offset 150 bases into the gene. This computation is only performed on genes at least 600bp long and with at least 5 unambiguous reads. These thresholds can be changed with `--offset`, `--window-size`, `--gene-length`, and `--detection-threshold`. A gene with even coverage in both it's 3' and 5' windows would have a bias of 0.5; bias near 1 or 0 may indicate degredation 34 | * Fragment Length Statistics (Mean, Meadian, Std Deviation, and Median Absolute Deviation): These aggregate statistics are based on the insert sizes observed in "High Quality" (above) read pairs. These metrics are only present if a Bed file was provided with the `--bed` option. Only the first 1,000,000 "High Quality" pairs, where both mates map to the same Bed interval are used. 35 | * Median of Transcript Coverage statistics (Mean, Std Deviation, Coefficient of Variation): These statistics are the median of a given aggregate statistic of transcript coverage (for example, the median of mean transcript coverage). Transcript coverage is computed by dropping the first and last 500bp of each gene and measuring the **"High Quality"** (above) coverage over the remainder of the gene. 36 | * Median Exon CV: The median coefficient of variation of exon coverage. Exon coverage is computed by dropping the first and last 500bp of each gene and measuring the **"High Quality"** (above) coverage over the remainder of the exons. This is considered a good metric for sample quality. A lower value indicates more consistent coverage over exons. 37 | * Exon CV MAD: The Median Absolute Deviation over all Exon CVs 38 | 39 | **Note**: When running in `--unpaired` mode, single-ended bams will report `nan` for all End 1 and End 2 metrics 40 | 41 | ### Fragment Sizes File 42 | 43 | This file contains the raw counts of the observed insert sizes of the sample. Fragment sizes are only measured if a Bed file is provided with the `--bed` option. This file is stored as a histogram, with the first column recording a given observed size, and the second column recording the number of occurances of that particular size. 44 | 45 | ### Coverage File 46 | 47 | This file contains coverage data for all genes. Coverage computations are always performed, but this file of per-gene coverage data is not produced unless 48 | the `--coverage` flag is provided. The first column contains the gene ID as given by the input annotation. The next three columns contain the mean, standard deviation, and coefficient of variation of coverage for each gene, respectively. The first and last 500bp of each gene are dropped and not considered when computing coverage. A value of 0 or `nan` may indicate that the gene's coding length was less than 1kb or that the gene had 0 coverage 49 | over it's exons. 50 | 51 | ## Migrating between old and new columns 52 | 53 | For users of the legacy tool, several metrics have been renamed, removed, or changed. 54 | Below is a table of previous metrics and how to access them using the new metrics names: 55 | 56 | Old Metric | New Metric | Notes 57 | -|-|- 58 | Base Mismatch Rate | Base Mismatch | 59 | Duplication Rate of Mapped | Duplicate Rate of Mapped | 60 | End 1/2 % Sense | End 1/2 Sense Rate | 61 | Estimated Library Size | Esitmated Library Complexity | 62 | Failed Vendor QC Check | Failed Vendor QC | 63 | Fragment Length Mean | Average Fragment Length | The fragment length metrics have changed significantly 64 | Fragment Length StdDev | Fragment Length Std 65 | Intragenic Rate | Intragenic Rate | Some reads previously classified as `Intragenic` are now classified as `Ambiguous Alignments`. The equivalent of the old `Intragenic Rate` can be computed by summing `Intragenic Rate` + `Ambigous Alignment Rate` 66 | Mapped | Mapped Reads | 67 | Mapped Unique | Mapped Unique Reads | 68 | Total Purity Filtered Reads Sequenced | Unique Mapping, Vendor QC Passed Reads | This counts reads without the Secondary or QC Fail flags set. For a true count of total alignments use `Total Reads` 69 | -------------------------------------------------------------------------------- /src/Fasta.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Fasta.cpp 3 | // IntervalTree 4 | // 5 | // Created by Aaron Graubert on 5/23/18. 6 | // Copyright © 2018 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #include "Fasta.h" 10 | #include 11 | #include 12 | #include 13 | 14 | namespace rnaseqc { 15 | std::map chromosomes; 16 | 17 | chrom chromosomeMap(std::string chr) 18 | { 19 | auto entry = chromosomes.find(chr); 20 | if (entry == chromosomes.end()) 21 | { 22 | chromosomes[chr] = chromosomes.size() + 1u; 23 | } 24 | return chromosomes[chr]; 25 | } 26 | 27 | //Given an internal chromosome ID, get the name it corresponds to 28 | std::string getChromosomeName(chrom idx) 29 | { 30 | for (auto entry = chromosomes.begin(); entry != chromosomes.end(); ++entry) if (entry->second == idx) return entry->first; 31 | throw invalidContigException("Invalid chromosome index"); 32 | } 33 | 34 | //Get reverse complement of a sequence 35 | void complement(std::string &sequence) 36 | { 37 | std::string tmp = sequence; 38 | auto src = sequence.rbegin(); 39 | for(unsigned int i = 0; src != sequence.rend() && i < tmp.length(); ++src, ++i) 40 | { 41 | switch(*src) 42 | { 43 | case 'A': 44 | case 'a': 45 | tmp[i] = 'T'; 46 | break; 47 | case 'T': 48 | case 't': 49 | tmp[i] = 'A'; 50 | break; 51 | case 'C': 52 | case 'c': 53 | tmp[i] = 'G'; 54 | break; 55 | case 'G': 56 | case 'g': 57 | tmp[i] = 'C'; 58 | break; 59 | default: 60 | tmp[i] = *src; 61 | } 62 | } 63 | sequence.swap(tmp); 64 | } 65 | 66 | //Count GC content in a sequence 67 | double gc(std::string &sequence) 68 | { 69 | if (sequence.length() == 0) return -1; 70 | double content = 0.0, size = static_cast(sequence.length()); 71 | for (auto base = sequence.begin(); base != sequence.end(); ++base) 72 | if (*base == 'G' || *base == 'g' || *base == 'C' || *base == 'c') content += 1.0/size; 73 | return content; 74 | } 75 | 76 | // Open a fasta file 77 | void Fasta::open(std::string &filename) 78 | { 79 | this->_open = true; 80 | this->reader.open(filename); 81 | if (!this->reader.is_open()) 82 | { 83 | throw fileException("Unable to open reference fasta: " +filename); 84 | } 85 | std::string index_path = filename + ".fai"; 86 | // Check if the index exists at filepath.fai 87 | if (boost::filesystem::exists(boost::filesystem::path(filename).replace_extension(".fai"))) 88 | index_path = boost::filesystem::path(filename).replace_extension(".fai").string(); 89 | // otherwise fail if the index doesn't exist at filepath.fasta.fai 90 | else if (!boost::filesystem::exists(index_path)) throw fileException("Unable to locate fasta index: " + filename); 91 | // Import chromosome names from index 92 | std::vector contigs = bioio::read_fasta_index_contig_names(index_path); 93 | for (auto contig = contigs.begin(); contig != contigs.end(); ++contig) chromosomeMap(*contig); 94 | // Then allow bioio to parse the index 95 | bioio::FastaIndex tmp_index = bioio::read_fasta_index(index_path); 96 | for (auto entry = tmp_index.begin(); entry != tmp_index.end(); ++entry) this->contigIndex[chromosomeMap(entry->first)] = entry->second; 97 | if (!this->contigIndex.size()) throw fileException("No contigs found in fasta index: " + index_path); 98 | } 99 | 100 | //Get a forward strand sequence {contig}:{start}-{end} 101 | std::string Fasta::getSeq(chrom contig, coord start, coord end) 102 | { 103 | return this->getSeq(contig, start, end, Strand::Forward); 104 | } 105 | 106 | bool Fasta::isOpen() const { 107 | return this->_open; 108 | } 109 | 110 | //Get a sequence {contig}:{start}-{end}, and optionally return its reverse complement 111 | std::string Fasta::getSeq(chrom contig, coord start, coord end, Strand strand) 112 | { 113 | //NOTE: Coordinates must be 0-based, end-exclusive. 114 | if (!this->isOpen()) return ""; 115 | std::string output; 116 | // Determine the coordinate for the start of the page which contains the start of this sequence 117 | coord pageOffset = (floor(start / PAGE_SIZE) * PAGE_SIZE); 118 | for (coord i = pageOffset; i < end; i+=PAGE_SIZE) //Iterate over pages until we have all the pages required 119 | { 120 | this->calls++; // Increment number of pages that were requested 121 | indexType page = this->pageForCoord(contig, i); // Get page index corresponding to this coordinate 122 | if (!this->pageCache.count(page)) // If that page isn't cached 123 | { 124 | this->misses++; // Increment number of pages that were actually read 125 | this->pageCache[page] = this->readSeq(contig, i); // Read page from fasta 126 | } 127 | this->updateLRU(page); // Update the cache state 128 | output += this->pageCache[page]; // Append this page to the output 129 | } 130 | if (start-pageOffset >= output.size()) 131 | { 132 | std::cerr << "Unable to fetch sequence" << std::endl; 133 | std::cerr << "Target region (GTF+1):\t" << getChromosomeName(contig) << ":" << start+1 << "-" << end << std::endl; 134 | std::cerr << "# pages fetched:\t" << output.length() / PAGE_SIZE << std::endl; 135 | std::cerr << "This contig page indices:\t[" << this->pageForContig(contig) << ", " << this->pageForContig(contig+1) << ")" << std::endl; 136 | std::cerr << "Sequence page indices:\t[" << this->pageForCoord(contig, start) << ", " << this->pageForCoord(contig, end) << "]" << std::endl; 137 | } 138 | // Extract desired sequence from the output (which is complete pages) 139 | output = output.substr(start-pageOffset, end-start); 140 | if (strand == Strand::Reverse) complement(output); 141 | return output; 142 | } 143 | 144 | // Bump page to top of the LRU, dropping older pages as necessary 145 | void Fasta::updateLRU(indexType page) 146 | { 147 | this->lru.remove(page); 148 | while (this->lru.size() >= CACHE_SIZE) 149 | { 150 | this->pageCache.erase(this->lru.front()); 151 | this->lru.pop_front(); 152 | } 153 | this->lru.push_back(page); 154 | } 155 | 156 | // Get page idx for position 0 of a contig 157 | indexType Fasta::pageForContig(chrom contig) 158 | { 159 | static std::unordered_map pageIndex; // Function caches the lookup table to save time 160 | if (!pageIndex.size()) pageIndex[0] = 0; 161 | if (pageIndex.count(contig)) return pageIndex[contig]; 162 | if (!this->contigIndex.count(contig)) throw invalidContigException("No such contig: " + getChromosomeName(contig)); 163 | chrom firstContig = contig; 164 | for (; firstContig > 0; --firstContig) if (pageIndex.count(firstContig)) break; 165 | indexType idx = pageIndex[firstContig]; 166 | for (chrom i = firstContig; i < contig; ++i) 167 | { 168 | idx += ceil(static_cast(this->contigIndex[i].length)/PAGE_SIZE); 169 | pageIndex[i+1] = idx; 170 | } 171 | return idx; 172 | } 173 | 174 | // Read a full page starting at this position 175 | std::string Fasta::readSeq(chrom contig, coord pos) 176 | { 177 | if (!this->contigIndex.count(contig)) throw invalidContigException("No such contig: " + getChromosomeName(contig)); 178 | return (bioio::read_fasta_contig(this->reader, this->contigIndex[contig], pos, PAGE_SIZE)); 179 | } 180 | 181 | indexType Fasta::pageForCoord(chrom contig, coord pos) 182 | { 183 | return this->pageForContig(contig) + floor(static_cast(pos)/PAGE_SIZE); 184 | } 185 | 186 | Fasta::~Fasta() 187 | { 188 | this->reader.close(); 189 | this->pageCache.clear(); 190 | if (this->misses) std::cerr << this->misses << " cache misses out of " << this->calls << " requests" << std::endl; 191 | } 192 | 193 | bool Fasta::hasContig(chrom contig) const { 194 | return this->contigIndex.count(contig); 195 | } 196 | } 197 | 198 | -------------------------------------------------------------------------------- /src/Metrics.h: -------------------------------------------------------------------------------- 1 | // 2 | // Metrics.hpp 3 | // IntervalTree 4 | // 5 | // Created by Aaron Graubert on 7/5/17. 6 | // Copyright © 2017 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #ifndef Metrics_h 10 | #define Metrics_h 11 | 12 | #include "GTF.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | namespace rnaseqc { 25 | class Metrics; 26 | } 27 | 28 | std::ofstream& operator<<(std::ofstream&, rnaseqc::Metrics&); 29 | 30 | namespace rnaseqc { 31 | class Metrics { 32 | // For storing arbitrary counters 33 | std::map counter; 34 | public: 35 | Metrics() : counter(){}; 36 | void increment(std::string); 37 | void increment(std::string, int); 38 | unsigned long get(std::string); 39 | double frac(std::string, std::string); 40 | friend std::ofstream& ::operator<<(std::ofstream&, Metrics&); 41 | }; 42 | 43 | class Collector { 44 | // For temporarily holding coverage on a read before we're ready to commit that coverage to a gene 45 | std::map > > data; 46 | std::map *target; 47 | bool dirty; 48 | double total; 49 | public: 50 | Collector(std::map *dataTarget) : data(), target(dataTarget), dirty(false), total(0.0) 51 | { 52 | 53 | } 54 | void add(const std::string&, const std::string&, const double); 55 | void collect(const std::string&); 56 | void collectSingle(const std::string&); //for legacy exon detection 57 | bool queryGene(const std::string&); 58 | bool isDirty(); 59 | double sum(); 60 | }; 61 | 62 | struct CoverageEntry { 63 | // Represents a single segment of aligned read bases for base-coverage computation 64 | coord offset; 65 | unsigned int length; 66 | std::string feature_id; 67 | }; 68 | 69 | class BiasCounter { 70 | // For counting 3'/5' bias coverage 71 | const int offset; 72 | const int windowSize; 73 | const unsigned long geneLength; 74 | const unsigned int detectionThreshold; 75 | unsigned int countedGenes; 76 | std::map fiveEnd; 77 | std::map threeEnd; 78 | public: 79 | BiasCounter(int offset, int windowSize, unsigned long geneLength, unsigned int detectionThreshold) : offset(offset), windowSize(windowSize), geneLength(geneLength), detectionThreshold(detectionThreshold), countedGenes(0), fiveEnd(), threeEnd() 80 | { 81 | 82 | } 83 | 84 | void computeBias(const Feature&, std::vector&); 85 | unsigned int countGenes() const; 86 | double getBias(const std::string&); 87 | const unsigned int getThreshold() const { 88 | return this->detectionThreshold; 89 | } 90 | }; 91 | 92 | struct ExonCoverage { 93 | double cv; 94 | double gc; 95 | }; 96 | 97 | class BaseCoverage { 98 | // For computing per-base coverage of genes 99 | Fasta& fastaReader; 100 | std::map > cache; //GID -> Entry tmp cache as exon hits are recorded 101 | std::map > coverage; //EID -> Coverage vector for exons still in window 102 | std::map exonCoverage; 103 | std::ofstream writer; 104 | const unsigned int mask_size; 105 | std::list geneMeans, geneStds, geneCVs; 106 | BiasCounter &bias; 107 | std::unordered_set seen; 108 | BaseCoverage(const BaseCoverage&) = delete; //No! 109 | public: 110 | BaseCoverage(Fasta& fasta, const std::string &filename, const unsigned int mask, bool openFile, BiasCounter &biasCounter) : fastaReader(fasta), coverage(), exonCoverage(), cache(), writer(openFile ? filename : "/dev/null"), mask_size(mask), geneMeans(), geneStds(), geneCVs(), bias(biasCounter), seen() 111 | { 112 | if ((!this->writer.is_open()) && openFile) throw std::runtime_error("Unable to open BaseCoverage output file"); 113 | this->writer << "gene_id\tcoverage_mean\tcoverage_std\tcoverage_CV" << std::endl; 114 | } 115 | 116 | void add(const Feature&, const coord, const coord); //Adds to the cache 117 | void commit(const std::string&); //moves one gene out of the cache and adds hits to exon coverage vector 118 | void reset(); //Empties the cache 119 | // void clearCoverage(); //empties out data that won't be used 120 | void compute(const Feature&); //Computes the per-base coverage for all transcripts in the gene 121 | void close(); //Flush and close the ofstream 122 | BiasCounter& getBiasCounter() const { 123 | return this->bias; 124 | } 125 | const std::map& getExonCoverage() const { 126 | return this->exonCoverage; 127 | } 128 | std::list& getGeneMeans() { 129 | return this->geneMeans; 130 | } 131 | std::list& getGeneStds() { 132 | return this->geneStds; 133 | } 134 | std::list& getGeneCVs() { 135 | return this->geneCVs; 136 | } 137 | }; 138 | 139 | template void sortContainer(T &data) { 140 | std::sort(data.begin(), data.end()); 141 | } 142 | 143 | template void sortContainer(std::list &data) { 144 | data.sort(); 145 | } 146 | 147 | template double computeMedian(unsigned long size, T &&iterator) 148 | { 149 | if (size <= 0) // Couldn't decide if it would make sense to just report a median of 0. This seemed safer 150 | throw std::range_error("Cannot compute median of an empty list"); 151 | else if (size == 1) 152 | return *iterator; 153 | for (unsigned long midpoint = (size - 1) / 2; midpoint > 0; --midpoint) ++iterator; 154 | if (size % 2) 155 | { 156 | double value = static_cast(*(iterator++)); 157 | return (value + static_cast(*iterator)) / 2.0; 158 | } 159 | return static_cast(*iterator); 160 | } 161 | 162 | typedef std::tuple statsTuple; 163 | 164 | enum StatIdx {avg = 0, med = 1, std = 2, mad = 3, skew = 1, kurt = 3}; 165 | 166 | template 167 | statsTuple getStatistics(T &data) { 168 | if (data.size()) { 169 | double avg = 0.0, std = 0.0; 170 | std::vector deviations; 171 | sortContainer(data); 172 | const double size = data.size(); 173 | double median = computeMedian(size, data.begin()); 174 | for (auto element = data.begin(); element != data.end(); ++element) { 175 | avg += static_cast(*element) / size; 176 | deviations.push_back(fabs(static_cast(*element) - median)); 177 | } 178 | sortContainer(deviations); 179 | double medDev = computeMedian(deviations.size(), deviations.begin()) * 1.4826; 180 | for (auto element = data.begin(); element != data.end(); ++element) 181 | std += pow(static_cast(*element) - avg, 2.0) / size; 182 | std = pow(std, 0.5); 183 | return statsTuple(avg, median, std, medDev); 184 | } 185 | return statsTuple(NAN, NAN, NAN, NAN); 186 | } 187 | 188 | template 189 | statsTuple getAdvancedStatistics(T &data) { 190 | if (!data.size()) return statsTuple(NAN, NAN, NAN, NAN); 191 | double avg = 0.0, m2 = 0.0, m3 = 0.0, m4 = 0.0, count = 0.0; 192 | for (auto element = data.begin(); element != data.end(); ++element) { 193 | double prev_count = count++; 194 | double delta = static_cast(*element) - avg; 195 | double delta_n = delta / count; 196 | double delta_n2 = delta_n * delta_n; 197 | double t = delta * delta_n * prev_count; 198 | avg += delta_n; 199 | m4 += t * delta_n2 * (count*count - 3*count + 3) + 6*delta_n2*m2 - 4*delta_n*m3; 200 | m3 += t * delta_n * (count - 2) - 3 * delta_n * m2; 201 | m2 += t; 202 | 203 | } 204 | double std = pow(m2/count, 0.5); 205 | return statsTuple(avg, m3 / count / pow(std, 3.0), std, (count * m4) / (m2 * m2) - 3); 206 | } 207 | 208 | extern std::map uniqueGeneCounts, geneCounts, exonCounts, geneFragmentCounts; //counters for read coverage of genes and exons 209 | extern std::map > fragmentTracker; // tracks fragments encountered by each gene 210 | } 211 | 212 | #endif /* Metrics_h */ 213 | -------------------------------------------------------------------------------- /src/GTF.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // GTF.cpp 3 | // IntervalTree 4 | // 5 | // Created by Aaron Graubert on 6/28/17. 6 | // Copyright © 2017 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #include "GTF.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using std::ifstream; 16 | using std::string; 17 | using std::map; 18 | 19 | namespace rnaseqc { 20 | const string EXON_NAME = "exon"; 21 | const boost::regex ribosomalPattern("rRNA"); //For recognizing features which are rRNAs 22 | map geneNames, geneSeqs; 23 | map geneLengths, geneCodingLengths; 24 | map exonLengths; 25 | std::map> exonsForGene; 26 | std::vector geneList, exonList; 27 | map exon_names; 28 | 29 | 30 | ifstream& operator>>(ifstream &in, Feature &out) 31 | { 32 | static std::unordered_set geneIds, exonIds; 33 | try{ 34 | string line; 35 | while(getline(in, line)) 36 | { 37 | if(line[0] == '#') continue; //not a feature line 38 | std::istringstream tokenizer(line); 39 | //get chr# 40 | string buffer; 41 | if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse chromosome. Invalid GTF line: " + line); 42 | out.chromosome = chromosomeMap(buffer); 43 | //get track name 44 | if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse track. Invalid GTF line: " + line); 45 | //get feature type 46 | if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse feature type. Invalid GTF line: " + line); 47 | if (buffer == "exon") out.type = FeatureType::Exon; 48 | else if (buffer == "gene") out.type = FeatureType::Gene; 49 | else if (buffer == "transcript") out.type = FeatureType::Transcript; 50 | else out.type = FeatureType::Other; 51 | //get start pos 52 | if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse start. Invalid GTF line: " + line); 53 | out.start = std::stoull(buffer); 54 | //get stop pos 55 | if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse end. Invalid GTF line: " + line); 56 | out.end = std::stoull(buffer); 57 | //get score 58 | if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse score. Invalid GTF line: " + line); 59 | //get strand 60 | if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse strand. Invalid GTF line: " + line); 61 | switch(buffer[0]) 62 | { 63 | case '+': 64 | out.strand = Strand::Forward; 65 | break; 66 | case '-': 67 | out.strand = Strand::Reverse; 68 | break; 69 | default: 70 | out.strand = Strand::Unknown; 71 | } 72 | //get frame 73 | if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse frame. Invalid GTF line: " + line); 74 | //get attributes 75 | if(!getline(tokenizer, buffer)) throw gtfException("Unable to parse attributes. Invalid GTF line: " + line); 76 | std::map attributes; 77 | parseAttributes(buffer, attributes); 78 | if ( out.end < out.start) 79 | std::cerr << "Bad feature range:" << out.start << " - " << out.end << std::endl; 80 | if (out.type == FeatureType::Gene && attributes.find("gene_id") != attributes.end()) 81 | { 82 | //Parse gene attributes 83 | out.feature_id = attributes["gene_id"]; 84 | if (geneIds.count(out.feature_id)) throw gtfException(std::string("Detected non-unique Gene ID: "+out.feature_id)); 85 | geneIds.insert(out.feature_id); 86 | geneLengths[out.feature_id] = out.end - out.start + 1; 87 | geneList.push_back(attributes["gene_id"]); 88 | } 89 | if (out.type == FeatureType::Transcript && attributes.find("transcript_id") != attributes.end()) out.feature_id = attributes["transcript_id"]; 90 | if (attributes.find("gene_id") != attributes.end()) out.gene_id = attributes["gene_id"]; 91 | if (out.type == FeatureType::Exon) 92 | { 93 | //Parse exon attributes 94 | if (attributes.find("exon_id") != attributes.end()) 95 | { 96 | out.feature_id = attributes["exon_id"]; 97 | } 98 | else if (attributes.find("gene_id") != attributes.end()) 99 | { 100 | out.feature_id = attributes["gene_id"] + "_" + std::to_string(++exon_names[attributes["gene_id"]]); 101 | std::cerr << "Unnamed exon: Gene: " << attributes["gene_id"] << " Position: [" << out.start << ", " << out.end << "] Inferred Exon Name: " << out.feature_id << std::endl; 102 | } 103 | else throw gtfException(std::string("Exon missing exon_id and gene_id fields: " + line)); 104 | if (exonIds.count(out.feature_id)) throw gtfException(std::string("Detected non-unique Exon ID: "+out.feature_id)); 105 | exonIds.insert(out.feature_id); 106 | exonList.push_back(out.feature_id); 107 | geneCodingLengths[out.gene_id] += 1 + (out.end - out.start); 108 | exonLengths[out.feature_id] = {out.chromosome, out.start, 1 + (out.end - out.start)}; 109 | } 110 | if (attributes.find("transcript_type") != attributes.end()) out.transcript_type = attributes["transcript_type"]; 111 | if (attributes.find("gene_name") != attributes.end()) geneNames[out.feature_id] = attributes["gene_name"]; 112 | else if (attributes.find("gene_id") != attributes.end()) geneNames[out.feature_id] = attributes["gene_id"]; 113 | out.ribosomal = boost::regex_search(out.transcript_type, ribosomalPattern); 114 | break; 115 | } 116 | 117 | } 118 | catch(gtfException &e) 119 | { 120 | throw e; 121 | } 122 | catch(std::invalid_argument &e) 123 | { 124 | throw gtfException(std::string("GTF is in an invalid format: ") + e.what()); 125 | } 126 | catch(std::exception &e) 127 | { 128 | throw gtfException(std::string("Uncountered an unknown error while parsing GTF: ")+e.what()); 129 | } 130 | return in; 131 | } 132 | 133 | std::map& parseAttributes(std::string &intake, std::map &attributes) 134 | { 135 | std::istringstream tokenizer(intake); 136 | string buffer; 137 | while (getline(tokenizer, buffer, ';')) 138 | { 139 | std::istringstream splitter(buffer); 140 | string current; 141 | getline(splitter, current, '"'); 142 | string key = current.substr(0, current.length()-1); 143 | while (key[0] == ' ' or key[0] == '\t') key = key.substr(1); 144 | getline(splitter, current, '"'); 145 | attributes[key] = current; 146 | } 147 | return attributes; 148 | } 149 | 150 | bool operator==(const Feature &a, const Feature &b) 151 | { 152 | if (a.start != b.start) return false; 153 | if (a.end != b.end) return false; 154 | if (a.chromosome != b.chromosome) return false; 155 | if (a.strand != b.strand) return false; 156 | if (a.type != b.type) return false; 157 | if (a.feature_id != b.feature_id) return false; 158 | return a.transcript_type == b.transcript_type; 159 | } 160 | 161 | bool compIntervalStart(const Feature &a, const Feature &b) 162 | { 163 | return a.start < b.start; 164 | } 165 | 166 | bool compIntervalEnd(const Feature &a, const Feature &b) 167 | { 168 | return a.end < b.end; 169 | } 170 | 171 | bool intersectPoint(const Feature &a, const coord x) 172 | { 173 | return (x >= a.start) && (x <= a.end); 174 | } 175 | 176 | bool intersectInterval(const Feature &a, const Feature &b) 177 | { 178 | return intersectPoint(a, b.start) || intersectPoint(a, b.end) || intersectPoint(b, a.start); 179 | } 180 | 181 | int partialIntersect(const Feature &target, const Feature &query) 182 | { 183 | return intersectInterval(target, query) ? ( 184 | 1+std::min(target.end, query.end-1) - std::max(target.start, query.start) 185 | ) : 0; 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /test_data/Makefile.osx: -------------------------------------------------------------------------------- 1 | #Set inclusion paths here (if boost, bamtools, or args are installed outside your path) 2 | INCLUDE_DIRS=-ISeqLib -ISeqLib/htslib/ 3 | #Set library paths here (if boost or bamtools are installed outside your path) 4 | LIBRARY_PATHS= 5 | #Set to 0 if you encounter linker errors regarding strings from the bamtools library 6 | ABI=1 7 | #Provide full paths here to .a archives for libraries which should be statically linked 8 | STATIC_LIBS=/usr/local/lib/libboost_filesystem.a /usr/local/lib/libboost_regex.a /usr/local/lib/libboost_system.a $(ZLIB_PATH) SeqLib/lib/libhts.a $(LZMA_PATH) /usr/local/opt/bzip2/lib/libbz2.a 9 | #List of remaining libraries that will be dynamically linked 10 | LIBS= 11 | 12 | CC=g++ 13 | STDLIB=-std=c++14 14 | CFLAGS=-Wall $(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI) -O3 15 | SOURCES=BED.cpp Expression.cpp GTF.cpp RNASeQC.cpp Metrics.cpp Fasta.cpp BamReader.cpp 16 | SRCDIR=src 17 | OBJECTS=$(SOURCES:.cpp=.o) 18 | SEQFLAGS=$(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI) 19 | SHELL=/bin/bash 20 | 21 | rnaseqc: $(foreach file,$(OBJECTS),$(SRCDIR)/$(file)) SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a 22 | $(CC) -O3 $(LIBRARY_PATHS) -o $@ $^ $(STATIC_LIBS) $(LIBS) 23 | 24 | %.o: %.cpp 25 | $(CC) $(CFLAGS) -I. $(INCLUDE_DIRS) -c -o $@ $< 26 | 27 | SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a: 28 | cd SeqLib && ./configure && make CXXFLAGS="$(SEQFLAGS)" && make install 29 | 30 | .PHONY: clean 31 | 32 | clean: 33 | rm $(wildcard $(SRCDIR)/*.o) 34 | 35 | # The rest of the makefile consists of test cases. Run "make test" to perform all tests 36 | 37 | .PHONY: test 38 | 39 | test: test-version test-single test-chr1 test-downsampled test-legacy test-crams test-expected-failures 40 | echo Tests Complete 41 | 42 | .PHONY: test-version 43 | 44 | test-version: rnaseqc 45 | [ ! -z "$(shell ./rnaseqc --version)" ] 46 | 47 | .PHONY: test-single 48 | 49 | test-single: rnaseqc 50 | ./rnaseqc test_data/single_pair.gtf test_data/single_pair.bam .test_output 51 | python3 test_data/approx_diff.py .test_output/single_pair.bam.metrics.tsv test_data/single_pair.output/single_pair.bam.metrics.tsv -m metrics -c single_pair.bam single_pair.bam_ -t 52 | python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_reads.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t 53 | python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_tpm.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t 54 | python3 test_data/approx_diff.py .test_output/single_pair.bam.exon_reads.gct <(gzcat test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t 55 | python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_fragments.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t 56 | rm -rf .test_output 57 | 58 | .PHONY: test-chr1 59 | 60 | test-chr1: rnaseqc 61 | ./rnaseqc test_data/chr1.gtf test_data/chr1.bam .test_output --coverage 62 | python3 test_data/approx_diff.py .test_output/chr1.bam.metrics.tsv test_data/chr1.output/chr1.bam.metrics.tsv -m metrics -c chr1.bam chr1.bam_ -t 63 | python3 test_data/approx_diff.py .test_output/chr1.bam.gene_reads.gct <(gzcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t 64 | python3 test_data/approx_diff.py .test_output/chr1.bam.gene_tpm.gct <(gzcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t 65 | python3 test_data/approx_diff.py .test_output/chr1.bam.exon_reads.gct <(gzcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t 66 | python3 test_data/approx_diff.py .test_output/chr1.bam.gene_fragments.gct <(gzcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t 67 | sed s/-nan/nan/g .test_output/chr1.bam.coverage.tsv > .test_output/coverage.tsv 68 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ -t 69 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ -t 70 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ -t 71 | rm -rf .test_output 72 | 73 | .PHONY: test-downsampled 74 | 75 | test-downsampled: rnaseqc 76 | ./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output 77 | python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/downsampled.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_ -t 78 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t 79 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t 80 | python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t 81 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t 82 | sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv 83 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ -t 84 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ -t 85 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ -t 86 | python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/downsampled.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_ -t 87 | rm -rf .test_output 88 | 89 | .PHONY: test-crams 90 | 91 | test-crams: rnaseqc 92 | touch test_data/chr1.fasta.fai 93 | ./rnaseqc test_data/chr1.gtf test_data/chr1.cram .test_output --coverage --fasta test_data/chr1.fasta 94 | python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.cram.metrics.tsv -m metrics -c chr1.cram chr1.cram_ -t 95 | python3 test_data/approx_diff.py .test_output/chr1.cram.gene_reads.gct <(gzcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t 96 | python3 test_data/approx_diff.py .test_output/chr1.cram.gene_tpm.gct <(gzcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t 97 | python3 test_data/approx_diff.py .test_output/chr1.cram.exon_reads.gct <(gzcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t 98 | python3 test_data/approx_diff.py .test_output/chr1.cram.gene_fragments.gct <(gzcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t 99 | sed s/-nan/nan/g .test_output/chr1.cram.coverage.tsv > .test_output/coverage.tsv 100 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ -t 101 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ -t 102 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ -t 103 | rm -rf .test_output 104 | 105 | .PHONY: test-legacy 106 | 107 | test-legacy: rnaseqc 108 | ./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output --legacy 109 | python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/legacy.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_ -t 110 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t 111 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t 112 | python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/legacy.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t 113 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t 114 | sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv 115 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ -t 116 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ -t 117 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ -t 118 | python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/legacy.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_ -t 119 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/legacy.output/legacy.gene_reads.gct.gz) -m tables -c Counts RNA-SeQC -t 120 | python3 python/rnaseqc/legacy_exon_remap.py .test_output/downsampled.bam.exon_reads.gct test_data/downsampled.gtf > /dev/null 121 | python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/legacy.output/legacy.exon_reads.gct.gz) -m tables -c Counts RNA-SeQC -t 122 | rm -rf .test_output 123 | 124 | .PHONY: test-expected-failures 125 | 126 | test-expected-failures: rnaseqc 127 | ./rnaseqc test_data/gencode.v26.collapsed.gtf test_data/downsampled.bam .test_output 2>/dev/null; test $$? -eq 11 128 | rm -rf .test_output 129 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #Set inclusion paths here (if boost, bamtools, or args are installed outside your path) 2 | INCLUDE_DIRS=-ISeqLib -ISeqLib/htslib/ 3 | #Set library paths here (if boost or bamtools are installed outside your path) 4 | LIBRARY_PATHS= 5 | #Set to 0 if you encounter linker errors regarding strings from the bamtools library 6 | ABI=1 7 | #Provide full paths here to .a archives for libraries which should be statically linked 8 | STATIC_LIBS= 9 | #List of remaining libraries that will be dynamically linked 10 | LIBS= -lboost_filesystem -lboost_regex -lboost_system -lz -llzma -lbz2 -lpthread 11 | 12 | CC=g++ 13 | STDLIB=-std=c++14 14 | CFLAGS=-Wall $(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI) -O3 15 | SOURCES=BED.cpp Expression.cpp GTF.cpp RNASeQC.cpp Metrics.cpp Fasta.cpp BamReader.cpp 16 | SRCDIR=src 17 | OBJECTS=$(SOURCES:.cpp=.o) 18 | SEQFLAGS=$(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI) 19 | SHELL=/bin/bash 20 | 21 | rnaseqc: $(foreach file,$(OBJECTS),$(SRCDIR)/$(file)) SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a 22 | $(CC) -O3 $(LIBRARY_PATHS) -o $@ $^ $(STATIC_LIBS) $(LIBS) 23 | 24 | .PHONY: lib 25 | 26 | lib: $(foreach file,$(OBJECTS),$(SRCDIR)/$(file)) 27 | ar -rcs rnaseqc.a $^ 28 | 29 | %.o: %.cpp 30 | $(CC) $(CFLAGS) -I. $(INCLUDE_DIRS) -c -o $@ $< 31 | 32 | SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a: 33 | cd SeqLib && ./configure && make CXXFLAGS="$(SEQFLAGS)" && make install 34 | 35 | .PHONY: clean 36 | 37 | clean: 38 | rm $(wildcard $(SRCDIR)/*.o) 39 | 40 | # The rest of the makefile consists of test cases. Run "make test" to perform all tests 41 | 42 | .PHONY: test 43 | 44 | test: test-version test-single test-chr1 test-downsampled test-legacy test-crams test-expected-failures 45 | echo Tests Complete 46 | 47 | .PHONY: test-version 48 | 49 | test-version: rnaseqc 50 | [ ! -z "$(shell ./rnaseqc --version)" ] 51 | 52 | .PHONY: test-single 53 | 54 | test-single: rnaseqc 55 | ./rnaseqc test_data/single_pair.gtf test_data/single_pair.bam .test_output 56 | python3 test_data/approx_diff.py .test_output/single_pair.bam.metrics.tsv test_data/single_pair.output/single_pair.bam.metrics.tsv -m metrics -c single_pair.bam single_pair.bam_ 57 | python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_reads.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 58 | python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_tpm.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 59 | python3 test_data/approx_diff.py .test_output/single_pair.bam.exon_reads.gct <(gzcat test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 60 | python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_fragments.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 61 | rm -rf .test_output 62 | 63 | .PHONY: test-chr1 64 | 65 | test-chr1: rnaseqc 66 | ./rnaseqc test_data/chr1.gtf test_data/chr1.bam .test_output --coverage 67 | python3 test_data/approx_diff.py .test_output/chr1.bam.metrics.tsv test_data/chr1.output/chr1.bam.metrics.tsv -m metrics -c chr1.bam chr1.bam_ 68 | python3 test_data/approx_diff.py .test_output/chr1.bam.gene_reads.gct <(gzcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 69 | python3 test_data/approx_diff.py .test_output/chr1.bam.gene_tpm.gct <(gzcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 70 | python3 test_data/approx_diff.py .test_output/chr1.bam.exon_reads.gct <(gzcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 71 | python3 test_data/approx_diff.py .test_output/chr1.bam.gene_fragments.gct <(gzcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 72 | sed s/-nan/nan/g .test_output/chr1.bam.coverage.tsv > .test_output/coverage.tsv 73 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ 74 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ 75 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ 76 | rm -rf .test_output 77 | 78 | .PHONY: test-downsampled 79 | 80 | test-downsampled: rnaseqc 81 | ./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output 82 | python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/downsampled.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_ 83 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 84 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 85 | python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 86 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 87 | sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv 88 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ 89 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ 90 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ 91 | python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/downsampled.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_ 92 | rm -rf .test_output 93 | 94 | .PHONY: test-legacy 95 | 96 | test-legacy: rnaseqc 97 | ./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output --legacy 98 | python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/legacy.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_ 99 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 100 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 101 | python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/legacy.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 102 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 103 | sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv 104 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ 105 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ 106 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ 107 | python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/legacy.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_ 108 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/legacy.output/legacy.gene_reads.gct.gz) -m tables -c Counts RNA-SeQC 109 | python3 python/rnaseqc/legacy_exon_remap.py .test_output/downsampled.bam.exon_reads.gct test_data/downsampled.gtf > /dev/null 110 | python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/legacy.output/legacy.exon_reads.gct.gz) -m tables -c Counts RNA-SeQC -t 111 | rm -rf .test_output 112 | 113 | .PHONY: test-crams 114 | 115 | test-crams: rnaseqc 116 | touch test_data/chr1.fasta.fai 117 | ./rnaseqc test_data/chr1.gtf test_data/chr1.cram .test_output --coverage --fasta test_data/chr1.fasta 118 | python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.cram.metrics.tsv -m metrics -c chr1.cram chr1.cram_ 119 | python3 test_data/approx_diff.py .test_output/chr1.cram.gene_reads.gct <(gzcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 120 | python3 test_data/approx_diff.py .test_output/chr1.cram.gene_tpm.gct <(gzcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 121 | python3 test_data/approx_diff.py .test_output/chr1.cram.exon_reads.gct <(gzcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 122 | python3 test_data/approx_diff.py .test_output/chr1.cram.gene_fragments.gct <(gzcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 123 | python3 test_data/approx_diff.py .test_output/chr1.cram.gc_content.tsv test_data/chr1.output/chr1.cram.gc_content.tsv -m metrics -c Count Count_ 124 | sed s/-nan/nan/g .test_output/chr1.cram.coverage.tsv > .test_output/coverage.tsv 125 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ 126 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ 127 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ 128 | rm -rf .test_output 129 | 130 | .PHONY: test-expected-failures 131 | 132 | test-expected-failures: rnaseqc 133 | ./rnaseqc test_data/gencode.v26.collapsed.gtf test_data/downsampled.bam .test_output 2>/dev/null; test $$? -eq 11 134 | rm -rf .test_output 135 | -------------------------------------------------------------------------------- /test_data/Makefile.linux: -------------------------------------------------------------------------------- 1 | #Set inclusion paths here (if boost, bamtools, or args are installed outside your path) 2 | INCLUDE_DIRS=-ISeqLib -ISeqLib/htslib/ 3 | #Set library paths here (if boost or bamtools are installed outside your path) 4 | LIBRARY_PATHS= 5 | #Set to 0 if you encounter linker errors regarding strings from the bamtools library 6 | ABI=1 7 | #Provide full paths here to .a archives for libraries which should be statically linked 8 | STATIC_LIBS=SeqLib/lib/libhts.a /usr/lib/x86_64-linux-gnu/libboost_filesystem.a /usr/lib/x86_64-linux-gnu/libboost_regex.a /usr/lib/x86_64-linux-gnu/libboost_system.a /usr/lib/x86_64-linux-gnu/libz.a /usr/lib/x86_64-linux-gnu/liblzma.a /usr/lib/x86_64-linux-gnu/libbz2.a /usr/lib/gcc/x86_64-linux-gnu/9*/libstdc++.a 9 | #List of remaining libraries that will be dynamically linked 10 | LIBS=-lpthread 11 | 12 | CC=g++ 13 | STDLIB=-std=c++14 14 | CFLAGS=-Wall $(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI) -O3 15 | SOURCES=BED.cpp Expression.cpp GTF.cpp RNASeQC.cpp Metrics.cpp Fasta.cpp BamReader.cpp 16 | SRCDIR=src 17 | OBJECTS=$(SOURCES:.cpp=.o) 18 | SEQFLAGS=$(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI) 19 | SHELL=/bin/bash 20 | 21 | rnaseqc: $(foreach file,$(OBJECTS),$(SRCDIR)/$(file)) SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a 22 | $(CC) -static -static-libstdc++ -static-libgcc -O3 $(LIBRARY_PATHS) -o $@ $^ $(STATIC_LIBS) $(LIBS) 23 | 24 | %.o: %.cpp 25 | $(CC) -static -static-libstdc++ -static-libgcc $(CFLAGS) -I. $(INCLUDE_DIRS) -c -o $@ $< 26 | 27 | SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a: 28 | cd SeqLib && ./configure && make CXXFLAGS="$(SEQFLAGS)" && make install 29 | 30 | .PHONY: clean 31 | 32 | clean: 33 | rm $(wildcard $(SRCDIR)/*.o) 34 | 35 | # The rest of the makefile consists of test cases. Run "make test" to perform all tests 36 | 37 | .PHONY: test 38 | 39 | test: test-version test-single test-chr1 test-downsampled test-legacy test-crams test-expected-failures 40 | echo Tests Complete 41 | 42 | .PHONY: test-version 43 | 44 | test-version: rnaseqc 45 | [ ! -z "$(shell ./rnaseqc --version)" ] 46 | 47 | .PHONY: test-single 48 | 49 | test-single: rnaseqc 50 | ./rnaseqc test_data/single_pair.gtf test_data/single_pair.bam .test_output 51 | python3 test_data/approx_diff.py .test_output/single_pair.bam.metrics.tsv test_data/single_pair.output/single_pair.bam.metrics.tsv -m metrics -c single_pair.bam single_pair.bam_ 52 | python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_reads.gct <(zcat test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 53 | python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_tpm.gct <(zcat test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 54 | python3 test_data/approx_diff.py .test_output/single_pair.bam.exon_reads.gct <(zcat test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 55 | python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_fragments.gct <(zcat test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 56 | rm -rf .test_output 57 | 58 | .PHONY: test-chr1 59 | 60 | test-chr1: rnaseqc 61 | ./rnaseqc test_data/chr1.gtf test_data/chr1.bam .test_output --coverage 62 | python3 test_data/approx_diff.py .test_output/chr1.bam.metrics.tsv test_data/chr1.output/chr1.bam.metrics.tsv -m metrics -c chr1.bam chr1.bam_ 63 | python3 test_data/approx_diff.py .test_output/chr1.bam.gene_reads.gct <(zcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 64 | python3 test_data/approx_diff.py .test_output/chr1.bam.gene_tpm.gct <(zcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 65 | python3 test_data/approx_diff.py .test_output/chr1.bam.exon_reads.gct <(zcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 66 | python3 test_data/approx_diff.py .test_output/chr1.bam.gene_fragments.gct <(zcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 67 | sed s/-nan/nan/g .test_output/chr1.bam.coverage.tsv > .test_output/coverage.tsv 68 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ 69 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ 70 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ 71 | rm -rf .test_output 72 | 73 | .PHONY: test-downsampled 74 | 75 | test-downsampled: rnaseqc 76 | ./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output 77 | python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/downsampled.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_ 78 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(zcat test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 79 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(zcat test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 80 | python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(zcat test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 81 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(zcat test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 82 | sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv 83 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ 84 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ 85 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ 86 | python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/downsampled.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_ 87 | rm -rf .test_output 88 | 89 | .PHONY: test-crams 90 | 91 | test-crams: rnaseqc 92 | touch test_data/chr1.fasta.fai 93 | ./rnaseqc test_data/chr1.gtf test_data/chr1.cram .test_output --coverage --fasta test_data/chr1.fasta 94 | python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.cram.metrics.tsv -m metrics -c chr1.cram chr1.cram_ 95 | python3 test_data/approx_diff.py .test_output/chr1.cram.gene_reads.gct <(zcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 96 | python3 test_data/approx_diff.py .test_output/chr1.cram.gene_tpm.gct <(zcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 97 | python3 test_data/approx_diff.py .test_output/chr1.cram.exon_reads.gct <(zcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 98 | python3 test_data/approx_diff.py .test_output/chr1.cram.gene_fragments.gct <(zcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 99 | sed s/-nan/nan/g .test_output/chr1.cram.coverage.tsv > .test_output/coverage.tsv 100 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ 101 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ 102 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ 103 | rm -rf .test_output 104 | 105 | .PHONY: test-legacy 106 | 107 | test-legacy: rnaseqc 108 | ./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output --legacy 109 | python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/legacy.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_ 110 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(zcat test_data/legacy.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ 111 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(zcat test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ 112 | python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(zcat test_data/legacy.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ 113 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(zcat test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ 114 | sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv 115 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ 116 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ 117 | python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ 118 | python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/legacy.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_ 119 | python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(zcat test_data/legacy.output/legacy.gene_reads.gct.gz) -m tables -c Counts RNA-SeQC 120 | python3 python/rnaseqc/legacy_exon_remap.py .test_output/downsampled.bam.exon_reads.gct test_data/downsampled.gtf > /dev/null 121 | python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(zcat test_data/legacy.output/legacy.exon_reads.gct.gz) -m tables -c Counts RNA-SeQC -t 122 | rm -rf .test_output 123 | 124 | .PHONY: test-expected-failures 125 | 126 | test-expected-failures: rnaseqc 127 | ./rnaseqc test_data/gencode.v26.collapsed.gtf test_data/downsampled.bam .test_output 2>/dev/null; test $$? -eq 11 128 | rm -rf .test_output 129 | -------------------------------------------------------------------------------- /python/rnaseqc/report.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import argparse 4 | import os 5 | import sys 6 | import matplotlib.pyplot as plt 7 | import qtl.io 8 | 9 | sys.path.insert(1, os.path.dirname(__file__)) 10 | from .plot import * 11 | 12 | 13 | def plot_qc_figures(metrics_df, cohort_s=None, cohort_order=None, cohort_colors=None, date_s=None, 14 | insertsize_df=None, gc_content_df=None, tpm_df=None, 15 | thresholds=None, lims=None, outlier_method='threshold', 16 | show_legend=True, legend_cols=5, lw=4, lh=1, ms=12, alpha=1, show_xticklabels=False, 17 | highlight_ids=None, prefix=None, output_dir=None, dpi=300): 18 | """ 19 | metrics_df: output from RNA-SeQC 20 | cohort_s: mapping of sample ID to cohort/cluster/etc. 21 | """ 22 | if cohort_s is None: 23 | cohort_s = pd.Series('All samples', index=metrics_df.index) 24 | else: 25 | assert metrics_df.index.isin(cohort_s.index).all() and cohort_s.loc[metrics_df.index].notnull().all() 26 | 27 | if date_s is not None: 28 | assert metrics_df.index.isin(date_s.index).all() and date_s.loc[metrics_df.index].notnull().all() 29 | 30 | if output_dir is not None: 31 | assert prefix is not None 32 | 33 | cohorts = np.unique(cohort_s.loc[metrics_df.index]) 34 | if cohort_colors is None: 35 | cohort_colors = get_cohort_colors(cohorts) 36 | 37 | metrics_args = { 38 | 'cohort_s': cohort_s, 39 | 'cohort_order': cohort_order, 40 | 'cohort_colors': cohort_colors, 41 | 'date_s': date_s, 42 | 'show_xticklabels': show_xticklabels, 43 | 'ms': ms, 44 | 'alpha': alpha, 45 | 'highlight_ids': highlight_ids, 46 | 'aw': 6, 47 | 'ah': 2, 48 | } 49 | 50 | metrics_list = [ 51 | 'Mapped Reads', # Unique Mapping, Vendor QC Passed Reads that were mapped 52 | 'Mapping Rate', 53 | 'Duplicate Rate of Mapped', 54 | 'Exonic Rate', 55 | 'Intronic Rate', 56 | 'Intergenic Rate', 57 | 'Chimeric Alignment Rate', 58 | 'rRNA Rate', 59 | # 'Mapped Unique Reads', # Duplicate Rate of Mapped is more representative 60 | "Median 3' bias", 61 | 'Median Exon CV', 62 | 'Fragment GC Content Mean', 63 | 'Average Fragment Length', 64 | ] 65 | 66 | threshold_dir_dict = { 67 | 'Mapped Reads': 'lt', 68 | 'Mapping Rate': 'lt', 69 | 'Duplicate Rate of Mapped': 'gt', 70 | 'Exonic Rate': 'lt', 71 | 'Intronic Rate': 'gt', 72 | 'Intergenic Rate': 'gt', 73 | 'Chimeric Alignment Rate': 'gt', 74 | 'rRNA Rate': 'gt', 75 | "Median 3' bias": 'gt', 76 | 'Median Exon CV': 'gt', 77 | 'Average Fragment Length': 'lt', 78 | } 79 | 80 | threshold_dict = { 81 | 'Mapped Reads': 50e6, 82 | 'Mapping Rate': 0.9, 83 | # 'Duplicate Rate of Mapped': 0.6, 84 | 'Exonic Rate': 0.7, 85 | 'Intronic Rate': 0.05, 86 | 'Intergenic Rate': 0.1, 87 | 'Chimeric Alignment Rate': 0.01, 88 | 'rRNA Rate': 0.1, 89 | } 90 | if thresholds is not None: 91 | threshold_dict.update(thresholds) 92 | 93 | ylim_dict = { 94 | 'Mapped Reads': None, 95 | 'Mapping Rate': [0,1], 96 | 'Duplicate Rate of Mapped': [0, 1], 97 | 'Exonic Rate': [0, 1], 98 | 'Intronic Rate': [0, 1], 99 | 'Intergenic Rate': [0, 1], 100 | 'Chimeric Alignment Rate': [0, 0.1], 101 | 'rRNA Rate': [0, 1], 102 | "Median 3' bias": [0, 1], 103 | 'Median Exon CV': None, 104 | 'Fragment GC Content Mean': [0, 1], 105 | 'Average Fragment Length': None, 106 | } 107 | 108 | if lims is not None: 109 | ylim_dict.update(lims) 110 | 111 | if cohort_order is None: 112 | cohort_order = cohorts 113 | 114 | # plot cohort legend 115 | if show_legend: 116 | ax = qtl.plot.setup_figure(lw, lh, xspace=[0,0], yspace=[0,0]) 117 | for c in cohort_order: 118 | ax.scatter(np.nan, np.nan, s=48, marker='s', color=cohort_colors[c], label=c) 119 | ax.scatter(np.nan, np.nan, fc='w', ec='k', lw=1, s=30, label='Outliers') 120 | ax.legend(loc='center left', handlelength=1, ncol=legend_cols) 121 | plt.axis('off') 122 | if output_dir is not None: 123 | plt.savefig(os.path.join(output_dir, f'{prefix}.legend.pdf'), dpi=dpi) 124 | 125 | # distributions for selected/key metrics 126 | for k,metric in enumerate(metrics_list, 1): 127 | if metric in metrics_df and not (metrics_df[metric] == 0).all(): 128 | if metric == 'Duplicate Rate of Mapped' and 'Duplicate Rate of Mapped, excluding Globins' in metrics_df: 129 | metric_s = metrics_df['Duplicate Rate of Mapped, excluding Globins'].rename('Duplicate Rate of Mapped') 130 | else: 131 | metric_s = metrics_df[metric] 132 | metrics(metric_s, ylim=ylim_dict[metric], 133 | threshold=threshold_dict.get(metric, None), 134 | threshold_dir=threshold_dir_dict.get(metric, None), 135 | outlier_method=outlier_method, 136 | **metrics_args) 137 | if output_dir is not None: 138 | plt.savefig(os.path.join(output_dir, '{}.{}.pdf'.format(prefix, metric.lower().replace("3'",'3prime').replace(' ','_'))), dpi=dpi) 139 | 140 | # genes detected vs bias and duplication rate 141 | if "Median 3' bias" in metrics_df: 142 | c = 'Duplicate Rate of Mapped, excluding Globins' if 'Duplicate Rate of Mapped, excluding Globins' in metrics_df else 'Duplicate Rate of Mapped' 143 | detection_bias(metrics_df, bias_metric="Median 3' bias", c=c) 144 | if output_dir is not None: 145 | plt.savefig(os.path.join(output_dir, f'{prefix}.genes_detected_vs_median_3prime_bias.pdf'), dpi=dpi) 146 | 147 | # mismatch rates 148 | if not metrics_df['End 1 Mismatch Rate'].isnull().all(): 149 | mismatch_rates(metrics_df, cohort_s=cohort_s, cohort_order=cohort_order, cohort_colors=cohort_colors, 150 | end1_threshold=threshold_dict.get('End 1 mismatch rate', None), 151 | end2_threshold=threshold_dict.get('End 2 mismatch rate', None)) 152 | if output_dir is not None: 153 | plt.savefig(os.path.join(output_dir, f'{prefix}.end_mismatch_rates.pdf'), dpi=dpi) 154 | 155 | mapping_sense(metrics_df, cohort_s=cohort_s, cohort_order=cohort_order, 156 | cohort_colors=cohort_colors, date_s=date_s, width=1) 157 | if output_dir is not None: 158 | plt.savefig(os.path.join(output_dir, f'{prefix}.mapping_sense.pdf'), dpi=dpi) 159 | 160 | # insert size distributions (if supplied) 161 | if insertsize_df is not None: 162 | insert_sizes(insertsize_df, cohort_s=cohort_s, cohort_order=cohort_order, 163 | cohort_colors=cohort_colors, sort_order='cohort') 164 | if output_dir is not None: 165 | plt.savefig(os.path.join(output_dir, f'{prefix}.insert_sizes.pdf'), dpi=dpi) 166 | 167 | if gc_content_df is not None: 168 | gc_content(gc_content_df, cohort_s=cohort_s, cohort_colors=cohort_colors, 169 | cohort_order=cohort_order, sort_order='cohort') 170 | if output_dir is not None: 171 | plt.savefig(os.path.join(output_dir, f'{prefix}.gc_content.pdf'), dpi=dpi) 172 | 173 | if tpm_df is not None: 174 | cdf_df = calculate_expression_cdfs(tpm_df) 175 | if tpm_df.shape[1] < 50: 176 | mode = 'lines' 177 | else: 178 | mode = 'ci' 179 | cumulative_expression(cdf_df, cohort_s=cohort_s, cohort_colors=cohort_colors, mode=mode) 180 | if output_dir is not None: 181 | plt.savefig(os.path.join(output_dir, f'{prefix}.cumulative_expression.pdf'), dpi=dpi) 182 | 183 | 184 | def load_inputs(args): 185 | 186 | if args.metrics.endswith('.parquet'): 187 | metrics_df = pd.read_parquet(args.metrics) 188 | else: 189 | metrics_df = pd.read_csv(args.metrics, sep='\t', index_col=0) 190 | 191 | if args.tpm is not None: 192 | tpm_df = qtl.io.read_gct(args.tpm, load_description=False) 193 | else: 194 | tpm_df = None 195 | 196 | if args.cohort is not None: 197 | cohort_s = pd.read_csv(args.cohort, sep='\t', index_col=0, header=None).squeeze('columns') 198 | assert metrics_df.index.isin(cohort_s.index).all() 199 | else: 200 | cohort_s = None 201 | 202 | if args.date is not None: 203 | date_s = pd.read_csv(args.date, sep='\t', index_col=0, header=None).squeeze('columns') 204 | assert metrics_df.index.isin(date_s.index).all() 205 | else: 206 | date_s = None 207 | 208 | if args.insert_size is not None: 209 | insertsize_df = pd.read_csv(args.insert_size, sep='\t', index_col=0) 210 | else: 211 | insertsize_df = None 212 | 213 | return metrics_df, tpm_df, cohort_s, date_s, insertsize_df 214 | 215 | 216 | if __name__ == '__main__': 217 | 218 | parser = argparse.ArgumentParser(description='Generate QC report from RNA-SeQC metrics table.') 219 | parser.add_argument('metrics', help='Aggregated QC metrics from RNA-SeQC.') 220 | parser.add_argument('prefix', help='Name for output files.') 221 | parser.add_argument('--tpm', default=None, help='Aggregated TPM matrix from RNA-SeQC.') 222 | parser.add_argument('--insert-size', default=None, help='Aggregated insert sizes from RNA-SeQC.') 223 | parser.add_argument('--cohort', default=None, help='Cohort or batch annotation. TSV file mapping sample IDs to annotation.') 224 | parser.add_argument('--date', default=None, help='Date annotation. TSV file mapping sample IDs to dates.') 225 | parser.add_argument('--output-dir', default='.', help='If specified, figures are saved here.') 226 | parser.add_argument('--dpi', type=int, default=300, help='Figure resolution.') 227 | args = parser.parse_args() 228 | 229 | metrics_df, tpm_df, cohort_s, date_s, insertsize_df = load_inputs(args) 230 | 231 | plot_qc_figures(metrics_df, cohort_s=cohort_s, cohort_colors=None, date_s=date_s, 232 | prefix=args.prefix, output_dir=args.output_dir, dpi=args.dpi, show_legend=True, 233 | ms=12, alpha=1, show_xticklabels=False, highlight_ids=None, 234 | thresholds=None, insertsize_df=insertsize_df, tpm_df=tpm_df) 235 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RNA-SeQC 2 | 3 | [![Version](https://img.shields.io/github/release/getzlab/rnaseqc.svg?label=Version)](https://github.com/getzlab/rnaseqc/releases) 4 | [![CI](https://github.com/getzlab/rnaseqc/workflows/CI/badge.svg)](https://github.com/getzlab/rnaseqc/actions) 5 | 6 | RNA-SeQC 2 is described in [A. Graubert*, F. Aguet*, A. Ravi, K.G. Ardlie, Gad Getz, "RNA-SeQC 2: efficient RNA-seq quality control and quantification for large cohorts," *Bioinformatics*, 2021](https://doi.org/10.1093/bioinformatics/btab135). 7 | 8 | ## Installing 9 | 10 | The latest stable build of RNA-SeQC is available on the [GitHub Releases](https://github.com/getzlab/rnaseqc/releases) page, and contains static binaries for Linux and OSX. 11 | 12 | RNA-SeQC is also available as a docker image: `gcr.io/broad-cga-aarong-gtex/rnaseqc:latest` which is automatically updated with any code change. 13 | Older versions of the docker image are tagged using the full commit SHA of any commit which introduced a code change. 14 | 15 | To checkout the source of RNA-SeQC run `git clone --recursive https://github.com/getzlab/rnaseqc.git`. 16 | If you do not use the `--recursive` flag, you'll need to run `git submodule update --init --recursive` or you will be missing [SeqLib](https://github.com/walaj/SeqLib). 17 | 18 | #### Unit Tests 19 | 20 | Input data for RNA-SeQC's testing suite is not stored in the repository due to 21 | size constraints. The current test data is available [here](https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz), and must be unpacked within the `test_data/` directory. 22 | Please note that the location of the test data is subject to change. 23 | The test resources use **~1.2 GB** of space. 24 | 25 | You can download and unpack test data with: 26 | 27 | ``` 28 | cd test_data 29 | wget https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz 30 | tar xzf test_inputs.tar.gz 31 | ``` 32 | 33 | You can run the unit tests with `make test` 34 | 35 | ## Usage 36 | 37 | **NOTE**: This tool requires that the provided GTF be collapsed in such a way that there are no overlapping transcripts **on the same strand** and that each gene have a single transcript whose id matches the parent gene id. This is **not** a transcript-quantification method. Readcounts and coverage are made towards exons and genes only if *all* aligned segments of a read fully align to exons of a gene, but keep in mind that coverage may be counted towards multiple transcripts (and its exons) if these criteria are met. Beyond this, no attempt will be made to disambiguate which transcript a read belongs to. 38 | You can collapse an existing GTF using the [GTEx collapse annotation script](https://github.com/broadinstitute/gtex-pipeline/tree/master/gene_model) 39 | 40 | ### Command Line Usage: 41 | 42 | `rnaseqc [OPTIONS] gtf bam output` 43 | 44 | Example: `./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .` 45 | 46 | ###### OPTIONS: 47 | -h, --help Display this message and quit 48 | 49 | --version Display the version and quit 50 | 51 | gtf The input GTF file containing features 52 | to check the bam against 53 | 54 | bam The input SAM/BAM file containing reads 55 | to process 56 | 57 | output Output directory 58 | 59 | -s[sample], --sample=[sample] The name of the current sample. Default: 60 | The bam's filename 61 | 62 | --bed=[BEDFILE] Optional input BED file containing 63 | non-overlapping exons used for fragment 64 | size calculations 65 | 66 | --fasta=[fasta] Optional input FASTA/FASTQ file 67 | containing the reference sequence used 68 | for parsing CRAM files 69 | 70 | --chimeric-distance=[DISTANCE] Set the maximum accepted distance 71 | between read mates. Mates beyond this 72 | distance will be counted as chimeric 73 | pairs. Default: 2000000 [bp] 74 | 75 | --fragment-samples=[SAMPLES] Set the number of samples to take when 76 | computing fragment sizes. Requires the 77 | --bed argument. Default: 1000000 78 | 79 | -q[QUALITY], 80 | --mapping-quality=[QUALITY] Set the lower bound on read quality for 81 | exon coverage counting. Reads below this 82 | number are excluded from coverage 83 | metrics. Default: 255 84 | 85 | --base-mismatch=[MISMATCHES] Set the maximum number of allowed 86 | mismatches between a read and the 87 | reference sequence. Reads with more than 88 | this number of mismatches are excluded 89 | from coverage metrics. Default: 6 90 | 91 | --offset=[OFFSET] Set the offset into the gene for the 3' 92 | and 5' windows in bias calculation. A 93 | positive value shifts the 3' and 5' 94 | windows towards eachother, while a 95 | negative value shifts them apart. 96 | Default: 150 [bp] 97 | 98 | --window-size=[SIZE] Set the size of the 3' and 5' windows in 99 | bias calculation. Default: 100 [bp] 100 | 101 | --gene-length=[LENGTH] Set the minimum size of a gene for bias 102 | calculation. Genes below this size are 103 | ignored in the calculation. Default: 600 104 | [bp] 105 | 106 | --legacy Use legacy counting rules. Gene and exon 107 | counts match output of RNA-SeQC 1.1.9 108 | 109 | --stranded=[stranded] Use strand-specific metrics. Only 110 | features on the same strand of a read 111 | will be considered. Allowed values are 112 | 'RF', 'rf', 'FR', and 'fr' 113 | 114 | -v, --verbose Give some feedback about what's going 115 | on. Supply this argument twice for 116 | progress updates while parsing the bam 117 | 118 | -t[TAG...], --tag=[TAG...] Filter out reads with the specified tag. 119 | 120 | --chimeric-tag=[TAG] Reads maked with the specified tag will 121 | be labeled as Chimeric. Defaults to 'ch' 122 | for STAR 123 | 124 | --exclude-chimeric Exclude chimeric reads from the read 125 | counts 126 | 127 | -u, --unpaired Allow unpaired reads to be quantified. 128 | Required for single-end libraries 129 | 130 | --rpkm Output gene RPKM values instead of TPMs 131 | 132 | --coverage If this flag is provided, coverage 133 | statistics for each transcript will be 134 | written to a table. Otherwise, only 135 | summary coverage statistics are 136 | generated and added to the metrics table 137 | 138 | --coverage-mask=[SIZE] Sets how many bases at both ends of a 139 | transcript are masked out when computing 140 | per-base exon coverage. Default: 500bp 141 | 142 | -d[threshold], 143 | --detection-threshold=[threshold] Number of counts on a gene to consider 144 | the gene 'detected'. Additionally, genes 145 | below this limit are excluded from 3' 146 | bias computation. Default: 5 reads 147 | 148 | "--" can be used to terminate flag options and force all following 149 | arguments to be treated as positional options 150 | 151 | ### Output files: 152 | The following output files are generated in the output directory you provide: 153 | * {sample}.metrics.tsv : A tab-delimited list of (Statistic, Value) pairs of all statistics and metrics recorded. 154 | * {sample}.exon_reads.gct : A tab-delimited GCT file with (Exon ID, Gene Name, coverage) tuples for all exons which had at least part of one read mapped. 155 | * {sample}.gene_reads.gct : A tab-delimited GCT file with (Gene ID, Gene Name, coverage) tuples for all genes which had at least one read map to at least one of its exons. This file contains the gene-level read counts used, e.g., for differential expression analyses. 156 | * {sample}.gene_tpm.gct : A tab-delimited GCT file with (Gene ID, Gene Name, TPM) tuples for all genes reported in the gene_reads.gct file, with expression values in transcript per million (TPM) units. Note: this file is renamed to .gene_rpkm.gct if the **--rpkm** flag is present. 157 | * {sample}.fragmentSizes.txt : A list of fragment sizes recorded, if a BED file was provided 158 | * {sample}.coverage.tsv : A tab-delimited list of (Gene ID, Transcript ID, Mean Coverage, Coverage Std, Coverage CV) tuples for all transcripts encountered in the GTF. 159 | 160 | #### Metrics reported: 161 | 162 | See [Metrics.md](Metrics.md) for a description of all metrics reported in the `metrics.tsv`, `coverage.tsv`, and `fragmentSizes.txt` files. 163 | 164 | ### Legacy mode differences 165 | 166 | The **--legacy** flag enables compatibility with RNASeQC 1.1.9. This ensures that exon and gene readcounts match exactly the counts which would have been produced by running that version. This also adds an extra condition to classify reads as chimeric (see "Chimeric Reads", above). Any metrics which existed in 1.1.9 will also match within Java's floating point precision. 167 | -------------------------------------------------------------------------------- /src/Metrics.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Metrics.cpp 3 | // IntervalTree 4 | // 5 | // Created by Aaron Graubert on 7/5/17. 6 | // Copyright © 2017 Aaron Graubert. All rights reserved. 7 | // 8 | 9 | #include "Metrics.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace rnaseqc { 18 | 19 | 20 | std::map uniqueGeneCounts, geneCounts, exonCounts, geneFragmentCounts; //counters for read coverage of genes and exons 21 | 22 | std::map > fragmentTracker; // tracks fragments encountered by each gene 23 | 24 | std::tuple computeCoverage(Fasta&, std::ofstream&, const Feature&, const unsigned int, const std::map >&, std::map&, BiasCounter&); 25 | 26 | void add_range(std::vector&, coord, unsigned int); 27 | 28 | void Metrics::increment(std::string key) 29 | { 30 | this->counter[key]++; 31 | } 32 | 33 | void Metrics::increment(std::string key, int n) 34 | { 35 | this->counter[key] += n; 36 | } 37 | 38 | unsigned long Metrics::get(std::string key) 39 | { 40 | return this->counter[key]; 41 | } 42 | 43 | double Metrics::frac(std::string a, std::string b) 44 | { 45 | return static_cast(this->get(a)) / this->get(b); 46 | } 47 | 48 | // Add coverage to an exon 49 | void Collector::add(const std::string &gene_id, const std::string &exon_id, const double coverage) 50 | { 51 | if (coverage > 0) 52 | { 53 | this->data[gene_id].push_back(std::pair(exon_id, coverage)); 54 | this->dirty = true; 55 | } 56 | } 57 | 58 | //Commit all the exon coverage from this gene to the global exon coverage counter 59 | void Collector::collect(const std::string &gene_id) 60 | { 61 | for (auto entry = this->data[gene_id].begin(); entry != this->data[gene_id].end(); ++entry) 62 | { 63 | (*this->target)[entry->first] += entry->second; 64 | this->total += entry->second; 65 | } 66 | } 67 | 68 | //Legacy version of the above function. Ignores the actual coverage and reports a full read count 69 | void Collector::collectSingle(const std::string &gene_id) 70 | { 71 | for (auto entry = this->data[gene_id].begin(); entry != this->data[gene_id].end(); ++entry) 72 | { 73 | (*this->target)[entry->first] += 1.0; 74 | } 75 | } 76 | 77 | //Check if there is any coverage on any exon of this gene 78 | bool Collector::queryGene(const std::string &gene_id) 79 | { 80 | return static_cast(this->data[gene_id].size()); 81 | } 82 | 83 | // Check if any coverage has been reported whatsoever 84 | bool Collector::isDirty() 85 | { 86 | return this->dirty; 87 | } 88 | 89 | //Get the sum of all coverage that was committed for this read (should always be <= 1) 90 | double Collector::sum() 91 | { 92 | return this->total; 93 | } 94 | 95 | //Adds coverage from one aligned segment of a read to this exon. Coverage feeds into cache until gene leaves search window 96 | void BaseCoverage::add(const Feature &exon, const coord start, const coord end) 97 | { 98 | CoverageEntry tmp; 99 | tmp.offset = start - exon.start; 100 | tmp.length = end - start; 101 | tmp.feature_id = exon.feature_id; 102 | this->cache[exon.gene_id].push_back(tmp); 103 | } 104 | 105 | //Commit the cached coverage to this gene after deciding to count the read towards the gene 106 | void BaseCoverage::commit(const std::string &gene_id) 107 | { 108 | if (this->seen.count(gene_id)) 109 | { 110 | std::cerr << "Gene encountered after computing coverage " << gene_id << std::endl; 111 | return; 112 | } 113 | auto beg = this->cache[gene_id].begin(); 114 | auto end = this->cache[gene_id].end(); 115 | while (beg != end) 116 | { 117 | if (this->coverage.find(beg->feature_id) == this->coverage.end()) this->coverage[beg->feature_id] = std::vector(exonLengths[beg->feature_id].length, 0ul); 118 | //Add each coverage entry to the per-base coverage vector for the exon 119 | //At this stage exons each have their own vectors. 120 | //During the compute() step, exons get stiched together 121 | add_range(this->coverage[beg->feature_id], beg->offset, beg->length); 122 | ++beg; 123 | } 124 | } 125 | 126 | void BaseCoverage::reset() //Empties the cache 127 | { 128 | this->cache.clear(); 129 | } 130 | 131 | //computes per-base coverage of the gene 132 | void BaseCoverage::compute(const Feature &gene) 133 | { 134 | //Coverage is stored in EID -> coverage vector 135 | //First iterate over all exons of the gene and ensure they're filled 136 | //That way, stiching the exons will result in a complete transcript even for exons which haven't been seen 137 | for (auto exon_id = exonsForGene[gene.feature_id].begin(); exon_id != exonsForGene[gene.feature_id].end(); ++exon_id) 138 | if (this->coverage.find(*exon_id) == this->coverage.end()) this->coverage[*exon_id] = std::vector(exonLengths[*exon_id].length, 0ul); 139 | //then compute coverage for the gene 140 | std::tuple results = computeCoverage(this->fastaReader, this->writer, gene, this->mask_size, this->coverage, this->exonCoverage, this->bias); 141 | if (std::get<0>(results) != -1) 142 | { 143 | this->geneMeans.push_back(std::get<0>(results)); 144 | this->geneStds.push_back(std::get<1>(results)); 145 | this->geneCVs.push_back(std::get<2>(results)); 146 | } 147 | //Now clean out the coverage map to save memory 148 | for (auto exon_id = exonsForGene[gene.feature_id].begin(); exon_id != exonsForGene[gene.feature_id].end(); ++exon_id) 149 | this->coverage.erase(*exon_id); 150 | this->seen.insert(gene.feature_id); 151 | } 152 | 153 | void BaseCoverage::close() 154 | { 155 | this->writer.flush(); 156 | this->writer.close(); 157 | } 158 | 159 | //Compute 3'/5' bias based on genes' per-base coverage 160 | void BiasCounter::computeBias(const Feature &gene, std::vector &coverage) 161 | { 162 | 163 | if (coverage.size() < this->geneLength) return; //Must meet minimum length req 164 | unsigned long peak = 0ul; 165 | unsigned peak_pos = 0; 166 | for (unsigned i = 0; i < coverage.size(); ++i) if (coverage[i] > peak) 167 | { 168 | peak_pos = i; 169 | peak = coverage[i]; 170 | } 171 | auto coverageMedianPos = coverage.begin() + peak_pos; 172 | std::list coveragePeakEntries; 173 | //First scroll half a window to the right of the peak (stop if we reach the end) 174 | for (int i = 0; i < this->windowSize/2 && coverageMedianPos != coverage.end(); ++i) ++coverageMedianPos; 175 | //Then scroll back 1 full window, adding entries to the list 176 | for (int i = 0; i < this->windowSize && coverageMedianPos != coverage.begin(); ++i) coveragePeakEntries.push_back(*(coverageMedianPos--)); 177 | coveragePeakEntries.sort(); 178 | double coveragePeakMedian = computeMedian(coveragePeakEntries.size(), coverageMedianPos); 179 | 180 | 181 | if (coveragePeakMedian >= 100) { 182 | std::vector percentileContainer(coverage); 183 | std::sort(percentileContainer.begin(), percentileContainer.end()); 184 | { 185 | auto xcursor = percentileContainer.begin(); 186 | while (xcursor != percentileContainer.end() && (*xcursor) == 0ul) ++xcursor; 187 | percentileContainer.erase(percentileContainer.begin(), xcursor); 188 | } 189 | unsigned long lowerLimit = percentileContainer[percentileContainer.size()*0.05]; 190 | unsigned long trimmed_length = 0ul; 191 | 192 | { 193 | auto cursor = coverage.begin(); 194 | while (cursor != coverage.end() && (*cursor) <= lowerLimit) 195 | { 196 | ++trimmed_length; 197 | ++cursor; 198 | } 199 | coverage.erase(coverage.begin(), cursor); 200 | } 201 | { 202 | while (coverage.size() > 0 && coverage.back() <= lowerLimit) { 203 | coverage.pop_back(); 204 | ++trimmed_length; 205 | } 206 | } 207 | 208 | if (coverage.size() >= this->geneLength) 209 | { 210 | double windowSize = static_cast(this->windowSize); 211 | std::vector lcov, rcov; 212 | lcov.reserve(this->windowSize); 213 | rcov.reserve(this->windowSize); 214 | for (unsigned int i = this->offset; i < this->offset + this->windowSize && i < coverage.size(); ++i) 215 | lcov.push_back(static_cast(coverage[i])); 216 | for (int i = coverage.size() - (this->windowSize + this->offset); i >= 0 && i < coverage.size() - this->offset; ++i) 217 | rcov.push_back(static_cast(coverage[i])); 218 | std::sort(lcov.begin(), lcov.end()); 219 | std::sort(rcov.begin(), rcov.end()); 220 | if (gene.strand == Strand::Forward) 221 | { 222 | this->threeEnd[gene.feature_id] += computeMedian(rcov.size(), rcov.begin()); 223 | this->fiveEnd[gene.feature_id] += computeMedian(lcov.size(), lcov.begin()); 224 | } else 225 | { 226 | this->threeEnd[gene.feature_id] += computeMedian(lcov.size(), lcov.begin()); 227 | this->fiveEnd[gene.feature_id] += computeMedian(rcov.size(), rcov.begin()); 228 | } 229 | 230 | } 231 | 232 | } 233 | 234 | 235 | } 236 | 237 | 238 | //Extract the bias for a gene 239 | double BiasCounter::getBias(const std::string &geneID) 240 | { 241 | double cov5 = this->fiveEnd[geneID]; 242 | double cov3 = this->threeEnd[geneID]; 243 | if (cov5 + cov3 > 0.0) 244 | { 245 | this->countedGenes++; 246 | return cov3 / (cov5 + cov3); 247 | } 248 | return -1.0; 249 | } 250 | 251 | unsigned int BiasCounter::countGenes() const 252 | { 253 | return this->countedGenes; 254 | } 255 | 256 | 257 | void add_range(std::vector &coverage, coord offset, unsigned int length) 258 | { 259 | const size_t size = coverage.size(); 260 | for (coord i = offset; i < offset + length && i < size; ++i) coverage[i] += 1ul; 261 | if (offset + length > size) std::cerr << "Error: Attempted to write more coverage than present on exon. Coverage-based metrics may be inaccurate. This may be a sign of an invalid bam or gtf entry" << std::endl; 262 | } 263 | 264 | //Compute exon coverage metrics, then stich exons together and compute gene coverage metrics 265 | std::tuple computeCoverage(Fasta& fastaReader, std::ofstream &writer, const Feature &gene, const unsigned int mask_size, const std::map > &coverage, std::map& totalExonCV, BiasCounter &bias) 266 | { 267 | std::vector > coverageMask; 268 | std::vector geneCoverage; 269 | unsigned int maskRemainder = mask_size; 270 | for (unsigned int i = 0; i < exonsForGene[gene.feature_id].size(); ++i) 271 | { 272 | coverageMask.push_back(std::vector(exonLengths[exonsForGene[gene.feature_id][i]].length, true)); //First store a pre-filled mask for the exon 273 | for (unsigned int j = 0; j < coverageMask.back().size() && maskRemainder; ++j, --maskRemainder) //now, remove coverage from the front of the exon until either it, or the mask size is depleted 274 | coverageMask.back()[j] = false; 275 | } 276 | maskRemainder = mask_size; //reset the exon mask to mask out the end 277 | for (int i = exonsForGene[gene.feature_id].size() - 1; i >= 0 && maskRemainder; --i) //repeat the process, masking out regions from the back until the mask size is depleted 278 | for (int j = coverageMask[i].size() - 1; j >= 0 && maskRemainder; --j, --maskRemainder) 279 | coverageMask[i][j] = false; 280 | for (unsigned int i = 0; i < exonsForGene[gene.feature_id].size(); ++i) 281 | { 282 | const std::vector &exon_coverage = coverage.at(exonsForGene[gene.feature_id][i]); //get the coverage vector for the current exon 283 | double exonMean = 0.0, exonStd = 0.0, exonSize = 0.0; 284 | std::vector mask = coverageMask[i]; 285 | 286 | for (unsigned int j = 0; j < mask.size(); ++j) if (mask[j]) exonSize += 1.0; //count the remaining unmasked length of the exon 287 | if (exonSize > 0) 288 | { 289 | auto maskIter = mask.begin(); 290 | for (auto start = exon_coverage.begin(); start != exon_coverage.end(); ++start) 291 | if (*(maskIter++)) exonMean += static_cast(*start) / exonSize; 292 | maskIter = mask.begin(); 293 | for (auto start = exon_coverage.begin(); start != exon_coverage.end(); ++start) 294 | if (*(maskIter++)) exonStd += pow(static_cast(*start) - exonMean, 2.0) / exonSize; 295 | exonStd = pow(exonStd, 0.5); 296 | exonStd /= exonMean; //now it's a CV 297 | 298 | if (!(std::isnan(exonStd) || std::isinf(exonStd))) { 299 | FeatureSpan exonPos = exonLengths[exonsForGene[gene.feature_id][i]]; 300 | if (fastaReader.hasContig(exonPos.chromosome)) { 301 | std::string exonSeq = fastaReader.getSeq(exonPos.chromosome, exonPos.start, exonPos.start + exonPos.length); 302 | totalExonCV[exonsForGene[gene.feature_id][i]] = {exonStd, gc(exonSeq)}; 303 | } else totalExonCV[exonsForGene[gene.feature_id][i]] = {exonStd, -1.0}; 304 | } 305 | } 306 | // Reserve and append the exon vector to the growing gene vector 307 | geneCoverage.reserve(geneCoverage.size() + exon_coverage.size()); 308 | geneCoverage.insert(geneCoverage.end(), exon_coverage.begin(), exon_coverage.end()); 309 | } 310 | //at this point the gene coverage vector represents an UNMASKED, but complete transcript 311 | bias.computeBias(gene, geneCoverage); //no masking in bias 312 | double avg = 0.0, std = 0.0; 313 | // apply the mask to the full gene vector 314 | if (mask_size) 315 | { 316 | //to account for the mask, erase bases from the vector 317 | //If the mask is larger than the gene, just erase all of it 318 | //Otherwise, erase from (end-mask) -> end 319 | geneCoverage.erase((mask_size > geneCoverage.size() ? geneCoverage.begin() : geneCoverage.end() - mask_size), geneCoverage.end()); 320 | // If there is still coverage area, erase from the front to either the end (if the mask is larger than remaining coverage) or to (front + mask) 321 | if (geneCoverage.size()) geneCoverage.erase(geneCoverage.begin(), (mask_size > geneCoverage.size() ? geneCoverage.end() : geneCoverage.begin() + mask_size)); 322 | } 323 | double size = static_cast(geneCoverage.size()); 324 | writer << gene.feature_id << "\t"; 325 | if (size > 0) //If there's still any coverage after applying the mask 326 | { 327 | for (auto beg = geneCoverage.begin(); beg != geneCoverage.end(); ++beg) 328 | avg += static_cast(*beg) / size; 329 | for (auto base = geneCoverage.begin(); base != geneCoverage.end(); ++base) 330 | std += std::pow(static_cast(*base) - avg, 2.0) / size; 331 | std = std::pow(std, 0.5); 332 | writer << avg << "\t" << std << "\t" << (std / avg) << std::endl; 333 | return std::make_tuple(avg, std, (std / avg)); 334 | } 335 | writer << "0\t0\tnan" << std::endl; 336 | return std::make_tuple(-1, -1, -1); 337 | } 338 | 339 | 340 | } 341 | 342 | std::ofstream& operator<<(std::ofstream &stream, rnaseqc::Metrics &counter) 343 | { 344 | std::vector keys = { 345 | //"Alternative Alignments", 346 | //"Chimeric Reads", 347 | "End 1 Antisense", 348 | "End 2 Antisense", 349 | "End 1 Bases", 350 | "End 2 Bases", 351 | "End 1 Mapped Reads", 352 | "End 2 Mapped Reads", 353 | "End 1 Mismatches", 354 | "End 2 Mismatches", 355 | "End 1 Sense", 356 | "End 2 Sense", 357 | "Exonic Reads", 358 | "Failed Vendor QC", 359 | "High Quality Reads", 360 | "Intergenic Reads", 361 | "Intragenic Reads", 362 | "Ambiguous Reads", 363 | "Intronic Reads", 364 | "Low Mapping Quality", 365 | "Low Quality Reads", 366 | "Mapped Duplicate Reads", 367 | "Mapped Reads", 368 | "Mapped Unique Reads", 369 | "Mismatched Bases", 370 | "Non-Globin Reads", 371 | "Non-Globin Duplicate Reads", 372 | "Reads used for Intron/Exon counts", 373 | "rRNA Reads", 374 | "Split Reads", 375 | "Total Bases", 376 | "Total Mapped Pairs", 377 | // "Total Reads", 378 | "Unique Mapping, Vendor QC Passed Reads", 379 | "Unpaired Reads" 380 | }; 381 | stream << "Total Alignments\t" << counter.get("Total Alignments") << std::endl; 382 | stream << "Alternative Alignments\t" << counter.get("Alternative Alignments") << std::endl; 383 | stream << "Supplementary Alignments\t" << counter.get("Supplementary Alignments") << std::endl; 384 | stream << "Total Reads\t" << counter.get("Total Alignments") - counter.get("Alternative Alignments") - counter.get("Supplementary Alignments") << std::endl; 385 | stream << "Chimeric Fragments\t"; 386 | if (counter.get("Chimeric Fragments_tag")) 387 | { 388 | stream << counter.get("Chimeric Fragments_tag") << std::endl; 389 | stream << "Chimeric Alignment Rate\t" << counter.frac("Chimeric Fragments_tag", "Total Mapped Pairs") << std::endl; 390 | } 391 | else 392 | { 393 | stream << counter.get("Chimeric Fragments_auto") << std::endl; 394 | stream << "Chimeric Alignment Rate\t" << counter.frac("Chimeric Fragments_auto", "Total Mapped Pairs") << std::endl; 395 | 396 | } 397 | for (int i = 0; i < keys.size(); ++i) 398 | if (keys[i] != "Split Reads" || counter.get("Split Reads")) 399 | stream << keys[i] << "\t" << counter.get(keys[i]) << std::endl; 400 | auto beg = counter.counter.begin(); 401 | auto end = counter.counter.end(); 402 | while (beg != end) 403 | { 404 | // Manually dump the counters for reads filtered by user supplied tags 405 | if( beg->first.length() > 17 && beg->first.substr(0,17) == "Filtered by tag: ") 406 | { 407 | stream << beg->first << "\t" << beg->second << std::endl; 408 | } 409 | ++beg; 410 | } 411 | return stream; 412 | } 413 | -------------------------------------------------------------------------------- /python/rnaseqc/plot.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import matplotlib.ticker as ticker 5 | from matplotlib.colors import LogNorm, ListedColormap, hsv_to_rgb 6 | import seaborn as sns 7 | import os 8 | import qtl.plot 9 | 10 | 11 | def get_cohort_colors(cohorts): 12 | nc = len(cohorts) 13 | if nc > 5: 14 | cohort_colors = {i:j for i,j in zip(cohorts, plt.cm.get_cmap('Spectral', nc)(np.random.permutation(np.arange(nc)))[:,:-1])} 15 | else: 16 | cohort_colors = {i:j for i,j in zip(cohorts, plt.cm.get_cmap('tab10', 10)(np.arange(nc))[:,:-1])} 17 | return cohort_colors 18 | 19 | 20 | def sort_samples(sample_ix, cohort_s=None, cohort_order=None, date_s=None): 21 | """Sort samples by date and cohort label""" 22 | if cohort_s is None and date_s is None: 23 | return sample_ix 24 | 25 | if cohort_s is not None: 26 | assert sample_ix.isin(cohort_s.index).all() 27 | cohort_s = cohort_s[sample_ix] 28 | if date_s is not None: 29 | assert sample_ix.isin(cohort_s.index).all() 30 | date_s = date_s[sample_ix] 31 | 32 | if date_s is not None: 33 | if cohort_s is not None: # sort samples by date and cohort 34 | sorted_ix = pd.concat([ 35 | pd.to_datetime(date_s).rename('date'), cohort_s.rename('cohort')], axis=1 36 | ).sort_values(['date', 'cohort'], na_position='first').index 37 | else: 38 | sorted_ix = pd.to_datetime(date_s).sort_values(na_position='first').index 39 | else: # sort by cohort only 40 | if cohort_order is None: 41 | sorted_ix = cohort_s.sort_values(na_position='first').index 42 | else: 43 | sorted_ix = cohort_s.map({j:i for i,j in enumerate(cohort_order)}).sort_values(na_position='first').index 44 | 45 | return sorted_ix 46 | 47 | 48 | def mismatch_rates(metrics_df, cohort_s=None, cohort_order=None, cohort_colors=None, ms=12, alpha=1, aw=2, 49 | end1_threshold=None, end2_threshold=None, 50 | end1_limit=0.01, end2_limit=0.025): 51 | """Plot base mismatch rates ('NM' tag) for read mate 1 vs read mate 2.""" 52 | 53 | if cohort_s is not None: 54 | assert metrics_df.index.isin(cohort_s.index).all() 55 | cohort_s = cohort_s.loc[metrics_df.index] 56 | else: 57 | cohort_s = pd.Series('NA', index=metrics_df.index) 58 | 59 | ax = qtl.plot.setup_figure(aw, aw) 60 | 61 | x = metrics_df['End 1 Mismatch Rate'].copy() 62 | y = metrics_df['End 2 Mismatch Rate'].copy() 63 | x[x > end1_limit] = end1_limit 64 | y[y > end2_limit] = end2_limit 65 | 66 | sorted_ix = sort_samples(metrics_df.index, cohort_s=cohort_s, cohort_order=cohort_order) 67 | cohorts = cohort_s.loc[sorted_ix].unique() 68 | if cohort_colors is None: 69 | cohort_colors = get_cohort_colors(cohorts) 70 | 71 | for t in cohorts: 72 | ix = cohort_s[cohort_s == t].index 73 | ax.scatter(x[ix], y[ix], s=ms, edgecolor='none', label=t, 74 | c=[cohort_colors[t]], alpha=alpha, clip_on=False, rasterized=True) 75 | 76 | if end1_threshold is not None: 77 | ax.plot(2*[end1_threshold], [0,0.2], '--', color=[0.6]*3, zorder=0, lw=1, alpha=0.8) 78 | if end2_threshold is not None: 79 | ax.plot([0,0.02], 2*[end2_threshold], '--', color=[0.6]*3, zorder=0, lw=1, alpha=0.8) 80 | if end1_threshold is not None or end2_threshold is not None: 81 | ix = (x > end1_threshold) | (y > end2_threshold) 82 | if any(ix): 83 | ax.scatter(x[ix], y[ix], c='none', edgecolor='k', s=ms, lw=1, label=None, clip_on=False, rasterized=True) 84 | 85 | qtl.plot.format_plot(ax, fontsize=10) 86 | ax.set_xlim([0, end1_limit]) 87 | ax.set_ylim([0, end2_limit]) 88 | 89 | ax.spines['left'].set_position(('outward', 6)) 90 | ax.spines['bottom'].set_position(('outward', 6)) 91 | ax.plot([0, end1_limit], [0, end1_limit], '--', c=[0.6]*3, lw=1, zorder=0) 92 | 93 | ax.set_xlabel('End 1 mismatch rate', fontsize=12) 94 | ax.set_ylabel('End 2 mismatch rate', fontsize=12) 95 | 96 | 97 | def metrics(metric_s, cohort_s=None, cohort_order=None, cohort_colors=None, date_s=None, 98 | threshold=None, threshold_dir=None, outlier_method='threshold', plot_density=True, show_legend=False, 99 | ms=12, alpha=1, ylim=None, ylabel=None, 100 | show_xticklabels=False, highlight_ids=None, 101 | dl=0.85, aw=6, ds=0.2, daw=0.5, dr=0.25, 102 | db=0.75, ah=2, dt=0.25, fontsize=10, rasterized=True): 103 | """Plot a single QC metric sorted by cohort and/or date""" 104 | 105 | if ylabel is None: 106 | ylabel = metric_s.name 107 | 108 | if metric_s.median() > 1e5: 109 | metric_s = metric_s.copy() / 1e6 110 | if threshold is not None: 111 | threshold = threshold / 1e6 112 | ylabel += ' (millions)' 113 | 114 | if cohort_s is not None: 115 | assert metric_s.index.isin(cohort_s.index).all() 116 | cohort_s = cohort_s.loc[metric_s.index] 117 | else: 118 | cohort_s = pd.Series('NA', index=metric_s.index) 119 | 120 | if show_xticklabels: 121 | db += 0.75 122 | 123 | if plot_density: 124 | fw = dl + aw + ds + daw + dr 125 | else: 126 | fw = dl + aw + dr 127 | fh = db + ah + dt 128 | fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh)) 129 | ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh]) 130 | if plot_density: 131 | dax = fig.add_axes([(dl+aw+ds)/fw, db/fh, daw/fw, ah/fh], sharey=ax) 132 | 133 | if date_s is not None: 134 | xlabel = 'Samples, ordered by date' 135 | else: 136 | xlabel = 'Samples' 137 | 138 | sorted_ix = sort_samples(metric_s.index, cohort_s=cohort_s, cohort_order=cohort_order, date_s=date_s) 139 | cohorts = cohort_s.loc[sorted_ix].unique() 140 | if cohort_colors is None: 141 | cohort_colors = get_cohort_colors(cohorts) 142 | 143 | ns = len(metric_s) 144 | xpos = pd.Series(np.arange(1,ns+1), index=sorted_ix) 145 | 146 | # plot 147 | for t in cohorts: 148 | ix = cohort_s[cohort_s==t].index 149 | ax.scatter(xpos[ix], metric_s[ix], s=ms, edgecolor='none', label=t, 150 | c=[cohort_colors[t]], alpha=alpha, clip_on=False, rasterized=rasterized) 151 | 152 | if highlight_ids is not None: 153 | ax.scatter(xpos[highlight_ids], metric_s[highlight_ids], marker='s', 154 | edgecolor='k', facecolor='none', clip_on=False, rasterized=rasterized) 155 | 156 | if threshold is not None: 157 | ax.plot([-0.02*ns, 1.02*ns], 2*[threshold], '--', color=[0.6,0.6,0.6], lw=1, alpha=0.8) 158 | 159 | if outlier_method.lower() == 'iqr': 160 | p = np.percentile(metric_s, [25, 75]) 161 | if threshold_dir == 'gt': 162 | ix = metric_s[metric_s > p[1] + 1.5*(p[1]-p[0])].index 163 | elif threshold_dir == 'lt': 164 | ix = metric_s[metric_s < p[0] - 1.5*(p[1]-p[0])].index 165 | if any(ix): 166 | ax.scatter(xpos[ix], metric_s[ix], c='none', edgecolor='k', s=ms, lw=1, label=None, clip_on=False, rasterized=rasterized) 167 | elif outlier_method.lower() == 'threshold' and threshold is not None: 168 | if threshold_dir == 'gt': 169 | ix = metric_s[metric_s > threshold].index 170 | elif threshold_dir == 'lt': 171 | ix = metric_s[metric_s < threshold].index 172 | if any(ix): 173 | ax.scatter(xpos[ix], metric_s[ix], c='none', edgecolor='k', s=ms, lw=1, label=None, clip_on=False, rasterized=rasterized) 174 | 175 | # plot density 176 | if plot_density: 177 | sns.kdeplot(y=metric_s, ax=dax, legend=False, fill=True, lw=1.5) 178 | dax.set_ylabel(None) 179 | qtl.plot.format_plot(dax, fontsize=fontsize, hide=['top', 'right', 'bottom']) 180 | plt.setp(dax.get_yticklabels(), visible=False) 181 | dax.set_xticks([]) 182 | dax.set_xlabel('Freq.', ha='left', x=0, fontsize=fontsize, labelpad=7) 183 | 184 | qtl.plot.format_plot(ax, fontsize=fontsize) 185 | ax.spines['left'].set_position(('outward', 8)) 186 | 187 | ax.set_xlim([1, ns]) 188 | if ylim is None: 189 | ax.set_ylim([0, ax.get_ylim()[1]]) 190 | else: 191 | ax.set_ylim(ylim) 192 | 193 | if show_xticklabels: 194 | ax.set_xticks(xpos) 195 | ax.set_xticklabels(sorted_ix, rotation=45, ha='right', va='top') 196 | else: 197 | ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True)) 198 | 199 | ax.set_ylabel(ylabel, fontsize=fontsize+2) 200 | ax.set_xlabel(xlabel, fontsize=fontsize+2) 201 | 202 | if show_legend: 203 | ax.legend(fontsize=9, handlelength=1, labelspacing=0.5, title=cohort_s.name) 204 | 205 | if plot_density: 206 | return ax, dax 207 | else: 208 | return ax 209 | 210 | 211 | def detection_bias(metrics_df, bias_metric="Median 3' bias", c='Duplicate Rate of Mapped', 212 | ah=2, aw=2, ct=0, rasterized=False): 213 | """Plot genes detected vs a bias metric (e.g., Median Exon CV)""" 214 | 215 | ax, cax = qtl.plot.setup_figure(ah, aw, xspace=[0.75, 0.75], 216 | colorbar=True, ds=0.05, cw=0.1, ct=ct) 217 | 218 | ix = metrics_df[c].sort_values().index 219 | h = ax.scatter(metrics_df.loc[ix, 'Genes Detected'], metrics_df.loc[ix, bias_metric], 220 | c=metrics_df.loc[ix, c], cmap=plt.cm.GnBu, 221 | clip_on=False, s=36, edgecolor='k',lw=0.5, 222 | vmin=0, vmax=1, rasterized=rasterized) 223 | 224 | ax.set_xlabel('Genes detected', fontsize=12) 225 | ax.set_ylabel(bias_metric, fontsize=12) 226 | qtl.plot.format_plot(ax, fontsize=10) 227 | ax.autoscale(True) 228 | ax.spines['left'].set_position(('outward', 6)) 229 | ax.spines['bottom'].set_position(('outward', 6)) 230 | hc = plt.colorbar(h, cax=cax) 231 | hc.set_label('Duplicate Rate', fontsize=12, labelpad=6) 232 | return ax, cax 233 | 234 | 235 | def mapping_sense(metrics_df, cohort_s=None, cohort_order=None, cohort_colors=None, date_s=None, width=0.8, 236 | dl=0.75, aw=4, dr=1.5, db=0.5, ah=2, dt=0.25, ds=0.066, dc=0.1): 237 | """Summary of sense/antisense alignments. 238 | 239 | For stranded protocols, most reads should be 'End 1 Antisense' and 'End 2 Sense', 240 | or vice versa, depending on protocol. 241 | For unstranded protocols, the 4 categories are expected to be of equal proportion (~0.25). 242 | """ 243 | sorted_ix = sort_samples(metrics_df.index, cohort_s=cohort_s, 244 | cohort_order=cohort_order, date_s=date_s) 245 | df = metrics_df.loc[sorted_ix, ['End 1 Sense', 'End 1 Antisense', 'End 2 Sense', 'End 2 Antisense']] 246 | df = df / np.sum(df.values, axis=1, keepdims=True) 247 | 248 | fw = dl + aw + dr 249 | fh = db + ah + dt 250 | fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh)) 251 | ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh]) 252 | df.reset_index(drop=True).plot(kind='bar', width=width, 253 | stacked=True, xticks=[], ax=ax, 254 | color=hsv_to_rgb([ 255 | [0.1, 0.6, 1], 256 | [0.4, 0.7, 0.75], 257 | [0.25, 0.4, 0.85], 258 | [0.15, 0.55, 1], 259 | ]) 260 | ) 261 | ax.set_ylim([0,1]) 262 | ax.legend(loc='upper left', handlelength=0.66, bbox_to_anchor=(1,1)) 263 | ax.set_ylabel('Proportion of mapped reads', fontsize=12) 264 | ax.set_xlabel('Samples', fontsize=12) 265 | ax.set_xlim([-width/2, metrics_df.shape[0]-width/2]) 266 | 267 | if cohort_s is not None: 268 | cax = fig.add_axes([dl/fw, (db+ah+ds)/fh, aw/fw, dc/fh], sharex=ax) 269 | cax.set_yticks([]) 270 | _plot_cohort_labels(cax, cohort_s.loc[df.index], cohort_colors=cohort_colors, 271 | lax=ax, legend=False, orientation='horizontal') 272 | 273 | 274 | def calculate_expression_cdfs(tpm_df): 275 | """Sort and compute CDF for each sample independently""" 276 | cdf_df = tpm_df.reset_index(drop=True).copy() 277 | if 'Description' in cdf_df: 278 | cdf_df.drop('Description', axis=1, inplace=True) 279 | for c in cdf_df: 280 | cdf_df[c] = np.cumsum(cdf_df[c].sort_values(ascending=False).values) / 1e6 281 | return cdf_df 282 | 283 | 284 | def cumulative_expression(cdf_df, cohort_s=None, cohort_colors=None, ax=None, cmap=plt.cm.Spectral_r, c=[0.6,0.6,0.6], 285 | reference_df=None, reference_name=None, alpha=0.5, mode='lines', lw=1, legend=False, rasterized=False): 286 | """ 287 | Plot cumulative gene expression for each sample. 288 | This enables identification of samples with dominant expression of few genes. 289 | 290 | With mode='ci', median and confidence intervals are shown instead of individual samples. 291 | """ 292 | if cohort_s is None: 293 | cohort_s = pd.Series('_NA', index=cdf_df.columns) 294 | 295 | if cohort_colors is None: 296 | cohorts = cohort_s.unique() 297 | nc = len(cohorts) 298 | if nc==1: 299 | cohort_colors = {cohorts[0]: c} 300 | else: 301 | cohort_colors = {i:j for i,j in zip(cohorts, plt.cm.get_cmap(cmap.name, nc)(np.arange(nc)))} 302 | 303 | if ax is None: 304 | ax = qtl.plot.setup_figure(4, 2.5) 305 | ax.set_xscale('log') 306 | 307 | if reference_df is not None: # plot reference distribution 308 | # mu = reference_df.mean(axis=1) 309 | s = reference_df.std(axis=1) 310 | mu = reference_df.median(axis=1) 311 | # s = np.median(np.abs(gtex_cdf-mu), axis=0) / 0.6745 312 | if mode=='ci': 313 | ax.fill_between(np.arange(reference_df.shape[0])+1, mu-1.96*s, mu+1.96*s, facecolor='k', edgecolor='k', alpha=0.2, label=None, zorder=20) 314 | ax.plot(mu, 'k', lw=2, alpha=0.8, rasterized=rasterized, label=reference_name, zorder=30) 315 | else: 316 | ax.fill_between(np.arange(reference_df.shape[0])+1, mu-1.96*s, mu+1.96*s, facecolor='k', edgecolor='none', alpha=0.2, label=f'{reference_name} 95% CI', zorder=20) 317 | ax.plot(mu, 'k', lw=1.5, alpha=0.8, rasterized=rasterized, label=f'{reference_name} mean', zorder=30) 318 | 319 | for c in cohort_s.unique(): 320 | x = np.arange(1, cdf_df.shape[0]+1) 321 | ix = cohort_s[cohort_s==c].index 322 | if mode == 'ci': # plot confidence intervals 323 | mu = cdf_df[ix].median(axis=1) 324 | s = cdf_df[ix].std(axis=1) # replace with MAD? 325 | fc = cohort_colors[c] 326 | pc = ax.fill_between(x, mu-1.96*s, mu+1.96*s, facecolor=fc, edgecolor=fc, alpha=0.2, label=None, zorder=20, lw=1) 327 | ax.plot(mu, '-', color=cohort_colors[c], lw=2, alpha=0.8, rasterized=rasterized, label=c, zorder=30) 328 | else: 329 | ax.plot(x, cdf_df[ix[0]], color=cohort_colors[c], alpha=alpha, lw=lw, rasterized=rasterized, label=c) # plot first one w/ label 330 | if len(ix)>1: 331 | ax.plot(x, cdf_df[ix[1:]], color=cohort_colors[c], alpha=alpha, lw=lw, rasterized=rasterized) 332 | 333 | ax.set_ylim([0,1]) 334 | ax.set_xlim([1,10000]) 335 | qtl.plot.format_plot(ax, fontsize=10) 336 | ax.set_xlabel('Number of genes', fontsize=12) 337 | ax.set_ylabel('Cumulative transcriptional output', fontsize=12) 338 | ax.spines['left'].set_position(('outward', 6)) 339 | 340 | if legend and not (cohort_s == '_NA').all(): 341 | leg = ax.legend(loc=4, handlelength=1, fontsize=10) 342 | for lh in leg.legendHandles: 343 | lh.set_alpha(1) 344 | 345 | return ax 346 | 347 | 348 | def _plot_cohort_labels(ax, cohort_s, cohort_colors=None, lax=None, legend=True, orientation='vertical'): 349 | """Internal function for adding a cohort color legend to a figure (in a separate axis)""" 350 | 351 | cohort_index_dict = {i:k for k,i in enumerate(np.unique(cohort_s))} 352 | if cohort_colors is None: 353 | n = len(cohort_index_dict) 354 | cmap = ListedColormap(plt.cm.get_cmap('Spectral', n)(np.arange(n)), 'indexed') 355 | else: 356 | cmap = ListedColormap(pd.Series(cohort_index_dict).sort_values().index.map(cohort_colors)) 357 | 358 | if orientation == 'vertical': 359 | ax.imshow(cohort_s.map(cohort_index_dict).values.reshape(-1,1), aspect='auto', cmap=cmap) 360 | else: 361 | ax.imshow(cohort_s.map(cohort_index_dict).values.reshape(1,-1), aspect='auto', cmap=cmap) 362 | 363 | if lax is None: 364 | lax = ax 365 | for k,i in cohort_index_dict.items(): 366 | lax.scatter(np.nan, np.nan, marker='s', c=[cmap(i)], label=f'{k}') 367 | if legend: 368 | lax.legend(loc='upper left', borderaxespad=None, bbox_to_anchor=(1,1), handlelength=1, title='Cohort') 369 | 370 | 371 | def insert_sizes(insertsize_df, cohort_s=None, cohort_colors=None, 372 | cohort_order=None, sort_order='mean', max_size=1000, 373 | legend=False, dl=0.75, aw=3, dr=0.5, db=0.5, ah=2, dt=0.25): 374 | """Plot heat map of insert size distributions""" 375 | 376 | # expand to 'max_size' bp 377 | df = insertsize_df.reindex(np.arange(1,max_size+1)).fillna(0).astype(np.int32).T 378 | 379 | # sort by mean if > 100000 reads 380 | mu = df.mul(df.columns.values, axis=1).sum(1) 381 | n = df.sum(1).sort_values() 382 | si = n[n<100000].index.tolist() + mu.loc[n[n>=100000].index].sort_values().index.tolist() 383 | 384 | if cohort_s is not None and sort_order == 'cohort': # sort within each cohort 385 | if cohort_order is None: 386 | cohort_order = cohort_s.value_counts().index 387 | sort_s = pd.Series(cohort_s[si], index=si) 388 | si = [] 389 | for c in cohort_order: 390 | si.extend(sort_s[sort_s==c].index) 391 | 392 | # set up figure 393 | if cohort_s is not None: 394 | cw = 0.1 395 | ds = 0.05 396 | else: 397 | cw = 0 398 | ds = 0 399 | fw = dl + cw + ds + aw + dr 400 | fh = db + ah + dt 401 | fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh)) 402 | ax = fig.add_axes([(dl+cw+ds)/fw, db/fh, aw/fw, ah/fh]) 403 | 404 | # add cohort information and legend 405 | if cohort_s is not None: 406 | # set up axes 407 | cax = fig.add_axes([dl/fw, db/fh, cw/fw, ah/fh], sharey=ax) 408 | plt.setp(ax.get_yticklabels(), visible=False); 409 | for line in ax.yaxis.get_ticklines(): 410 | line.set_markersize(0) 411 | line.set_markeredgewidth(0) 412 | cax.set_xticks([]) 413 | cax.set_ylabel('Sample', fontsize=12) 414 | 415 | # plot labels 416 | _plot_cohort_labels(cax, cohort_s[si], cohort_colors=cohort_colors, lax=ax, legend=legend) 417 | 418 | ax.imshow(df.loc[si], interpolation='none', aspect='auto', norm=LogNorm()) 419 | ax.set_xlabel('Insert size (bp)', fontsize=12) 420 | ax.set_xlim([1, max_size]) 421 | return ax 422 | 423 | 424 | def gc_content(gc_content_df, cohort_s=None, cohort_colors=None, 425 | cohort_order=None, sort_order='mean', legend=False, 426 | dl=0.75, aw=3, dr=0.5, db=0.5, ah=2, dt=0.25): 427 | """Plot heat map of GC content distributions""" 428 | 429 | # sort by mean 430 | x = gc_content_df.index.values 431 | mu = (gc_content_df * x.reshape(-1,1)).sum() 432 | si = mu.sort_values().index 433 | 434 | if cohort_s is not None and sort_order == 'cohort': # sort within each cohort 435 | if cohort_order is None: 436 | cohort_order = cohort_s.value_counts().index 437 | sort_s = pd.Series(cohort_s[si], index=si) 438 | si = [] 439 | for c in cohort_order: 440 | si.extend(sort_s[sort_s==c].index) 441 | 442 | # set up figure 443 | if cohort_s is not None: 444 | ch = 0.1 445 | ds = 0.05 446 | else: 447 | ch = 0 448 | ds = 0 449 | fw = dl + aw + dr 450 | fh = db + ah + ch + ds + dt 451 | fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh)) 452 | ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh]) 453 | 454 | # add cohort information and legend 455 | if cohort_s is not None: 456 | # set up axes 457 | cax = fig.add_axes([dl/fw, (db+ah+ds)/fh, aw/fw, ch/fh], sharex=ax) 458 | plt.setp(cax.get_xticklabels(), visible=False); 459 | for line in cax.xaxis.get_ticklines(): 460 | line.set_markersize(0) 461 | line.set_markeredgewidth(0) 462 | cax.set_yticks([]) 463 | 464 | # plot labels 465 | _plot_cohort_labels(cax, cohort_s[si], orientation='horizontal', 466 | cohort_colors=cohort_colors, lax=ax, legend=legend) 467 | 468 | ax.imshow(gc_content_df[si], origin='lower', interpolation='none', aspect='auto', norm=LogNorm()) 469 | ax.set_xlabel('Samples', fontsize=12) 470 | ax.set_ylabel('Fragment GC Content', fontsize=12) 471 | y = np.arange(0, 120, 20) 472 | ax.set_yticks(y) 473 | ax.set_yticklabels(y/100) 474 | return ax 475 | 476 | 477 | def xy_expression(tpm_df, sex_s=None, flag_klinefelter=True, highlight_ids=None, 478 | x_threshold=5, y_threshold=30, s=24, verbose=True, rasterized=False, **kwargs): 479 | """Expression of sex-specific genes (XIST and RPS4Y1) to identify sample swaps. 480 | 481 | sex_s: pd.Series annotating the sex of each sample, as Male/Female. 482 | """ 483 | 484 | x_id = tpm_df.index[tpm_df.index.str.startswith('ENSG00000229807')][0] # XIST 485 | y_id = tpm_df.index[tpm_df.index.str.startswith('ENSG00000129824')][0] # RPS4Y1 486 | x_s = tpm_df.loc[x_id].rename('XIST') 487 | y_s = tpm_df.loc[y_id].rename('RPS4Y1') 488 | 489 | ax = qtl.plot.setup_figure(3, 3, xspace=[0.75, 1.75]) 490 | ax.set_xscale('symlog') 491 | ax.set_yscale('symlog') 492 | 493 | if sex_s is not None: # flag potential swaps based on thresholds 494 | assert tpm_df.columns.isin(sex_s.index).all() 495 | res_s = pd.Series('NA', index=sex_s.index[sex_s.index.isin(tpm_df.columns)], name='inferred_sex') 496 | 497 | args = {'ec':'none', 'lw':0, 'rasterized':rasterized, 'clip_on':False, 's':s, 'alpha':0.33} 498 | args.update(kwargs) 499 | args2 = {**args, 'ec':'k', 'lw':1, 's':s+6, 'alpha':1} 500 | 501 | # infer missing labels based on thresholds 502 | ix = sex_s[sex_s.isnull() & (x_s <= x_threshold) & (y_s > y_threshold)].index 503 | if len(ix) > 0: 504 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.6,0.8,0.7]).reshape(1,-1), **args, label=f"Male* ({len(ix)})") 505 | res_s[ix] = 'Male' 506 | ix = sex_s[sex_s.isnull() & (x_s > x_threshold) & (y_s <= y_threshold)].index 507 | if len(ix) > 0: 508 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0,0.8,0.7]).reshape(1,-1), **args, label=f"Female* ({len(ix)})") 509 | res_s[ix] = 'Female' 510 | ix = sex_s[sex_s.isnull() & (x_s > x_threshold) & (y_s > y_threshold)].index 511 | if len(ix) > 0: 512 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.75,0.8,0.7]).reshape(1,-1), **args, label=f"XXY* ({len(ix)})") 513 | res_s[ix] = 'Klinefelter (XXY)' 514 | ix = sex_s[sex_s.isnull() & (x_s <= x_threshold) & (y_s <= y_threshold)].index 515 | if len(ix) > 0: 516 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.0,0,0.7]).reshape(1,-1), **args, label=f"? ({len(ix)})") 517 | res_s[ix] = np.nan 518 | 519 | # matching samples 520 | ix = sex_s[(sex_s == 'Male') & (x_s <= x_threshold)].index 521 | if len(ix) > 0: 522 | res_s[ix] = 'Male' 523 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.6,0.8,0.7]).reshape(1,-1), label=f"Male ({len(ix)})", **args) 524 | ix = sex_s[(sex_s == 'Female') & (y_s <= y_threshold)].index 525 | if len(ix) > 0: 526 | res_s[ix] = 'Female' 527 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0,0.8,0.7]).reshape(1,-1), label=f"Female ({len(ix)})", **args) 528 | 529 | # mismatches 530 | if flag_klinefelter: 531 | ix = sex_s[(sex_s == 'Male') & (x_s > x_threshold) & (y_s <= y_threshold)].index 532 | if len(ix) > 0: 533 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.6,1,0.9]).reshape(1,-1), label=f'M > F swap ({len(ix)})', **args2) 534 | if verbose: 535 | print(f'F mislabeled as M:\n{ix.tolist()}') 536 | res_s[ix] = 'Female' 537 | ix = sex_s[(sex_s == 'Female') & (y_s > y_threshold) & (x_s <= x_threshold)].index 538 | if len(ix) > 0: 539 | ax.scatter(x_s[ix], y_s[ix], c=[[0.9, 0, 0, 1]], label=f'F > M swap ({len(ix)})', **args2) 540 | if verbose: 541 | print(f'M mislabeled as F:\n{ix.tolist()}') 542 | res_s[ix] = 'Male' 543 | 544 | # Klinefelter 545 | ix = sex_s[(sex_s == 'Male') & (x_s > x_threshold) & (y_s > y_threshold)].index 546 | if len(ix) > 0: 547 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.75,1,0.9]).reshape(1,-1), label=f'XXY? ({len(ix)})', **args2) 548 | if verbose: 549 | print(f'Possible Klinefelter (XXY): {ix.tolist()}') 550 | res_s[ix] = 'Possible Klinefelter (XXY)' 551 | ix = sex_s[(sex_s == 'Female') & (y_s > y_threshold) & (x_s > x_threshold)].index 552 | if len(ix) > 0: 553 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0,1,0.9]).reshape(1,-1), label=f'XXY? ({len(ix)})', **args2) 554 | if verbose: 555 | print(f'Possible Klinefelter (XXY): {ix.tolist()}') 556 | res_s[ix] = 'Possible Klinefelter (XXY)' 557 | 558 | else: 559 | ix = sex_s[(sex_s == 'Male') & (x_s > x_threshold)].index 560 | if len(ix) > 0: 561 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.6,1,0.9]).reshape(1,-1), label=f'M > F swap ({len(ix)})', **args2) 562 | if verbose: 563 | print(f'F mislabeled as M:\n{ix.tolist()}') 564 | res_s[ix] = 'Female' 565 | ix = sex_s[(sex_s == 'Female') & (y_s > y_threshold)].index 566 | if len(ix) > 0: 567 | ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0,1,0.9]).reshape(1,-1), label=f'F > M swap ({len(ix)})', **args2) 568 | if verbose: 569 | print(f'M mislabeled as F:\n{ix.tolist()}') 570 | res_s[ix] = 'Male' 571 | else: 572 | ax.scatter(x_s, y_s, s=s, alpha=0.5, edgecolors='none', lw=0.5, rasterized=True, clip_on=False) 573 | 574 | if highlight_ids is not None: # highlight selected samples 575 | ax.scatter(x_s[highlight_ids], y_s[highlight_ids], c=[hsv_to_rgb([0.075,1,1])], s=s+12, alpha=1, edgecolors='k', lw=1, zorder=50, rasterized=False, clip_on=False, label=None) 576 | 577 | qtl.plot.format_plot(ax, fontsize=12) 578 | ax.spines['left'].set_position(('outward', 6)) 579 | ax.spines['bottom'].set_position(('outward', 6)) 580 | ax.set_xlabel('XIST expression (TPM)', fontsize=14) 581 | ax.set_ylabel('RPS4Y1 expression (TPM)', fontsize=14) 582 | 583 | xlim = list(ax.get_xlim()) 584 | ylim = list(ax.get_ylim()) 585 | xlim[0] = 0 586 | ylim[0] = 0 587 | ax.plot(2*[x_threshold], ylim, '--', c=[0.75]*3) 588 | ax.plot(xlim, 2*[y_threshold], '--', c=[0.75]*3) 589 | ax.set_xlim(xlim) 590 | ax.set_ylim(ylim) 591 | 592 | if sex_s is not None: 593 | leg = ax.legend(loc='upper left', fontsize=12, handlelength=0.5, labelspacing=0.2, bbox_to_anchor=(1,1)) 594 | for lh in leg.legend_handles: 595 | lh.set_alpha(1) 596 | return ax, res_s 597 | else: 598 | return ax 599 | --------------------------------------------------------------------------------