├── .gitattributes
├── python
    ├── rnaseqc
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── create_notebook.py
    │   ├── run.py
    │   ├── legacy_exon_remap.py
    │   ├── nb_encode.py
    │   ├── insert_size_intervals.py
    │   ├── aggregate.py
    │   ├── report.py
    │   └── plot.py
    ├── setup.py
    └── README.md
├── .gitmodules
├── test_data
    ├── chr1.output
    │   ├── chr1.bam.gene_tpm.gct.gz
    │   ├── chr1.bam.exon_reads.gct.gz
    │   ├── chr1.bam.gene_reads.gct.gz
    │   ├── chr1.bam.gene_fragments.gct.gz
    │   ├── chr1.cram.gc_content.tsv
    │   ├── chr1.bam.metrics.tsv
    │   └── chr1.cram.metrics.tsv
    ├── legacy.output
    │   ├── legacy.exon_reads.gct.gz
    │   ├── legacy.gene_reads.gct.gz
    │   ├── downsampled.bam.gene_tpm.gct.gz
    │   ├── downsampled.bam.exon_reads.gct.gz
    │   ├── downsampled.bam.gene_reads.gct.gz
    │   ├── downsampled.bam.gene_fragments.gct.gz
    │   ├── downsampled.bam.metrics.tsv
    │   └── downsampled.bam.fragmentSizes.txt
    ├── downsampled.output
    │   ├── downsampled.bam.gene_tpm.gct.gz
    │   ├── downsampled.bam.exon_reads.gct.gz
    │   ├── downsampled.bam.gene_reads.gct.gz
    │   ├── downsampled.bam.gene_fragments.gct.gz
    │   ├── downsampled.bam.metrics.tsv
    │   └── downsampled.bam.fragmentSizes.txt
    ├── single_pair.output
    │   ├── single_pair.bam.gene_tpm.gct.gz
    │   ├── single_pair.bam.exon_reads.gct.gz
    │   ├── single_pair.bam.gene_reads.gct.gz
    │   ├── single_pair.bam.gene_fragments.gct.gz
    │   └── single_pair.bam.metrics.tsv
    ├── approx_diff.py
    ├── Makefile.osx
    └── Makefile.linux
├── .gitignore
├── src
    ├── BED.h
    ├── BamReader.cpp
    ├── BED.cpp
    ├── Expression.h
    ├── GTF.h
    ├── Fasta.h
    ├── BamReader.h
    ├── Fasta.cpp
    ├── Metrics.h
    ├── GTF.cpp
    └── Metrics.cpp
├── THIRD-PARTY-LICENSES.md
├── cloudbuild.yaml
├── LICENSE
├── Dockerfile
├── .github
    └── workflows
    │   ├── CI.yml
    │   └── Deployment.yml
├── Metrics.md
├── Makefile
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python/rnaseqc/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.3'
2 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "SeqLib"]
2 | 	path = SeqLib
3 | 	url = https://github.com/walaj/SeqLib.git
4 | 


--------------------------------------------------------------------------------
/test_data/chr1.output/chr1.bam.gene_tpm.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/chr1.output/chr1.bam.gene_tpm.gct.gz


--------------------------------------------------------------------------------
/test_data/chr1.output/chr1.bam.exon_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/chr1.output/chr1.bam.exon_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/chr1.output/chr1.bam.gene_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/chr1.output/chr1.bam.gene_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/legacy.output/legacy.exon_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/legacy.exon_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/legacy.output/legacy.gene_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/legacy.gene_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/chr1.output/chr1.bam.gene_fragments.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/chr1.output/chr1.bam.gene_fragments.gct.gz


--------------------------------------------------------------------------------
/test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz


--------------------------------------------------------------------------------
/test_data/legacy.output/downsampled.bam.exon_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/downsampled.bam.exon_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/legacy.output/downsampled.bam.gene_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/downsampled.bam.gene_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz


--------------------------------------------------------------------------------
/test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz


--------------------------------------------------------------------------------
/test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz


--------------------------------------------------------------------------------
/test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz


--------------------------------------------------------------------------------
/test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz


--------------------------------------------------------------------------------
/test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getzlab/rnaseqc/HEAD/test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.xcodeproj/
 2 | __pycache__/
 3 | src/*.o
 4 | rnaseqc
 5 | !python/rnaseqc
 6 | rnaseqc.a
 7 | build/
 8 | *.egg-info/
 9 | *.bam
10 | *.cram
11 | *.gtf
12 | *.fasta
13 | *.fa
14 | *.fai
15 | test_data/test_inputs.tar.gz
16 | 


--------------------------------------------------------------------------------
/src/BED.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  BED.hpp
 3 | //  IntervalTree
 4 | //
 5 | //  Created by Aaron Graubert on 7/11/17.
 6 | //  Copyright © 2017 Aaron Graubert. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef BED_h
10 | #define BED_h
11 | 
12 | #include "GTF.h"
13 | 
14 | namespace rnaseqc {
15 |     struct bedException : public std::exception {
16 |         std::string error;
17 |         bedException(std::string msg) : error(msg) {};
18 |     };
19 |     
20 |     std::ifstream& extractBED(std::ifstream&, Feature&);
21 | }
22 | #endif /* BED_h */
23 | 


--------------------------------------------------------------------------------
/THIRD-PARTY-LICENSES.md:
--------------------------------------------------------------------------------
 1 | RNA-SeQC third party code notice:
 2 | 
 3 | In addition, RNA-SeQC is distributed, in part, under and subject to the licenses for:
 4 | [SeqLib](https://github.com/walaj/SeqLib) - Copyright © 2016 Jeremiah A. Wala. All Rights Reserved.
 5 | [Apache 2.0 License](https://github.com/walaj/SeqLib/blob/master/LICENSE) (as of 7e1f982).
 6 | 
 7 | [Args](https://github.com/Taywee/args/) - Copyright © 2016 – 2017 Taylor C. Richberger and Pavel Belikov. All Rights Reserved.
 8 | [MIT license](https://github.com/Taywee/args/blob/master/LICENSE) (as of 7bf17000).
 9 | 
10 | [BioIO](https://github.com/dancooke/bioio/), Copyright © 2017 Daniel Cooke. All Rights Reserved.
11 | [MIT License](https://github.com/dancooke/bioio/blob/master/LICENSE) (as of 99978e1).
12 | 


--------------------------------------------------------------------------------
/src/BamReader.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //  BamReader.cpp
 3 | //  RNA-SeQC
 4 | //
 5 | //  Created by Aaron Graubert on 10/3/18.
 6 | //  Copyright © 2018 Aaron Graubert. All rights reserved.
 7 | //
 8 | 
 9 | #include "BamReader.h"
10 | 
11 | namespace rnaseqc {
12 |     bool SeqlibReader::next(SeqLib::BamRecord &read)
13 |     {
14 |         // Must uncomment before adding multithreading
15 |         //    std::lock_guard<SeqlibReader> guard(*this);
16 |         try {
17 |             bool ok = this->bam.GetNextRecord(read);
18 |             if (ok) this->read_count++;
19 |             return ok;
20 |         }
21 |         catch (std::runtime_error &e) {
22 |             if (this->user_cram_reference) throw referenceHTSMismatch(std::string("HTSLib was unable to find a suitable reference while decoding a cram: ")+e.what());
23 |             throw;
24 |         }
25 |         return false; // No way to get here
26 |         
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from setuptools import setup, find_packages
 4 | with open("rnaseqc/__init__.py") as reader:
 5 |     __version__ = re.search(
 6 |         r'__version__ ?= ?[\'\"]([\w.]+)[\'\"]',
 7 |         reader.read()
 8 |     ).group(1)
 9 | with open(os.path.join(os.path.dirname(__file__), 'README.md')) as r:
10 |     long_description = r.read()
11 | 
12 | # Setup information
13 | setup(
14 |     name = 'rnaseqc',
15 |     version = __version__,
16 |     packages = find_packages(),
17 |     description = 'Multi-sample visualization of metrics from RNA-SeQC',
18 |     long_description = long_description,
19 |     long_description_content_type='text/markdown',
20 |     install_requires = [
21 |         'numpy',
22 |         'pandas',
23 |         'matplotlib',
24 |         'seaborn',
25 |         'qtl',
26 |         'agutil',
27 |         'nbformat'
28 |     ],
29 |     classifiers = [
30 |         "Programming Language :: Python :: 3",
31 |         "Intended Audience :: Science/Research",
32 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
33 |     ],
34 | )
35 | 


--------------------------------------------------------------------------------
/test_data/chr1.output/chr1.cram.gc_content.tsv:
--------------------------------------------------------------------------------
  1 | Content Bin	Count
  2 | 0	0
  3 | 0.01	0
  4 | 0.02	0
  5 | 0.03	0
  6 | 0.04	0
  7 | 0.05	0
  8 | 0.06	0
  9 | 0.07	0
 10 | 0.08	0
 11 | 0.09	0
 12 | 0.1	0
 13 | 0.11	0
 14 | 0.12	0
 15 | 0.13	0
 16 | 0.14	2
 17 | 0.15	0
 18 | 0.16	1
 19 | 0.17	1
 20 | 0.18	6
 21 | 0.19	14
 22 | 0.2	17
 23 | 0.21	34
 24 | 0.22	128
 25 | 0.23	225
 26 | 0.24	531
 27 | 0.25	1027
 28 | 0.26	1816
 29 | 0.27	2880
 30 | 0.28	4199
 31 | 0.29	5730
 32 | 0.3	6819
 33 | 0.31	7925
 34 | 0.32	8547
 35 | 0.33	9823
 36 | 0.34	10260
 37 | 0.35	10631
 38 | 0.36	11507
 39 | 0.37	11568
 40 | 0.38	11731
 41 | 0.39	11664
 42 | 0.4	12154
 43 | 0.41	11357
 44 | 0.42	11028
 45 | 0.43	10721
 46 | 0.44	10951
 47 | 0.45	10661
 48 | 0.46	10776
 49 | 0.47	11125
 50 | 0.48	11119
 51 | 0.49	10503
 52 | 0.5	10200
 53 | 0.51	10029
 54 | 0.52	9791
 55 | 0.53	9357
 56 | 0.54	9196
 57 | 0.55	8883
 58 | 0.56	8474
 59 | 0.57	7835
 60 | 0.58	6694
 61 | 0.59	5991
 62 | 0.6	5467
 63 | 0.61	5127
 64 | 0.62	4517
 65 | 0.63	4205
 66 | 0.64	3741
 67 | 0.65	3197
 68 | 0.66	2617
 69 | 0.67	2049
 70 | 0.68	1596
 71 | 0.69	1198
 72 | 0.7	959
 73 | 0.71	727
 74 | 0.72	498
 75 | 0.73	335
 76 | 0.74	238
 77 | 0.75	157
 78 | 0.76	61
 79 | 0.77	38
 80 | 0.78	8
 81 | 0.79	7
 82 | 0.8	1
 83 | 0.81	1
 84 | 0.82	1
 85 | 0.83	0
 86 | 0.84	0
 87 | 0.85	0
 88 | 0.86	0
 89 | 0.87	0
 90 | 0.88	0
 91 | 0.89	0
 92 | 0.9	0
 93 | 0.91	0
 94 | 0.92	0
 95 | 0.93	0
 96 | 0.94	0
 97 | 0.95	0
 98 | 0.96	0
 99 | 0.97	0
100 | 0.98	0
101 | 0.99	0
102 | 


--------------------------------------------------------------------------------
/src/BED.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //  BED.cpp
 3 | //  IntervalTree
 4 | //
 5 | //  Created by Aaron Graubert on 7/11/17.
 6 | //  Copyright © 2017 Aaron Graubert. All rights reserved.
 7 | //
 8 | 
 9 | #include "BED.h"
10 | #include <sstream>
11 | #include <exception>
12 | #include <stdexcept>
13 | 
14 | using std::ifstream;
15 | using std::string;
16 | 
17 | namespace rnaseqc {
18 |     ifstream& extractBED(ifstream &input, Feature &out)
19 |     {
20 |         try
21 |         {
22 |             string line;
23 |             while(getline(input, line))
24 |             {
25 |                 if(line[0] == '#') continue; //Do beds even have comment lines?
26 |                 std::istringstream tokenizer(line);
27 |                 string buffer;
28 |                 tokenizer >> buffer; //chromosome name
29 |                 out.chromosome = chromosomeMap(buffer);
30 |                 tokenizer >> buffer; //start
31 |                 out.start = std::stoull(buffer) + 1;
32 |                 tokenizer >> buffer; //stop
33 |                 out.end = std::stoull(buffer) + 1;
34 |                 out.feature_id = line; // add a dummy exon_id for mapping interval intersections later
35 |                 out.type = FeatureType::Exon;
36 |                 break;
37 |             }
38 |         }
39 |         catch (std::exception &e)
40 |         {
41 |             throw bedException(std::string("Encountered an unknown error while parsing the BED: ") + e.what());
42 |         }
43 |         return input;
44 |     }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/cloudbuild.yaml:
--------------------------------------------------------------------------------
 1 | steps:
 2 |   - name: gcr.io/cloud-builders/docker
 3 |     args:
 4 |       - pull
 5 |       - gcr.io/broad-cga-aarong-gtex/rnaseqc:latest
 6 |   - name: gcr.io/cloud-builders/docker
 7 |     args:
 8 |       - build
 9 |       - -t
10 |       - gcr.io/broad-cga-aarong-gtex/rnaseqc:$COMMIT_SHA
11 |       - --cache-from
12 |       - gcr.io/broad-cga-aarong-gtex/rnaseqc:latest
13 |       - .
14 |     timeout: 900s
15 |   - name: gcr.io/cloud-builders/docker
16 |     args:
17 |       - push
18 |       - gcr.io/broad-cga-aarong-gtex/rnaseqc:$COMMIT_SHA
19 |   - name: gcr.io/broad-cga-aarong-gtex/rnaseqc:$COMMIT_SHA
20 |     args:
21 |       - bash
22 |       - -c
23 |       - >
24 |           apt-get update &&
25 |           apt-get install git wget -y &&
26 |           git clone https://github.com/getzlab/rnaseqc.git &&
27 |           mv rnaseqc/test_data /opt/rnaseqc &&
28 |           cd /opt/rnaseqc/test_data &&
29 |           wget https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz &&
30 |           tar xzf test_inputs.tar.gz &&
31 |           cd .. &&
32 |           make && make -f test_data/Makefile.linux test
33 |     timeout: 900s
34 |   - name: gcr.io/cloud-builders/docker
35 |     args:
36 |       - tag
37 |       - gcr.io/broad-cga-aarong-gtex/rnaseqc:$COMMIT_SHA
38 |       - gcr.io/broad-cga-aarong-gtex/rnaseqc:latest
39 |   - name: gcr.io/cloud-builders/docker
40 |     args:
41 |       - push
42 |       - gcr.io/broad-cga-aarong-gtex/rnaseqc:latest
43 | images:
44 |   - gcr.io/broad-cga-aarong-gtex/rnaseqc
45 | timeout: 1800s
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | RNA-SeQC is licensed under the following BSD 3-clause license:
 2 | 
 3 | Copyright © 2018 The Broad Institute, Inc. and The General Hospital Corporation. All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification,
 6 | are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice,
 9 |    this list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its contributors
16 |    may be used to endorse or promote products derived from this software without
17 |    specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 | (INCLUDING,BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/src/Expression.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  Expression.hpp
 3 | //  IntervalTree
 4 | //
 5 | //  Created by Aaron Graubert on 8/2/17.
 6 | //  Copyright © 2017 Aaron Graubert. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef Expression_h
10 | #define Expression_h
11 | 
12 | #include "Metrics.h"
13 | #include "BamReader.h"
14 | #include <set>
15 | #include <iostream>
16 | 
17 | namespace rnaseqc {
18 |     //Utility functions
19 |     unsigned int extractBlocks(Alignment&, std::vector<Feature>&, chrom, bool);
20 |     //unsigned int legacyExtractBlocks(BamTools::BamAlignment&, std::vector<Feature>&, chrom);
21 |     std::list<Feature>* intersectBlock(Feature&, std::list<Feature>&);
22 |     void trimFeatures(Alignment&, std::list<Feature>&);
23 |     void trimFeatures(Alignment&, std::list<Feature>&, BaseCoverage&);
24 |     void dropFeatures(std::list<Feature>&, BaseCoverage&);
25 |     
26 |     // Definitions for fragment tracking
27 |     typedef std::tuple<std::string, coord> FragmentMateEntry; // Used to record mate end point (exon name, read end position)
28 |     const std::size_t EXON = 0, ENDPOS = 1;
29 |     
30 |     //Metrics functions
31 |     void fragmentSizeMetrics(unsigned int&, std::map<chrom, std::list<Feature>>*, std::map<std::string, FragmentMateEntry>&, std::map<long long, unsigned long>&,std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&);
32 |     
33 |     double exonAlignmentMetrics(std::map<chrom, std::list<Feature>>&, Metrics&, std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool, std::map<std::string, FragmentMateEntry>&, Fasta&);
34 |     
35 |     void legacyExonAlignmentMetrics(unsigned int, std::map<chrom, std::list<Feature>>&, Metrics&, std::vector<Feature>&, Alignment&, SeqLib::HeaderSequenceVector&, unsigned int, Strand, BaseCoverage&, const bool, const bool);
36 |     
37 |     Strand feature_strand(Alignment&, Strand);
38 | }
39 | 
40 | #endif /* Expression_h */
41 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | # RNA-SeQC Python utilities
 2 | 
 3 | This module contains utility code for RNA-SeQC
 4 | 
 5 | ## Installing
 6 | 
 7 | * From pip: `pip install rnaseqc`
 8 | * From the git repo: `pip install -e python` (Invoke from root of git repo)
 9 | 
10 | ## Usage
11 | 
12 | This does not install a console entrypoint. You can invoke the utilities in one of three ways:
13 | 
14 | * From the main module: `python3 -m rnaseqc ...`
15 | * Calling the target module: `python3 -m rnaseqc.example ...`
16 | * Calling scripts directly: `python3 python/rnaseqc/example.py`
17 | 
18 | ## Utilities
19 | 
20 | The `rnaseqc` module contains 5 main utilities. To get more help with each utility,
21 | invoke the utility with the `-h` or `--help` option
22 | 
23 | ### Aggregation
24 | 
25 | Aggregates RNA-SeQC outputs from multiple samples
26 | 
27 | ```
28 | python3 -m rnaseqc aggregate [-h] [--parquet] [-o OUTPUT_DIR] results_dir prefix
29 | ```
30 | 
31 | ### Jupyter Notebooks
32 | 
33 | Creates a jupyter notebook with several figures for comparing samples
34 | 
35 | ```
36 | python3 -m rnaseqc notebook [-h] [-t TPM] [-i INSERT_SIZE] [-c COHORT] [-d DATE] metrics output
37 | ```
38 | 
39 | ### Figures
40 | 
41 | Generates figures from an aggregated RNA-SeQC metrics table
42 | 
43 | ```
44 | python3 -m rnaseqc report [-h] [--tpm TPM] [--insert-size INSERT_SIZE] [--cohort COHORT] [--output-dir OUTPUT_DIR] [--dpi DPI] metrics prefix
45 | ```
46 | 
47 | ### Insert Size distributions
48 | 
49 | Generates a BED file with intervals used by RNA-SeQC for estimating a sample's insert size distribution
50 | 
51 | ```
52 | python3 -m rnaseqc insert-size [-h] [--min-length MIN_LENGTH] [--min-mappability MIN_MAPPABILITY] [--output-dir OUTPUT_DIR] gtf_path mappability_bigwig prefix
53 | ```
54 | 
55 | ### Exon remapping
56 | 
57 | Convert exon names in an `*.exon_reads.gct` file from RNA-SeQC 2.X.X to match names
58 | as reported by RNA-SeQC 1.1.9
59 | 
60 | ```
61 | python3 -m rnaseqc legacy-exons gct gtf
62 | ```
63 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile for RNASeQC
 2 | FROM ubuntu:20.04
 3 | MAINTAINER Aaron Graubert
 4 | 
 5 | RUN apt-get update && apt-get install -y software-properties-common && \
 6 |     apt-get update && apt-get install -y \
 7 |         build-essential \
 8 |         cmake \
 9 |         git \
10 |         python3 \
11 |         python3-pip \
12 |         libboost-filesystem-dev \
13 |         libboost-regex-dev \
14 |         libboost-system-dev \
15 |         libbz2-dev \
16 |         libcurl3-dev \
17 |         liblzma-dev \
18 |         libpthread-stubs0-dev \
19 |         wget \
20 |         zlib1g-dev \
21 |     && rm -rf /var/lib/apt/lists/*
22 |     
23 | # Python
24 | RUN python3 -m pip install --upgrade pip setuptools pyarrow jupyter
25 | 
26 | # SeqLib
27 | COPY Makefile /opt/rnaseqc/Makefile
28 | RUN cd /opt/rnaseqc && git clone --recursive https://github.com/walaj/SeqLib.git && \
29 |     cd SeqLib && git checkout 7e1f98267b5057f9505dbff119308137a0e006db && cd .. && \
30 |     make SeqLib/lib/libseqlib.a
31 | 
32 | # python
33 | RUN cd /opt && git clone https://github.com/francois-a/rnaseq-utils rnaseq && cd rnaseq && \
34 |   git checkout f1c6a5677bbca465ea1edd06c2293a5d1078a18b && python3 -m pip install --upgrade pip setuptools && \
35 |   python3 -m pip install numpy && python3 -m pip install pandas matplotlib scipy pyBigWig bx-python \
36 |   agutil nbformat seaborn sklearn qtl && mkdir -p /root/.config/matplotlib && echo "backend	:	Agg" > /root/.config/matplotlib/matplotlibrc
37 | ENV PYTHONPATH $PYTHONPATH:/opt/
38 | 
39 | #RNASeQC
40 | COPY src /opt/rnaseqc/src
41 | COPY python /opt/rnaseqc/python
42 | COPY args.hxx /opt/rnaseqc
43 | COPY bioio.hpp /opt/rnaseqc
44 | RUN cd /opt/rnaseqc && make && ln -s /opt/rnaseqc/rnaseqc /usr/local/bin/rnaseqc && make clean && python3 -m pip install -e /opt/rnaseqc/python
45 | 
46 | # clean up
47 | RUN apt-get clean && \
48 |     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
49 |     apt-get autoclean && \
50 |     apt-get autoremove -y && \
51 |     rm -rf /var/lib/{apt,dpkg,cache,log}/
52 | 


--------------------------------------------------------------------------------
/src/GTF.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  GTF.hpp
 3 | //  IntervalTree
 4 | //
 5 | //  Created by Aaron Graubert on 6/28/17.
 6 | //  Copyright © 2017 Aaron Graubert. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef GTF_h
10 | #define GTF_h
11 | 
12 | #include <string>
13 | #include <iostream>
14 | #include <fstream>
15 | #include <map>
16 | #include <utility>
17 | #include <vector>
18 | #include <sstream>
19 | #include "Fasta.h"
20 | 
21 | namespace rnaseqc {
22 |     struct gtfException : public std::exception {
23 |         std::string error;
24 |         gtfException(std::string msg) : error(msg) {};
25 |     };
26 |     
27 |     enum FeatureType {Gene, Transcript, Exon, Other};
28 |     
29 |     struct Feature {
30 |         //Represents arbitrary genome features
31 |         coord start, end;
32 |         chrom chromosome;
33 |         Strand strand;
34 |         FeatureType type;
35 |         std::string feature_id, gene_id, transcript_type;
36 |         bool ribosomal;
37 |     };
38 |     
39 |     //For comparing features
40 |     bool operator==(const Feature &a, const Feature &b);
41 |     bool compIntervalStart(const Feature&, const Feature&);
42 |     bool compIntervalEnd(const Feature&, const Feature&);
43 |     bool intersectPoint(const Feature&, const coord);
44 |     bool intersectInterval(const Feature&, const Feature&);
45 |     int partialIntersect(const Feature&, const Feature&);
46 | 
47 |     struct FeatureSpan {
48 |         chrom chromosome;
49 |         coord start, length;
50 |     };
51 |     
52 |     
53 |     extern std::map<std::string, std::string> geneNames, geneSeqs;
54 | extern std::map<std::string, coord> geneLengths, geneCodingLengths;
55 |     extern std::map<std::string, FeatureSpan> exonLengths;
56 |     extern std::vector<std::string> geneList, exonList;
57 |     extern std::map<std::string, std::vector<std::string>> exonsForGene;
58 |     
59 |     std::ifstream& operator>>(std::ifstream&, Feature&);
60 |     std::map<std::string,std::string>& parseAttributes(std::string&, std::map<std::string,std::string>&);
61 | }
62 | 
63 | #endif /* GTF_h */
64 | 


--------------------------------------------------------------------------------
/test_data/single_pair.output/single_pair.bam.metrics.tsv:
--------------------------------------------------------------------------------
 1 | Sample	single_pair.bam
 2 | Mapping Rate	1
 3 | Unique Rate of Mapped	1
 4 | Duplicate Rate of Mapped	0
 5 | Duplicate Rate of Mapped, excluding Globins	0
 6 | Base Mismatch	0
 7 | End 1 Mapping Rate	1
 8 | End 2 Mapping Rate	1
 9 | End 1 Mismatch Rate	0
10 | End 2 Mismatch Rate	0
11 | Expression Profiling Efficiency	1
12 | High Quality Rate	1
13 | Exonic Rate	1
14 | Intronic Rate	0
15 | Intergenic Rate	0
16 | Intragenic Rate	1
17 | Ambiguous Alignment Rate	0
18 | High Quality Exonic Rate	1
19 | High Quality Intronic Rate	0
20 | High Quality Intergenic Rate	0
21 | High Quality Intragenic Rate	1
22 | High Quality Ambiguous Alignment Rate	0
23 | Discard Rate	0
24 | rRNA Rate	0
25 | End 1 Sense Rate	1
26 | End 2 Sense Rate	0
27 | Avg. Splits per Read	0
28 | Alternative Alignments	0
29 | Chimeric Fragments	0
30 | Chimeric Alignment Rate	0
31 | Duplicate Reads	0
32 | End 1 Antisense	0
33 | End 2 Antisense	1
34 | End 1 Bases	76
35 | End 2 Bases	76
36 | End 1 Mapped Reads	1
37 | End 2 Mapped Reads	1
38 | End 1 Mismatches	0
39 | End 2 Mismatches	0
40 | End 1 Sense	1
41 | End 2 Sense	0
42 | Exonic Reads	2
43 | Failed Vendor QC	0
44 | High Quality Reads	2
45 | Intergenic Reads	0
46 | Intragenic Reads	2
47 | Ambiguous Reads	0
48 | Intronic Reads	0
49 | Low Mapping Quality	0
50 | Low Quality Reads	0
51 | Mapped Duplicate Reads	0
52 | Mapped Reads	2
53 | Mapped Unique Reads	2
54 | Mismatched Bases	0
55 | Non-Globin Reads	2
56 | Non-Globin Duplicate Reads	0
57 | Reads used for Intron/Exon counts	2
58 | rRNA Reads	0
59 | Total Bases	152
60 | Total Mapped Pairs	1
61 | Total Reads	2
62 | Unique Mapping, Vendor QC Passed Reads	2
63 | Unpaired Reads	0
64 | Read Length	76
65 | Genes Detected	0
66 | Estimated Library Complexity	0
67 | Genes used in 3' bias	0
68 | Mean 3' bias	0
69 | Median 3' bias	0
70 | 3' bias Std	0
71 | 3' bias MAD_Std	0
72 | 3' Bias, 25th Percentile	0
73 | 3' Bias, 75th Percentile	0
74 | Median of Avg Transcript Coverage	0
75 | Median of Transcript Coverage Std	0
76 | Median of Transcript Coverage CV	0
77 | Median Exon CV	nan
78 | Exon CV MAD	nan
79 | 


--------------------------------------------------------------------------------
/test_data/chr1.output/chr1.bam.metrics.tsv:
--------------------------------------------------------------------------------
 1 | Sample	chr1.bam
 2 | Mapping Rate	1
 3 | Unique Rate of Mapped	1
 4 | Duplicate Rate of Mapped	0
 5 | Duplicate Rate of Mapped, excluding Globins	0
 6 | Base Mismatch	0.00968147
 7 | End 1 Mapping Rate	1.01474
 8 | End 2 Mapping Rate	0.985262
 9 | End 1 Mismatch Rate	0.00253608
10 | End 2 Mismatch Rate	0.0170406
11 | Expression Profiling Efficiency	0.807719
12 | High Quality Rate	0.884446
13 | Exonic Rate	0.807719
14 | Intronic Rate	0.131935
15 | Intergenic Rate	0.0274077
16 | Intragenic Rate	0.939654
17 | Ambiguous Alignment Rate	0.0329382
18 | High Quality Exonic Rate	0.835902
19 | High Quality Intronic Rate	0.108092
20 | High Quality Intergenic Rate	0.0240545
21 | High Quality Intragenic Rate	0.943994
22 | High Quality Ambiguous Alignment Rate	0.0319513
23 | Discard Rate	0
24 | rRNA Rate	4.18304e-06
25 | End 1 Sense Rate	0.495471
26 | End 2 Sense Rate	0.503206
27 | Avg. Splits per Read	0.261769
28 | Alternative Alignments	229158
29 | Chimeric Fragments	0
30 | Chimeric Alignment Rate	0
31 | Duplicate Reads	0
32 | End 1 Antisense	498854
33 | End 2 Antisense	477326
34 | End 1 Bases	82963728
35 | End 2 Bases	80553844
36 | End 1 Mapped Reads	1091628
37 | End 2 Mapped Reads	1059919
38 | End 1 Mismatches	210403
39 | End 2 Mismatches	1372688
40 | End 1 Sense	489897
41 | End 2 Sense	483486
42 | Exonic Reads	1737846
43 | Failed Vendor QC	122510
44 | High Quality Reads	1902928
45 | Intergenic Reads	58969
46 | Intragenic Reads	2021710
47 | Ambiguous Reads	70868
48 | Intronic Reads	283864
49 | Low Mapping Quality	186732
50 | Low Quality Reads	248619
51 | Mapped Duplicate Reads	0
52 | Mapped Reads	2151547
53 | Mapped Unique Reads	2151547
54 | Mismatched Bases	1583091
55 | Non-Globin Reads	2151547
56 | Non-Globin Duplicate Reads	0
57 | Reads used for Intron/Exon counts	2151547
58 | rRNA Reads	9
59 | Total Bases	163517572
60 | Total Mapped Pairs	1052544
61 | Total Reads	2503215
62 | Unique Mapping, Vendor QC Passed Reads	2151547
63 | Unpaired Reads	0
64 | Read Length	76
65 | Genes Detected	1842
66 | Estimated Library Complexity	0
67 | Genes used in 3' bias	236
68 | Mean 3' bias	0.511031
69 | Median 3' bias	0.5
70 | 3' bias Std	0.293274
71 | 3' bias MAD_Std	0.353802
72 | 3' Bias, 25th Percentile	0.266282
73 | 3' Bias, 75th Percentile	0.75
74 | Median of Avg Transcript Coverage	0.629976
75 | Median of Transcript Coverage Std	0.905514
76 | Median of Transcript Coverage CV	1.06256
77 | Median Exon CV	0.313763
78 | Exon CV MAD	0.308991
79 | 


--------------------------------------------------------------------------------
/src/Fasta.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  Fasta.h
 3 | //  IntervalTree
 4 | //
 5 | //  Created by Aaron Graubert on 5/23/18.
 6 | //  Copyright © 2018 Aaron Graubert. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef Fasta_h
10 | #define Fasta_h
11 | 
12 | #include <string>
13 | #include <iostream>
14 | #include <fstream>
15 | #include <map>
16 | #include <unordered_map>
17 | #include <list>
18 | #include <bioio.hpp>
19 | #include <exception>
20 | 
21 | namespace rnaseqc {
22 |     struct fileException : public std::exception {
23 |         std::string error;
24 |         fileException(std::string msg) : error(msg) {};
25 |     };
26 |     
27 |     struct invalidContigException : public std::exception {
28 |         std::string error;
29 |         invalidContigException(std::string msg) : error(msg) {};
30 |     };
31 |     
32 |     typedef long long coord;
33 |     typedef unsigned long indexType;
34 |     typedef unsigned short chrom;
35 |     
36 |     static const double PAGE_SIZE = 1e6; // Size of each cache page (in bases)
37 |     static const unsigned short CACHE_SIZE = 10u; // How many pages are stored in the cache
38 |     
39 |     extern std::map<std::string, chrom> chromosomes;
40 |     
41 |     enum Strand {Forward, Reverse, Unknown};
42 |     chrom chromosomeMap(std::string);
43 |     
44 |     class Fasta {
45 |         // Represents an entire fasta file
46 |         // Uses the bioio library for quickly retrieving sequences
47 |         // Uses an internal LRU cache to minimize required reading
48 |         bool _open;
49 |         std::ifstream reader;
50 |         std::unordered_map<indexType, std::string> pageCache;
51 |         std::list<indexType> lru;
52 |         std::unordered_map<chrom, bioio::FastaContigIndex> contigIndex;
53 |         void updateLRU(indexType);
54 |         indexType pageForContig(chrom);
55 |         std::string readSeq(chrom, coord);
56 |         unsigned long calls, misses;
57 |     public:
58 |         Fasta() : _open(), reader(), pageCache(), lru(), contigIndex(), calls(), misses() {};
59 |         ~Fasta();
60 |         void open(std::string&);
61 |         std::string getSeq(chrom, coord, coord);
62 |         std::string getSeq(chrom, coord, coord, Strand);
63 |         indexType pageForCoord(chrom, coord);
64 |         coord pageOffset(indexType);
65 |         bool isOpen() const;
66 |         bool hasContig(chrom) const;
67 |         
68 |     };
69 |     
70 |     double gc(std::string&);
71 | }
72 | 
73 | #endif /* Fasta_h */
74 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 |     branches:
 8 |       - master
 9 | 
10 | jobs:
11 |   build-macos:
12 | 
13 |     runs-on:
14 |     - macos-latest
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |       with:
19 |         submodules: recursive
20 |         lfs: true
21 |     - name: Brew Update
22 |       run: brew update >/dev/null
23 |     - name: Reinstall
24 |       run: brew reinstall xz curl
25 |       continue-on-error: true
26 |     - name: Install Deps
27 |       run: brew install boost zlib curl samtools bzip2 xz && brew link --overwrite python@3.9
28 |     - name: Pip install
29 |       run: sudo python3 -m pip install --user --upgrade pip setuptools && sudo python3 -m pip install --user numpy && sudo python3 -m pip install --user -e ./python
30 |     - name: make
31 |       run: >
32 |         export ZLIB_PATH=$(ls /usr/local/Cellar/zlib/*/lib/libz.a)
33 |         LZMA_PATH=$(ls /usr/local/Cellar/xz/*/lib/liblzma.a) &&
34 |         make -f test_data/Makefile.osx
35 |     - name: Download Tests
36 |       run: >
37 |         cd test_data &&
38 |         wget https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz &&
39 |         tar xzf test_inputs.tar.gz &&
40 |         cd ..
41 |     - name: Run Tests
42 |       run: sudo bash -c "PYTHONPATH=$(pwd) make -f test_data/Makefile.osx test"
43 | 
44 |   build-linux:
45 | 
46 |     runs-on:
47 |     - ubuntu-latest
48 | 
49 |     steps:
50 |     - uses: actions/checkout@v2
51 |       with:
52 |         submodules: recursive
53 |         lfs: true
54 |     - name: Install deps
55 |       run: >
56 |         sudo apt-get update && sudo apt-get install -y cmake python3 python3-dev
57 |         libboost-filesystem-dev libboost-regex-dev libboost-system-dev libbz2-dev
58 |         liblzma-dev libpthread-stubs0-dev wget zlib1g-dev g++ &&
59 |         curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && sudo
60 |         python3 get-pip.py && python3 -m pip install --upgrade pip &&
61 |         python3 -m pip install numpy && python3 -m pip install -e ./python &&
62 |         python3 -m pip install --force-reinstall matplotlib
63 |     - name: make
64 |       run: make -f test_data/Makefile.linux
65 |     - name: Download Tests
66 |       run: >
67 |         cd test_data &&
68 |         wget https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz &&
69 |         tar xzf test_inputs.tar.gz &&
70 |         cd ..
71 |     - name: Run Tests
72 |       run: make -f test_data/Makefile.linux test
73 | 


--------------------------------------------------------------------------------
/test_data/chr1.output/chr1.cram.metrics.tsv:
--------------------------------------------------------------------------------
 1 | Sample	chr1.cram
 2 | Mapping Rate	1
 3 | Unique Rate of Mapped	1
 4 | Duplicate Rate of Mapped	0
 5 | Duplicate Rate of Mapped, excluding Globins	0
 6 | Base Mismatch	0.00968147
 7 | End 1 Mapping Rate	1.01474
 8 | End 2 Mapping Rate	0.985262
 9 | End 1 Mismatch Rate	0.00253608
10 | End 2 Mismatch Rate	0.0170406
11 | Expression Profiling Efficiency	0.807719
12 | High Quality Rate	0.884446
13 | Exonic Rate	0.807719
14 | Intronic Rate	0.131935
15 | Intergenic Rate	0.0274077
16 | Intragenic Rate	0.939654
17 | Ambiguous Alignment Rate	0.0329382
18 | High Quality Exonic Rate	0.835902
19 | High Quality Intronic Rate	0.108092
20 | High Quality Intergenic Rate	0.0240545
21 | High Quality Intragenic Rate	0.943994
22 | High Quality Ambiguous Alignment Rate	0.0319513
23 | Discard Rate	0
24 | rRNA Rate	4.18304e-06
25 | End 1 Sense Rate	0.495471
26 | End 2 Sense Rate	0.503206
27 | Avg. Splits per Read	0.261769
28 | Alternative Alignments	229158
29 | Chimeric Fragments	0
30 | Chimeric Alignment Rate	0
31 | Duplicate Reads	0
32 | End 1 Antisense	498854
33 | End 2 Antisense	477326
34 | End 1 Bases	82963728
35 | End 2 Bases	80553844
36 | End 1 Mapped Reads	1091628
37 | End 2 Mapped Reads	1059919
38 | End 1 Mismatches	210403
39 | End 2 Mismatches	1372688
40 | End 1 Sense	489897
41 | End 2 Sense	483486
42 | Exonic Reads	1737846
43 | Failed Vendor QC	122510
44 | High Quality Reads	1902928
45 | Intergenic Reads	58969
46 | Intragenic Reads	2021710
47 | Ambiguous Reads	70868
48 | Intronic Reads	283864
49 | Low Mapping Quality	186732
50 | Low Quality Reads	248619
51 | Mapped Duplicate Reads	0
52 | Mapped Reads	2151547
53 | Mapped Unique Reads	2151547
54 | Mismatched Bases	1583091
55 | Non-Globin Reads	2151547
56 | Non-Globin Duplicate Reads	0
57 | Reads used for Intron/Exon counts	2151547
58 | rRNA Reads	9
59 | Total Bases	163517572
60 | Total Mapped Pairs	1052544
61 | Total Reads	2503215
62 | Unique Mapping, Vendor QC Passed Reads	2151547
63 | Unpaired Reads	0
64 | Read Length	76
65 | Genes Detected	1842
66 | Estimated Library Complexity	0
67 | Genes used in 3' bias	236
68 | Mean 3' bias	0.511031
69 | Median 3' bias	0.5
70 | 3' bias Std	0.293274
71 | 3' bias MAD_Std	0.353802
72 | 3' Bias, 25th Percentile	0.266282
73 | 3' Bias, 75th Percentile	0.75
74 | Median of Avg Transcript Coverage	0.629976
75 | Median of Transcript Coverage Std	0.905514
76 | Median of Transcript Coverage CV	1.06256
77 | Median Exon CV	0.313763
78 | Exon CV MAD	0.308991
79 | Fragment GC Content Mean	0.453382
80 | Fragment GC Content Std	0.104727
81 | Fragment GC Content Skewness	0.25671
82 | Fragment GC Content Kurtosis	-0.673033
83 | 


--------------------------------------------------------------------------------
/test_data/approx_diff.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import subprocess
 3 | import pandas as pd
 4 | import numpy as np
 5 | 
 6 | 
 7 | def main(args):
 8 |     if args.mode == 'tables':
 9 |         df = pd.read_csv(args.input1, index_col=0, header=2, sep='\t').join(
10 |             pd.read_csv(args.input2, index_col=0, header=2, sep='\t'),
11 |             how='outer',
12 |             rsuffix='_'
13 |         )
14 |     elif args.mode == 'metrics' or args.mode == 'fragments':
15 |         df = pd.read_csv(args.input1, sep='\t', index_col=0).join(
16 |             pd.read_csv(args.input2, sep='\t', index_col=0),
17 |             how='outer',
18 |             rsuffix='_',
19 | 
20 |         )
21 | 
22 | 
23 |     assert not (df[args.columns[0]].isna() ^ df[args.columns[1]].isna()).any(), df[df.isna().any(1)]
24 |     assert len(df[np.abs(df[args.columns[0]] - df[args.columns[1]]) > args.tolerance]) == 0, df[np.abs(df[args.columns[0]] - df[args.columns[1]]) > args.tolerance].head()
25 |     if args.mode == 'fragments':
26 |         assert len(set(pd.read_csv(args.input1, sep='\t', index_col=0).index) ^ set(pd.read_csv(args.input2, sep='\t', index_col=0).index)) == 0
27 | 
28 | if __name__ == '__main__':
29 |     parser = argparse.ArgumentParser('legacy-test')
30 |     parser.add_argument(
31 |         'input1',
32 |         help='First input file'
33 |     )
34 |     parser.add_argument(
35 |         'input2',
36 |         help='Second input file'
37 |     )
38 |     parser.add_argument(
39 |         '-t', '--tolerance',
40 |         nargs='?',
41 |         type=float,
42 |         help="Tolerance for differing values. If not provided, this defaults to 0,"
43 |         " for exact comparison. If provided without argument, this defaults to .01,"
44 |         " which is usually good for checking modern vs legacy counts (which vary"
45 |         " slightly within Java's default precision). You can also provide a floating"
46 |         " point number to manually specify tolerance",
47 |         default=0.000001,
48 |         const=0.01
49 |     )
50 |     parser.add_argument(
51 |         '-m', '--mode',
52 |         choices=[
53 |             'metrics',
54 |             'tables',
55 |             'fragments'
56 |         ],
57 |         default='metrics',
58 |         help="What type of input file is being compared. Default: metrics"
59 |     )
60 |     parser.add_argument(
61 |         '-c', '--columns',
62 |         nargs=2,
63 |         help="Column names to load for 'tables'",
64 |         metavar=['COLUMN-A', 'COLUMN-B'],
65 |         default=('Counts', 'RNA-SeQC')
66 |     )
67 |     args = parser.parse_args()
68 |     main(args)
69 | 


--------------------------------------------------------------------------------
/test_data/downsampled.output/downsampled.bam.metrics.tsv:
--------------------------------------------------------------------------------
 1 | Sample	downsampled.bam
 2 | Mapping Rate	0.354336
 3 | Unique Rate of Mapped	1
 4 | Duplicate Rate of Mapped	0
 5 | Duplicate Rate of Mapped, excluding Globins	0
 6 | Base Mismatch	0.0100175
 7 | End 1 Mapping Rate	0.359515
 8 | End 2 Mapping Rate	0.349158
 9 | End 1 Mismatch Rate	0.00267655
10 | End 2 Mismatch Rate	0.0175762
11 | Expression Profiling Efficiency	0.275876
12 | High Quality Rate	0.881642
13 | Exonic Rate	0.778571
14 | Intronic Rate	0.114795
15 | Intergenic Rate	0.0700429
16 | Intragenic Rate	0.893366
17 | Ambiguous Alignment Rate	0.0365912
18 | High Quality Exonic Rate	0.811945
19 | High Quality Intronic Rate	0.111951
20 | High Quality Intergenic Rate	0.0397759
21 | High Quality Intragenic Rate	0.923895
22 | High Quality Ambiguous Alignment Rate	0.0363289
23 | Discard Rate	0
24 | rRNA Rate	0.00747698
25 | End 1 Sense Rate	0.49428
26 | End 2 Sense Rate	0.507332
27 | Avg. Splits per Read	0.256852
28 | Alternative Alignments	275296
29 | Chimeric Fragments	0
30 | Chimeric Alignment Rate	0
31 | Duplicate Reads	0
32 | End 1 Antisense	437593
33 | End 2 Antisense	413825
34 | End 1 Bases	76069312
35 | End 2 Bases	73877928
36 | End 1 Mapped Reads	1000912
37 | End 2 Mapped Reads	972078
38 | End 1 Mismatches	203603
39 | End 2 Mismatches	1298496
40 | End 1 Sense	427694
41 | End 2 Sense	426143
42 | Exonic Reads	1536112
43 | Failed Vendor QC	541350
44 | High Quality Reads	1739471
45 | Intergenic Reads	138194
46 | Intragenic Reads	1762602
47 | Ambiguous Reads	72194
48 | Intronic Reads	226490
49 | Low Mapping Quality	3766708
50 | Low Quality Reads	233519
51 | Mapped Duplicate Reads	0
52 | Mapped Reads	1972990
53 | Mapped Unique Reads	1972990
54 | Mismatched Bases	1502099
55 | Non-Globin Reads	1961273
56 | Non-Globin Duplicate Reads	0
57 | Reads used for Intron/Exon counts	1972990
58 | rRNA Reads	14752
59 | Total Bases	149947240
60 | Total Mapped Pairs	963702
61 | Total Reads	6384776
62 | Unique Mapping, Vendor QC Passed Reads	5568130
63 | Unpaired Reads	0
64 | Read Length	76
65 | Genes Detected	11590
66 | Estimated Library Complexity	0
67 | Genes used in 3' bias	153
68 | Mean 3' bias	0.601911
69 | Median 3' bias	0.591125
70 | 3' bias Std	0.314833
71 | 3' bias MAD_Std	0.445799
72 | 3' Bias, 25th Percentile	0.375
73 | 3' Bias, 75th Percentile	0.928571
74 | Average Fragment Length	182.276
75 | Fragment Length Median	164
76 | Fragment Length Std	69.7425
77 | Fragment Length MAD_Std	56.3388
78 | Median of Avg Transcript Coverage	0.0201045
79 | Median of Transcript Coverage Std	0.141663
80 | Median of Transcript Coverage CV	1.44311
81 | Median Exon CV	0.587046
82 | Exon CV MAD	0.495332
83 | 


--------------------------------------------------------------------------------
/test_data/legacy.output/downsampled.bam.metrics.tsv:
--------------------------------------------------------------------------------
 1 | Sample	downsampled.bam
 2 | Mapping Rate	0.354336
 3 | Unique Rate of Mapped	1
 4 | Duplicate Rate of Mapped	0
 5 | Duplicate Rate of Mapped, excluding Globins	nan
 6 | Base Mismatch	0.0100151
 7 | End 1 Mapping Rate	0.359421
 8 | End 2 Mapping Rate	0.34904
 9 | End 1 Mismatch Rate	0.00267648
10 | End 2 Mismatch Rate	0.017572
11 | Expression Profiling Efficiency	0.272945
12 | High Quality Rate	0.881406
13 | Exonic Rate	0.770301
14 | Intronic Rate	0.159378
15 | Intergenic Rate	0.0700237
16 | Intragenic Rate	0.929679
17 | Ambiguous Alignment Rate	0
18 | High Quality Exonic Rate	0.803019
19 | High Quality Intronic Rate	0.157213
20 | High Quality Intergenic Rate	0.0397675
21 | High Quality Intragenic Rate	0.960232
22 | High Quality Ambiguous Alignment Rate	0
23 | Discard Rate	0.000297518
24 | rRNA Rate	0.00747647
25 | End 1 Sense Rate	0.494116
26 | End 2 Sense Rate	0.507167
27 | Avg. Splits per Read	0.256225
28 | Alternative Alignments	275296
29 | Chimeric Fragments	0
30 | Chimeric Alignment Rate	0
31 | Duplicate Reads	0
32 | End 1 Antisense	435833
33 | End 2 Antisense	412263
34 | End 1 Bases	76049552
35 | End 2 Bases	73853076
36 | End 1 Mapped Reads	1000652
37 | End 2 Mapped Reads	971751
38 | End 1 Mismatches	203545
39 | End 2 Mismatches	1297746
40 | End 1 Sense	425694
41 | End 2 Sense	424254
42 | Exonic Reads	1519796
43 | Failed Vendor QC	541350
44 | High Quality Reads	1739006
45 | Intergenic Reads	138156
46 | Intragenic Reads	1834247
47 | Ambiguous Reads	0
48 | Intronic Reads	314451
49 | Low Mapping Quality	3766708
50 | Low Quality Reads	233397
51 | Mapped Duplicate Reads	0
52 | Mapped Reads	1972990
53 | Mapped Unique Reads	1972990
54 | Mismatched Bases	1501291
55 | Non-Globin Reads	0
56 | Non-Globin Duplicate Reads	0
57 | Reads used for Intron/Exon counts	1972403
58 | rRNA Reads	14751
59 | Split Reads	395803
60 | Total Bases	149902628
61 | Total Mapped Pairs	963448
62 | Total Reads	6384776
63 | Unique Mapping, Vendor QC Passed Reads	5568130
64 | Unpaired Reads	0
65 | Read Length	76
66 | Genes Detected	11614
67 | Estimated Library Complexity	0
68 | Genes used in 3' bias	151
69 | Mean 3' bias	0.595942
70 | Median 3' bias	0.574761
71 | 3' bias Std	0.314981
72 | 3' bias MAD_Std	0.445135
73 | 3' Bias, 25th Percentile	0.374545
74 | 3' Bias, 75th Percentile	0.928571
75 | Average Fragment Length	182.276
76 | Fragment Length Median	164
77 | Fragment Length Std	69.7425
78 | Fragment Length MAD_Std	56.3388
79 | Median of Avg Transcript Coverage	0.0202775
80 | Median of Transcript Coverage Std	0.141889
81 | Median of Transcript Coverage CV	1.44268
82 | Median Exon CV	0.58593
83 | Exon CV MAD	0.495322
84 | 


--------------------------------------------------------------------------------
/python/rnaseqc/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import os
 4 | import subprocess
 5 | 
 6 | def get_filepath(name):
 7 |     # return os.path.join(os.path.dirname(os.path.abspath(__file__)), name)
 8 |     return 'rnaseqc.{}'.format(name)
 9 | 
10 | if __name__ == '__main__':
11 |     parser = argparse.ArgumentParser('rnaseqc')
12 | 
13 |     subparsers = parser.add_subparsers(dest='command')
14 | 
15 |     commands = {
16 |         'aggregate': get_filepath('aggregate'),
17 |         'notebook': get_filepath('create_notebook'),
18 |         'insert-size': get_filepath('insert_size_intervals'),
19 |         'legacy-exons': get_filepath('legacy_exon_remap'),
20 |         'report': get_filepath('report'),
21 |         'run': get_filepath('run'),
22 |     }
23 | 
24 |     run_parser = subparsers.add_parser(
25 |         'run',
26 |         help='A light wrapper with some convenience functions to run RNA-SeQC',
27 |         description='A light wrapper with some convenience functions to run RNA-SeQC',
28 |         add_help=False
29 |     )
30 | 
31 |     aggregate_parser = subparsers.add_parser(
32 |         'aggregate',
33 |         help='Aggregate RNA-SeQC outputs from multiple samples',
34 |         description='Aggregate RNA-SeQC outputs from multiple samples',
35 |         add_help=False
36 |     )
37 | 
38 |     notebook_parser = subparsers.add_parser(
39 |         'notebook',
40 |         help='Generate a notebook with figures comparing outputs from multiple samples',
41 |         description='Generate a notebook with figures comparing outputs from multiple samples',
42 |         add_help=False
43 |     )
44 | 
45 |     report_parser = subparsers.add_parser(
46 |         'report',
47 |         help='Generate PDF figures from aggregated RNA-SeQC results',
48 |         description='Generate PDF figures from aggregated RNA-SeQC results',
49 |         add_help=False
50 |     )
51 | 
52 |     intervals_parser = subparsers.add_parser(
53 |         'insert-size',
54 |         help='Generate a BED file with long (>1000bp), high-mappability intervals for estimating insert sizes',
55 |         description='Generate a BED file with long (>1000bp), high-mappability intervals for estimating insert sizes',
56 |         add_help=False
57 |     )
58 | 
59 |     legacy_parser = subparsers.add_parser(
60 |         'legacy-exons',
61 |         help='Renames exons in exon_reads.gct file from RNA-SeQC 2 to use naming convention from RNA-SeQC 1.1.x',
62 |         description='Renames exons in exon_reads.gct file from RNA-SeQC 2 to use naming convention from RNA-SeQC 1.1.x',
63 |         add_help=False
64 |     )
65 | 
66 |     args, remainder = parser.parse_known_args()
67 |     if args.command in commands:
68 |         os.execvp(sys.executable, [sys.executable, '-m', commands[args.command]] + remainder)
69 |     else:
70 |         parser.print_usage()
71 |         sys.exit('A valid subcommand must be provided.')
72 | 


--------------------------------------------------------------------------------
/python/rnaseqc/create_notebook.py:
--------------------------------------------------------------------------------
 1 | from . import nb_encode as nbe
 2 | import argparse
 3 | import subprocess
 4 | 
 5 | 
 6 | def main(args):
 7 | 
 8 |     nb = nbe.Notebook()
 9 | 
10 |     nb.add_markdown_cell(
11 |         '# RNA-SeQC metrics report',
12 |     )
13 | 
14 |     nb.add_code_cell([
15 |         'import pandas as pd',
16 |         'import qtl.io',
17 |         'import rnaseqc.report'
18 |     ])
19 | 
20 |     cell = [
21 |         "# load inputs",
22 |         "metrics_df = pd.read_csv('{}', sep='\\t', index_col=0)".format(args.metrics),
23 |     ]
24 |     if args.tpm is not None:
25 |         cell.append("tpm_df = qtl.io.read_gct('{}')".format(args.tpm))
26 |     if args.cohort is not None:
27 |         cell.append("cohort_s = pd.read_csv('{}', sep='\\t', index_col=0, header=None).squeeze('columns')".format(args.cohort))
28 |     if args.date is not None:
29 |         cell.append("date_s = pd.read_csv('{}', sep='\\t', index_col=0, header=None).squeeze('columns')".format(args.date))
30 |     if args.insert_size is not None:
31 |         cell.append("insertsize_df = pd.read_csv('{}', sep='\\t', index_col=0)".format(args.insert_size))
32 |     nb.add_code_cell(cell)
33 | 
34 |     nb.add_code_cell([
35 |         "thresholds = {'Exonic Rate': 0.7}",
36 |         'rnaseqc.report.plot_qc_figures(metrics_df, cohort_s={}, cohort_colors=None, date_s={},'.format(
37 |             'cohort_s' if args.cohort is not None else 'None', 'date_s' if args.date is not None else 'None'),
38 |         '                               show_legend=True, ms=12, alpha=1, highlight_ids=None,',
39 |         '                               thresholds=thresholds, insertsize_df={}, tpm_df={})'.format(
40 |             'insertsize_df' if args.insert_size is not None else 'None', 'tpm_df' if args.tpm is not None else 'None'),
41 |     ])
42 | 
43 |     nb.add_code_cell('')
44 |     nb.write(args.output)
45 | 
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     parser = argparse.ArgumentParser('rnaseqc-plot')
50 |     parser.add_argument('metrics', help='Aggregated metrics')
51 |     parser.add_argument('output', type=argparse.FileType('w'),
52 |                         help="Output python notebook")
53 |     parser.add_argument('-t', '--tpm', default=None, help='Aggregated TPM')
54 |     parser.add_argument('-i', '--insert-size', default=None,
55 |                         help='Aggregated insert size distributions')
56 |     parser.add_argument('-c', '--cohort', default=None,
57 |                         help='TSV file mapping sample IDs to cohort/batch IDs')
58 |     parser.add_argument('-d', '--date', default=None,
59 |                         help='TSV file mapping sample IDs to dates')
60 |     args = parser.parse_args()
61 | 
62 |     # generate notebook
63 |     main(args)
64 | 
65 |     # execute notebook
66 |     subprocess.check_call('jupyter nbconvert --execute --ExecutePreprocessor.timeout=300 --inplace {}'.format(args.output.name), shell=True)
67 | 


--------------------------------------------------------------------------------
/python/rnaseqc/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | import argparse
 4 | import subprocess
 5 | from datetime import datetime
 6 | import os
 7 | import sys
 8 | 
 9 | def locate_rnaseqc():
10 |     """
11 |     Search PATH for executable, then try up two directories
12 |     (where compiled executable would exist in the git repo)
13 |     """
14 |     for path in os.environ['PATH'].split(os.pathsep):
15 |         exe = os.path.join(path, 'rnaseqc')
16 |         if test_rnaseqc(exe):
17 |             return exe
18 |     exe = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'rnaseqc')
19 |     if test_rnaseqc(exe):
20 |         return exe
21 |     print("Unable to find rnaseqc executable", file=sys.stderr)
22 |     return 'rnaseqc' # Just try it and see what happens, I guess
23 | 
24 | def test_rnaseqc(path):
25 |     return os.path.isfile(path) and os.access(path, os.X_OK) and subprocess.run([path, '--version'], stdout=subprocess.PIPE).stdout.startswith(b'RNASeQC 2')
26 | 
27 | if __name__ == '__main__':
28 |     parser = argparse.ArgumentParser(description='Wrapper for RNA-SeQC 2')
29 |     parser.add_argument('genes_gtf', type=str, help='Gene annotation GTF')
30 |     parser.add_argument('bam_file', type=str, help='BAM file')
31 |     parser.add_argument('prefix', type=str, default='Reads', help='Prefix for output files; usually sample_id')
32 |     parser.add_argument('-o', '--output_dir', default=os.getcwd(), help='Output directory')
33 |     parser.add_argument('-q', '--mapping-quality', default=None, type=int, help="Lower bound on read quality for reads used in coverage metrics")
34 |     parser.add_argument('-m', '--mismatch-threshold', default=None, type=int, help="Maximum allowed mismatches in a read while still used for coverage metrics")
35 |     parser.add_argument('-c', '--coverage', action='store_true', help="Include raw coverage metrics in a separate output table. By default, only summary statistics are included in metrics")
36 |     parser.add_argument('--stranded', default=None, choices=['rf', 'fr'], help='Strandedness for stranded libraries')
37 |     parser.add_argument('--bed', default=None, help='BED file with intervals for estimating insert size distribution')
38 |     args = parser.parse_args()
39 | 
40 |     print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Running RNA-SeQC', flush=True)
41 | 
42 |     cmd = f"{locate_rnaseqc()} {args.genes_gtf} {args.bam_file} {args.output_dir}" \
43 |         + f" -s {args.prefix}" \
44 |         + ' -vv'
45 |     if args.stranded is not None:
46 |         cmd += f" --stranded {args.stranded}"
47 |     if args.bed is not None:
48 |         cmd += f" --bed {args.bed}"
49 |     if args.mapping_quality is not None:
50 |         cmd += f" --mapping-quality {args.mapping_quality}"
51 |     if args.mismatch_threshold is not None:
52 |         cmd += f" --base-mismatch {args.mismatch_threshold}"
53 |     if args.coverage:
54 |         cmd += ' --coverage'
55 |     print(f'  * command: "{cmd}"', flush=True)
56 |     subprocess.check_call(cmd, shell=True)
57 | 
58 |     # gzip GCTs
59 |     subprocess.check_call('gzip {0}.exon_reads.gct {0}.gene_tpm.gct {0}.gene_reads.gct {0}.gene_fragments.gct'.format(args.prefix), shell=True)
60 |     if args.coverage:
61 |         subprocess.check_call(f'gzip {args.prefix}.coverage.tsv', shell=True)
62 | 
63 |     print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Finished RNA-SeQC', flush=True)
64 | 


--------------------------------------------------------------------------------
/.github/workflows/Deployment.yml:
--------------------------------------------------------------------------------
 1 | name: Deployment
 2 | on:
 3 |   release:
 4 |     types:
 5 |       - published
 6 | 
 7 | jobs:
 8 |   build-macos:
 9 | 
10 |     runs-on:
11 |     - macos-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |       with:
16 |         submodules: recursive
17 |     - name: Brew Update
18 |       run: brew update >/dev/null
19 |     - name: Reinstall
20 |       run: brew reinstall xz curl
21 |       continue-on-error: true
22 |     - name: Install Deps
23 |       run: brew install boost zlib curl samtools bzip2 xz && brew link --overwrite python@3.9
24 |     - name: Pip install
25 |       run: sudo python3 -m pip install --user --upgrade pip setuptools && sudo python3 -m pip install --user numpy && sudo python3 -m pip install --user -e ./python
26 |     - name: Zip Source
27 |       run: >
28 |         cd .. && tar --strip-components=4 -czf rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz $GITHUB_WORKSPACE &&
29 |         cd $GITHUB_WORKSPACE && mv ../rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz .
30 |     - name: make
31 |       run: >
32 |         export ZLIB_PATH=$(ls /usr/local/Cellar/zlib/*/lib/libz.a)
33 |         LZMA_PATH=$(ls /usr/local/Cellar/xz/*/lib/liblzma.a) &&
34 |         make -f test_data/Makefile.osx &&
35 |         gzip -c rnaseqc > rnaseqc.${{ github.event.release.tag_name }}.macos.gz && ls -l
36 |     - name: Upload Executable
37 |       uses: actions/upload-release-asset@v1
38 |       env:
39 |         GITHUB_TOKEN: ${{ secrets.OAUTH_TOKEN }}
40 |       with:
41 |         upload_url: https://uploads.github.com/repos/getzlab/rnaseqc/releases/${{ github.event.release.id }}/assets?name=rnaseqc.${{ github.event.release.tag_name }}.macos.gz
42 |         asset_path: rnaseqc.${{ github.event.release.tag_name }}.macos.gz
43 |         asset_name: rnaseqc.${{ github.event.release.tag_name }}.macos.gz
44 |         asset_content_type: application/gzip
45 |     - name: Upload Zip
46 |       uses: actions/upload-release-asset@v1
47 |       env:
48 |         GITHUB_TOKEN: ${{ secrets.OAUTH_TOKEN }}
49 |       with:
50 |         upload_url: https://uploads.github.com/repos/getzlab/rnaseqc/releases/${{ github.event.release.id }}/assets?name=rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz
51 |         asset_path: rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz
52 |         asset_name: rnaseqc.${{ github.event.release.tag_name }}.full_source.tar.gz
53 |         asset_content_type: application/gzip
54 | 
55 |   build-linux:
56 | 
57 |     runs-on:
58 |     - ubuntu-latest
59 | 
60 |     steps:
61 |     - uses: actions/checkout@v2
62 |       with:
63 |         submodules: recursive
64 |     - name: Install deps
65 |       run: >
66 |         sudo apt-get update && sudo apt-get install -y cmake python3 python3-dev
67 |         libboost-filesystem-dev libboost-regex-dev libboost-system-dev libbz2-dev
68 |         liblzma-dev libpthread-stubs0-dev wget zlib1g-dev g++ &&
69 |         curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && sudo
70 |         python3 get-pip.py && python3 -m pip install --upgrade pip &&
71 |         python3 -m pip install numpy && python3 -m pip install -e ./python &&
72 |         python3 -m pip install --force-reinstall matplotlib
73 |     - name: make
74 |       run: make -f test_data/Makefile.linux && gzip -c rnaseqc > rnaseqc.${{ github.event.release.tag_name }}.linux.gz
75 |     - name: Upload Executable
76 |       uses: actions/upload-release-asset@v1
77 |       env:
78 |         GITHUB_TOKEN: ${{ secrets.OAUTH_TOKEN }}
79 |       with:
80 |         upload_url: https://uploads.github.com/repos/getzlab/rnaseqc/releases/${{ github.event.release.id }}/assets?name=rnaseqc.${{ github.event.release.tag_name }}.linux.gz
81 |         asset_path: rnaseqc.${{ github.event.release.tag_name }}.linux.gz
82 |         asset_name: rnaseqc.${{ github.event.release.tag_name }}.linux.gz
83 |         asset_content_type: application/gzip
84 | 


--------------------------------------------------------------------------------
/src/BamReader.h:
--------------------------------------------------------------------------------
  1 | //
  2 | //  BamReader.hpp
  3 | //  RNA-SeQC
  4 | //
  5 | //  Created by Aaron Graubert on 10/3/18.
  6 | //  Copyright © 2018 Aaron Graubert. All rights reserved.
  7 | //
  8 | 
  9 | #ifndef BamReader_h
 10 | #define BamReader_h
 11 | 
 12 | #include "Fasta.h"
 13 | #include <stdio.h>
 14 | #include <mutex>
 15 | #include <string>
 16 | #include <set>
 17 | #include <SeqLib/BamReader.h>
 18 | #include <SeqLib/BamHeader.h>
 19 | #include <SeqLib/BamRecord.h>
 20 | #include <htslib/cram/cram.h> // I really don't like using unofficial APIs, but not much choice here.
 21 | 
 22 | namespace rnaseqc {
 23 | 
 24 |     struct referenceHTSMismatch : public std::exception {
 25 |         std::string error;
 26 |         referenceHTSMismatch(std::string msg) : error(msg) {};
 27 |     };
 28 | 
 29 | 
 30 |     class SynchronizedReader {
 31 |         std::mutex mtx;
 32 |     protected:
 33 |         unsigned long read_count;
 34 |     public:
 35 |         SynchronizedReader() : mtx(), read_count() {
 36 |             
 37 |         }
 38 |         
 39 |         void lock()
 40 |         {
 41 |             this->mtx.lock();
 42 |         }
 43 |         
 44 |         void unlock()
 45 |         {
 46 |             this->mtx.unlock();
 47 |         }
 48 |         
 49 |         unsigned long get_count() const
 50 |         {
 51 |             return this->read_count;
 52 |         }
 53 |     };
 54 |     
 55 |     class SeqlibReader : public SynchronizedReader {
 56 |         SeqLib::BamReader bam;
 57 |         std::string reference_path;
 58 |         std::set<chrom> valid_chroms;
 59 |         bool user_cram_reference;
 60 |     public:
 61 |         
 62 |         SeqlibReader() : reference_path(), valid_chroms(), user_cram_reference(false) {}
 63 |         
 64 |         bool next(SeqLib::BamRecord&);
 65 |         
 66 |         const SeqLib::BamHeader getHeader() const {
 67 |             return this->bam.Header();
 68 |         }
 69 |         
 70 |         bool open(std::string filepath) {
 71 |             if (this->reference_path.length()) {
 72 |                 auto htsfile = hts_open(filepath.c_str(), "r");
 73 |                 hts_set_fai_filename(htsfile, this->reference_path.c_str());
 74 |                 if (htsfile->format.format == htsExactFormat::cram) {
 75 |                     this->user_cram_reference = true;
 76 |                     // Cram handling is very dumb. All of this nonsense is just because htslib is incredibly opaque about reference handling
 77 |                     // Even with a user-provided reference, htslib only uses it if the MD5 matches
 78 |                     // So here we load up the file, check if it's a cram, then get a list of chromosomes that htslib decides to use
 79 |                     cram_fd *cram = static_cast<cram_fd*>(htsfile->fp.cram);
 80 |                     if (cram->refs && cram->refs->nref > 0)
 81 |                         for (unsigned int i = 0; i < cram->refs->nref; ++i)
 82 |                             if (this->reference_path == std::string(cram->refs->ref_id[i]->fn))
 83 |                                 this->valid_chroms.insert(
 84 |                                     chromosomeMap(cram->refs->ref_id[i]->name)
 85 |                                 );
 86 |                     this->bam.SetCramReference(this->reference_path); // Consider moving out of if statement, if there's any meaningful use to having a reference set on a non-cram
 87 |                 }
 88 |                 hts_close(htsfile);
 89 |             }
 90 |             this->bam.Open(filepath);
 91 |             return this->bam.IsOpen();
 92 |         }
 93 |         
 94 |         void addReference(std::string filepath) {
 95 |             this->reference_path = filepath;
 96 |         }
 97 |         
 98 |         inline bool validateChromosome(const chrom c) {
 99 |             // For crams, we only validate chromosomes which matched our reference. Otherwise yes!
100 |             return this->user_cram_reference ? this->valid_chroms.count(c) > 0 : true;
101 |         }
102 |         
103 |     };
104 |     
105 |     typedef SeqLib::BamRecord Alignment;
106 | }
107 | 
108 | #endif /* BamReader_h */
109 | 


--------------------------------------------------------------------------------
/python/rnaseqc/legacy_exon_remap.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from tqdm import tqdm
  3 | import subprocess
  4 | import csv
  5 | import shutil
  6 | from qtl.annotation import Annotation
  7 | import tempfile
  8 | 
  9 | def run(args):
 10 |     print("Parsing GTF")
 11 |     gtf = Annotation(args.gtf.name)
 12 |     print("Parsing GCT")
 13 |     numRows = int(subprocess.check_output(f"wc -l {args.gct.name}", shell=True).decode().strip().split()[0]) - 3
 14 |     header = ''.join([next(args.gct), next(args.gct)])
 15 |     reader = csv.DictReader(args.gct, delimiter='\t')
 16 |     w = tempfile.NamedTemporaryFile('w')
 17 |     w.write(header)
 18 |     writer = csv.DictWriter(w, reader.fieldnames, delimiter='\t', lineterminator='\n')
 19 |     writer.writeheader()
 20 |     current = None
 21 |     features = []
 22 |     for line in tqdm(reader, total=numRows):
 23 |         gene = '_'.join(line['Name'].split('_')[:-1])
 24 |         if gene != current:
 25 |             if current is not None:
 26 |                 ref = gtf.get_gene(current)
 27 |                 try:
 28 |                     if len(ref):
 29 |                         ref = ref[0]
 30 |                 except:
 31 |                     pass
 32 |                 exons = {exon.id:exon for transcript in ref.transcripts for exon in transcript.exons}
 33 |                 raw_size = len(exons)
 34 |                 for exon in [exon for exon in exons]:
 35 |                     try:
 36 |                         if exon.isdigit() and int(exon) <= raw_size:
 37 |                             exons[current+'_'+exon] = exons[exon]
 38 |                     except:
 39 |                         pass
 40 |                 features.sort(
 41 |                     key=lambda feat:(
 42 |                         1 if exons[feat['Name']].length == 1 else 0,
 43 |                         exons[feat['Name']].start_pos,
 44 |                         exons[feat['Name']].end_pos
 45 |                     )
 46 |                 )
 47 |                 for i in range(len(features)):
 48 |                     parts = features[i]['Name'].split('_')
 49 |                     prefix = '_'.join(parts[:-1])
 50 |                     suffix = parts[-1]
 51 |                     if exons[features[i]['Name']].length == 1:
 52 |                         features[i][reader.fieldnames[-1]] = 0
 53 |                     suffix = str(i)
 54 |                     features[i]['Name'] = prefix+'_'+suffix
 55 |                 writer.writerows(features)
 56 |             current = gene
 57 |             features = []
 58 |         features.append({k:v for k,v in line.items()})
 59 |     if len(features):
 60 |         ref = gtf.get_gene(current)
 61 |         try:
 62 |             if len(ref):
 63 |                 ref = ref[0]
 64 |         except:
 65 |             pass
 66 |         exons = {exon.id:exon for transcript in ref.transcripts for exon in transcript.exons}
 67 |         raw_size = len(exons)
 68 |         for exon in [exon for exon in exons]:
 69 |             try:
 70 |                 if exon.isdigit() and int(exon) <= raw_size:
 71 |                     exons[current+'_'+exon] = exons[exon]
 72 |             except:
 73 |                 pass
 74 |         features.sort(
 75 |             key=lambda feat:(
 76 |                 1 if exons[feat['Name']].length == 1 else 0,
 77 |                 exons[feat['Name']].start_pos,
 78 |                 exons[feat['Name']].end_pos
 79 |             )
 80 |         )
 81 |         for i in range(len(features)):
 82 |             prefix, suffix = features[i]['Name'].split('_')
 83 |             if exons[features[i]['Name']].length == 1:
 84 |                 features[i]['Counts'] = 0
 85 |             suffix = str(i)
 86 |             features[i]['Name'] = prefix+'_'+suffix
 87 |             writer.writerows(features)
 88 |     print("Cleaning up")
 89 |     w.flush()
 90 |     args.gct.close()
 91 |     shutil.copyfile(args.gct.name, args.gct.name+'.bak')
 92 |     shutil.copyfile(w.name, args.gct.name)
 93 | 
 94 | 
 95 | def main():
 96 |     parser = argparse.ArgumentParser('flipper')
 97 |     parser.add_argument('gct', type=argparse.FileType('r'), help="RNA-SeQC 2 Exon reads gct file")
 98 |     parser.add_argument('gtf', type=argparse.FileType('r'), help="Reference GTF for the exons")
 99 |     args = parser.parse_args()
100 |     run(args)
101 | 
102 | if __name__ == '__main__':
103 |     main()
104 | 


--------------------------------------------------------------------------------
/python/rnaseqc/nb_encode.py:
--------------------------------------------------------------------------------
  1 | # Author: Aaron Graubert  https://github.com/agraubert
  2 | import nbformat as nbf
  3 | import base64
  4 | import io
  5 | import json
  6 | import sys
  7 | 
  8 | def trim(docstring):
  9 |     if not docstring:
 10 |         return ''
 11 |     # Convert tabs to spaces (following the normal Python rules)
 12 |     # and split into a list of lines:
 13 |     lines = docstring.expandtabs().splitlines()
 14 |     # Determine minimum indentation (first line doesn't count):
 15 |     indent = sys.maxsize
 16 |     for line in lines[1:]:
 17 |         stripped = line.lstrip()
 18 |         if stripped:
 19 |             indent = min(indent, len(line) - len(stripped))
 20 |     # Remove indentation (first line is special):
 21 |     trimmed = [lines[0].strip()]
 22 |     if indent < sys.maxsize:
 23 |         for line in lines[1:]:
 24 |             trimmed.append(line[indent:].rstrip())
 25 |     # Strip off trailing and leading blank lines:
 26 |     while trimmed and not trimmed[-1]:
 27 |         trimmed.pop()
 28 |     while trimmed and not trimmed[0]:
 29 |         trimmed.pop(0)
 30 |     # Return a single string:
 31 |     return '\n'.join(trimmed)
 32 | 
 33 | def encode_figure(figure, **kwargs):
 34 |     img = io.BytesIO()
 35 |     figure.savefig(img, **kwargs)
 36 |     img.seek(0,0)
 37 |     return nbf.v4.new_output(
 38 |         'display_data',
 39 |         {
 40 |             'text/plain': [repr(figure)],
 41 |             'image/png': base64.b64encode(img.read()).decode()
 42 |         }
 43 |     )
 44 | 
 45 | def encode_dataframe(df, n, **kwargs):
 46 |     return nbf.v4.new_output(
 47 |         'execute_result',
 48 |         {
 49 |             'text/plain': [df.to_string()],
 50 |             'text/html': [df.to_html(**kwargs)]
 51 |         },
 52 |         execution_count=n
 53 |     )
 54 | 
 55 | def encode_output(obj, n):
 56 | 
 57 |     return nbf.v4.new_output(
 58 |         'execute_result',
 59 |         {'text/plain': [repr(obj)]},
 60 |         execution_count=n
 61 |     ) if obj is not None else None
 62 | 
 63 | class Notebook(object):
 64 |     """
 65 |     Wrapper to nbformat Notebook
 66 |     """
 67 |     def __init__(self, header=None):
 68 |         self.nb = nbf.v4.new_notebook()
 69 |         if header is not None:
 70 |             self.add_markdown_cell(header, '---', 'Created by the nb_encode api')
 71 |         self.exec_count = 1
 72 | 
 73 |     def add_markdown_cell(self, *lines):
 74 |         lines = [line.rstrip()+'\n' for line in lines]
 75 |         lines[-1] = lines[-1][:-1]
 76 |         self.nb['cells'].append(nbf.v4.new_markdown_cell(lines))
 77 | 
 78 |     def add_code_cell(self, source, *outputs, **kwargs):
 79 |         if isinstance(source, list):
 80 |             source = '\n'.join(line.rstrip() for line in source)
 81 |         self.nb['cells'].append(nbf.v4.new_code_cell(
 82 |             source,
 83 |             execution_count=self.exec_count,
 84 |             outputs=[
 85 |                 encode_output(output, self.exec_count)
 86 |                 if not isinstance(output, nbf.notebooknode.NotebookNode)
 87 |                 else output
 88 |                 for output in outputs
 89 |                 if output is not None
 90 |             ],
 91 |             **kwargs
 92 |         ))
 93 |         self.exec_count += 1
 94 | 
 95 |     def write(self, dest):
 96 |         if isinstance(dest, str):
 97 |             with open(dest, 'w') as w:
 98 |                 nbf.write(self.nb, w)
 99 |         else:
100 |             nbf.write(self.nb, dest.name)
101 | 
102 | def encode_plot_cell(cell, source, result, figure):
103 |     img = io.BytesIO()
104 |     figure.savefig(img)
105 |     img.seek(0,0)
106 |     img = base64.b64encode(img.read())
107 |     output_cell = nbf.v4.new_code_cell(
108 |         source,
109 |         outputs=[
110 |             nbf.v4.new_output(
111 |                 'execute_result',
112 |                 {
113 |                     'text/plain': [result]
114 |                 },
115 |                 execution_count=cell
116 |             ),
117 |             nbf.v4.new_output(
118 |                 'display_data',
119 |                 {
120 |                     'text/plain': [repr(figure)],
121 |                     'image/png': img.decode()
122 |                 }
123 |             )
124 |         ]
125 |     )
126 |     return output_cell
127 | 
128 | def encode_standard_cell(cell):
129 |     source = eval('_i%d'%cell)
130 |     try:
131 |         result = repr(eval('_%d'%cell))
132 |     except:
133 |         result = None
134 |     output_cell = nbf.v4.new_code_cell(
135 |         source,
136 |         outputs=([
137 |             nbf.v4.new_output(
138 |                 'execute_result',
139 |                 {'text/plain': [result]},
140 |                 execution_count=cell
141 |             )
142 |         ] if result is not None else [])
143 |     )
144 |     return output_cell
145 | 


--------------------------------------------------------------------------------
/python/rnaseqc/insert_size_intervals.py:
--------------------------------------------------------------------------------
  1 | # Author: Francois Aguet
  2 | import numpy as np
  3 | import os
  4 | import pyBigWig
  5 | import argparse
  6 | import qtl.annotation as annotation
  7 | 
  8 | 
  9 | def intersect_overlap(intervals):
 10 |     """
 11 |     intervals: list of tuples or 2-element lists
 12 | 
 13 |     breaks intersections into separate intervals
 14 |     e.g.: [0,6],[2,8] ->[0,1],[2,6],[7,8]
 15 |     """
 16 |     intervals = intervals.copy()
 17 |     intervals.sort(key=lambda x: (x[0],x[1]))
 18 |     intersected = []
 19 |     union = list(intervals[0])
 20 |     bounds = [intervals[0]]
 21 |     for i in intervals[1:]:
 22 |         if i[0] <= union[1]:  # overlap w/ previous
 23 |             if i[1] > union[1]:  # only extend if larger
 24 |                 union[1] = i[1]
 25 |             bounds.append(i)
 26 |         else:
 27 |             # process bounds
 28 |             if len(bounds)>1:
 29 |                 p = np.unique([i[0] for i in bounds]+[i[1]+1 for i in bounds])
 30 |                 intersected.extend([[i,j-1] for i,j in zip(p[:-1],p[1:])])
 31 |             else:
 32 |                 intersected.append(bounds[0])
 33 |             # reset
 34 |             bounds = [i]
 35 |             union = list(i)
 36 |     # process last
 37 |     if len(bounds)>1:
 38 |         p = np.unique([i[0] for i in bounds]+[i[1]+1 for i in bounds])
 39 |         intersected.extend([[i,j-1] for i,j in zip(p[:-1], p[1:])])
 40 |     else:
 41 |         intersected.append(bounds[0])
 42 | 
 43 |     return intersected
 44 | 
 45 | 
 46 | def parse_intervals(annot, mappability_bw, output_dir, prefix, min_length=1000, min_mappability=0.95):
 47 |     """Write intervals to BED format"""
 48 | 
 49 |     exclude = set(['retained_intron', 'readthrough_transcript'])
 50 | 
 51 |     bw = pyBigWig.open(mappability_bw)
 52 |     gintervals = {}
 53 |     for c in annot.chr_list:
 54 |         exon_coords = []
 55 |         for g in annot.chr_genes[c]:
 56 |             for t in g.transcripts:
 57 |                 if (t.type not in exclude) and (('tags' not in t.attributes) or len(set(t.attributes['tags']).intersection(exclude)) == 0):
 58 |                     for e in t.exons:
 59 |                         exon_coords.append((e.start_pos, e.end_pos))
 60 | 
 61 |         v = np.array(intersect_overlap(exon_coords))  # intersect all exons on current chr
 62 |         l = v[:,1]-v[:,0]+1
 63 |         gintervals[c] = v[l >= min_length, :]
 64 |         # filter by mappability
 65 |         gintervals[c] = np.array([i for i in gintervals[c] if bw.stats(c, int(i[0])-1, int(i[1]), exact=True)[0] >= min_mappability])
 66 |     bw.close()
 67 | 
 68 |     # all intervals
 69 |     with open(os.path.join(output_dir, f'{prefix}_geq{min_length}bp.bed'), 'w') as f:
 70 |         f.write('#chr\tstart\tend\n')
 71 |         for c in annot.chr_list:
 72 |             for i in range(gintervals[c].shape[0]):
 73 |                 f.write(f'{c}\t{gintervals[c][i][0]-1}\t{gintervals[c][i][1]}\n')  # BED is 0-indexed, [..)
 74 | 
 75 |     # single-isoform genes
 76 |     gintervals_1iso = {}
 77 |     for c in annot.chr_list:
 78 |         ec = []
 79 |         for g in annot.chr_genes[c]:
 80 |             if len(g.transcripts) == 1:
 81 |                 for e in g.transcripts[0].exons:
 82 |                     if e.length >= min_length:
 83 |                         ec.append([e.start_pos, e.end_pos])
 84 |         ec = list(set([tuple(i) for i in ec]).intersection(set([tuple(i) for i in gintervals[c]])))
 85 |         ec.sort(key=lambda x: (x[0],x[1]))
 86 |         gintervals_1iso[c] = np.array(ec)
 87 | 
 88 |     with open(os.path.join(output_dir, f'{prefix}_geq{min_length}bp_1iso.bed'), 'w') as f:
 89 |         f.write('#chr\tstart\tend\n')
 90 |         for c in annot.chr_list:
 91 |             for i in range(gintervals_1iso[c].shape[0]):
 92 |                 f.write(f'{c}\t{gintervals_1iso[c][i][0]-1}\t{gintervals_1iso[c][i][1]}\n')
 93 | 
 94 | 
 95 | if __name__=='__main__':
 96 | 
 97 |     parser = argparse.ArgumentParser(description='Parse long exons/UTRs with high mappability for estimating insert size distribution.')
 98 |     parser.add_argument('gtf_path', help='Reference annotation in GTF format.')
 99 |     parser.add_argument('mappability_bigwig', help='Mappability track in bigWig format.')
100 |     parser.add_argument('prefix', help='Prefix for output file names.')
101 |     parser.add_argument('--min-length', type=np.int32, default=1000, help='Minimum exon/UTR length for computing insert sizes. Default: 1000bp')
102 |     parser.add_argument('--min-mappability', type=np.float64, default=0.95, help='Minimum mappability for retained intervals. Default: 0.95')
103 |     parser.add_argument('--output-dir', default='.', help='Output directory.')
104 |     args = parser.parse_args()
105 | 
106 |     annot = annotation.Annotation(args.gtf_path, verbose=True)
107 |     parse_intervals(annot, args.mappability_bigwig, args.output_dir, args.prefix,
108 |                     min_length=args.min_length, min_mappability=args.min_mappability)
109 | 


--------------------------------------------------------------------------------
/test_data/legacy.output/downsampled.bam.fragmentSizes.txt:
--------------------------------------------------------------------------------
  1 | Fragment Size	Count
  2 | 48	1
  3 | 49	1
  4 | 53	1
  5 | 60	1
  6 | 68	1
  7 | 69	1
  8 | 76	1
  9 | 77	37
 10 | 78	37
 11 | 79	39
 12 | 80	48
 13 | 81	56
 14 | 82	48
 15 | 83	66
 16 | 84	61
 17 | 85	56
 18 | 86	64
 19 | 87	65
 20 | 88	82
 21 | 89	85
 22 | 90	117
 23 | 91	127
 24 | 92	126
 25 | 93	148
 26 | 94	143
 27 | 95	165
 28 | 96	152
 29 | 97	166
 30 | 98	170
 31 | 99	197
 32 | 100	211
 33 | 101	224
 34 | 102	311
 35 | 103	320
 36 | 104	350
 37 | 105	318
 38 | 106	343
 39 | 107	381
 40 | 108	363
 41 | 109	392
 42 | 110	419
 43 | 111	474
 44 | 112	464
 45 | 113	531
 46 | 114	566
 47 | 115	574
 48 | 116	522
 49 | 117	526
 50 | 118	521
 51 | 119	552
 52 | 120	590
 53 | 121	663
 54 | 122	633
 55 | 123	649
 56 | 124	696
 57 | 125	711
 58 | 126	676
 59 | 127	611
 60 | 128	640
 61 | 129	642
 62 | 130	620
 63 | 131	644
 64 | 132	682
 65 | 133	704
 66 | 134	721
 67 | 135	685
 68 | 136	672
 69 | 137	606
 70 | 138	618
 71 | 139	590
 72 | 140	623
 73 | 141	699
 74 | 142	670
 75 | 143	711
 76 | 144	678
 77 | 145	648
 78 | 146	683
 79 | 147	581
 80 | 148	579
 81 | 149	578
 82 | 150	546
 83 | 151	572
 84 | 152	551
 85 | 153	571
 86 | 154	589
 87 | 155	573
 88 | 156	594
 89 | 157	542
 90 | 158	527
 91 | 159	507
 92 | 160	504
 93 | 161	511
 94 | 162	470
 95 | 163	560
 96 | 164	478
 97 | 165	477
 98 | 166	558
 99 | 167	481
100 | 168	514
101 | 169	461
102 | 170	469
103 | 171	476
104 | 172	456
105 | 173	439
106 | 174	442
107 | 175	453
108 | 176	487
109 | 177	422
110 | 178	390
111 | 179	422
112 | 180	405
113 | 181	402
114 | 182	399
115 | 183	389
116 | 184	411
117 | 185	402
118 | 186	395
119 | 187	374
120 | 188	371
121 | 189	364
122 | 190	345
123 | 191	374
124 | 192	333
125 | 193	338
126 | 194	319
127 | 195	337
128 | 196	307
129 | 197	374
130 | 198	325
131 | 199	341
132 | 200	293
133 | 201	306
134 | 202	315
135 | 203	290
136 | 204	329
137 | 205	287
138 | 206	291
139 | 207	329
140 | 208	327
141 | 209	303
142 | 210	288
143 | 211	276
144 | 212	278
145 | 213	261
146 | 214	300
147 | 215	254
148 | 216	260
149 | 217	286
150 | 218	267
151 | 219	263
152 | 220	236
153 | 221	242
154 | 222	244
155 | 223	231
156 | 224	242
157 | 225	245
158 | 226	225
159 | 227	264
160 | 228	228
161 | 229	236
162 | 230	226
163 | 231	234
164 | 232	227
165 | 233	219
166 | 234	206
167 | 235	197
168 | 236	229
169 | 237	191
170 | 238	197
171 | 239	205
172 | 240	190
173 | 241	180
174 | 242	190
175 | 243	177
176 | 244	180
177 | 245	181
178 | 246	194
179 | 247	179
180 | 248	188
181 | 249	175
182 | 250	174
183 | 251	161
184 | 252	167
185 | 253	169
186 | 254	171
187 | 255	155
188 | 256	143
189 | 257	181
190 | 258	169
191 | 259	171
192 | 260	166
193 | 261	149
194 | 262	144
195 | 263	139
196 | 264	144
197 | 265	140
198 | 266	153
199 | 267	160
200 | 268	144
201 | 269	155
202 | 270	130
203 | 271	138
204 | 272	122
205 | 273	135
206 | 274	112
207 | 275	143
208 | 276	131
209 | 277	105
210 | 278	113
211 | 279	122
212 | 280	118
213 | 281	133
214 | 282	121
215 | 283	114
216 | 284	108
217 | 285	88
218 | 286	99
219 | 287	95
220 | 288	121
221 | 289	93
222 | 290	112
223 | 291	92
224 | 292	88
225 | 293	115
226 | 294	110
227 | 295	87
228 | 296	84
229 | 297	73
230 | 298	100
231 | 299	86
232 | 300	100
233 | 301	102
234 | 302	81
235 | 303	98
236 | 304	75
237 | 305	88
238 | 306	73
239 | 307	80
240 | 308	69
241 | 309	65
242 | 310	70
243 | 311	79
244 | 312	64
245 | 313	62
246 | 314	71
247 | 315	82
248 | 316	73
249 | 317	62
250 | 318	66
251 | 319	62
252 | 320	70
253 | 321	66
254 | 322	62
255 | 323	74
256 | 324	59
257 | 325	48
258 | 326	53
259 | 327	58
260 | 328	50
261 | 329	52
262 | 330	56
263 | 331	50
264 | 332	52
265 | 333	63
266 | 334	50
267 | 335	50
268 | 336	38
269 | 337	49
270 | 338	38
271 | 339	46
272 | 340	45
273 | 341	46
274 | 342	43
275 | 343	58
276 | 344	45
277 | 345	50
278 | 346	43
279 | 347	54
280 | 348	45
281 | 349	40
282 | 350	42
283 | 351	47
284 | 352	32
285 | 353	39
286 | 354	32
287 | 355	33
288 | 356	30
289 | 357	34
290 | 358	32
291 | 359	22
292 | 360	40
293 | 361	30
294 | 362	42
295 | 363	25
296 | 364	38
297 | 365	29
298 | 366	36
299 | 367	39
300 | 368	24
301 | 369	29
302 | 370	24
303 | 371	21
304 | 372	31
305 | 373	21
306 | 374	22
307 | 375	25
308 | 376	19
309 | 377	21
310 | 378	20
311 | 379	21
312 | 380	23
313 | 381	17
314 | 382	25
315 | 383	22
316 | 384	20
317 | 385	26
318 | 386	17
319 | 387	21
320 | 388	23
321 | 389	18
322 | 390	22
323 | 391	13
324 | 392	19
325 | 393	23
326 | 394	29
327 | 395	17
328 | 396	25
329 | 397	20
330 | 398	23
331 | 399	16
332 | 400	20
333 | 401	7
334 | 402	12
335 | 403	11
336 | 404	12
337 | 405	15
338 | 406	17
339 | 407	13
340 | 408	13
341 | 409	21
342 | 410	9
343 | 411	13
344 | 412	8
345 | 413	11
346 | 414	10
347 | 415	16
348 | 416	10
349 | 417	7
350 | 418	12
351 | 419	12
352 | 420	13
353 | 421	8
354 | 422	9
355 | 423	11
356 | 424	10
357 | 425	10
358 | 426	18
359 | 427	11
360 | 428	9
361 | 429	9
362 | 430	11
363 | 431	10
364 | 432	8
365 | 433	9
366 | 434	7
367 | 435	14
368 | 436	6
369 | 437	9
370 | 438	11
371 | 439	6
372 | 440	9
373 | 441	6
374 | 442	5
375 | 443	6
376 | 444	6
377 | 445	7
378 | 446	8
379 | 447	8
380 | 448	7
381 | 449	7
382 | 450	7
383 | 451	6
384 | 452	8
385 | 453	3
386 | 454	7
387 | 455	6
388 | 456	7
389 | 457	6
390 | 458	8
391 | 459	7
392 | 460	5
393 | 461	1
394 | 462	4
395 | 463	7
396 | 464	5
397 | 465	4
398 | 466	1
399 | 467	4
400 | 468	7
401 | 469	3
402 | 470	2
403 | 471	4
404 | 473	5
405 | 474	3
406 | 475	4
407 | 476	5
408 | 477	5
409 | 478	2
410 | 479	3
411 | 480	8
412 | 481	1
413 | 483	1
414 | 484	3
415 | 485	4
416 | 486	1
417 | 487	3
418 | 488	4
419 | 489	4
420 | 490	3
421 | 491	3
422 | 492	3
423 | 493	2
424 | 494	2
425 | 495	7
426 | 497	2
427 | 498	4
428 | 499	1
429 | 500	2
430 | 501	2
431 | 502	2
432 | 503	3
433 | 504	2
434 | 505	1
435 | 506	2
436 | 507	1
437 | 508	1
438 | 509	1
439 | 510	4
440 | 512	1
441 | 513	1
442 | 515	1
443 | 516	1
444 | 517	3
445 | 518	1
446 | 520	2
447 | 521	1
448 | 522	1
449 | 523	2
450 | 524	1
451 | 525	2
452 | 527	2
453 | 528	2
454 | 529	1
455 | 530	2
456 | 531	2
457 | 532	1
458 | 533	2
459 | 534	1
460 | 535	2
461 | 536	1
462 | 537	1
463 | 538	2
464 | 540	1
465 | 541	1
466 | 542	1
467 | 543	1
468 | 545	1
469 | 546	1
470 | 548	1
471 | 550	1
472 | 552	1
473 | 553	2
474 | 555	2
475 | 556	1
476 | 560	1
477 | 561	1
478 | 563	2
479 | 564	1
480 | 571	1
481 | 572	2
482 | 581	1
483 | 584	1
484 | 585	1
485 | 587	2
486 | 588	1
487 | 590	1
488 | 591	1
489 | 594	1
490 | 595	2
491 | 597	1
492 | 603	1
493 | 607	1
494 | 612	1
495 | 617	1
496 | 618	1
497 | 622	1
498 | 627	1
499 | 628	1
500 | 631	1
501 | 632	1
502 | 638	1
503 | 643	1
504 | 656	1
505 | 667	1
506 | 669	1
507 | 683	1
508 | 708	1
509 | 743	1
510 | 747	1
511 | 846	1
512 | 877	1
513 | 921	1
514 | 941	1
515 | 943	1
516 | 1065	1
517 | 1396	1
518 | 1559	1
519 | 1875	1
520 | 2074	1
521 | 


--------------------------------------------------------------------------------
/test_data/downsampled.output/downsampled.bam.fragmentSizes.txt:
--------------------------------------------------------------------------------
  1 | Fragment Size	Count
  2 | 48	1
  3 | 49	1
  4 | 53	1
  5 | 60	1
  6 | 68	1
  7 | 69	1
  8 | 76	1
  9 | 77	37
 10 | 78	37
 11 | 79	39
 12 | 80	48
 13 | 81	56
 14 | 82	48
 15 | 83	66
 16 | 84	61
 17 | 85	56
 18 | 86	64
 19 | 87	65
 20 | 88	82
 21 | 89	85
 22 | 90	117
 23 | 91	127
 24 | 92	126
 25 | 93	148
 26 | 94	143
 27 | 95	165
 28 | 96	152
 29 | 97	166
 30 | 98	170
 31 | 99	197
 32 | 100	211
 33 | 101	224
 34 | 102	311
 35 | 103	320
 36 | 104	350
 37 | 105	318
 38 | 106	343
 39 | 107	381
 40 | 108	363
 41 | 109	392
 42 | 110	419
 43 | 111	474
 44 | 112	464
 45 | 113	531
 46 | 114	566
 47 | 115	574
 48 | 116	522
 49 | 117	526
 50 | 118	521
 51 | 119	552
 52 | 120	590
 53 | 121	663
 54 | 122	633
 55 | 123	649
 56 | 124	696
 57 | 125	711
 58 | 126	676
 59 | 127	611
 60 | 128	640
 61 | 129	642
 62 | 130	620
 63 | 131	644
 64 | 132	682
 65 | 133	704
 66 | 134	721
 67 | 135	685
 68 | 136	672
 69 | 137	606
 70 | 138	618
 71 | 139	590
 72 | 140	623
 73 | 141	699
 74 | 142	670
 75 | 143	711
 76 | 144	678
 77 | 145	648
 78 | 146	683
 79 | 147	581
 80 | 148	579
 81 | 149	578
 82 | 150	546
 83 | 151	572
 84 | 152	551
 85 | 153	571
 86 | 154	589
 87 | 155	573
 88 | 156	594
 89 | 157	542
 90 | 158	527
 91 | 159	507
 92 | 160	504
 93 | 161	511
 94 | 162	470
 95 | 163	560
 96 | 164	478
 97 | 165	477
 98 | 166	558
 99 | 167	481
100 | 168	514
101 | 169	461
102 | 170	469
103 | 171	476
104 | 172	456
105 | 173	439
106 | 174	442
107 | 175	453
108 | 176	487
109 | 177	422
110 | 178	390
111 | 179	422
112 | 180	405
113 | 181	402
114 | 182	399
115 | 183	389
116 | 184	411
117 | 185	402
118 | 186	395
119 | 187	374
120 | 188	371
121 | 189	364
122 | 190	345
123 | 191	374
124 | 192	333
125 | 193	338
126 | 194	319
127 | 195	337
128 | 196	307
129 | 197	374
130 | 198	325
131 | 199	341
132 | 200	293
133 | 201	306
134 | 202	315
135 | 203	290
136 | 204	329
137 | 205	287
138 | 206	291
139 | 207	329
140 | 208	327
141 | 209	303
142 | 210	288
143 | 211	276
144 | 212	278
145 | 213	261
146 | 214	300
147 | 215	254
148 | 216	260
149 | 217	286
150 | 218	267
151 | 219	263
152 | 220	236
153 | 221	242
154 | 222	244
155 | 223	231
156 | 224	242
157 | 225	245
158 | 226	225
159 | 227	264
160 | 228	228
161 | 229	236
162 | 230	226
163 | 231	234
164 | 232	227
165 | 233	219
166 | 234	206
167 | 235	197
168 | 236	229
169 | 237	191
170 | 238	197
171 | 239	205
172 | 240	190
173 | 241	180
174 | 242	190
175 | 243	177
176 | 244	180
177 | 245	181
178 | 246	194
179 | 247	179
180 | 248	188
181 | 249	175
182 | 250	174
183 | 251	161
184 | 252	167
185 | 253	169
186 | 254	171
187 | 255	155
188 | 256	143
189 | 257	181
190 | 258	169
191 | 259	171
192 | 260	166
193 | 261	149
194 | 262	144
195 | 263	139
196 | 264	144
197 | 265	140
198 | 266	153
199 | 267	160
200 | 268	144
201 | 269	155
202 | 270	130
203 | 271	138
204 | 272	122
205 | 273	135
206 | 274	112
207 | 275	143
208 | 276	131
209 | 277	105
210 | 278	113
211 | 279	122
212 | 280	118
213 | 281	133
214 | 282	121
215 | 283	114
216 | 284	108
217 | 285	88
218 | 286	99
219 | 287	95
220 | 288	121
221 | 289	93
222 | 290	112
223 | 291	92
224 | 292	88
225 | 293	115
226 | 294	110
227 | 295	87
228 | 296	84
229 | 297	73
230 | 298	100
231 | 299	86
232 | 300	100
233 | 301	102
234 | 302	81
235 | 303	98
236 | 304	75
237 | 305	88
238 | 306	73
239 | 307	80
240 | 308	69
241 | 309	65
242 | 310	70
243 | 311	79
244 | 312	64
245 | 313	62
246 | 314	71
247 | 315	82
248 | 316	73
249 | 317	62
250 | 318	66
251 | 319	62
252 | 320	70
253 | 321	66
254 | 322	62
255 | 323	74
256 | 324	59
257 | 325	48
258 | 326	53
259 | 327	58
260 | 328	50
261 | 329	52
262 | 330	56
263 | 331	50
264 | 332	52
265 | 333	63
266 | 334	50
267 | 335	50
268 | 336	38
269 | 337	49
270 | 338	38
271 | 339	46
272 | 340	45
273 | 341	46
274 | 342	43
275 | 343	58
276 | 344	45
277 | 345	50
278 | 346	43
279 | 347	54
280 | 348	45
281 | 349	40
282 | 350	42
283 | 351	47
284 | 352	32
285 | 353	39
286 | 354	32
287 | 355	33
288 | 356	30
289 | 357	34
290 | 358	32
291 | 359	22
292 | 360	40
293 | 361	30
294 | 362	42
295 | 363	25
296 | 364	38
297 | 365	29
298 | 366	36
299 | 367	39
300 | 368	24
301 | 369	29
302 | 370	24
303 | 371	21
304 | 372	31
305 | 373	21
306 | 374	22
307 | 375	25
308 | 376	19
309 | 377	21
310 | 378	20
311 | 379	21
312 | 380	23
313 | 381	17
314 | 382	25
315 | 383	22
316 | 384	20
317 | 385	26
318 | 386	17
319 | 387	21
320 | 388	23
321 | 389	18
322 | 390	22
323 | 391	13
324 | 392	19
325 | 393	23
326 | 394	29
327 | 395	17
328 | 396	25
329 | 397	20
330 | 398	23
331 | 399	16
332 | 400	20
333 | 401	7
334 | 402	12
335 | 403	11
336 | 404	12
337 | 405	15
338 | 406	17
339 | 407	13
340 | 408	13
341 | 409	21
342 | 410	9
343 | 411	13
344 | 412	8
345 | 413	11
346 | 414	10
347 | 415	16
348 | 416	10
349 | 417	7
350 | 418	12
351 | 419	12
352 | 420	13
353 | 421	8
354 | 422	9
355 | 423	11
356 | 424	10
357 | 425	10
358 | 426	18
359 | 427	11
360 | 428	9
361 | 429	9
362 | 430	11
363 | 431	10
364 | 432	8
365 | 433	9
366 | 434	7
367 | 435	14
368 | 436	6
369 | 437	9
370 | 438	11
371 | 439	6
372 | 440	9
373 | 441	6
374 | 442	5
375 | 443	6
376 | 444	6
377 | 445	7
378 | 446	8
379 | 447	8
380 | 448	7
381 | 449	7
382 | 450	7
383 | 451	6
384 | 452	8
385 | 453	3
386 | 454	7
387 | 455	6
388 | 456	7
389 | 457	6
390 | 458	8
391 | 459	7
392 | 460	5
393 | 461	1
394 | 462	4
395 | 463	7
396 | 464	5
397 | 465	4
398 | 466	1
399 | 467	4
400 | 468	7
401 | 469	3
402 | 470	2
403 | 471	4
404 | 473	5
405 | 474	3
406 | 475	4
407 | 476	5
408 | 477	5
409 | 478	2
410 | 479	3
411 | 480	8
412 | 481	1
413 | 483	1
414 | 484	3
415 | 485	4
416 | 486	1
417 | 487	3
418 | 488	4
419 | 489	4
420 | 490	3
421 | 491	3
422 | 492	3
423 | 493	2
424 | 494	2
425 | 495	7
426 | 497	2
427 | 498	4
428 | 499	1
429 | 500	2
430 | 501	2
431 | 502	2
432 | 503	3
433 | 504	2
434 | 505	1
435 | 506	2
436 | 507	1
437 | 508	1
438 | 509	1
439 | 510	4
440 | 512	1
441 | 513	1
442 | 515	1
443 | 516	1
444 | 517	3
445 | 518	1
446 | 520	2
447 | 521	1
448 | 522	1
449 | 523	2
450 | 524	1
451 | 525	2
452 | 527	2
453 | 528	2
454 | 529	1
455 | 530	2
456 | 531	2
457 | 532	1
458 | 533	2
459 | 534	1
460 | 535	2
461 | 536	1
462 | 537	1
463 | 538	2
464 | 540	1
465 | 541	1
466 | 542	1
467 | 543	1
468 | 545	1
469 | 546	1
470 | 548	1
471 | 550	1
472 | 552	1
473 | 553	2
474 | 555	2
475 | 556	1
476 | 560	1
477 | 561	1
478 | 563	2
479 | 564	1
480 | 571	1
481 | 572	2
482 | 581	1
483 | 584	1
484 | 585	1
485 | 587	2
486 | 588	1
487 | 590	1
488 | 591	1
489 | 594	1
490 | 595	2
491 | 597	1
492 | 603	1
493 | 607	1
494 | 612	1
495 | 617	1
496 | 618	1
497 | 622	1
498 | 627	1
499 | 628	1
500 | 631	1
501 | 632	1
502 | 638	1
503 | 643	1
504 | 656	1
505 | 667	1
506 | 669	1
507 | 683	1
508 | 708	1
509 | 743	1
510 | 747	1
511 | 846	1
512 | 877	1
513 | 921	1
514 | 941	1
515 | 943	1
516 | 1065	1
517 | 1396	1
518 | 1559	1
519 | 1875	1
520 | 2074	1
521 | 


--------------------------------------------------------------------------------
/python/rnaseqc/aggregate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import pandas as pd
  3 | import numpy as np
  4 | import argparse
  5 | import glob
  6 | import gzip
  7 | import os
  8 | 
  9 | 
 10 | def combine_gcts(path_dict, verbose=True):
 11 |     """Aggregate single-sample GCT files."""
 12 | 
 13 |     sample_ids = sorted(path_dict)
 14 | 
 15 |     # load first sample and determine dtype
 16 |     sample_id = sample_ids[0]
 17 |     df = pd.read_csv(path_dict[sample_id], sep='\t', skiprows=3, header=None,
 18 |                      index_col=0, names=['Name','Description', sample_id])
 19 |     if df[sample_id].dtype == np.float64:
 20 |         dtype = np.float32
 21 |     elif df[sample_id].dtype == np.int64:
 22 |         dtype = np.int32
 23 |     else:
 24 |         dtype = df[sample_id].dtype.type
 25 | 
 26 |     # allocate
 27 |     gct_df = pd.DataFrame(0, index=df.index, columns=['Description']+list(sample_ids), dtype=dtype)
 28 |     gct_df['Description'] = df['Description']
 29 |     gct_df[sample_id] = df[sample_id].astype(dtype)
 30 | 
 31 |     for k,sample_id in enumerate(sample_ids[1:], 2):
 32 |         if verbose:
 33 |             print(f'\r  * loading GCT {k}/{len(path_dict)}', end='', flush=True)
 34 |         df = pd.read_csv(path_dict[sample_id], sep='\t', skiprows=3, header=None,
 35 |                          usecols=[0,2],  index_col=0, names=['Name', sample_id],
 36 |                          dtype={'Name':str, sample_id:dtype})
 37 |         gct_df[sample_id] = df[sample_id]
 38 |     if verbose:
 39 |         print()
 40 | 
 41 |     return gct_df
 42 | 
 43 | 
 44 | def write_gct(df, gct_file, float_format='%.6g', compresslevel=6):
 45 |     """Write pd.DataFrame to GCT format"""
 46 | 
 47 |     assert df.index.name == 'Name' and df.columns[0] == 'Description'
 48 | 
 49 |     if gct_file.endswith('.gct.gz'):
 50 |         opener = gzip.open(gct_file, 'wt', compresslevel=compresslevel)
 51 |     else:
 52 |         opener = open(gct_file, 'w')
 53 | 
 54 |     with opener as gct:
 55 |         gct.write(f'#1.2\n{df.shape[0]:d}\t{df.shape[1]-1:d}\n')
 56 |         df.to_csv(gct, sep='\t', float_format=float_format)
 57 | 
 58 | 
 59 | def combine_metrics(path_dict):
 60 |     """Aggregate single-sample metrics files."""
 61 |     metrics_df = []
 62 |     for k,sample_id in enumerate(sorted(path_dict), 1):
 63 |         metrics_df.append(pd.read_csv(path_dict[sample_id], sep='\t', index_col=0, dtype=str))
 64 |     metrics_df = pd.concat(metrics_df, axis=1).T
 65 |     metrics_df.index.name = 'sample_id'
 66 |     return metrics_df
 67 | 
 68 | 
 69 | def combine_distributions(path_dict):
 70 |     """Aggregate single-sample insert sizes distributions."""
 71 |     distr_df = []
 72 |     for k,sample_id in enumerate(sorted(path_dict), 1):
 73 |         distr_df.append(pd.read_csv(path_dict[sample_id], sep='\t', index_col=0).squeeze('columns').rename(sample_id))
 74 |     distr_df = pd.concat(distr_df, axis=1).fillna(0).astype(np.int32)
 75 |     return distr_df
 76 | 
 77 | 
 78 | if __name__ == '__main__':
 79 |     parser = argparse.ArgumentParser(description='Aggregate RNA-SeQC outputs')
 80 |     parser.add_argument('results_dir', help='Directory containing RNA-SeQC outputs for all samples to be combined.')
 81 |     parser.add_argument('prefix', help='Prefix for output files, e.g., <prefix>.gct.gz')
 82 |     parser.add_argument('--parquet', action='store_true', help='Write to parquet format instead of GCT')
 83 |     parser.add_argument('-o', '--output-dir', default='.', help='Output directory')
 84 |     args = parser.parse_args()
 85 | 
 86 |     if not os.path.isdir(args.output_dir):
 87 |         os.makedirs(args.output_dir)
 88 | 
 89 |     # if os.path.isdir(args.results):
 90 |     gene_reads_gcts =  {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*gene_reads.gct*'), recursive=True)}
 91 |     gene_fragm_gcts =  {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*gene_fragments.gct*'), recursive=True)}
 92 |     gene_tpm_gcts =    {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*gene_tpm.gct*'), recursive=True)}
 93 |     exon_reads_gcts =  {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*exon_reads.gct*'), recursive=True)}
 94 |     metrics_files =    {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*metrics.tsv*'), recursive=True)}
 95 |     insertsize_files = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*fragmentSizes.txt*'), recursive=True)}
 96 |     gc_content_files = {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results_dir, '**/*gc_content.tsv*'), recursive=True)}
 97 |     # coverage files don't get aggregated
 98 |     # coverage_files =   {os.path.basename(i).split('.')[0]:i for i in glob.glob(os.path.join(args.results, '*coverage.tsv*'))}
 99 | 
100 |     if len(metrics_files) > 0:
101 |         print('Aggregating metrics')
102 |         metrics_df = combine_metrics(metrics_files)
103 |         metrics_df.to_csv(os.path.join(args.output_dir, f'{args.prefix}.metrics.txt.gz'), sep='\t')
104 | 
105 |     if len(insertsize_files) > 0:
106 |         print('Aggregating insert size distributions')
107 |         insertsize_df = combine_distributions(insertsize_files)
108 |         insertsize_df.to_csv(os.path.join(args.output_dir, f'{args.prefix}.insert_size_hists.txt.gz'), sep='\t')
109 | 
110 |     if len(gc_content_files) > 0:
111 |         print('Aggregating GC content distributions')
112 |         gc_df = combine_distributions(gc_content_files)
113 |         gc_df.to_csv(os.path.join(args.output_dir, f'{args.prefix}.gc_content_hists.txt.gz'), sep='\t')
114 | 
115 |     if len(gene_reads_gcts) > 0:
116 |         print('Aggregating read count GCTs')
117 |         gct_df = combine_gcts(gene_reads_gcts)
118 |         if args.parquet:
119 |             gct_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.gene_reads.parquet'))
120 |         else:
121 |             write_gct(gct_df, os.path.join(args.output_dir, f'{args.prefix}.gene_reads.gct.gz'))
122 | 
123 |     if len(gene_fragm_gcts) > 0:
124 |         print('Aggregating fragment count GCTs')
125 |         gct_df = combine_gcts(gene_fragm_gcts)
126 |         if args.parquet:
127 |             gct_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.gene_fragments.parquet'))
128 |         else:
129 |             write_gct(gct_df, os.path.join(args.output_dir, f'{args.prefix}.gene_fragments.gct.gz'))
130 | 
131 |     if len(gene_tpm_gcts) > 0:
132 |         print('Aggregating TPM GCTs')
133 |         gct_df = combine_gcts(gene_tpm_gcts)
134 |         if args.parquet:
135 |             gct_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.gene_tpm.parquet'))
136 |         else:
137 |             write_gct(gct_df, os.path.join(args.output_dir, f'{args.prefix}.gene_tpm.gct.gz'))
138 | 
139 |     if len(exon_reads_gcts) > 0:
140 |         print('Aggregating exon read count GCTs')
141 |         gct_df = combine_gcts(exon_reads_gcts)
142 |         if args.parquet:
143 |             gct_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.exon_reads.parquet'))
144 |         else:
145 |             write_gct(gct_df, os.path.join(args.output_dir, f'{args.prefix}.exon_reads.gct.gz'))
146 | 


--------------------------------------------------------------------------------
/Metrics.md:
--------------------------------------------------------------------------------
 1 | # RNA-SeQC Output Metrics
 2 | 
 3 | This file provides a description for each of the output metrics in the `metrics.tsv` file. A description of other output files can be found at the bottom
 4 | 
 5 | ## Output Metrics
 6 | * Mapping Rate: The proportion of all reads in the Bam which were Mapped, and not Secondary Alignments or Platform/Vendor QC Failing reads ("Mapped Reads").
 7 | In legacy mode, this is the proportion of all reads which were Mapped out
 8 | of all reads which were not Secondary Alignments or Platform/Vendor QC Failing reads.
 9 | * Unique Rate of Mapped: This is the proportion of reads which **were not** marked as PCR/Optical Duplicates out of all "Mapped Reads" (as defined above; excludes Secondary and Vendor QC Failed reads).
10 | * Duplicate Rate of Mapped: This is the proportion of all reads which **were** marked as PCR/Optical Duplicates out of all "Mapped Reads" (as defined above; excludes Secondary and Vendor QC Failed reads). This is complementary to the "Unique Rate of Mapped".
11 | * Duplicate Rate of Mapped, excluding Globins: This is similar to the "Duplicate Rate of Mapped" except that it only includes reads which **did not** align to _HBA1_, _HBA2_, _HBB_, or _HBD_.
12 | * Base Mismatch: The total number of mismatched bases (as determined by the "NM" tag) of all "Mapped Reads" (as defined above) divided by the total aligned length of all "Mapped Reads".
13 | * End 1 & 2 Mapping Rate: The proportion of Paired reads which were marked as First or Second in the pair, respectively, out of all "Mapped Reads" (above).
14 | * End 1 & 2 Mismatch Rate: The proportion of mismatched bases (as determined by the "NM" tag) belonging to First or Second mates, divided by the total aligned length of all "Mapped" (above) First or Second mates, respectively.
15 | * Expression Profiling Efficiency: The proportion of "Exonic Reads" (see "Exonic Rate", below) out of all reads which were not Secondary Alignments or
16 | Platform/Vendor QC Failing reads.
17 | * High Quality Rate: The proportion of **properly paired** reads with less than 6 mismatched bases and a perfect mapping quality out of all "Mapped Reads" (above).
18 | * Exonic Rate: The proportion of "Mapped Reads" (above) for which all aligned segments unambiguously aligned to exons of the same gene.
19 | * Intronic Rate: The proportion of "Mapped Reads" (above) for which all aligned segments unambiguously aligned to the same gene, but none of which _intersected_ any exons of the gene.
20 | * Intergenic Rate: The proportion of "Mapped Reads" (above) for which none of the aligned segments _intersected_ any genes.
21 | * Intragenic Rate: The sum of "Exonic" and "Intronic" rates (see "Exonic Rate" and "Intronic Rate" above)
22 | * Ambiguous Alignment Rate: The proportion of "Mapped Reads" (above) where the aligned segments unambiguously aligned to exons of more than one gene.
23 | * High Quality Exonic, Intronic, Intergenic, Intragenic, and Ambiguous Alignment Rates: The proportion of "Exonic Reads", "Intronic Reads", "Intragenic Reads", "Intergenic Reads", and "Ambiguous Reads" (see rates above) out of "High Quality Reads" only (as defined in "High Quality Rate", above)
24 | * Discard Rate: The proportion of "Mapped Reads" (above) which discarded and not checked against the reference annotation. In most cases this should be 0, however, this will include reads which were discarded by additional command line flags (such as `--exclude-chimeric` or `--tag`) or extra legacy mode filters. "Exonic Rate", "Intronic Rate", "Intergenic Rate", "Ambiguous Alignment Rate" and "Discard Rate" will sum to 1.
25 | * rRNA Rate: The proportion of "Mapped Reads" (above) which at least partially intersected with an annotated rRNA gene. This is **not** complementary to any other rates.
26 | * End 1 & 2 Sense Rate: The proportion of First or Second Mate reads which intersected with a Sense Strand feature out of all First or Second
27 | Mate reads which intersected with any features, respectively.
28 | * Avg. Splits per Read: The average number of gaps or deletions present in "Mapped Reads" (above). This is generally not an important metric, but may indicate aligner errors if the value is too high.
29 | * Raw Counts: All raw counts used in any metrics are reported here in the file
30 | * Read Length: The longest aligned length observed in any read
31 | * Genes Detected: The number of genes which had at least 5 unambiguous reads. The detection threshold can be changed with `--detection-threshold`
32 | * Estimated Library Complexity: An estimation of the number of unique cDNA fragments present in the library. This computation follows the same formula as Picard EstimateLibraryComplexity
33 | * 3' Bias statistics (Mean, Median, Std Deviation, Median Absolute Deviation, 25th percentile, 75th percentile): These aggregate statistics are based on the total coverage in 100 bp windows on both the 3' and 5' ends of a gene. The windows are both offset 150 bases into the gene. This computation is only performed on genes at least 600bp long and with at least 5 unambiguous reads. These thresholds can be changed with `--offset`, `--window-size`, `--gene-length`, and `--detection-threshold`. A gene with even coverage in both it's 3' and 5' windows would have a bias of 0.5; bias near 1 or 0 may indicate degredation
34 | * Fragment Length Statistics (Mean, Meadian, Std Deviation, and Median Absolute Deviation): These aggregate statistics are based on the insert sizes observed in "High Quality" (above) read pairs. These metrics are only present if a Bed file was provided with the `--bed` option. Only the first 1,000,000 "High Quality" pairs, where both mates map to the same Bed interval are used.
35 | * Median of Transcript Coverage statistics (Mean, Std Deviation, Coefficient of Variation): These statistics are the median of a given aggregate statistic of transcript coverage (for example, the median of mean transcript coverage). Transcript coverage is computed by dropping the first and last 500bp of each gene and measuring the **"High Quality"** (above) coverage over the remainder of the gene.
36 | * Median Exon CV: The median coefficient of variation of exon coverage. Exon coverage is computed by dropping the first and last 500bp of each gene and measuring the **"High Quality"** (above) coverage over the remainder of the exons. This is considered a good metric for sample quality. A lower value indicates more consistent coverage over exons.
37 | * Exon CV MAD: The Median Absolute Deviation over all Exon CVs
38 | 
39 | **Note**: When running in `--unpaired` mode, single-ended bams will report `nan` for all End 1 and End 2 metrics
40 | 
41 | ### Fragment Sizes File
42 | 
43 | This file contains the raw counts of the observed insert sizes of the sample. Fragment sizes are only measured if a Bed file is provided with the `--bed` option. This file is stored as a histogram, with the first column recording a given observed size, and the second column recording the number of occurances of that particular size.
44 | 
45 | ### Coverage File
46 | 
47 | This file contains coverage data for all genes. Coverage computations are always performed, but this file of per-gene coverage data is not produced unless
48 | the `--coverage` flag is provided. The first column contains the gene ID as given by the input annotation. The next three columns contain the mean, standard deviation, and coefficient of variation of coverage for each gene, respectively. The first and last 500bp of each gene are dropped and not considered when computing coverage. A value of 0 or `nan` may indicate that the gene's coding length was less than 1kb or that the gene had 0 coverage
49 | over it's exons.
50 | 
51 | ## Migrating between old and new columns
52 | 
53 | For users of the legacy tool, several metrics have been renamed, removed, or changed.
54 | Below is a table of previous metrics and how to access them using the new metrics names:
55 | 
56 | Old Metric | New Metric | Notes
57 | -|-|-
58 | Base Mismatch Rate | Base Mismatch |
59 | Duplication Rate of Mapped | Duplicate Rate of Mapped |
60 | End 1/2 % Sense | End 1/2 Sense Rate |
61 | Estimated Library Size | Esitmated Library Complexity |
62 | Failed Vendor QC Check | Failed Vendor QC |
63 | Fragment Length Mean | Average Fragment Length | The fragment length metrics have changed significantly
64 | Fragment Length StdDev | Fragment Length Std
65 | Intragenic Rate | Intragenic Rate | Some reads previously classified as `Intragenic` are now classified as `Ambiguous Alignments`. The equivalent of the old `Intragenic Rate` can be computed by summing `Intragenic Rate` + `Ambigous Alignment Rate`
66 | Mapped | Mapped Reads |
67 | Mapped Unique | Mapped Unique Reads |
68 | Total Purity Filtered Reads Sequenced | Unique Mapping, Vendor QC Passed Reads | This counts reads without the Secondary or QC Fail flags set. For a true count of total alignments use `Total Reads`
69 | 


--------------------------------------------------------------------------------
/src/Fasta.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | //  Fasta.cpp
  3 | //  IntervalTree
  4 | //
  5 | //  Created by Aaron Graubert on 5/23/18.
  6 | //  Copyright © 2018 Aaron Graubert. All rights reserved.
  7 | //
  8 | 
  9 | #include "Fasta.h"
 10 | #include <boost/filesystem.hpp>
 11 | #include <cmath>
 12 | #include <utility>
 13 | 
 14 | namespace rnaseqc {
 15 |     std::map<std::string, chrom> chromosomes;
 16 |     
 17 |     chrom chromosomeMap(std::string chr)
 18 |     {
 19 |         auto entry = chromosomes.find(chr);
 20 |         if (entry == chromosomes.end())
 21 |         {
 22 |             chromosomes[chr] = chromosomes.size() + 1u;
 23 |         }
 24 |         return chromosomes[chr];
 25 |     }
 26 |     
 27 |     //Given an internal chromosome ID, get the name it corresponds to
 28 |     std::string getChromosomeName(chrom idx)
 29 |     {
 30 |         for (auto entry = chromosomes.begin(); entry != chromosomes.end(); ++entry) if (entry->second == idx) return entry->first;
 31 |         throw invalidContigException("Invalid chromosome index");
 32 |     }
 33 |     
 34 |     //Get reverse complement of a sequence
 35 |     void complement(std::string &sequence)
 36 |     {
 37 |         std::string tmp = sequence;
 38 |         auto src = sequence.rbegin();
 39 |         for(unsigned int i = 0; src != sequence.rend() && i < tmp.length(); ++src, ++i)
 40 |         {
 41 |             switch(*src)
 42 |             {
 43 |                 case 'A':
 44 |                 case 'a':
 45 |                     tmp[i] = 'T';
 46 |                     break;
 47 |                 case 'T':
 48 |                 case 't':
 49 |                     tmp[i] = 'A';
 50 |                     break;
 51 |                 case 'C':
 52 |                 case 'c':
 53 |                     tmp[i] = 'G';
 54 |                     break;
 55 |                 case 'G':
 56 |                 case 'g':
 57 |                     tmp[i] = 'C';
 58 |                     break;
 59 |                 default:
 60 |                     tmp[i] = *src;
 61 |             }
 62 |         }
 63 |         sequence.swap(tmp);
 64 |     }
 65 |     
 66 |     //Count GC content in a sequence
 67 |     double gc(std::string &sequence)
 68 |     {
 69 |         if (sequence.length() == 0) return -1;
 70 |         double content = 0.0, size = static_cast<double>(sequence.length());
 71 |         for (auto base = sequence.begin(); base != sequence.end(); ++base)
 72 |             if (*base == 'G' || *base == 'g' || *base == 'C' || *base == 'c') content += 1.0/size;
 73 |         return content;
 74 |     }
 75 |     
 76 |     // Open a fasta file
 77 |     void Fasta::open(std::string &filename)
 78 |     {
 79 |         this->_open = true;
 80 |         this->reader.open(filename);
 81 |         if (!this->reader.is_open())
 82 |         {
 83 |             throw fileException("Unable to open reference fasta: " +filename);
 84 |         }
 85 |         std::string index_path = filename + ".fai";
 86 |         // Check if the index exists at filepath.fai
 87 |         if (boost::filesystem::exists(boost::filesystem::path(filename).replace_extension(".fai")))
 88 |             index_path = boost::filesystem::path(filename).replace_extension(".fai").string();
 89 |         // otherwise fail if the index doesn't exist at filepath.fasta.fai
 90 |         else if (!boost::filesystem::exists(index_path)) throw fileException("Unable to locate fasta index: " + filename);
 91 |         // Import chromosome names from index
 92 |         std::vector<std::string> contigs = bioio::read_fasta_index_contig_names(index_path);
 93 |         for (auto contig = contigs.begin(); contig != contigs.end(); ++contig) chromosomeMap(*contig);
 94 |         // Then allow bioio to parse the index
 95 |         bioio::FastaIndex tmp_index = bioio::read_fasta_index(index_path);
 96 |         for (auto entry = tmp_index.begin(); entry != tmp_index.end(); ++entry) this->contigIndex[chromosomeMap(entry->first)] = entry->second;
 97 |         if (!this->contigIndex.size()) throw fileException("No contigs found in fasta index: " + index_path);
 98 |     }
 99 |     
100 |     //Get a forward strand sequence {contig}:{start}-{end}
101 |     std::string Fasta::getSeq(chrom contig, coord start, coord end)
102 |     {
103 |         return this->getSeq(contig, start, end, Strand::Forward);
104 |     }
105 | 
106 |     bool Fasta::isOpen() const {
107 |         return this->_open;
108 |     }
109 |     
110 |     //Get a sequence {contig}:{start}-{end}, and optionally return its reverse complement
111 |     std::string Fasta::getSeq(chrom contig, coord start, coord end, Strand strand)
112 |     {
113 |         //NOTE: Coordinates must be 0-based, end-exclusive.
114 |         if (!this->isOpen()) return "";
115 |         std::string output;
116 |         // Determine the coordinate for the start of the page which contains the start of this sequence
117 |         coord pageOffset = (floor(start / PAGE_SIZE) * PAGE_SIZE);
118 |         for (coord i = pageOffset; i < end; i+=PAGE_SIZE) //Iterate over pages until we have all the pages required
119 |         {
120 |             this->calls++; // Increment number of pages that were requested
121 |             indexType page = this->pageForCoord(contig, i); // Get page index corresponding to this coordinate
122 |             if (!this->pageCache.count(page)) // If that page isn't cached
123 |             {
124 |                 this->misses++; // Increment number of pages that were actually read
125 |                 this->pageCache[page] = this->readSeq(contig, i); // Read page from fasta
126 |             }
127 |             this->updateLRU(page); // Update the cache state
128 |             output += this->pageCache[page]; // Append this page to the output
129 |         }
130 |         if (start-pageOffset >= output.size())
131 |         {
132 |             std::cerr << "Unable to fetch sequence" << std::endl;
133 |             std::cerr << "Target region (GTF+1):\t" << getChromosomeName(contig) << ":" << start+1 << "-" << end << std::endl;
134 |             std::cerr << "# pages fetched:\t" << output.length() / PAGE_SIZE << std::endl;
135 |             std::cerr << "This contig page indices:\t[" << this->pageForContig(contig) << ", " << this->pageForContig(contig+1) << ")" << std::endl;
136 |             std::cerr << "Sequence page indices:\t[" << this->pageForCoord(contig, start) << ", " << this->pageForCoord(contig, end) << "]" << std::endl;
137 |         }
138 |         // Extract desired sequence from the output (which is complete pages)
139 |         output = output.substr(start-pageOffset, end-start);
140 |         if (strand == Strand::Reverse) complement(output);
141 |         return output;
142 |     }
143 |     
144 |     // Bump page to top of the LRU, dropping older pages as necessary
145 |     void Fasta::updateLRU(indexType page)
146 |     {
147 |         this->lru.remove(page);
148 |         while (this->lru.size() >= CACHE_SIZE)
149 |         {
150 |             this->pageCache.erase(this->lru.front());
151 |             this->lru.pop_front();
152 |         }
153 |         this->lru.push_back(page);
154 |     }
155 |     
156 |     // Get page idx for position 0 of a contig
157 |     indexType Fasta::pageForContig(chrom contig)
158 |     {
159 |         static std::unordered_map<chrom, indexType> pageIndex; // Function caches the lookup table to save time
160 |         if (!pageIndex.size()) pageIndex[0] = 0;
161 |         if (pageIndex.count(contig)) return pageIndex[contig];
162 |         if (!this->contigIndex.count(contig)) throw invalidContigException("No such contig: " + getChromosomeName(contig));
163 |         chrom firstContig = contig;
164 |         for (; firstContig > 0; --firstContig) if (pageIndex.count(firstContig)) break;
165 |         indexType idx = pageIndex[firstContig];
166 |         for (chrom i = firstContig; i < contig; ++i)
167 |         {
168 |             idx += ceil(static_cast<double>(this->contigIndex[i].length)/PAGE_SIZE);
169 |             pageIndex[i+1] = idx;
170 |         }
171 |         return idx;
172 |     }
173 |     
174 |     // Read a full page starting at this position
175 |     std::string Fasta::readSeq(chrom contig, coord pos)
176 |     {
177 |         if (!this->contigIndex.count(contig)) throw invalidContigException("No such contig: " + getChromosomeName(contig));
178 |         return (bioio::read_fasta_contig(this->reader, this->contigIndex[contig], pos, PAGE_SIZE));
179 |     }
180 |     
181 |     indexType Fasta::pageForCoord(chrom contig, coord pos)
182 |     {
183 |         return this->pageForContig(contig) + floor(static_cast<double>(pos)/PAGE_SIZE);
184 |     }
185 |     
186 |     Fasta::~Fasta()
187 |     {
188 |         this->reader.close();
189 |         this->pageCache.clear();
190 |         if (this->misses) std::cerr << this->misses << " cache misses out of " << this->calls << " requests" << std::endl;
191 |     }
192 | 
193 |     bool Fasta::hasContig(chrom contig) const {
194 |         return this->contigIndex.count(contig);
195 |     }
196 | }
197 | 
198 | 


--------------------------------------------------------------------------------
/src/Metrics.h:
--------------------------------------------------------------------------------
  1 | //
  2 | //  Metrics.hpp
  3 | //  IntervalTree
  4 | //
  5 | //  Created by Aaron Graubert on 7/5/17.
  6 | //  Copyright © 2017 Aaron Graubert. All rights reserved.
  7 | //
  8 | 
  9 | #ifndef Metrics_h
 10 | #define Metrics_h
 11 | 
 12 | #include "GTF.h"
 13 | #include <map>
 14 | #include <cmath>
 15 | #include <fstream>
 16 | #include <string>
 17 | #include <vector>
 18 | #include <utility>
 19 | #include <tuple>
 20 | #include <list>
 21 | #include <unordered_set>
 22 | #include <iterator>
 23 | 
 24 | namespace rnaseqc {
 25 |     class Metrics;
 26 | }
 27 | 
 28 | std::ofstream& operator<<(std::ofstream&, rnaseqc::Metrics&);
 29 | 
 30 | namespace rnaseqc {
 31 |     class Metrics {
 32 |         // For storing arbitrary counters
 33 |         std::map<std::string, unsigned long> counter;
 34 |     public:
 35 |         Metrics() : counter(){};
 36 |         void increment(std::string);
 37 |         void increment(std::string, int);
 38 |         unsigned long get(std::string);
 39 |         double frac(std::string, std::string);
 40 |         friend std::ofstream& ::operator<<(std::ofstream&, Metrics&);
 41 |     };
 42 |     
 43 |     class Collector {
 44 |         // For temporarily holding coverage on a read before we're ready to commit that coverage to a gene
 45 |         std::map<std::string, std::vector<std::pair<std::string, double> > > data;
 46 |         std::map<std::string, double> *target;
 47 |         bool dirty;
 48 |         double total;
 49 |     public:
 50 |         Collector(std::map<std::string, double> *dataTarget) : data(), target(dataTarget), dirty(false), total(0.0)
 51 |         {
 52 |             
 53 |         }
 54 |         void add(const std::string&, const std::string&, const double);
 55 |         void collect(const std::string&);
 56 |         void collectSingle(const std::string&); //for legacy exon detection
 57 |         bool queryGene(const std::string&);
 58 |         bool isDirty();
 59 |         double sum();
 60 |     };
 61 |     
 62 |     struct CoverageEntry {
 63 |         // Represents a single segment of aligned read bases for base-coverage computation
 64 |         coord offset;
 65 |         unsigned int length;
 66 |         std::string feature_id;
 67 |     };
 68 |     
 69 |     class BiasCounter {
 70 |         // For counting 3'/5' bias coverage
 71 |         const int offset;
 72 |         const int windowSize;
 73 |         const unsigned long geneLength;
 74 |         const unsigned int detectionThreshold;
 75 |         unsigned int countedGenes;
 76 |         std::map<std::string, unsigned long> fiveEnd;
 77 |         std::map<std::string, unsigned long> threeEnd;
 78 |     public:
 79 |         BiasCounter(int offset, int windowSize, unsigned long geneLength, unsigned int detectionThreshold) : offset(offset), windowSize(windowSize), geneLength(geneLength), detectionThreshold(detectionThreshold), countedGenes(0), fiveEnd(), threeEnd()
 80 |         {
 81 |             
 82 |         }
 83 |         
 84 |         void computeBias(const Feature&, std::vector<unsigned long>&);
 85 |         unsigned int countGenes() const;
 86 |         double getBias(const std::string&);
 87 |         const unsigned int getThreshold() const {
 88 |             return this->detectionThreshold;
 89 |         }
 90 |     };
 91 | 
 92 |     struct ExonCoverage {
 93 |         double cv;
 94 |         double gc;
 95 |     };
 96 |     
 97 |     class BaseCoverage {
 98 |         // For computing per-base coverage of genes
 99 |         Fasta& fastaReader;
100 |         std::map<std::string, std::vector<CoverageEntry> > cache; //GID -> Entry<EID> tmp cache as exon hits are recorded
101 |         std::map<std::string, std::vector<unsigned long> > coverage; //EID -> Coverage vector for exons still in window
102 |         std::map<std::string, ExonCoverage> exonCoverage;
103 |         std::ofstream writer;
104 |         const unsigned int mask_size;
105 |         std::list<double> geneMeans, geneStds, geneCVs;
106 |         BiasCounter &bias;
107 |         std::unordered_set<std::string> seen;
108 |         BaseCoverage(const BaseCoverage&) = delete; //No!
109 |     public:
110 |         BaseCoverage(Fasta& fasta, const std::string &filename, const unsigned int mask, bool openFile, BiasCounter &biasCounter) : fastaReader(fasta), coverage(), exonCoverage(), cache(), writer(openFile ? filename : "/dev/null"), mask_size(mask), geneMeans(), geneStds(), geneCVs(), bias(biasCounter), seen()
111 |         {
112 |             if ((!this->writer.is_open()) && openFile) throw std::runtime_error("Unable to open BaseCoverage output file");
113 |             this->writer << "gene_id\tcoverage_mean\tcoverage_std\tcoverage_CV" << std::endl;
114 |         }
115 |         
116 |         void add(const Feature&, const coord, const coord); //Adds to the cache
117 |         void commit(const std::string&); //moves one gene out of the cache and adds hits to exon coverage vector
118 |         void reset(); //Empties the cache
119 |         //    void clearCoverage(); //empties out data that won't be used
120 |         void compute(const Feature&); //Computes the per-base coverage for all transcripts in the gene
121 |         void close(); //Flush and close the ofstream
122 |         BiasCounter& getBiasCounter() const {
123 |             return this->bias;
124 |         }
125 |         const std::map<std::string, ExonCoverage>& getExonCoverage() const {
126 |             return this->exonCoverage;
127 |         }
128 |         std::list<double>& getGeneMeans() {
129 |             return this->geneMeans;
130 |         }
131 |         std::list<double>& getGeneStds() {
132 |             return this->geneStds;
133 |         }
134 |         std::list<double>& getGeneCVs() {
135 |             return this->geneCVs;
136 |         }
137 |     };
138 | 
139 |     template <typename T> void sortContainer(T &data) {
140 |         std::sort(data.begin(), data.end());
141 |     }
142 | 
143 |     template <typename T> void sortContainer(std::list<T> &data) {
144 |         data.sort();
145 |     }
146 |     
147 |     template <typename T> double computeMedian(unsigned long size, T &&iterator)
148 |     {
149 |         if (size <= 0) // Couldn't decide if it would make sense to just report a median of 0. This seemed safer
150 |             throw std::range_error("Cannot compute median of an empty list");
151 |         else if (size == 1)
152 |             return *iterator;
153 |         for (unsigned long midpoint = (size - 1) / 2; midpoint > 0; --midpoint) ++iterator;
154 |         if (size % 2)
155 |         {
156 |             double value = static_cast<double>(*(iterator++));
157 |             return (value + static_cast<double>(*iterator)) / 2.0;
158 |         }
159 |         return static_cast<double>(*iterator);
160 |     }
161 | 
162 |     typedef std::tuple<double, double, double, double> statsTuple;
163 |     
164 |     enum StatIdx {avg = 0, med = 1, std = 2, mad = 3, skew = 1, kurt = 3};
165 |     
166 |     template <typename T>
167 |     statsTuple getStatistics(T &data) {
168 |         if (data.size()) {
169 |             double avg = 0.0, std = 0.0;
170 |             std::vector<double> deviations;
171 |             sortContainer(data);
172 |             const double size = data.size();
173 |             double median = computeMedian(size, data.begin());
174 |             for (auto element = data.begin(); element != data.end(); ++element) {
175 |                 avg += static_cast<double>(*element) / size;
176 |                 deviations.push_back(fabs(static_cast<double>(*element) - median));
177 |             }
178 |             sortContainer(deviations);
179 |             double medDev = computeMedian(deviations.size(), deviations.begin()) * 1.4826;
180 |             for (auto element = data.begin(); element != data.end(); ++element)
181 |                 std += pow(static_cast<double>(*element) - avg, 2.0) / size;
182 |             std = pow(std, 0.5);
183 |             return statsTuple(avg, median, std, medDev);
184 |         }
185 |         return statsTuple(NAN, NAN, NAN, NAN);
186 |     }
187 | 
188 |     template<typename T>
189 |     statsTuple getAdvancedStatistics(T &data) {
190 |         if (!data.size()) return statsTuple(NAN, NAN, NAN, NAN);
191 |         double avg = 0.0, m2 = 0.0, m3 = 0.0, m4 = 0.0, count = 0.0;
192 |         for (auto element = data.begin(); element != data.end(); ++element) {
193 |             double prev_count = count++;
194 |             double delta = static_cast<double>(*element) - avg;
195 |             double delta_n = delta / count;
196 |             double delta_n2 = delta_n * delta_n;
197 |             double t = delta * delta_n * prev_count;
198 |             avg += delta_n;
199 |             m4 += t * delta_n2 * (count*count - 3*count + 3) + 6*delta_n2*m2 - 4*delta_n*m3;
200 |             m3 += t * delta_n * (count - 2) - 3 * delta_n * m2;
201 |             m2 += t;
202 |             
203 |         }
204 |         double std = pow(m2/count, 0.5);
205 |         return statsTuple(avg, m3 / count / pow(std, 3.0), std, (count * m4) / (m2 * m2) - 3);
206 |     }
207 |     
208 |     extern std::map<std::string, double> uniqueGeneCounts, geneCounts, exonCounts, geneFragmentCounts; //counters for read coverage of genes and exons
209 |     extern std::map<std::string, std::unordered_set<std::string> > fragmentTracker; // tracks fragments encountered by each gene
210 | }
211 | 
212 | #endif /* Metrics_h */
213 | 


--------------------------------------------------------------------------------
/src/GTF.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | //  GTF.cpp
  3 | //  IntervalTree
  4 | //
  5 | //  Created by Aaron Graubert on 6/28/17.
  6 | //  Copyright © 2017 Aaron Graubert. All rights reserved.
  7 | //
  8 | 
  9 | #include "GTF.h"
 10 | #include <exception>
 11 | #include <stdexcept>
 12 | #include <unordered_set>
 13 | #include <boost/regex.hpp>
 14 | 
 15 | using std::ifstream;
 16 | using std::string;
 17 | using std::map;
 18 | 
 19 | namespace rnaseqc {
 20 |     const string EXON_NAME = "exon";
 21 |     const boost::regex ribosomalPattern("rRNA"); //For recognizing features which are rRNAs
 22 |     map<string, string> geneNames, geneSeqs;
 23 | map<string, coord> geneLengths, geneCodingLengths;
 24 |     map<string, FeatureSpan> exonLengths;
 25 |     std::map<std::string, std::vector<std::string>> exonsForGene;
 26 |     std::vector<std::string> geneList, exonList;
 27 |     map<string, unsigned int> exon_names;
 28 |     
 29 |     
 30 |     ifstream& operator>>(ifstream &in, Feature &out)
 31 |     {
 32 |         static std::unordered_set<std::string> geneIds, exonIds;
 33 |         try{
 34 |             string line;
 35 |             while(getline(in, line))
 36 |             {
 37 |                 if(line[0] == '#') continue; //not a feature line
 38 |                 std::istringstream tokenizer(line);
 39 |                 //get chr#
 40 |                 string buffer;
 41 |                 if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse chromosome. Invalid GTF line: " + line);
 42 |                 out.chromosome = chromosomeMap(buffer);
 43 |                 //get track name
 44 |                 if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse track. Invalid GTF line: " + line);
 45 |                 //get feature type
 46 |                 if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse feature type. Invalid GTF line: " + line);
 47 |                 if (buffer == "exon") out.type = FeatureType::Exon;
 48 |                 else if (buffer == "gene") out.type = FeatureType::Gene;
 49 |                 else if (buffer == "transcript") out.type = FeatureType::Transcript;
 50 |                 else out.type = FeatureType::Other;
 51 |                 //get start pos
 52 |                 if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse start. Invalid GTF line: " + line);
 53 |                 out.start = std::stoull(buffer);
 54 |                 //get stop pos
 55 |                 if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse end. Invalid GTF line: " + line);
 56 |                 out.end = std::stoull(buffer);
 57 |                 //get score
 58 |                 if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse score. Invalid GTF line: " + line);
 59 |                 //get strand
 60 |                 if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse strand. Invalid GTF line: " + line);
 61 |                 switch(buffer[0])
 62 |                 {
 63 |                     case '+':
 64 |                         out.strand = Strand::Forward;
 65 |                         break;
 66 |                     case '-':
 67 |                         out.strand = Strand::Reverse;
 68 |                         break;
 69 |                     default:
 70 |                         out.strand = Strand::Unknown;
 71 |                 }
 72 |                 //get frame
 73 |                 if(!getline(tokenizer, buffer, '\t')) throw gtfException("Unable to parse frame. Invalid GTF line: " + line);
 74 |                 //get attributes
 75 |                 if(!getline(tokenizer, buffer)) throw gtfException("Unable to parse attributes. Invalid GTF line: " + line);
 76 |                 std::map<string, string> attributes;
 77 |                 parseAttributes(buffer, attributes);
 78 |                 if ( out.end < out.start)
 79 |                     std::cerr << "Bad feature range:" << out.start << " - " << out.end << std::endl;
 80 |                 if (out.type == FeatureType::Gene && attributes.find("gene_id") != attributes.end())
 81 |                 {
 82 |                     //Parse gene attributes
 83 |                     out.feature_id = attributes["gene_id"];
 84 |                     if (geneIds.count(out.feature_id)) throw gtfException(std::string("Detected non-unique Gene ID: "+out.feature_id));
 85 |                     geneIds.insert(out.feature_id);
 86 |                     geneLengths[out.feature_id] = out.end - out.start + 1;
 87 |                     geneList.push_back(attributes["gene_id"]);
 88 |                 }
 89 |                 if (out.type == FeatureType::Transcript && attributes.find("transcript_id") != attributes.end()) out.feature_id = attributes["transcript_id"];
 90 |                 if (attributes.find("gene_id") != attributes.end()) out.gene_id = attributes["gene_id"];
 91 |                 if (out.type == FeatureType::Exon)
 92 |                 {
 93 |                     //Parse exon attributes
 94 |                     if (attributes.find("exon_id") != attributes.end())
 95 |                     {
 96 |                         out.feature_id = attributes["exon_id"];
 97 |                     }
 98 |                     else if (attributes.find("gene_id") != attributes.end())
 99 |                     {
100 |                         out.feature_id = attributes["gene_id"] + "_" + std::to_string(++exon_names[attributes["gene_id"]]);
101 |                         std::cerr << "Unnamed exon: Gene: " << attributes["gene_id"] << " Position: [" << out.start << ", " << out.end <<  "] Inferred Exon Name: " << out.feature_id << std::endl;
102 |                     }
103 |                     else throw gtfException(std::string("Exon missing exon_id and gene_id fields: " + line));
104 |                     if (exonIds.count(out.feature_id)) throw gtfException(std::string("Detected non-unique Exon ID: "+out.feature_id));
105 |                     exonIds.insert(out.feature_id);
106 |                     exonList.push_back(out.feature_id);
107 |                     geneCodingLengths[out.gene_id] += 1 + (out.end - out.start);
108 |                     exonLengths[out.feature_id] = {out.chromosome, out.start, 1 + (out.end - out.start)};
109 |                 }
110 |                 if (attributes.find("transcript_type") != attributes.end()) out.transcript_type = attributes["transcript_type"];
111 |                 if (attributes.find("gene_name") != attributes.end()) geneNames[out.feature_id] = attributes["gene_name"];
112 |                 else if (attributes.find("gene_id") != attributes.end()) geneNames[out.feature_id] = attributes["gene_id"];
113 |                 out.ribosomal = boost::regex_search(out.transcript_type, ribosomalPattern);
114 |                 break;
115 |             }
116 |             
117 |         }
118 |         catch(gtfException &e)
119 |         {
120 |             throw e;
121 |         }
122 |         catch(std::invalid_argument &e)
123 |         {
124 |             throw gtfException(std::string("GTF is in an invalid format: ") + e.what());
125 |         }
126 |         catch(std::exception &e)
127 |         {
128 |             throw gtfException(std::string("Uncountered an unknown error while parsing GTF: ")+e.what());
129 |         }
130 |         return in;
131 |     }
132 |     
133 |     std::map<std::string,std::string>& parseAttributes(std::string &intake, std::map<std::string,std::string> &attributes)
134 |     {
135 |         std::istringstream tokenizer(intake);
136 |         string buffer;
137 |         while (getline(tokenizer, buffer, ';'))
138 |         {
139 |             std::istringstream splitter(buffer);
140 |             string current;
141 |             getline(splitter, current, '"');
142 |             string key = current.substr(0, current.length()-1);
143 |             while (key[0] == ' ' or key[0] == '\t') key = key.substr(1);
144 |             getline(splitter, current, '"');
145 |             attributes[key] = current;
146 |         }
147 |         return attributes;
148 |     }
149 |     
150 |     bool operator==(const Feature &a, const Feature &b)
151 |     {
152 |         if (a.start != b.start) return false;
153 |         if (a.end != b.end) return false;
154 |         if (a.chromosome != b.chromosome) return false;
155 |         if (a.strand != b.strand) return false;
156 |         if (a.type != b.type) return false;
157 |         if (a.feature_id != b.feature_id) return false;
158 |         return a.transcript_type == b.transcript_type;
159 |     }
160 |     
161 |     bool compIntervalStart(const Feature &a, const Feature &b)
162 |     {
163 |         return a.start < b.start;
164 |     }
165 |     
166 |     bool compIntervalEnd(const Feature &a, const Feature &b)
167 |     {
168 |         return a.end < b.end;
169 |     }
170 |     
171 |     bool intersectPoint(const Feature &a, const coord x)
172 |     {
173 |         return (x >= a.start) && (x <= a.end);
174 |     }
175 |     
176 |     bool intersectInterval(const Feature &a, const Feature &b)
177 |     {
178 |         return intersectPoint(a, b.start) || intersectPoint(a, b.end) || intersectPoint(b, a.start);
179 |     }
180 |     
181 |     int partialIntersect(const Feature &target, const Feature &query)
182 |     {
183 |         return intersectInterval(target, query) ? (
184 |                                                    1+std::min(target.end, query.end-1) - std::max(target.start, query.start)
185 |                                                    ) : 0;
186 |     }
187 | }
188 | 


--------------------------------------------------------------------------------
/test_data/Makefile.osx:
--------------------------------------------------------------------------------
  1 | #Set inclusion paths here (if boost, bamtools, or args are installed outside your path)
  2 | INCLUDE_DIRS=-ISeqLib -ISeqLib/htslib/
  3 | #Set library paths here (if boost or bamtools are installed outside your path)
  4 | LIBRARY_PATHS=
  5 | #Set to 0 if you encounter linker errors regarding strings from the bamtools library
  6 | ABI=1
  7 | #Provide full paths here to .a archives for libraries which should be statically linked
  8 | STATIC_LIBS=/usr/local/lib/libboost_filesystem.a /usr/local/lib/libboost_regex.a /usr/local/lib/libboost_system.a $(ZLIB_PATH) SeqLib/lib/libhts.a $(LZMA_PATH) /usr/local/opt/bzip2/lib/libbz2.a
  9 | #List of remaining libraries that will be dynamically linked
 10 | LIBS=
 11 | 
 12 | CC=g++
 13 | STDLIB=-std=c++14
 14 | CFLAGS=-Wall $(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI) -O3
 15 | SOURCES=BED.cpp Expression.cpp GTF.cpp RNASeQC.cpp Metrics.cpp Fasta.cpp BamReader.cpp
 16 | SRCDIR=src
 17 | OBJECTS=$(SOURCES:.cpp=.o)
 18 | SEQFLAGS=$(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI)
 19 | SHELL=/bin/bash
 20 | 
 21 | rnaseqc: $(foreach file,$(OBJECTS),$(SRCDIR)/$(file)) SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a
 22 | 	$(CC) -O3 $(LIBRARY_PATHS) -o $@ $^ $(STATIC_LIBS) $(LIBS)
 23 | 
 24 | %.o: %.cpp
 25 | 	$(CC) $(CFLAGS) -I. $(INCLUDE_DIRS) -c -o $@ $<
 26 | 
 27 | SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a:
 28 | 	cd SeqLib && ./configure && make CXXFLAGS="$(SEQFLAGS)" && make install
 29 | 
 30 | .PHONY: clean
 31 | 
 32 | clean:
 33 | 	rm $(wildcard $(SRCDIR)/*.o)
 34 | 
 35 | # The rest of the makefile consists of test cases. Run "make test" to perform all tests
 36 | 
 37 | .PHONY: test
 38 | 
 39 | test: test-version test-single test-chr1 test-downsampled test-legacy test-crams test-expected-failures
 40 | 	echo Tests Complete
 41 | 
 42 | .PHONY: test-version
 43 | 
 44 | test-version: rnaseqc
 45 | 	[ ! -z "$(shell ./rnaseqc --version)" ]
 46 | 
 47 | .PHONY: test-single
 48 | 
 49 | test-single: rnaseqc
 50 | 	./rnaseqc test_data/single_pair.gtf test_data/single_pair.bam .test_output
 51 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.metrics.tsv test_data/single_pair.output/single_pair.bam.metrics.tsv -m metrics -c single_pair.bam single_pair.bam_ -t
 52 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_reads.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t
 53 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_tpm.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t
 54 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.exon_reads.gct <(gzcat test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t
 55 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_fragments.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t
 56 | 	rm -rf .test_output
 57 | 
 58 | .PHONY: test-chr1
 59 | 
 60 | test-chr1: rnaseqc
 61 | 	./rnaseqc test_data/chr1.gtf test_data/chr1.bam .test_output --coverage
 62 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.metrics.tsv test_data/chr1.output/chr1.bam.metrics.tsv -m metrics -c chr1.bam chr1.bam_ -t
 63 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.gene_reads.gct <(gzcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t
 64 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.gene_tpm.gct <(gzcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t
 65 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.exon_reads.gct <(gzcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t
 66 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.gene_fragments.gct <(gzcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t
 67 | 	sed s/-nan/nan/g .test_output/chr1.bam.coverage.tsv > .test_output/coverage.tsv
 68 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ -t
 69 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ -t
 70 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ -t
 71 | 	rm -rf .test_output
 72 | 
 73 | .PHONY: test-downsampled
 74 | 
 75 | test-downsampled: rnaseqc
 76 | 	./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output
 77 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/downsampled.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_ -t
 78 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t
 79 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t
 80 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t
 81 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t
 82 | 	sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv
 83 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ -t
 84 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ -t
 85 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ -t
 86 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/downsampled.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_ -t
 87 | 	rm -rf .test_output
 88 | 
 89 | .PHONY: test-crams
 90 | 
 91 | test-crams: rnaseqc
 92 | 	touch test_data/chr1.fasta.fai
 93 | 	./rnaseqc test_data/chr1.gtf test_data/chr1.cram .test_output --coverage --fasta test_data/chr1.fasta
 94 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.cram.metrics.tsv -m metrics -c chr1.cram chr1.cram_ -t
 95 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_reads.gct <(gzcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t
 96 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_tpm.gct <(gzcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t
 97 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.exon_reads.gct <(gzcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t
 98 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_fragments.gct <(gzcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t
 99 | 	sed s/-nan/nan/g .test_output/chr1.cram.coverage.tsv > .test_output/coverage.tsv
100 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ -t
101 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ -t
102 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ -t
103 | 	rm -rf .test_output
104 | 
105 | .PHONY: test-legacy
106 | 
107 | test-legacy: rnaseqc
108 | 	./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output --legacy
109 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/legacy.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_ -t
110 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_ -t
111 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_ -t
112 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/legacy.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_ -t
113 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_ -t
114 | 	sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv
115 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_ -t
116 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_ -t
117 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_ -t
118 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/legacy.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_ -t
119 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/legacy.output/legacy.gene_reads.gct.gz) -m tables -c Counts RNA-SeQC -t
120 | 	python3 python/rnaseqc/legacy_exon_remap.py .test_output/downsampled.bam.exon_reads.gct test_data/downsampled.gtf > /dev/null
121 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/legacy.output/legacy.exon_reads.gct.gz) -m tables -c Counts RNA-SeQC -t
122 | 	rm -rf .test_output
123 | 
124 | .PHONY: test-expected-failures
125 | 
126 | test-expected-failures: rnaseqc
127 | 	./rnaseqc test_data/gencode.v26.collapsed.gtf test_data/downsampled.bam .test_output 2>/dev/null; test $$? -eq 11
128 | 	rm -rf .test_output
129 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | #Set inclusion paths here (if boost, bamtools, or args are installed outside your path)
  2 | INCLUDE_DIRS=-ISeqLib -ISeqLib/htslib/
  3 | #Set library paths here (if boost or bamtools are installed outside your path)
  4 | LIBRARY_PATHS=
  5 | #Set to 0 if you encounter linker errors regarding strings from the bamtools library
  6 | ABI=1
  7 | #Provide full paths here to .a archives for libraries which should be statically linked
  8 | STATIC_LIBS=
  9 | #List of remaining libraries that will be dynamically linked
 10 | LIBS= -lboost_filesystem -lboost_regex -lboost_system -lz -llzma -lbz2 -lpthread
 11 | 
 12 | CC=g++
 13 | STDLIB=-std=c++14
 14 | CFLAGS=-Wall $(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI) -O3
 15 | SOURCES=BED.cpp Expression.cpp GTF.cpp RNASeQC.cpp Metrics.cpp Fasta.cpp BamReader.cpp
 16 | SRCDIR=src
 17 | OBJECTS=$(SOURCES:.cpp=.o)
 18 | SEQFLAGS=$(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI)
 19 | SHELL=/bin/bash
 20 | 
 21 | rnaseqc: $(foreach file,$(OBJECTS),$(SRCDIR)/$(file)) SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a
 22 | 	$(CC) -O3 $(LIBRARY_PATHS) -o $@ $^ $(STATIC_LIBS) $(LIBS)
 23 | 
 24 | .PHONY: lib
 25 | 
 26 | lib: $(foreach file,$(OBJECTS),$(SRCDIR)/$(file))
 27 | 	ar -rcs rnaseqc.a $^
 28 | 
 29 | %.o: %.cpp
 30 | 	$(CC) $(CFLAGS) -I. $(INCLUDE_DIRS) -c -o $@ $<
 31 | 
 32 | SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a:
 33 | 	cd SeqLib && ./configure && make CXXFLAGS="$(SEQFLAGS)" && make install
 34 | 
 35 | .PHONY: clean
 36 | 
 37 | clean:
 38 | 	rm $(wildcard $(SRCDIR)/*.o)
 39 | 
 40 | # The rest of the makefile consists of test cases. Run "make test" to perform all tests
 41 | 
 42 | .PHONY: test
 43 | 
 44 | test: test-version test-single test-chr1 test-downsampled test-legacy test-crams test-expected-failures
 45 | 	echo Tests Complete
 46 | 
 47 | .PHONY: test-version
 48 | 
 49 | test-version: rnaseqc
 50 | 	[ ! -z "$(shell ./rnaseqc --version)" ]
 51 | 
 52 | .PHONY: test-single
 53 | 
 54 | test-single: rnaseqc
 55 | 	./rnaseqc test_data/single_pair.gtf test_data/single_pair.bam .test_output
 56 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.metrics.tsv test_data/single_pair.output/single_pair.bam.metrics.tsv -m metrics -c single_pair.bam single_pair.bam_
 57 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_reads.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
 58 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_tpm.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
 59 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.exon_reads.gct <(gzcat test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
 60 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_fragments.gct <(gzcat test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
 61 | 	rm -rf .test_output
 62 | 
 63 | .PHONY: test-chr1
 64 | 
 65 | test-chr1: rnaseqc
 66 | 	./rnaseqc test_data/chr1.gtf test_data/chr1.bam .test_output --coverage
 67 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.metrics.tsv test_data/chr1.output/chr1.bam.metrics.tsv -m metrics -c chr1.bam chr1.bam_
 68 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.gene_reads.gct <(gzcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
 69 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.gene_tpm.gct <(gzcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
 70 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.exon_reads.gct <(gzcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
 71 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.gene_fragments.gct <(gzcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
 72 | 	sed s/-nan/nan/g .test_output/chr1.bam.coverage.tsv > .test_output/coverage.tsv
 73 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
 74 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_
 75 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_
 76 | 	rm -rf .test_output
 77 | 
 78 | .PHONY: test-downsampled
 79 | 
 80 | test-downsampled: rnaseqc
 81 | 	./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output
 82 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/downsampled.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_
 83 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
 84 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
 85 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
 86 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(gzcat test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
 87 | 	sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv
 88 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
 89 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_
 90 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_
 91 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/downsampled.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_
 92 | 	rm -rf .test_output
 93 | 
 94 | .PHONY: test-legacy
 95 | 
 96 | test-legacy: rnaseqc
 97 | 	./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output --legacy
 98 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/legacy.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_
 99 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
100 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
101 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/legacy.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
102 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(gzcat test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
103 | 	sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv
104 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
105 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_
106 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_
107 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/legacy.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_
108 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(gzcat test_data/legacy.output/legacy.gene_reads.gct.gz) -m tables -c Counts RNA-SeQC
109 | 	python3 python/rnaseqc/legacy_exon_remap.py .test_output/downsampled.bam.exon_reads.gct test_data/downsampled.gtf > /dev/null
110 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(gzcat test_data/legacy.output/legacy.exon_reads.gct.gz) -m tables -c Counts RNA-SeQC -t
111 | 	rm -rf .test_output
112 | 
113 | .PHONY: test-crams
114 | 
115 | test-crams: rnaseqc
116 | 	touch test_data/chr1.fasta.fai
117 | 	./rnaseqc test_data/chr1.gtf test_data/chr1.cram .test_output --coverage --fasta test_data/chr1.fasta
118 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.cram.metrics.tsv -m metrics -c chr1.cram chr1.cram_
119 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_reads.gct <(gzcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
120 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_tpm.gct <(gzcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
121 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.exon_reads.gct <(gzcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
122 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_fragments.gct <(gzcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
123 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gc_content.tsv test_data/chr1.output/chr1.cram.gc_content.tsv -m metrics -c Count Count_
124 | 	sed s/-nan/nan/g .test_output/chr1.cram.coverage.tsv > .test_output/coverage.tsv
125 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
126 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_
127 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_
128 | 	rm -rf .test_output
129 | 
130 | .PHONY: test-expected-failures
131 | 
132 | test-expected-failures: rnaseqc
133 | 	./rnaseqc test_data/gencode.v26.collapsed.gtf test_data/downsampled.bam .test_output 2>/dev/null; test $$? -eq 11
134 | 	rm -rf .test_output
135 | 


--------------------------------------------------------------------------------
/test_data/Makefile.linux:
--------------------------------------------------------------------------------
  1 | #Set inclusion paths here (if boost, bamtools, or args are installed outside your path)
  2 | INCLUDE_DIRS=-ISeqLib -ISeqLib/htslib/
  3 | #Set library paths here (if boost or bamtools are installed outside your path)
  4 | LIBRARY_PATHS=
  5 | #Set to 0 if you encounter linker errors regarding strings from the bamtools library
  6 | ABI=1
  7 | #Provide full paths here to .a archives for libraries which should be statically linked
  8 | STATIC_LIBS=SeqLib/lib/libhts.a /usr/lib/x86_64-linux-gnu/libboost_filesystem.a /usr/lib/x86_64-linux-gnu/libboost_regex.a /usr/lib/x86_64-linux-gnu/libboost_system.a /usr/lib/x86_64-linux-gnu/libz.a /usr/lib/x86_64-linux-gnu/liblzma.a /usr/lib/x86_64-linux-gnu/libbz2.a /usr/lib/gcc/x86_64-linux-gnu/9*/libstdc++.a
  9 | #List of remaining libraries that will be dynamically linked
 10 | LIBS=-lpthread
 11 | 
 12 | CC=g++
 13 | STDLIB=-std=c++14
 14 | CFLAGS=-Wall $(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI) -O3
 15 | SOURCES=BED.cpp Expression.cpp GTF.cpp RNASeQC.cpp Metrics.cpp Fasta.cpp BamReader.cpp
 16 | SRCDIR=src
 17 | OBJECTS=$(SOURCES:.cpp=.o)
 18 | SEQFLAGS=$(STDLIB) -D_GLIBCXX_USE_CXX11_ABI=$(ABI)
 19 | SHELL=/bin/bash
 20 | 
 21 | rnaseqc: $(foreach file,$(OBJECTS),$(SRCDIR)/$(file)) SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a
 22 | 	$(CC) -static -static-libstdc++ -static-libgcc -O3 $(LIBRARY_PATHS) -o $@ $^ $(STATIC_LIBS) $(LIBS)
 23 | 
 24 | %.o: %.cpp
 25 | 	$(CC) -static -static-libstdc++ -static-libgcc $(CFLAGS) -I. $(INCLUDE_DIRS) -c -o $@ $<
 26 | 
 27 | SeqLib/lib/libseqlib.a SeqLib/lib/libhts.a:
 28 | 	cd SeqLib && ./configure && make CXXFLAGS="$(SEQFLAGS)" && make install
 29 | 
 30 | .PHONY: clean
 31 | 
 32 | clean:
 33 | 	rm $(wildcard $(SRCDIR)/*.o)
 34 | 
 35 | # The rest of the makefile consists of test cases. Run "make test" to perform all tests
 36 | 
 37 | .PHONY: test
 38 | 
 39 | test: test-version test-single test-chr1 test-downsampled test-legacy test-crams test-expected-failures
 40 | 	echo Tests Complete
 41 | 
 42 | .PHONY: test-version
 43 | 
 44 | test-version: rnaseqc
 45 | 	[ ! -z "$(shell ./rnaseqc --version)" ]
 46 | 
 47 | .PHONY: test-single
 48 | 
 49 | test-single: rnaseqc
 50 | 	./rnaseqc test_data/single_pair.gtf test_data/single_pair.bam .test_output
 51 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.metrics.tsv test_data/single_pair.output/single_pair.bam.metrics.tsv -m metrics -c single_pair.bam single_pair.bam_
 52 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_reads.gct <(zcat test_data/single_pair.output/single_pair.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
 53 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_tpm.gct <(zcat test_data/single_pair.output/single_pair.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
 54 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.exon_reads.gct <(zcat test_data/single_pair.output/single_pair.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
 55 | 	python3 test_data/approx_diff.py .test_output/single_pair.bam.gene_fragments.gct <(zcat test_data/single_pair.output/single_pair.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
 56 | 	rm -rf .test_output
 57 | 
 58 | .PHONY: test-chr1
 59 | 
 60 | test-chr1: rnaseqc
 61 | 	./rnaseqc test_data/chr1.gtf test_data/chr1.bam .test_output --coverage
 62 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.metrics.tsv test_data/chr1.output/chr1.bam.metrics.tsv -m metrics -c chr1.bam chr1.bam_
 63 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.gene_reads.gct <(zcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
 64 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.gene_tpm.gct <(zcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
 65 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.exon_reads.gct <(zcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
 66 | 	python3 test_data/approx_diff.py .test_output/chr1.bam.gene_fragments.gct <(zcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
 67 | 	sed s/-nan/nan/g .test_output/chr1.bam.coverage.tsv > .test_output/coverage.tsv
 68 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
 69 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_
 70 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_
 71 | 	rm -rf .test_output
 72 | 
 73 | .PHONY: test-downsampled
 74 | 
 75 | test-downsampled: rnaseqc
 76 | 	./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output
 77 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/downsampled.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_
 78 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(zcat test_data/downsampled.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
 79 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(zcat test_data/downsampled.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
 80 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(zcat test_data/downsampled.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
 81 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(zcat test_data/downsampled.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
 82 | 	sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv
 83 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
 84 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_
 85 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/downsampled.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_
 86 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/downsampled.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_
 87 | 	rm -rf .test_output
 88 | 
 89 | .PHONY: test-crams
 90 | 
 91 | test-crams: rnaseqc
 92 | 	touch test_data/chr1.fasta.fai
 93 | 	./rnaseqc test_data/chr1.gtf test_data/chr1.cram .test_output --coverage --fasta test_data/chr1.fasta
 94 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.metrics.tsv test_data/chr1.output/chr1.cram.metrics.tsv -m metrics -c chr1.cram chr1.cram_
 95 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_reads.gct <(zcat test_data/chr1.output/chr1.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
 96 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_tpm.gct <(zcat test_data/chr1.output/chr1.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
 97 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.exon_reads.gct <(zcat test_data/chr1.output/chr1.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
 98 | 	python3 test_data/approx_diff.py .test_output/chr1.cram.gene_fragments.gct <(zcat test_data/chr1.output/chr1.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
 99 | 	sed s/-nan/nan/g .test_output/chr1.cram.coverage.tsv > .test_output/coverage.tsv
100 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
101 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_std coverage_std_
102 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/chr1.output/chr1.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_
103 | 	rm -rf .test_output
104 | 
105 | .PHONY: test-legacy
106 | 
107 | test-legacy: rnaseqc
108 | 	./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .test_output --legacy
109 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.metrics.tsv test_data/legacy.output/downsampled.bam.metrics.tsv -m metrics -c downsampled.bam downsampled.bam_
110 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(zcat test_data/legacy.output/downsampled.bam.gene_reads.gct.gz) -m tables -c Counts Counts_
111 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_tpm.gct <(zcat test_data/legacy.output/downsampled.bam.gene_tpm.gct.gz) -m tables -c TPM TPM_
112 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(zcat test_data/legacy.output/downsampled.bam.exon_reads.gct.gz) -m tables -c Counts Counts_
113 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_fragments.gct <(zcat test_data/legacy.output/downsampled.bam.gene_fragments.gct.gz) -m tables -c Fragments Fragments_
114 | 	sed s/-nan/nan/g .test_output/downsampled.bam.coverage.tsv > .test_output/coverage.tsv
115 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_mean coverage_mean_
116 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_std coverage_std_
117 | 	python3 test_data/approx_diff.py .test_output/coverage.tsv test_data/legacy.output/downsampled.bam.coverage.tsv -m metrics -c coverage_CV coverage_CV_
118 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.fragmentSizes.txt test_data/legacy.output/downsampled.bam.fragmentSizes.txt -m fragments -c Count Count_
119 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.gene_reads.gct <(zcat test_data/legacy.output/legacy.gene_reads.gct.gz) -m tables -c Counts RNA-SeQC
120 | 	python3 python/rnaseqc/legacy_exon_remap.py .test_output/downsampled.bam.exon_reads.gct test_data/downsampled.gtf > /dev/null
121 | 	python3 test_data/approx_diff.py .test_output/downsampled.bam.exon_reads.gct <(zcat test_data/legacy.output/legacy.exon_reads.gct.gz) -m tables -c Counts RNA-SeQC -t
122 | 	rm -rf .test_output
123 | 
124 | .PHONY: test-expected-failures
125 | 
126 | test-expected-failures: rnaseqc
127 | 	./rnaseqc test_data/gencode.v26.collapsed.gtf test_data/downsampled.bam .test_output 2>/dev/null; test $$? -eq 11
128 | 	rm -rf .test_output
129 | 


--------------------------------------------------------------------------------
/python/rnaseqc/report.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import argparse
  4 | import os
  5 | import sys
  6 | import matplotlib.pyplot as plt
  7 | import qtl.io
  8 | 
  9 | sys.path.insert(1, os.path.dirname(__file__))
 10 | from .plot import *
 11 | 
 12 | 
 13 | def plot_qc_figures(metrics_df, cohort_s=None, cohort_order=None, cohort_colors=None, date_s=None,
 14 |                     insertsize_df=None, gc_content_df=None, tpm_df=None,
 15 |                     thresholds=None, lims=None, outlier_method='threshold',
 16 |                     show_legend=True, legend_cols=5, lw=4, lh=1, ms=12, alpha=1, show_xticklabels=False,
 17 |                     highlight_ids=None, prefix=None, output_dir=None, dpi=300):
 18 |     """
 19 |     metrics_df: output from RNA-SeQC
 20 |     cohort_s: mapping of sample ID to cohort/cluster/etc.
 21 |     """
 22 |     if cohort_s is None:
 23 |         cohort_s = pd.Series('All samples', index=metrics_df.index)
 24 |     else:
 25 |         assert metrics_df.index.isin(cohort_s.index).all() and cohort_s.loc[metrics_df.index].notnull().all()
 26 | 
 27 |     if date_s is not None:
 28 |         assert metrics_df.index.isin(date_s.index).all() and date_s.loc[metrics_df.index].notnull().all()
 29 | 
 30 |     if output_dir is not None:
 31 |         assert prefix is not None
 32 | 
 33 |     cohorts = np.unique(cohort_s.loc[metrics_df.index])
 34 |     if cohort_colors is None:
 35 |         cohort_colors = get_cohort_colors(cohorts)
 36 | 
 37 |     metrics_args = {
 38 |         'cohort_s': cohort_s,
 39 |         'cohort_order': cohort_order,
 40 |         'cohort_colors': cohort_colors,
 41 |         'date_s': date_s,
 42 |         'show_xticklabels': show_xticklabels,
 43 |         'ms': ms,
 44 |         'alpha': alpha,
 45 |         'highlight_ids': highlight_ids,
 46 |         'aw': 6,
 47 |         'ah': 2,
 48 |     }
 49 | 
 50 |     metrics_list = [
 51 |         'Mapped Reads',  # Unique Mapping, Vendor QC Passed Reads that were mapped
 52 |         'Mapping Rate',
 53 |         'Duplicate Rate of Mapped',
 54 |         'Exonic Rate',
 55 |         'Intronic Rate',
 56 |         'Intergenic Rate',
 57 |         'Chimeric Alignment Rate',
 58 |         'rRNA Rate',
 59 |         # 'Mapped Unique Reads',  # Duplicate Rate of Mapped is more representative
 60 |         "Median 3' bias",
 61 |         'Median Exon CV',
 62 |         'Fragment GC Content Mean',
 63 |         'Average Fragment Length',
 64 |     ]
 65 | 
 66 |     threshold_dir_dict = {
 67 |         'Mapped Reads': 'lt',
 68 |         'Mapping Rate': 'lt',
 69 |         'Duplicate Rate of Mapped': 'gt',
 70 |         'Exonic Rate': 'lt',
 71 |         'Intronic Rate': 'gt',
 72 |         'Intergenic Rate': 'gt',
 73 |         'Chimeric Alignment Rate': 'gt',
 74 |         'rRNA Rate': 'gt',
 75 |         "Median 3' bias": 'gt',
 76 |         'Median Exon CV': 'gt',
 77 |         'Average Fragment Length': 'lt',
 78 |     }
 79 | 
 80 |     threshold_dict = {
 81 |         'Mapped Reads': 50e6,
 82 |         'Mapping Rate': 0.9,
 83 |         # 'Duplicate Rate of Mapped': 0.6,
 84 |         'Exonic Rate': 0.7,
 85 |         'Intronic Rate': 0.05,
 86 |         'Intergenic Rate': 0.1,
 87 |         'Chimeric Alignment Rate': 0.01,
 88 |         'rRNA Rate': 0.1,
 89 |     }
 90 |     if thresholds is not None:
 91 |         threshold_dict.update(thresholds)
 92 | 
 93 |     ylim_dict = {
 94 |         'Mapped Reads': None,
 95 |         'Mapping Rate': [0,1],
 96 |         'Duplicate Rate of Mapped': [0, 1],
 97 |         'Exonic Rate': [0, 1],
 98 |         'Intronic Rate': [0, 1],
 99 |         'Intergenic Rate': [0, 1],
100 |         'Chimeric Alignment Rate': [0, 0.1],
101 |         'rRNA Rate': [0, 1],
102 |         "Median 3' bias": [0, 1],
103 |         'Median Exon CV': None,
104 |         'Fragment GC Content Mean': [0, 1],
105 |         'Average Fragment Length': None,
106 |     }
107 | 
108 |     if lims is not None:
109 |         ylim_dict.update(lims)
110 | 
111 |     if cohort_order is None:
112 |         cohort_order = cohorts
113 | 
114 |     # plot cohort legend
115 |     if show_legend:
116 |         ax = qtl.plot.setup_figure(lw, lh, xspace=[0,0], yspace=[0,0])
117 |         for c in cohort_order:
118 |             ax.scatter(np.nan, np.nan, s=48, marker='s', color=cohort_colors[c], label=c)
119 |         ax.scatter(np.nan, np.nan, fc='w', ec='k', lw=1, s=30, label='Outliers')
120 |         ax.legend(loc='center left', handlelength=1, ncol=legend_cols)
121 |         plt.axis('off')
122 |         if output_dir is not None:
123 |             plt.savefig(os.path.join(output_dir, f'{prefix}.legend.pdf'), dpi=dpi)
124 | 
125 |     # distributions for selected/key metrics
126 |     for k,metric in enumerate(metrics_list, 1):
127 |         if metric in metrics_df and not (metrics_df[metric] == 0).all():
128 |             if metric == 'Duplicate Rate of Mapped' and 'Duplicate Rate of Mapped, excluding Globins' in metrics_df:
129 |                 metric_s = metrics_df['Duplicate Rate of Mapped, excluding Globins'].rename('Duplicate Rate of Mapped')
130 |             else:
131 |                 metric_s = metrics_df[metric]
132 |             metrics(metric_s, ylim=ylim_dict[metric],
133 |                          threshold=threshold_dict.get(metric, None),
134 |                          threshold_dir=threshold_dir_dict.get(metric, None),
135 |                          outlier_method=outlier_method,
136 |                          **metrics_args)
137 |             if output_dir is not None:
138 |                 plt.savefig(os.path.join(output_dir, '{}.{}.pdf'.format(prefix, metric.lower().replace("3'",'3prime').replace(' ','_'))), dpi=dpi)
139 | 
140 |     # genes detected vs bias and duplication rate
141 |     if "Median 3' bias" in metrics_df:
142 |         c = 'Duplicate Rate of Mapped, excluding Globins' if 'Duplicate Rate of Mapped, excluding Globins' in metrics_df else 'Duplicate Rate of Mapped'
143 |         detection_bias(metrics_df, bias_metric="Median 3' bias", c=c)
144 |         if output_dir is not None:
145 |             plt.savefig(os.path.join(output_dir, f'{prefix}.genes_detected_vs_median_3prime_bias.pdf'), dpi=dpi)
146 | 
147 |     # mismatch rates
148 |     if not metrics_df['End 1 Mismatch Rate'].isnull().all():
149 |         mismatch_rates(metrics_df, cohort_s=cohort_s, cohort_order=cohort_order, cohort_colors=cohort_colors,
150 |                        end1_threshold=threshold_dict.get('End 1 mismatch rate', None),
151 |                        end2_threshold=threshold_dict.get('End 2 mismatch rate', None))
152 |         if output_dir is not None:
153 |             plt.savefig(os.path.join(output_dir, f'{prefix}.end_mismatch_rates.pdf'), dpi=dpi)
154 | 
155 |     mapping_sense(metrics_df, cohort_s=cohort_s, cohort_order=cohort_order,
156 |                   cohort_colors=cohort_colors, date_s=date_s, width=1)
157 |     if output_dir is not None:
158 |         plt.savefig(os.path.join(output_dir, f'{prefix}.mapping_sense.pdf'), dpi=dpi)
159 | 
160 |     # insert size distributions (if supplied)
161 |     if insertsize_df is not None:
162 |         insert_sizes(insertsize_df, cohort_s=cohort_s, cohort_order=cohort_order,
163 |                      cohort_colors=cohort_colors, sort_order='cohort')
164 |         if output_dir is not None:
165 |             plt.savefig(os.path.join(output_dir, f'{prefix}.insert_sizes.pdf'), dpi=dpi)
166 | 
167 |     if gc_content_df is not None:
168 |         gc_content(gc_content_df, cohort_s=cohort_s, cohort_colors=cohort_colors,
169 |                    cohort_order=cohort_order, sort_order='cohort')
170 |         if output_dir is not None:
171 |             plt.savefig(os.path.join(output_dir, f'{prefix}.gc_content.pdf'), dpi=dpi)
172 | 
173 |     if tpm_df is not None:
174 |         cdf_df = calculate_expression_cdfs(tpm_df)
175 |         if tpm_df.shape[1] < 50:
176 |             mode = 'lines'
177 |         else:
178 |             mode = 'ci'
179 |         cumulative_expression(cdf_df, cohort_s=cohort_s, cohort_colors=cohort_colors, mode=mode)
180 |         if output_dir is not None:
181 |             plt.savefig(os.path.join(output_dir, f'{prefix}.cumulative_expression.pdf'), dpi=dpi)
182 | 
183 | 
184 | def load_inputs(args):
185 | 
186 |     if args.metrics.endswith('.parquet'):
187 |         metrics_df = pd.read_parquet(args.metrics)
188 |     else:
189 |         metrics_df = pd.read_csv(args.metrics, sep='\t', index_col=0)
190 | 
191 |     if args.tpm is not None:
192 |         tpm_df = qtl.io.read_gct(args.tpm, load_description=False)
193 |     else:
194 |         tpm_df = None
195 | 
196 |     if args.cohort is not None:
197 |         cohort_s = pd.read_csv(args.cohort, sep='\t', index_col=0, header=None).squeeze('columns')
198 |         assert metrics_df.index.isin(cohort_s.index).all()
199 |     else:
200 |         cohort_s = None
201 | 
202 |     if args.date is not None:
203 |         date_s = pd.read_csv(args.date, sep='\t', index_col=0, header=None).squeeze('columns')
204 |         assert metrics_df.index.isin(date_s.index).all()
205 |     else:
206 |         date_s = None
207 | 
208 |     if args.insert_size is not None:
209 |         insertsize_df = pd.read_csv(args.insert_size, sep='\t', index_col=0)
210 |     else:
211 |         insertsize_df = None
212 | 
213 |     return metrics_df, tpm_df, cohort_s, date_s, insertsize_df
214 | 
215 | 
216 | if __name__ == '__main__':
217 | 
218 |     parser = argparse.ArgumentParser(description='Generate QC report from RNA-SeQC metrics table.')
219 |     parser.add_argument('metrics', help='Aggregated QC metrics from RNA-SeQC.')
220 |     parser.add_argument('prefix', help='Name for output files.')
221 |     parser.add_argument('--tpm', default=None, help='Aggregated TPM matrix from RNA-SeQC.')
222 |     parser.add_argument('--insert-size', default=None, help='Aggregated insert sizes from RNA-SeQC.')
223 |     parser.add_argument('--cohort', default=None, help='Cohort or batch annotation. TSV file mapping sample IDs to annotation.')
224 |     parser.add_argument('--date', default=None, help='Date annotation. TSV file mapping sample IDs to dates.')
225 |     parser.add_argument('--output-dir', default='.', help='If specified, figures are saved here.')
226 |     parser.add_argument('--dpi', type=int, default=300, help='Figure resolution.')
227 |     args = parser.parse_args()
228 | 
229 |     metrics_df, tpm_df, cohort_s, date_s, insertsize_df = load_inputs(args)
230 | 
231 |     plot_qc_figures(metrics_df, cohort_s=cohort_s, cohort_colors=None, date_s=date_s,
232 |                     prefix=args.prefix, output_dir=args.output_dir, dpi=args.dpi, show_legend=True,
233 |                     ms=12, alpha=1, show_xticklabels=False, highlight_ids=None,
234 |                     thresholds=None, insertsize_df=insertsize_df, tpm_df=tpm_df)
235 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RNA-SeQC
  2 | 
  3 | [![Version](https://img.shields.io/github/release/getzlab/rnaseqc.svg?label=Version)](https://github.com/getzlab/rnaseqc/releases)
  4 | [![CI](https://github.com/getzlab/rnaseqc/workflows/CI/badge.svg)](https://github.com/getzlab/rnaseqc/actions)
  5 | 
  6 | RNA-SeQC 2 is described in [A. Graubert*, F. Aguet*, A. Ravi, K.G. Ardlie, Gad Getz, "RNA-SeQC 2: efficient RNA-seq quality control and quantification for large cohorts," *Bioinformatics*, 2021](https://doi.org/10.1093/bioinformatics/btab135).
  7 | 
  8 | ## Installing
  9 | 
 10 | The latest stable build of RNA-SeQC is available on the [GitHub Releases](https://github.com/getzlab/rnaseqc/releases) page, and contains static binaries for Linux and OSX.
 11 | 
 12 | RNA-SeQC is also available as a docker image: `gcr.io/broad-cga-aarong-gtex/rnaseqc:latest` which is automatically updated with any code change.
 13 | Older versions of the docker image are tagged using the full commit SHA of any commit which introduced a code change.
 14 | 
 15 | To checkout the source of RNA-SeQC run `git clone --recursive https://github.com/getzlab/rnaseqc.git`.
 16 | If you do not use the `--recursive` flag, you'll need to run `git submodule update --init --recursive` or you will be missing [SeqLib](https://github.com/walaj/SeqLib).
 17 | 
 18 | #### Unit Tests
 19 | 
 20 | Input data for RNA-SeQC's testing suite is not stored in the repository due to
 21 | size constraints. The current test data is available [here](https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz), and must be unpacked within the `test_data/` directory.
 22 | Please note that the location of the test data is subject to change.
 23 | The test resources use **~1.2 GB** of space.
 24 | 
 25 | You can download and unpack test data with:
 26 | 
 27 | ```
 28 | cd test_data
 29 | wget https://storage.googleapis.com/agraubert/broadinstitute/rnaseqc/test_inputs.tar.gz
 30 | tar xzf test_inputs.tar.gz
 31 | ```
 32 | 
 33 | You can run the unit tests with `make test`
 34 | 
 35 | ## Usage
 36 | 
 37 | **NOTE**: This tool requires that the provided GTF be collapsed in such a way that there are no overlapping transcripts **on the same strand** and that each gene have a single transcript whose id matches the parent gene id. This is **not** a transcript-quantification method. Readcounts and coverage are made towards exons and genes only if *all* aligned segments of a read fully align to exons of a gene, but keep in mind that coverage may be counted towards multiple transcripts (and its exons) if these criteria are met. Beyond this, no attempt will be made to disambiguate which transcript a read belongs to.
 38 | You can collapse an existing GTF using the [GTEx collapse annotation script](https://github.com/broadinstitute/gtex-pipeline/tree/master/gene_model)
 39 | 
 40 | ### Command Line Usage:
 41 | 
 42 | `rnaseqc [OPTIONS] gtf bam output`
 43 | 
 44 | Example: `./rnaseqc test_data/downsampled.gtf test_data/downsampled.bam --bed test_data/downsampled.bed --coverage .`
 45 | 
 46 | ###### OPTIONS:
 47 |       -h, --help                        Display this message and quit
 48 | 
 49 |       --version                         Display the version and quit
 50 | 
 51 |       gtf                               The input GTF file containing features
 52 |                                         to check the bam against
 53 | 
 54 |       bam                               The input SAM/BAM file containing reads
 55 |                                         to process
 56 | 
 57 |       output                            Output directory
 58 | 
 59 |       -s[sample], --sample=[sample]     The name of the current sample. Default:
 60 |                                         The bam's filename
 61 | 
 62 |       --bed=[BEDFILE]                   Optional input BED file containing
 63 |                                         non-overlapping exons used for fragment
 64 |                                         size calculations
 65 | 
 66 |       --fasta=[fasta]                   Optional input FASTA/FASTQ file
 67 |                                         containing the reference sequence used
 68 |                                         for parsing CRAM files
 69 | 
 70 |       --chimeric-distance=[DISTANCE]    Set the maximum accepted distance
 71 |                                         between read mates. Mates beyond this
 72 |                                         distance will be counted as chimeric
 73 |                                         pairs. Default: 2000000 [bp]
 74 | 
 75 |       --fragment-samples=[SAMPLES]      Set the number of samples to take when
 76 |                                         computing fragment sizes. Requires the
 77 |                                         --bed argument. Default: 1000000
 78 | 
 79 |       -q[QUALITY],
 80 |       --mapping-quality=[QUALITY]       Set the lower bound on read quality for
 81 |                                         exon coverage counting. Reads below this
 82 |                                         number are excluded from coverage
 83 |                                         metrics. Default: 255
 84 | 
 85 |       --base-mismatch=[MISMATCHES]      Set the maximum number of allowed
 86 |                                         mismatches between a read and the
 87 |                                         reference sequence. Reads with more than
 88 |                                         this number of mismatches are excluded
 89 |                                         from coverage metrics. Default: 6
 90 | 
 91 |       --offset=[OFFSET]                 Set the offset into the gene for the 3'
 92 |                                         and 5' windows in bias calculation. A
 93 |                                         positive value shifts the 3' and 5'
 94 |                                         windows towards eachother, while a
 95 |                                         negative value shifts them apart.
 96 |                                         Default: 150 [bp]
 97 | 
 98 |       --window-size=[SIZE]              Set the size of the 3' and 5' windows in
 99 |                                         bias calculation. Default: 100 [bp]
100 | 
101 |       --gene-length=[LENGTH]            Set the minimum size of a gene for bias
102 |                                         calculation. Genes below this size are
103 |                                         ignored in the calculation. Default: 600
104 |                                         [bp]
105 | 
106 |       --legacy                          Use legacy counting rules. Gene and exon
107 |                                         counts match output of RNA-SeQC 1.1.9
108 | 
109 |       --stranded=[stranded]             Use strand-specific metrics. Only
110 |                                         features on the same strand of a read
111 |                                         will be considered. Allowed values are
112 |                                         'RF', 'rf', 'FR', and 'fr'
113 | 
114 |       -v, --verbose                     Give some feedback about what's going
115 |                                         on. Supply this argument twice for
116 |                                         progress updates while parsing the bam
117 | 
118 |       -t[TAG...], --tag=[TAG...]        Filter out reads with the specified tag.
119 | 
120 |       --chimeric-tag=[TAG]              Reads maked with the specified tag will
121 |                                         be labeled as Chimeric. Defaults to 'ch'
122 |                                         for STAR
123 | 
124 |       --exclude-chimeric                Exclude chimeric reads from the read
125 |                                         counts
126 | 
127 |       -u, --unpaired                    Allow unpaired reads to be quantified.
128 |                                         Required for single-end libraries
129 | 
130 |       --rpkm                            Output gene RPKM values instead of TPMs
131 | 
132 |       --coverage                        If this flag is provided, coverage
133 |                                         statistics for each transcript will be
134 |                                         written to a table. Otherwise, only
135 |                                         summary coverage statistics are
136 |                                         generated and added to the metrics table
137 | 
138 |       --coverage-mask=[SIZE]            Sets how many bases at both ends of a
139 |                                         transcript are masked out when computing
140 |                                         per-base exon coverage. Default: 500bp
141 | 
142 |       -d[threshold],
143 |       --detection-threshold=[threshold] Number of counts on a gene to consider
144 |                                         the gene 'detected'. Additionally, genes
145 |                                         below this limit are excluded from 3'
146 |                                         bias computation. Default: 5 reads
147 | 
148 |       "--" can be used to terminate flag options and force all following
149 |       arguments to be treated as positional options
150 | 
151 | ### Output files:
152 | The following output files are generated in the output directory you provide:
153 | * {sample}.metrics.tsv : A tab-delimited list of (Statistic, Value) pairs of all statistics and metrics recorded.
154 | * {sample}.exon_reads.gct : A tab-delimited GCT file with (Exon ID, Gene Name, coverage) tuples for all exons which had at least part of one read mapped.
155 | * {sample}.gene_reads.gct : A tab-delimited GCT file with (Gene ID, Gene Name, coverage) tuples for all genes which had at least one read map to at least one of its exons. This file contains the gene-level read counts used, e.g., for differential expression analyses.
156 | * {sample}.gene_tpm.gct : A tab-delimited GCT file with (Gene ID, Gene Name, TPM) tuples for all genes reported in the gene_reads.gct file, with expression values in transcript per million (TPM) units. Note: this file is renamed to .gene_rpkm.gct if the **--rpkm** flag is present.
157 | * {sample}.fragmentSizes.txt : A list of fragment sizes recorded, if a BED file was provided
158 | * {sample}.coverage.tsv : A tab-delimited list of (Gene ID, Transcript ID, Mean Coverage, Coverage Std, Coverage CV) tuples for all transcripts encountered in the GTF.
159 | 
160 | #### Metrics reported:
161 | 
162 | See [Metrics.md](Metrics.md) for a description of all metrics reported in the `metrics.tsv`, `coverage.tsv`, and `fragmentSizes.txt` files.
163 | 
164 | ### Legacy mode differences
165 | 
166 | The **--legacy** flag enables compatibility with RNASeQC 1.1.9. This ensures that exon and gene readcounts match exactly the counts which would have been produced by running that version. This also adds an extra condition to classify reads as chimeric (see "Chimeric Reads", above). Any metrics which existed in 1.1.9 will also match within Java's floating point precision.
167 | 


--------------------------------------------------------------------------------
/src/Metrics.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | //  Metrics.cpp
  3 | //  IntervalTree
  4 | //
  5 | //  Created by Aaron Graubert on 7/5/17.
  6 | //  Copyright © 2017 Aaron Graubert. All rights reserved.
  7 | //
  8 | 
  9 | #include "Metrics.h"
 10 | #include <iostream>
 11 | #include <math.h>
 12 | #include <cmath>
 13 | #include <unordered_set>
 14 | #include <algorithm>
 15 | #include <iterator>
 16 | 
 17 | namespace rnaseqc {
 18 | 
 19 | 
 20 |     std::map<std::string, double> uniqueGeneCounts, geneCounts, exonCounts, geneFragmentCounts; //counters for read coverage of genes and exons
 21 | 
 22 |     std::map<std::string, std::unordered_set<std::string> > fragmentTracker; // tracks fragments encountered by each gene
 23 |     
 24 |     std::tuple<double, double, double> computeCoverage(Fasta&, std::ofstream&, const Feature&, const unsigned int, const std::map<std::string, std::vector<unsigned long> >&, std::map<std::string, ExonCoverage>&, BiasCounter&);
 25 | 
 26 |     void add_range(std::vector<unsigned long>&, coord, unsigned int);
 27 | 
 28 |     void Metrics::increment(std::string key)
 29 |     {
 30 |         this->counter[key]++;
 31 |     }
 32 | 
 33 |     void Metrics::increment(std::string key, int n)
 34 |     {
 35 |         this->counter[key] += n;
 36 |     }
 37 | 
 38 |     unsigned long Metrics::get(std::string key)
 39 |     {
 40 |         return this->counter[key];
 41 |     }
 42 | 
 43 |     double Metrics::frac(std::string a, std::string b)
 44 |     {
 45 |         return static_cast<double>(this->get(a)) / this->get(b);
 46 |     }
 47 | 
 48 |     // Add coverage to an exon
 49 |     void Collector::add(const std::string &gene_id, const std::string &exon_id, const double coverage)
 50 |     {
 51 |         if (coverage > 0)
 52 |         {
 53 |             this->data[gene_id].push_back(std::pair<std::string, double>(exon_id, coverage));
 54 |             this->dirty = true;
 55 |         }
 56 |     }
 57 | 
 58 |     //Commit all the exon coverage from this gene to the global exon coverage counter
 59 |     void Collector::collect(const std::string &gene_id)
 60 |     {
 61 |         for (auto entry = this->data[gene_id].begin(); entry != this->data[gene_id].end(); ++entry)
 62 |         {
 63 |             (*this->target)[entry->first] += entry->second;
 64 |             this->total += entry->second;
 65 |         }
 66 |     }
 67 | 
 68 |     //Legacy version of the above function. Ignores the actual coverage and reports a full read count
 69 |     void Collector::collectSingle(const std::string &gene_id)
 70 |     {
 71 |         for (auto entry = this->data[gene_id].begin(); entry != this->data[gene_id].end(); ++entry)
 72 |         {
 73 |             (*this->target)[entry->first] += 1.0;
 74 |         }
 75 |     }
 76 | 
 77 |     //Check if there is any coverage on any exon of this gene
 78 |     bool Collector::queryGene(const std::string &gene_id)
 79 |     {
 80 |         return static_cast<bool>(this->data[gene_id].size());
 81 |     }
 82 | 
 83 |     // Check if any coverage has been reported whatsoever
 84 |     bool Collector::isDirty()
 85 |     {
 86 |         return this->dirty;
 87 |     }
 88 | 
 89 |     //Get the sum of all coverage that was committed for this read (should always be <= 1)
 90 |     double Collector::sum()
 91 |     {
 92 |         return this->total;
 93 |     }
 94 | 
 95 |     //Adds coverage from one aligned segment of a read to this exon. Coverage feeds into cache until gene leaves search window
 96 |     void BaseCoverage::add(const Feature &exon, const coord start, const coord end)
 97 |     {
 98 |         CoverageEntry tmp;
 99 |         tmp.offset = start - exon.start;
100 |         tmp.length = end - start;
101 |         tmp.feature_id = exon.feature_id;
102 |         this->cache[exon.gene_id].push_back(tmp);
103 |     }
104 | 
105 |     //Commit the cached coverage to this gene after deciding to count the read towards the gene
106 |     void BaseCoverage::commit(const std::string &gene_id)
107 |     {
108 |         if (this->seen.count(gene_id))
109 |         {
110 |             std::cerr << "Gene encountered after computing coverage " << gene_id << std::endl;
111 |             return;
112 |         }
113 |         auto beg = this->cache[gene_id].begin();
114 |         auto end = this->cache[gene_id].end();
115 |         while (beg != end)
116 |         {
117 |             if (this->coverage.find(beg->feature_id) == this->coverage.end()) this->coverage[beg->feature_id] = std::vector<unsigned long>(exonLengths[beg->feature_id].length, 0ul);
118 |             //Add each coverage entry to the per-base coverage vector for the exon
119 |             //At this stage exons each have their own vectors.
120 |             //During the compute() step, exons get stiched together
121 |             add_range(this->coverage[beg->feature_id], beg->offset, beg->length);
122 |             ++beg;
123 |         }
124 |     }
125 | 
126 |     void BaseCoverage::reset() //Empties the cache
127 |     {
128 |         this->cache.clear();
129 |     }
130 | 
131 |     //computes per-base coverage of the gene
132 |     void BaseCoverage::compute(const Feature &gene)
133 |     {
134 |         //Coverage is stored in EID -> coverage vector
135 |         //First iterate over all exons of the gene and ensure they're filled
136 |         //That way, stiching the exons will result in a complete transcript even for exons which haven't been seen
137 |         for (auto exon_id = exonsForGene[gene.feature_id].begin(); exon_id != exonsForGene[gene.feature_id].end(); ++exon_id)
138 |             if (this->coverage.find(*exon_id) == this->coverage.end()) this->coverage[*exon_id] = std::vector<unsigned long>(exonLengths[*exon_id].length, 0ul);
139 |         //then compute coverage for the gene
140 |         std::tuple<double, double, double> results = computeCoverage(this->fastaReader, this->writer, gene, this->mask_size, this->coverage, this->exonCoverage, this->bias);
141 |         if (std::get<0>(results) != -1)
142 |         {
143 |             this->geneMeans.push_back(std::get<0>(results));
144 |             this->geneStds.push_back(std::get<1>(results));
145 |             this->geneCVs.push_back(std::get<2>(results));
146 |         }
147 |         //Now clean out the coverage map to save memory
148 |         for (auto exon_id = exonsForGene[gene.feature_id].begin(); exon_id != exonsForGene[gene.feature_id].end(); ++exon_id)
149 |             this->coverage.erase(*exon_id);
150 |         this->seen.insert(gene.feature_id);
151 |     }
152 | 
153 |     void BaseCoverage::close()
154 |     {
155 |         this->writer.flush();
156 |         this->writer.close();
157 |     }
158 | 
159 |     //Compute 3'/5' bias based on genes' per-base coverage
160 |     void BiasCounter::computeBias(const Feature &gene, std::vector<unsigned long> &coverage)
161 |     {
162 | 
163 |         if (coverage.size() < this->geneLength) return; //Must meet minimum length req
164 |         unsigned long peak = 0ul;
165 |         unsigned peak_pos = 0;
166 |         for (unsigned i = 0; i < coverage.size(); ++i) if (coverage[i] > peak)
167 |         {
168 |           peak_pos = i;
169 |           peak = coverage[i];
170 |         }
171 |         auto coverageMedianPos = coverage.begin() + peak_pos;
172 |         std::list<unsigned long> coveragePeakEntries;
173 |         //First scroll half a window to the right of the peak (stop if we reach the end)
174 |         for (int i = 0; i < this->windowSize/2 && coverageMedianPos != coverage.end(); ++i) ++coverageMedianPos;
175 |         //Then scroll back 1 full window, adding entries to the list
176 |         for (int i = 0; i < this->windowSize && coverageMedianPos != coverage.begin(); ++i) coveragePeakEntries.push_back(*(coverageMedianPos--));
177 |         coveragePeakEntries.sort();
178 |         double coveragePeakMedian = computeMedian(coveragePeakEntries.size(), coverageMedianPos);
179 |         
180 | 
181 |         if (coveragePeakMedian >= 100) {
182 |             std::vector<unsigned long> percentileContainer(coverage);
183 |             std::sort(percentileContainer.begin(), percentileContainer.end());
184 |             {
185 |                 auto xcursor = percentileContainer.begin();
186 |                 while (xcursor != percentileContainer.end() && (*xcursor) == 0ul) ++xcursor;
187 |                 percentileContainer.erase(percentileContainer.begin(), xcursor);
188 |             }
189 |             unsigned long lowerLimit = percentileContainer[percentileContainer.size()*0.05];
190 |             unsigned long trimmed_length = 0ul;
191 |             
192 |             {
193 |                 auto cursor = coverage.begin();
194 |                 while (cursor != coverage.end() && (*cursor) <= lowerLimit)
195 |                 {
196 |                     ++trimmed_length;
197 |                     ++cursor;
198 |                 }
199 |                 coverage.erase(coverage.begin(), cursor);
200 |             }
201 |             {
202 |                 while (coverage.size() > 0 && coverage.back() <= lowerLimit) {
203 |                     coverage.pop_back();
204 |                     ++trimmed_length;
205 |                 }
206 |             }
207 | 
208 |             if (coverage.size() >= this->geneLength)
209 |             {
210 |                 double windowSize = static_cast<double>(this->windowSize);
211 |                 std::vector<double> lcov, rcov;
212 |                 lcov.reserve(this->windowSize);
213 |                 rcov.reserve(this->windowSize);
214 |                 for (unsigned int i = this->offset; i < this->offset + this->windowSize && i < coverage.size(); ++i)
215 |                     lcov.push_back(static_cast<double>(coverage[i]));
216 |                 for (int i = coverage.size() - (this->windowSize + this->offset); i >= 0 && i < coverage.size() - this->offset; ++i)
217 |                     rcov.push_back(static_cast<double>(coverage[i]));
218 |                 std::sort(lcov.begin(), lcov.end());
219 |                 std::sort(rcov.begin(), rcov.end());
220 |                 if (gene.strand == Strand::Forward)
221 |                 {
222 |                     this->threeEnd[gene.feature_id] += computeMedian(rcov.size(), rcov.begin());
223 |                     this->fiveEnd[gene.feature_id] += computeMedian(lcov.size(), lcov.begin());
224 |                 } else
225 |                 {
226 |                     this->threeEnd[gene.feature_id] += computeMedian(lcov.size(), lcov.begin());
227 |                     this->fiveEnd[gene.feature_id] += computeMedian(rcov.size(), rcov.begin());
228 |                 }
229 |                 
230 |             }
231 |             
232 |         }
233 |         
234 | 
235 |     }
236 |     
237 | 
238 |     //Extract the bias for a gene
239 |     double BiasCounter::getBias(const std::string &geneID)
240 |     {
241 |         double cov5 = this->fiveEnd[geneID];
242 |         double cov3 = this->threeEnd[geneID];
243 |         if (cov5 + cov3 > 0.0)
244 |         {
245 |             this->countedGenes++;
246 |             return cov3 / (cov5 + cov3);
247 |         }
248 |         return -1.0;
249 |     }
250 |     
251 |     unsigned int BiasCounter::countGenes() const
252 |     {
253 |         return this->countedGenes;
254 |     }
255 | 
256 | 
257 |     void add_range(std::vector<unsigned long> &coverage, coord offset, unsigned int length)
258 |     {
259 |         const size_t size = coverage.size();
260 |         for (coord i = offset; i < offset + length && i < size; ++i) coverage[i] += 1ul;
261 |         if (offset + length > size) std::cerr << "Error: Attempted to write more coverage than present on exon. Coverage-based metrics may be inaccurate. This may be a sign of an invalid bam or gtf entry" << std::endl;
262 |     }
263 | 
264 |     //Compute exon coverage metrics, then stich exons together and compute gene coverage metrics
265 |     std::tuple<double, double, double> computeCoverage(Fasta& fastaReader, std::ofstream &writer, const Feature &gene, const unsigned int mask_size, const std::map<std::string, std::vector<unsigned long> > &coverage, std::map<std::string, ExonCoverage>& totalExonCV, BiasCounter &bias)
266 |     {
267 |         std::vector<std::vector<bool> > coverageMask;
268 |         std::vector<unsigned long> geneCoverage;
269 |         unsigned int maskRemainder = mask_size;
270 |         for (unsigned int i = 0; i < exonsForGene[gene.feature_id].size(); ++i)
271 |         {
272 |             coverageMask.push_back(std::vector<bool>(exonLengths[exonsForGene[gene.feature_id][i]].length, true)); //First store a pre-filled mask for the exon
273 |             for (unsigned int j = 0; j < coverageMask.back().size() && maskRemainder; ++j, --maskRemainder) //now, remove coverage from the front of the exon until either it, or the mask size is depleted
274 |                 coverageMask.back()[j] = false;
275 |         }
276 |         maskRemainder = mask_size; //reset the exon mask to mask out the end
277 |         for (int i = exonsForGene[gene.feature_id].size() - 1; i >= 0 && maskRemainder; --i) //repeat the process, masking out regions from the back until the mask size is depleted
278 |             for (int j = coverageMask[i].size() - 1; j >= 0 && maskRemainder; --j, --maskRemainder)
279 |                 coverageMask[i][j] = false;
280 |         for (unsigned int i = 0; i < exonsForGene[gene.feature_id].size(); ++i)
281 |         {
282 |             const std::vector<unsigned long> &exon_coverage = coverage.at(exonsForGene[gene.feature_id][i]); //get the coverage vector for the current exon
283 |             double exonMean = 0.0, exonStd = 0.0, exonSize = 0.0;
284 |             std::vector<bool> mask = coverageMask[i];
285 | 
286 |             for (unsigned int j = 0; j < mask.size(); ++j) if (mask[j]) exonSize += 1.0; //count the remaining unmasked length of the exon
287 |             if (exonSize > 0)
288 |             {
289 |                 auto maskIter = mask.begin();
290 |                 for (auto start = exon_coverage.begin(); start != exon_coverage.end(); ++start)
291 |                     if (*(maskIter++)) exonMean += static_cast<double>(*start) / exonSize;
292 |                 maskIter = mask.begin();
293 |                 for (auto start = exon_coverage.begin(); start != exon_coverage.end(); ++start)
294 |                     if (*(maskIter++)) exonStd += pow(static_cast<double>(*start) - exonMean, 2.0) / exonSize;
295 |                 exonStd = pow(exonStd, 0.5);
296 |                 exonStd /= exonMean; //now it's a CV
297 |                 
298 |                 if (!(std::isnan(exonStd) || std::isinf(exonStd))) {
299 |                     FeatureSpan exonPos = exonLengths[exonsForGene[gene.feature_id][i]];
300 |                     if (fastaReader.hasContig(exonPos.chromosome)) {
301 |                         std::string exonSeq = fastaReader.getSeq(exonPos.chromosome, exonPos.start, exonPos.start + exonPos.length);
302 |                         totalExonCV[exonsForGene[gene.feature_id][i]] = {exonStd, gc(exonSeq)};
303 |                     } else totalExonCV[exonsForGene[gene.feature_id][i]] = {exonStd, -1.0};
304 |                 }
305 |             }
306 |             // Reserve and append the exon vector to the growing gene vector
307 |             geneCoverage.reserve(geneCoverage.size() + exon_coverage.size());
308 |             geneCoverage.insert(geneCoverage.end(), exon_coverage.begin(), exon_coverage.end());
309 |         }
310 |         //at this point the gene coverage vector represents an UNMASKED, but complete transcript
311 |         bias.computeBias(gene, geneCoverage); //no masking in bias
312 |         double avg = 0.0, std = 0.0;
313 |         // apply the mask to the full gene vector
314 |         if (mask_size)
315 |         {
316 |             //to account for the mask, erase bases from the vector
317 |             //If the mask is larger than the gene, just erase all of it
318 |             //Otherwise, erase from (end-mask) -> end
319 |             geneCoverage.erase((mask_size > geneCoverage.size() ? geneCoverage.begin() : geneCoverage.end() - mask_size), geneCoverage.end());
320 |             // If there is still coverage area, erase from the front to either the end (if the mask is larger than remaining coverage) or to (front + mask)
321 |             if (geneCoverage.size()) geneCoverage.erase(geneCoverage.begin(), (mask_size > geneCoverage.size() ? geneCoverage.end() : geneCoverage.begin() + mask_size));
322 |         }
323 |         double size = static_cast<double>(geneCoverage.size());
324 |         writer << gene.feature_id << "\t";
325 |         if (size > 0) //If there's still any coverage after applying the mask
326 |         {
327 |             for (auto beg = geneCoverage.begin(); beg != geneCoverage.end(); ++beg)
328 |                 avg += static_cast<double>(*beg) / size;
329 |             for (auto base = geneCoverage.begin(); base != geneCoverage.end(); ++base)
330 |                 std += std::pow(static_cast<double>(*base) - avg, 2.0) / size;
331 |             std = std::pow(std, 0.5);
332 |             writer << avg << "\t" << std << "\t" << (std / avg) << std::endl;
333 |             return std::make_tuple(avg, std, (std / avg));
334 |         }
335 |         writer << "0\t0\tnan" << std::endl;
336 |         return std::make_tuple(-1, -1, -1);
337 |     }
338 | 
339 | 
340 | }
341 | 
342 | std::ofstream& operator<<(std::ofstream &stream, rnaseqc::Metrics &counter)
343 | {
344 |     std::vector<std::string> keys =  {
345 |         //"Alternative Alignments",
346 |         //"Chimeric Reads",
347 |         "End 1 Antisense",
348 |         "End 2 Antisense",
349 |         "End 1 Bases",
350 |         "End 2 Bases",
351 |         "End 1 Mapped Reads",
352 |         "End 2 Mapped Reads",
353 |         "End 1 Mismatches",
354 |         "End 2 Mismatches",
355 |         "End 1 Sense",
356 |         "End 2 Sense",
357 |         "Exonic Reads",
358 |         "Failed Vendor QC",
359 |         "High Quality Reads",
360 |         "Intergenic Reads",
361 |         "Intragenic Reads",
362 |         "Ambiguous Reads",
363 |         "Intronic Reads",
364 |         "Low Mapping Quality",
365 |         "Low Quality Reads",
366 |         "Mapped Duplicate Reads",
367 |         "Mapped Reads",
368 |         "Mapped Unique Reads",
369 |         "Mismatched Bases",
370 |         "Non-Globin Reads",
371 |         "Non-Globin Duplicate Reads",
372 |         "Reads used for Intron/Exon counts",
373 |         "rRNA Reads",
374 |         "Split Reads",
375 |         "Total Bases",
376 |         "Total Mapped Pairs",
377 |         // "Total Reads",
378 |         "Unique Mapping, Vendor QC Passed Reads",
379 |         "Unpaired Reads"
380 |     };
381 |     stream << "Total Alignments\t" << counter.get("Total Alignments") << std::endl;
382 |     stream << "Alternative Alignments\t" << counter.get("Alternative Alignments") << std::endl;
383 |     stream << "Supplementary Alignments\t" << counter.get("Supplementary Alignments") << std::endl;
384 |     stream << "Total Reads\t" << counter.get("Total Alignments") - counter.get("Alternative Alignments") - counter.get("Supplementary Alignments") << std::endl;
385 |     stream << "Chimeric Fragments\t";
386 |     if (counter.get("Chimeric Fragments_tag"))
387 |     {
388 |         stream << counter.get("Chimeric Fragments_tag") << std::endl;
389 |         stream << "Chimeric Alignment Rate\t" << counter.frac("Chimeric Fragments_tag", "Total Mapped Pairs") << std::endl;
390 |     }
391 |     else
392 |     {
393 |         stream << counter.get("Chimeric Fragments_auto") << std::endl;
394 |         stream << "Chimeric Alignment Rate\t" << counter.frac("Chimeric Fragments_auto", "Total Mapped Pairs") << std::endl;
395 | 
396 |     }
397 |     for (int i = 0; i < keys.size(); ++i)
398 |         if (keys[i] != "Split Reads" || counter.get("Split Reads"))
399 |             stream << keys[i] << "\t" << counter.get(keys[i]) << std::endl;
400 |     auto beg = counter.counter.begin();
401 |     auto end = counter.counter.end();
402 |     while (beg != end)
403 |     {
404 |         // Manually dump the counters for reads filtered by user supplied tags
405 |         if( beg->first.length() > 17 && beg->first.substr(0,17) == "Filtered by tag: ")
406 |         {
407 |             stream << beg->first << "\t" << beg->second << std::endl;
408 |         }
409 |         ++beg;
410 |     }
411 |     return stream;
412 | }
413 | 


--------------------------------------------------------------------------------
/python/rnaseqc/plot.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import matplotlib.ticker as ticker
  5 | from matplotlib.colors import LogNorm, ListedColormap, hsv_to_rgb
  6 | import seaborn as sns
  7 | import os
  8 | import qtl.plot
  9 | 
 10 | 
 11 | def get_cohort_colors(cohorts):
 12 |     nc = len(cohorts)
 13 |     if nc > 5:
 14 |         cohort_colors = {i:j for i,j in zip(cohorts, plt.cm.get_cmap('Spectral', nc)(np.random.permutation(np.arange(nc)))[:,:-1])}
 15 |     else:
 16 |         cohort_colors = {i:j for i,j in zip(cohorts, plt.cm.get_cmap('tab10', 10)(np.arange(nc))[:,:-1])}
 17 |     return cohort_colors
 18 | 
 19 | 
 20 | def sort_samples(sample_ix, cohort_s=None, cohort_order=None, date_s=None):
 21 |     """Sort samples by date and cohort label"""
 22 |     if cohort_s is None and date_s is None:
 23 |         return sample_ix
 24 | 
 25 |     if cohort_s is not None:
 26 |         assert sample_ix.isin(cohort_s.index).all()
 27 |         cohort_s = cohort_s[sample_ix]
 28 |     if date_s is not None:
 29 |         assert sample_ix.isin(cohort_s.index).all()
 30 |         date_s = date_s[sample_ix]
 31 | 
 32 |     if date_s is not None:
 33 |         if cohort_s is not None:  # sort samples by date and cohort
 34 |             sorted_ix = pd.concat([
 35 |                 pd.to_datetime(date_s).rename('date'), cohort_s.rename('cohort')], axis=1
 36 |             ).sort_values(['date', 'cohort'], na_position='first').index
 37 |         else:
 38 |             sorted_ix = pd.to_datetime(date_s).sort_values(na_position='first').index
 39 |     else:  # sort by cohort only
 40 |         if cohort_order is None:
 41 |             sorted_ix = cohort_s.sort_values(na_position='first').index
 42 |         else:
 43 |             sorted_ix = cohort_s.map({j:i for i,j in enumerate(cohort_order)}).sort_values(na_position='first').index
 44 | 
 45 |     return sorted_ix
 46 | 
 47 | 
 48 | def mismatch_rates(metrics_df, cohort_s=None, cohort_order=None, cohort_colors=None, ms=12, alpha=1, aw=2,
 49 |                    end1_threshold=None, end2_threshold=None,
 50 |                    end1_limit=0.01, end2_limit=0.025):
 51 |     """Plot base mismatch rates ('NM' tag) for read mate 1 vs read mate 2."""
 52 | 
 53 |     if cohort_s is not None:
 54 |         assert metrics_df.index.isin(cohort_s.index).all()
 55 |         cohort_s = cohort_s.loc[metrics_df.index]
 56 |     else:
 57 |         cohort_s = pd.Series('NA', index=metrics_df.index)
 58 | 
 59 |     ax = qtl.plot.setup_figure(aw, aw)
 60 | 
 61 |     x = metrics_df['End 1 Mismatch Rate'].copy()
 62 |     y = metrics_df['End 2 Mismatch Rate'].copy()
 63 |     x[x > end1_limit] = end1_limit
 64 |     y[y > end2_limit] = end2_limit
 65 | 
 66 |     sorted_ix = sort_samples(metrics_df.index, cohort_s=cohort_s, cohort_order=cohort_order)
 67 |     cohorts = cohort_s.loc[sorted_ix].unique()
 68 |     if cohort_colors is None:
 69 |         cohort_colors = get_cohort_colors(cohorts)
 70 | 
 71 |     for t in cohorts:
 72 |         ix = cohort_s[cohort_s == t].index
 73 |         ax.scatter(x[ix], y[ix], s=ms, edgecolor='none', label=t,
 74 |             c=[cohort_colors[t]], alpha=alpha, clip_on=False, rasterized=True)
 75 | 
 76 |     if end1_threshold is not None:
 77 |         ax.plot(2*[end1_threshold], [0,0.2], '--',  color=[0.6]*3, zorder=0, lw=1, alpha=0.8)
 78 |     if end2_threshold is not None:
 79 |         ax.plot([0,0.02], 2*[end2_threshold], '--', color=[0.6]*3, zorder=0, lw=1, alpha=0.8)
 80 |     if end1_threshold is not None or end2_threshold is not None:
 81 |         ix = (x > end1_threshold) | (y > end2_threshold)
 82 |         if any(ix):
 83 |             ax.scatter(x[ix], y[ix], c='none', edgecolor='k', s=ms, lw=1, label=None, clip_on=False, rasterized=True)
 84 | 
 85 |     qtl.plot.format_plot(ax, fontsize=10)
 86 |     ax.set_xlim([0, end1_limit])
 87 |     ax.set_ylim([0, end2_limit])
 88 | 
 89 |     ax.spines['left'].set_position(('outward', 6))
 90 |     ax.spines['bottom'].set_position(('outward', 6))
 91 |     ax.plot([0, end1_limit], [0, end1_limit], '--', c=[0.6]*3, lw=1, zorder=0)
 92 | 
 93 |     ax.set_xlabel('End 1 mismatch rate', fontsize=12)
 94 |     ax.set_ylabel('End 2 mismatch rate', fontsize=12)
 95 | 
 96 | 
 97 | def metrics(metric_s, cohort_s=None, cohort_order=None, cohort_colors=None, date_s=None,
 98 |             threshold=None, threshold_dir=None, outlier_method='threshold', plot_density=True, show_legend=False,
 99 |             ms=12, alpha=1, ylim=None, ylabel=None,
100 |             show_xticklabels=False, highlight_ids=None,
101 |             dl=0.85, aw=6, ds=0.2, daw=0.5, dr=0.25,
102 |             db=0.75, ah=2, dt=0.25, fontsize=10, rasterized=True):
103 |     """Plot a single QC metric sorted by cohort and/or date"""
104 | 
105 |     if ylabel is None:
106 |         ylabel = metric_s.name
107 | 
108 |     if metric_s.median() > 1e5:
109 |         metric_s = metric_s.copy() / 1e6
110 |         if threshold is not None:
111 |             threshold = threshold / 1e6
112 |         ylabel += ' (millions)'
113 | 
114 |     if cohort_s is not None:
115 |         assert metric_s.index.isin(cohort_s.index).all()
116 |         cohort_s = cohort_s.loc[metric_s.index]
117 |     else:
118 |         cohort_s = pd.Series('NA', index=metric_s.index)
119 | 
120 |     if show_xticklabels:
121 |         db += 0.75
122 | 
123 |     if plot_density:
124 |         fw = dl + aw + ds + daw + dr
125 |     else:
126 |         fw = dl + aw + dr
127 |     fh = db + ah + dt
128 |     fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh))
129 |     ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh])
130 |     if plot_density:
131 |         dax = fig.add_axes([(dl+aw+ds)/fw, db/fh, daw/fw, ah/fh], sharey=ax)
132 | 
133 |     if date_s is not None:
134 |         xlabel = 'Samples, ordered by date'
135 |     else:
136 |         xlabel = 'Samples'
137 | 
138 |     sorted_ix = sort_samples(metric_s.index, cohort_s=cohort_s, cohort_order=cohort_order, date_s=date_s)
139 |     cohorts = cohort_s.loc[sorted_ix].unique()
140 |     if cohort_colors is None:
141 |         cohort_colors = get_cohort_colors(cohorts)
142 | 
143 |     ns = len(metric_s)
144 |     xpos = pd.Series(np.arange(1,ns+1), index=sorted_ix)
145 | 
146 |     # plot
147 |     for t in cohorts:
148 |         ix = cohort_s[cohort_s==t].index
149 |         ax.scatter(xpos[ix], metric_s[ix], s=ms, edgecolor='none', label=t,
150 |                    c=[cohort_colors[t]], alpha=alpha, clip_on=False, rasterized=rasterized)
151 | 
152 |     if highlight_ids is not None:
153 |         ax.scatter(xpos[highlight_ids], metric_s[highlight_ids], marker='s',
154 |                    edgecolor='k', facecolor='none', clip_on=False, rasterized=rasterized)
155 | 
156 |     if threshold is not None:
157 |         ax.plot([-0.02*ns, 1.02*ns], 2*[threshold], '--', color=[0.6,0.6,0.6], lw=1, alpha=0.8)
158 | 
159 |     if outlier_method.lower() == 'iqr':
160 |         p = np.percentile(metric_s, [25, 75])
161 |         if threshold_dir == 'gt':
162 |             ix = metric_s[metric_s > p[1] + 1.5*(p[1]-p[0])].index
163 |         elif threshold_dir == 'lt':
164 |             ix = metric_s[metric_s < p[0] - 1.5*(p[1]-p[0])].index
165 |         if any(ix):
166 |             ax.scatter(xpos[ix], metric_s[ix], c='none', edgecolor='k', s=ms, lw=1, label=None, clip_on=False, rasterized=rasterized)
167 |     elif outlier_method.lower() == 'threshold' and threshold is not None:
168 |         if threshold_dir == 'gt':
169 |             ix = metric_s[metric_s > threshold].index
170 |         elif threshold_dir == 'lt':
171 |             ix = metric_s[metric_s < threshold].index
172 |         if any(ix):
173 |             ax.scatter(xpos[ix], metric_s[ix], c='none', edgecolor='k', s=ms, lw=1, label=None, clip_on=False, rasterized=rasterized)
174 | 
175 |     # plot density
176 |     if plot_density:
177 |         sns.kdeplot(y=metric_s, ax=dax, legend=False, fill=True, lw=1.5)
178 |         dax.set_ylabel(None)
179 |         qtl.plot.format_plot(dax, fontsize=fontsize, hide=['top', 'right', 'bottom'])
180 |         plt.setp(dax.get_yticklabels(), visible=False)
181 |         dax.set_xticks([])
182 |         dax.set_xlabel('Freq.', ha='left', x=0, fontsize=fontsize, labelpad=7)
183 | 
184 |     qtl.plot.format_plot(ax, fontsize=fontsize)
185 |     ax.spines['left'].set_position(('outward', 8))
186 | 
187 |     ax.set_xlim([1, ns])
188 |     if ylim is None:
189 |         ax.set_ylim([0, ax.get_ylim()[1]])
190 |     else:
191 |         ax.set_ylim(ylim)
192 | 
193 |     if show_xticklabels:
194 |         ax.set_xticks(xpos)
195 |         ax.set_xticklabels(sorted_ix, rotation=45, ha='right', va='top')
196 |     else:
197 |         ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
198 | 
199 |     ax.set_ylabel(ylabel, fontsize=fontsize+2)
200 |     ax.set_xlabel(xlabel, fontsize=fontsize+2)
201 | 
202 |     if show_legend:
203 |         ax.legend(fontsize=9, handlelength=1, labelspacing=0.5, title=cohort_s.name)
204 | 
205 |     if plot_density:
206 |         return ax, dax
207 |     else:
208 |         return ax
209 | 
210 | 
211 | def detection_bias(metrics_df, bias_metric="Median 3' bias", c='Duplicate Rate of Mapped',
212 |                    ah=2, aw=2, ct=0, rasterized=False):
213 |     """Plot genes detected vs a bias metric (e.g., Median Exon CV)"""
214 | 
215 |     ax, cax = qtl.plot.setup_figure(ah, aw, xspace=[0.75, 0.75],
216 |                                     colorbar=True, ds=0.05, cw=0.1, ct=ct)
217 | 
218 |     ix = metrics_df[c].sort_values().index
219 |     h = ax.scatter(metrics_df.loc[ix, 'Genes Detected'], metrics_df.loc[ix, bias_metric],
220 |                    c=metrics_df.loc[ix, c], cmap=plt.cm.GnBu,
221 |                    clip_on=False, s=36, edgecolor='k',lw=0.5,
222 |                    vmin=0, vmax=1, rasterized=rasterized)
223 | 
224 |     ax.set_xlabel('Genes detected', fontsize=12)
225 |     ax.set_ylabel(bias_metric, fontsize=12)
226 |     qtl.plot.format_plot(ax, fontsize=10)
227 |     ax.autoscale(True)
228 |     ax.spines['left'].set_position(('outward', 6))
229 |     ax.spines['bottom'].set_position(('outward', 6))
230 |     hc = plt.colorbar(h, cax=cax)
231 |     hc.set_label('Duplicate Rate', fontsize=12, labelpad=6)
232 |     return ax, cax
233 | 
234 | 
235 | def mapping_sense(metrics_df, cohort_s=None, cohort_order=None, cohort_colors=None, date_s=None, width=0.8,
236 |                   dl=0.75, aw=4, dr=1.5, db=0.5, ah=2, dt=0.25, ds=0.066, dc=0.1):
237 |     """Summary of sense/antisense alignments.
238 | 
239 |     For stranded protocols, most reads should be 'End 1 Antisense' and 'End 2 Sense',
240 |     or vice versa, depending on protocol.
241 |     For unstranded protocols, the 4 categories are expected to be of equal proportion (~0.25).
242 |     """
243 |     sorted_ix = sort_samples(metrics_df.index, cohort_s=cohort_s,
244 |                              cohort_order=cohort_order, date_s=date_s)
245 |     df = metrics_df.loc[sorted_ix, ['End 1 Sense', 'End 1 Antisense', 'End 2 Sense', 'End 2 Antisense']]
246 |     df = df / np.sum(df.values, axis=1, keepdims=True)
247 | 
248 |     fw = dl + aw + dr
249 |     fh = db + ah + dt
250 |     fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh))
251 |     ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh])
252 |     df.reset_index(drop=True).plot(kind='bar', width=width,
253 |                                    stacked=True, xticks=[], ax=ax,
254 |                                    color=hsv_to_rgb([
255 |                                        [0.1, 0.6, 1],
256 |                                        [0.4, 0.7, 0.75],
257 |                                        [0.25, 0.4, 0.85],
258 |                                        [0.15, 0.55, 1],
259 |                                    ])
260 |                                    )
261 |     ax.set_ylim([0,1])
262 |     ax.legend(loc='upper left', handlelength=0.66, bbox_to_anchor=(1,1))
263 |     ax.set_ylabel('Proportion of mapped reads', fontsize=12)
264 |     ax.set_xlabel('Samples', fontsize=12)
265 |     ax.set_xlim([-width/2, metrics_df.shape[0]-width/2])
266 | 
267 |     if cohort_s is not None:
268 |         cax = fig.add_axes([dl/fw, (db+ah+ds)/fh, aw/fw, dc/fh], sharex=ax)
269 |         cax.set_yticks([])
270 |         _plot_cohort_labels(cax, cohort_s.loc[df.index], cohort_colors=cohort_colors,
271 |                             lax=ax, legend=False, orientation='horizontal')
272 | 
273 | 
274 | def calculate_expression_cdfs(tpm_df):
275 |     """Sort and compute CDF for each sample independently"""
276 |     cdf_df = tpm_df.reset_index(drop=True).copy()
277 |     if 'Description' in cdf_df:
278 |         cdf_df.drop('Description', axis=1, inplace=True)
279 |     for c in cdf_df:
280 |         cdf_df[c] = np.cumsum(cdf_df[c].sort_values(ascending=False).values) / 1e6
281 |     return cdf_df
282 | 
283 | 
284 | def cumulative_expression(cdf_df, cohort_s=None, cohort_colors=None, ax=None, cmap=plt.cm.Spectral_r, c=[0.6,0.6,0.6],
285 |                           reference_df=None, reference_name=None, alpha=0.5, mode='lines', lw=1, legend=False, rasterized=False):
286 |     """
287 |     Plot cumulative gene expression for each sample.
288 |     This enables identification of samples with dominant expression of few genes.
289 | 
290 |     With mode='ci', median and confidence intervals are shown instead of individual samples.
291 |     """
292 |     if cohort_s is None:
293 |         cohort_s = pd.Series('_NA', index=cdf_df.columns)
294 | 
295 |     if cohort_colors is None:
296 |         cohorts = cohort_s.unique()
297 |         nc = len(cohorts)
298 |         if nc==1:
299 |             cohort_colors = {cohorts[0]: c}
300 |         else:
301 |             cohort_colors = {i:j for i,j in zip(cohorts, plt.cm.get_cmap(cmap.name, nc)(np.arange(nc)))}
302 | 
303 |     if ax is None:
304 |         ax = qtl.plot.setup_figure(4, 2.5)
305 |     ax.set_xscale('log')
306 | 
307 |     if reference_df is not None:  # plot reference distribution
308 |         # mu = reference_df.mean(axis=1)
309 |         s = reference_df.std(axis=1)
310 |         mu = reference_df.median(axis=1)
311 |         # s = np.median(np.abs(gtex_cdf-mu), axis=0) / 0.6745
312 |         if mode=='ci':
313 |             ax.fill_between(np.arange(reference_df.shape[0])+1, mu-1.96*s, mu+1.96*s, facecolor='k', edgecolor='k', alpha=0.2, label=None, zorder=20)
314 |             ax.plot(mu, 'k', lw=2, alpha=0.8, rasterized=rasterized, label=reference_name, zorder=30)
315 |         else:
316 |             ax.fill_between(np.arange(reference_df.shape[0])+1, mu-1.96*s, mu+1.96*s, facecolor='k', edgecolor='none', alpha=0.2, label=f'{reference_name} 95% CI', zorder=20)
317 |             ax.plot(mu, 'k', lw=1.5, alpha=0.8, rasterized=rasterized, label=f'{reference_name} mean', zorder=30)
318 | 
319 |     for c in cohort_s.unique():
320 |         x = np.arange(1, cdf_df.shape[0]+1)
321 |         ix = cohort_s[cohort_s==c].index
322 |         if mode == 'ci':  # plot confidence intervals
323 |             mu = cdf_df[ix].median(axis=1)
324 |             s = cdf_df[ix].std(axis=1)  # replace with MAD?
325 |             fc = cohort_colors[c]
326 |             pc = ax.fill_between(x, mu-1.96*s, mu+1.96*s, facecolor=fc, edgecolor=fc, alpha=0.2, label=None, zorder=20, lw=1)
327 |             ax.plot(mu, '-', color=cohort_colors[c], lw=2, alpha=0.8, rasterized=rasterized, label=c, zorder=30)
328 |         else:
329 |             ax.plot(x, cdf_df[ix[0]], color=cohort_colors[c], alpha=alpha, lw=lw, rasterized=rasterized, label=c)  # plot first one w/ label
330 |             if len(ix)>1:
331 |                 ax.plot(x, cdf_df[ix[1:]], color=cohort_colors[c], alpha=alpha, lw=lw, rasterized=rasterized)
332 | 
333 |     ax.set_ylim([0,1])
334 |     ax.set_xlim([1,10000])
335 |     qtl.plot.format_plot(ax, fontsize=10)
336 |     ax.set_xlabel('Number of genes', fontsize=12)
337 |     ax.set_ylabel('Cumulative transcriptional output', fontsize=12)
338 |     ax.spines['left'].set_position(('outward', 6))
339 | 
340 |     if legend and not (cohort_s == '_NA').all():
341 |         leg = ax.legend(loc=4, handlelength=1, fontsize=10)
342 |         for lh in leg.legendHandles:
343 |             lh.set_alpha(1)
344 | 
345 |     return ax
346 | 
347 | 
348 | def _plot_cohort_labels(ax, cohort_s, cohort_colors=None, lax=None, legend=True, orientation='vertical'):
349 |     """Internal function for adding a cohort color legend to a figure (in a separate axis)"""
350 | 
351 |     cohort_index_dict = {i:k for k,i in enumerate(np.unique(cohort_s))}
352 |     if cohort_colors is None:
353 |         n = len(cohort_index_dict)
354 |         cmap = ListedColormap(plt.cm.get_cmap('Spectral', n)(np.arange(n)), 'indexed')
355 |     else:
356 |         cmap = ListedColormap(pd.Series(cohort_index_dict).sort_values().index.map(cohort_colors))
357 | 
358 |     if orientation == 'vertical':
359 |         ax.imshow(cohort_s.map(cohort_index_dict).values.reshape(-1,1), aspect='auto', cmap=cmap)
360 |     else:
361 |         ax.imshow(cohort_s.map(cohort_index_dict).values.reshape(1,-1), aspect='auto', cmap=cmap)
362 | 
363 |     if lax is None:
364 |         lax = ax
365 |     for k,i in cohort_index_dict.items():
366 |         lax.scatter(np.nan, np.nan, marker='s', c=[cmap(i)], label=f'{k}')
367 |     if legend:
368 |         lax.legend(loc='upper left', borderaxespad=None, bbox_to_anchor=(1,1), handlelength=1, title='Cohort')
369 | 
370 | 
371 | def insert_sizes(insertsize_df, cohort_s=None, cohort_colors=None,
372 |                  cohort_order=None, sort_order='mean', max_size=1000,
373 |                  legend=False, dl=0.75, aw=3, dr=0.5, db=0.5, ah=2, dt=0.25):
374 |     """Plot heat map of insert size distributions"""
375 | 
376 |     # expand to 'max_size' bp
377 |     df = insertsize_df.reindex(np.arange(1,max_size+1)).fillna(0).astype(np.int32).T
378 | 
379 |     # sort by mean if > 100000 reads
380 |     mu = df.mul(df.columns.values, axis=1).sum(1)
381 |     n = df.sum(1).sort_values()
382 |     si = n[n<100000].index.tolist() + mu.loc[n[n>=100000].index].sort_values().index.tolist()
383 | 
384 |     if cohort_s is not None and sort_order == 'cohort':  # sort within each cohort
385 |         if cohort_order is None:
386 |             cohort_order = cohort_s.value_counts().index
387 |         sort_s = pd.Series(cohort_s[si], index=si)
388 |         si = []
389 |         for c in cohort_order:
390 |             si.extend(sort_s[sort_s==c].index)
391 | 
392 |     # set up figure
393 |     if cohort_s is not None:
394 |         cw = 0.1
395 |         ds = 0.05
396 |     else:
397 |         cw = 0
398 |         ds = 0
399 |     fw = dl + cw + ds + aw + dr
400 |     fh = db + ah + dt
401 |     fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh))
402 |     ax = fig.add_axes([(dl+cw+ds)/fw, db/fh, aw/fw, ah/fh])
403 | 
404 |     # add cohort information and legend
405 |     if cohort_s is not None:
406 |         # set up axes
407 |         cax = fig.add_axes([dl/fw, db/fh, cw/fw, ah/fh], sharey=ax)
408 |         plt.setp(ax.get_yticklabels(), visible=False);
409 |         for line in ax.yaxis.get_ticklines():
410 |             line.set_markersize(0)
411 |             line.set_markeredgewidth(0)
412 |         cax.set_xticks([])
413 |         cax.set_ylabel('Sample', fontsize=12)
414 | 
415 |         # plot labels
416 |         _plot_cohort_labels(cax, cohort_s[si], cohort_colors=cohort_colors, lax=ax, legend=legend)
417 | 
418 |     ax.imshow(df.loc[si], interpolation='none', aspect='auto', norm=LogNorm())
419 |     ax.set_xlabel('Insert size (bp)', fontsize=12)
420 |     ax.set_xlim([1, max_size])
421 |     return ax
422 | 
423 | 
424 | def gc_content(gc_content_df, cohort_s=None, cohort_colors=None,
425 |                cohort_order=None, sort_order='mean', legend=False,
426 |                dl=0.75, aw=3, dr=0.5, db=0.5, ah=2, dt=0.25):
427 |     """Plot heat map of GC content distributions"""
428 | 
429 |     # sort by mean
430 |     x = gc_content_df.index.values
431 |     mu = (gc_content_df * x.reshape(-1,1)).sum()
432 |     si = mu.sort_values().index
433 | 
434 |     if cohort_s is not None and sort_order == 'cohort':  # sort within each cohort
435 |         if cohort_order is None:
436 |             cohort_order = cohort_s.value_counts().index
437 |         sort_s = pd.Series(cohort_s[si], index=si)
438 |         si = []
439 |         for c in cohort_order:
440 |             si.extend(sort_s[sort_s==c].index)
441 | 
442 |     # set up figure
443 |     if cohort_s is not None:
444 |         ch = 0.1
445 |         ds = 0.05
446 |     else:
447 |         ch = 0
448 |         ds = 0
449 |     fw = dl + aw + dr
450 |     fh = db + ah + ch + ds + dt
451 |     fig = plt.figure(facecolor=(1,1,1), figsize=(fw,fh))
452 |     ax = fig.add_axes([dl/fw, db/fh, aw/fw, ah/fh])
453 | 
454 |     # add cohort information and legend
455 |     if cohort_s is not None:
456 |         # set up axes
457 |         cax = fig.add_axes([dl/fw, (db+ah+ds)/fh, aw/fw, ch/fh], sharex=ax)
458 |         plt.setp(cax.get_xticklabels(), visible=False);
459 |         for line in cax.xaxis.get_ticklines():
460 |             line.set_markersize(0)
461 |             line.set_markeredgewidth(0)
462 |         cax.set_yticks([])
463 | 
464 |         # plot labels
465 |         _plot_cohort_labels(cax, cohort_s[si], orientation='horizontal',
466 |                             cohort_colors=cohort_colors, lax=ax, legend=legend)
467 | 
468 |     ax.imshow(gc_content_df[si], origin='lower', interpolation='none', aspect='auto', norm=LogNorm())
469 |     ax.set_xlabel('Samples', fontsize=12)
470 |     ax.set_ylabel('Fragment GC Content', fontsize=12)
471 |     y = np.arange(0, 120, 20)
472 |     ax.set_yticks(y)
473 |     ax.set_yticklabels(y/100)
474 |     return ax
475 | 
476 | 
477 | def xy_expression(tpm_df, sex_s=None, flag_klinefelter=True, highlight_ids=None,
478 |                   x_threshold=5, y_threshold=30, s=24, verbose=True, rasterized=False, **kwargs):
479 |     """Expression of sex-specific genes (XIST and RPS4Y1) to identify sample swaps.
480 | 
481 |     sex_s: pd.Series annotating the sex of each sample, as Male/Female.
482 |     """
483 | 
484 |     x_id = tpm_df.index[tpm_df.index.str.startswith('ENSG00000229807')][0]  # XIST
485 |     y_id = tpm_df.index[tpm_df.index.str.startswith('ENSG00000129824')][0]  # RPS4Y1
486 |     x_s = tpm_df.loc[x_id].rename('XIST')
487 |     y_s = tpm_df.loc[y_id].rename('RPS4Y1')
488 | 
489 |     ax = qtl.plot.setup_figure(3, 3, xspace=[0.75, 1.75])
490 |     ax.set_xscale('symlog')
491 |     ax.set_yscale('symlog')
492 | 
493 |     if sex_s is not None:  # flag potential swaps based on thresholds
494 |         assert tpm_df.columns.isin(sex_s.index).all()
495 |         res_s = pd.Series('NA', index=sex_s.index[sex_s.index.isin(tpm_df.columns)], name='inferred_sex')
496 | 
497 |         args =  {'ec':'none', 'lw':0, 'rasterized':rasterized, 'clip_on':False, 's':s, 'alpha':0.33}
498 |         args.update(kwargs)
499 |         args2 = {**args, 'ec':'k', 'lw':1, 's':s+6, 'alpha':1}
500 | 
501 |         # infer missing labels based on thresholds
502 |         ix = sex_s[sex_s.isnull() & (x_s <= x_threshold) & (y_s > y_threshold)].index
503 |         if len(ix) > 0:
504 |             ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.6,0.8,0.7]).reshape(1,-1), **args, label=f"Male* ({len(ix)})")
505 |             res_s[ix] = 'Male'
506 |         ix = sex_s[sex_s.isnull() & (x_s > x_threshold) & (y_s <= y_threshold)].index
507 |         if len(ix) > 0:
508 |             ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0,0.8,0.7]).reshape(1,-1), **args, label=f"Female* ({len(ix)})")
509 |             res_s[ix] = 'Female'
510 |         ix = sex_s[sex_s.isnull() & (x_s > x_threshold) & (y_s > y_threshold)].index
511 |         if len(ix) > 0:
512 |             ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.75,0.8,0.7]).reshape(1,-1), **args, label=f"XXY* ({len(ix)})")
513 |             res_s[ix] = 'Klinefelter (XXY)'
514 |         ix = sex_s[sex_s.isnull() & (x_s <= x_threshold) & (y_s <= y_threshold)].index
515 |         if len(ix) > 0:
516 |             ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.0,0,0.7]).reshape(1,-1), **args, label=f"? ({len(ix)})")
517 |             res_s[ix] = np.nan
518 | 
519 |         # matching samples
520 |         ix = sex_s[(sex_s == 'Male') & (x_s <= x_threshold)].index
521 |         if len(ix) > 0:
522 |             res_s[ix] = 'Male'
523 |             ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.6,0.8,0.7]).reshape(1,-1), label=f"Male ({len(ix)})", **args)
524 |         ix = sex_s[(sex_s == 'Female') & (y_s <= y_threshold)].index
525 |         if len(ix) > 0:
526 |             res_s[ix] = 'Female'
527 |             ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0,0.8,0.7]).reshape(1,-1), label=f"Female ({len(ix)})", **args)
528 | 
529 |         # mismatches
530 |         if flag_klinefelter:
531 |             ix = sex_s[(sex_s == 'Male') & (x_s > x_threshold) & (y_s <= y_threshold)].index
532 |             if len(ix) > 0:
533 |                 ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.6,1,0.9]).reshape(1,-1), label=f'M > F swap ({len(ix)})', **args2)
534 |                 if verbose:
535 |                     print(f'F mislabeled as M:\n{ix.tolist()}')
536 |                 res_s[ix] = 'Female'
537 |             ix = sex_s[(sex_s == 'Female') & (y_s > y_threshold) & (x_s <= x_threshold)].index
538 |             if len(ix) > 0:
539 |                 ax.scatter(x_s[ix], y_s[ix], c=[[0.9, 0, 0, 1]], label=f'F > M swap ({len(ix)})', **args2)
540 |                 if verbose:
541 |                     print(f'M mislabeled as F:\n{ix.tolist()}')
542 |                 res_s[ix] = 'Male'
543 | 
544 |             # Klinefelter
545 |             ix = sex_s[(sex_s == 'Male') & (x_s > x_threshold) & (y_s > y_threshold)].index
546 |             if len(ix) > 0:
547 |                 ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.75,1,0.9]).reshape(1,-1), label=f'XXY? ({len(ix)})', **args2)
548 |                 if verbose:
549 |                     print(f'Possible Klinefelter (XXY): {ix.tolist()}')
550 |                 res_s[ix] = 'Possible Klinefelter (XXY)'
551 |             ix = sex_s[(sex_s == 'Female') & (y_s > y_threshold) & (x_s > x_threshold)].index
552 |             if len(ix) > 0:
553 |                 ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0,1,0.9]).reshape(1,-1), label=f'XXY? ({len(ix)})', **args2)
554 |                 if verbose:
555 |                     print(f'Possible Klinefelter (XXY): {ix.tolist()}')
556 |                 res_s[ix] = 'Possible Klinefelter (XXY)'
557 | 
558 |         else:
559 |             ix = sex_s[(sex_s == 'Male') & (x_s > x_threshold)].index
560 |             if len(ix) > 0:
561 |                 ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0.6,1,0.9]).reshape(1,-1), label=f'M > F swap ({len(ix)})', **args2)
562 |                 if verbose:
563 |                     print(f'F mislabeled as M:\n{ix.tolist()}')
564 |                 res_s[ix] = 'Female'
565 |             ix = sex_s[(sex_s == 'Female') & (y_s > y_threshold)].index
566 |             if len(ix) > 0:
567 |                 ax.scatter(x_s[ix], y_s[ix], c=hsv_to_rgb([0,1,0.9]).reshape(1,-1), label=f'F > M swap ({len(ix)})', **args2)
568 |                 if verbose:
569 |                     print(f'M mislabeled as F:\n{ix.tolist()}')
570 |                 res_s[ix] = 'Male'
571 |     else:
572 |         ax.scatter(x_s, y_s, s=s, alpha=0.5, edgecolors='none', lw=0.5, rasterized=True, clip_on=False)
573 | 
574 |     if highlight_ids is not None:  # highlight selected samples
575 |         ax.scatter(x_s[highlight_ids], y_s[highlight_ids], c=[hsv_to_rgb([0.075,1,1])], s=s+12, alpha=1, edgecolors='k', lw=1, zorder=50, rasterized=False, clip_on=False, label=None)
576 | 
577 |     qtl.plot.format_plot(ax, fontsize=12)
578 |     ax.spines['left'].set_position(('outward', 6))
579 |     ax.spines['bottom'].set_position(('outward', 6))
580 |     ax.set_xlabel('XIST expression (TPM)', fontsize=14)
581 |     ax.set_ylabel('RPS4Y1 expression (TPM)', fontsize=14)
582 | 
583 |     xlim = list(ax.get_xlim())
584 |     ylim = list(ax.get_ylim())
585 |     xlim[0] = 0
586 |     ylim[0] = 0
587 |     ax.plot(2*[x_threshold], ylim, '--', c=[0.75]*3)
588 |     ax.plot(xlim, 2*[y_threshold], '--', c=[0.75]*3)
589 |     ax.set_xlim(xlim)
590 |     ax.set_ylim(ylim)
591 | 
592 |     if sex_s is not None:
593 |         leg = ax.legend(loc='upper left', fontsize=12, handlelength=0.5, labelspacing=0.2, bbox_to_anchor=(1,1))
594 |         for lh in leg.legend_handles:
595 |             lh.set_alpha(1)
596 |         return ax, res_s
597 |     else:
598 |         return ax
599 | 


--------------------------------------------------------------------------------