├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── setup.py
└── src
    ├── scripts
        └── SEQC
    └── seqc
        ├── __init__.py
        ├── alignment
            ├── __init__.py
            ├── sam.py
            └── star.py
        ├── barcode_correction.py
        ├── core
            ├── __init__.py
            ├── download.py
            ├── index.py
            ├── instances.py
            ├── main.py
            ├── notebook.py
            ├── parser.py
            ├── progress.py
            ├── run.py
            ├── start.py
            ├── terminate.py
            └── verify.py
        ├── distance.py
        ├── ec2.py
        ├── email_.py
        ├── exceptions.py
        ├── filter.py
        ├── gene_info.py
        ├── h5.py
        ├── io.py
        ├── log.py
        ├── multialignment.py
        ├── notebooks
            ├── __init__.py
            ├── analysis_template.json
            ├── notebooks.py
            └── test_notebooks.py
        ├── platforms.py
        ├── plot.py
        ├── read_array.py
        ├── reader.py
        ├── rmt_correction.py
        ├── run_mast.R
        ├── sequence
            ├── __init__.py
            ├── barcodes.py
            ├── encodings.py
            ├── fastq.py
            ├── gtf.py
            └── index.py
        ├── sparse_frame.py
        ├── stats
            ├── __init__.py
            ├── anova.py
            ├── correlation.py
            ├── experimental_yield.py
            ├── g_test.py
            ├── graph_diffusion.py
            ├── gsea.py
            ├── mast.py
            ├── pca.py
            ├── resampled_nonparametric.py
            ├── smoothing.py
            ├── tree.py
            ├── tsne.py
            └── ttest.py
        ├── summary
            ├── __init__.py
            ├── css
            │   ├── bootstrap.css
            │   ├── bootstrap.min.css
            │   └── simple-sidebar.css
            ├── fonts
            │   ├── glyphicons-halflings-regular.eot
            │   ├── glyphicons-halflings-regular.svg
            │   ├── glyphicons-halflings-regular.ttf
            │   ├── glyphicons-halflings-regular.woff
            │   └── glyphicons-halflings-regular.woff2
            ├── html_
            │   └── __init__.py
            ├── img
            │   └── __init__.py
            ├── js
            │   ├── bootstrap.js
            │   ├── bootstrap.min.js
            │   └── jquery.js
            ├── static
            │   └── __init__.py
            ├── summary.py
            ├── templates
            │   ├── mini_summary_base.html
            │   ├── section_base.html
            │   └── section_content.html
            └── test.py
        ├── test.py
        └── version.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg*
 2 | *.idea*
 3 | *__pycache__*
 4 | .idea*
 5 | testfiles*
 6 | *.DS_Store*
 7 | *seqc.log
 8 | build/*
 9 | dist/*
10 | .project
11 | .pydevproject


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/seqc/summary/*/*.css
2 | include src/seqc/summary/fonts/*
3 | include src/seqc/summary/*/*.py
4 | include src/seqc/summary/*/*.js
5 | include src/seqc/summary/*/*.html
6 | include src/seqc/notebooks/*.json
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SEquence Quality Control (SEQC -- /sek-si:/)
 2 | 
 3 | **NOTE:** This repository is no longer actively maintained. If you want to get the latest update or have any inquiries, please refer to https://github.com/dpeerlab/seqc instead.
 4 | 
 5 | ----
 6 | 
 7 | ## Overview:
 8 | 
 9 | SEQC is a python package that processes single-cell sequencing data in the cloud and analyzes it interactively on your local machine.
10 | 
11 | To faciliate easy installation and use, we have made available Amazon Machine Images (AMIs) that come with all of SEQC's dependencies pre-installed. In addition, we have uploaded common genome indices (-i/--index parameter) and barcode data (--barcode-files) to public amazon s3 repositories. These links can be provided to SEQC and it will automatically fetch them prior to initiating an analysis run. Finally, it can fetch input data directly from BaseSpace or amazon s3 for analysis.
12 | 
13 | For users with access to in-house compute clusters, SEQC can be installed on your systems and run using the --local parameter.
14 | 
15 | ### Dependencies:
16 | 
17 | 
18 | #### Python3
19 | Python must be installed on your local machine to run SEQC. We recommend installing python3 through your unix operating system's package manager. For Mac OSX users we recommend <a href=http://brew.sh/>homebrew</a>. Typical installation commands would be:
20 | 
21 |     brew install python3  # mac
22 |     apt-get install python3  # debian
23 |     yum install python3 # rpm-based
24 | 
25 | #### Python3 Libraries
26 | 
27 | Installing these libraries is necessary before installing SEQC.
28 | 
29 |     pip3 install Cython
30 |     pip3 install numpy
31 |     pip3 install bhtsne
32 | 
33 | #### STAR
34 | To process data locally using SEQC, you must install the <a href=https://github.com/alexdobin/STAR>STAR Aligner</a>, <a href=http://www.htslib.org/>Samtools</a>, and <a href=https://support.hdfgroup.org/HDF5/>hdf5</a>. If you only intend to use SEQC to trigger remote processing on AWS, these dependencies are optional. We recommend installing samtools and hdf5 through your package manager, if possible.
35 | 
36 | #### Hardware Requirements:
37 | For processing a single lane (~200M reads) against human- and mouse-scale genomes, SEQC requires 30GB RAM, approximately 200GB free hard drive space, and scales linearly with additional compute cores. If running on AWS (see below), jobs are automatically scaled up or down according to the size of the input. There are no hardware requirements for the computer used to launch remote instances.
38 | 
39 | 
40 | #### Amazon Web Services:
41 | SEQC can be run on any unix-based operating system, however it also features the ability to automatically spawn Amazon Web Services instances to process your data. If you wish to take advantage of AWS, you will need to follow their instructions to:
42 | 
43 | 1. <a href=http://aws.amazon.com>Set up an AWS account</a>
44 | 2. <a href=https://aws.amazon.com/cli/>Install and configure AWS CLI</a>
45 | 3. <a href=http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html>Create and upload an rsa-key for AWS</a>
46 | 
47 | 
48 | ### SEQC Installation:
49 | 
50 | Once all dependencies have been installed, SEQC can be installed on any machine by typing:
51 | 
52 |     $> git clone https://github.com/ambrosejcarr/seqc.git
53 |     $> cd seqc && python3 setup.py install
54 | 
55 | Please note that to avoid passing the -k/--rsa-key command when you execute SEQC runs, you can also set the environment variable `AWS_RSA_KEY` to the path to your newly created key.
56 | 
57 | ### Testing SEQC:
58 | 
59 | All the unit tests in class `TestSEQC` in `test.py` have been tested. Currently, only two platforms `ten_x_v2` and `in_drop_v2` have been tested. Old unit tests from these two platforms together with other platforms are stored at `s3://dp-lab-data/seqc-old-unit-test/`.
60 | 
61 | ### Running SEQC:
62 | 
63 | After SEQC is installed, help can be listed:
64 | 
65 |     SEQC [-h] [-v] {run,progress,terminate,instances,start,index} ...
66 | 
67 |     Processing Tools for scRNA-seq Experiments
68 | 
69 |     positional arguments:
70 |       {run,progress,terminate,instances,start,index}
71 |         run                 initiate SEQC runs
72 |         progress            check SEQC run progress
73 |         terminate           terminate SEQC runs
74 |         instances           list all running instances
75 |         start               initialize a seqc-ready instance
76 |         index               create a SEQC index
77 | 
78 |     optional arguments:
79 |       -h, --help            show this help message and exit
80 |       -v, --version         show program's version number and exit
81 | 
82 | In addition to processing sequencing experiments, SEQC.py provides some convenience tools to create indices for use with SEQC and STAR, and tools to check the progress of remote runs, list current runs, start instances, and terminate them.
83 | 
84 | To seamlessly start an AWS instance with automatic installation of SEQC from your local machine you can run:
85 | 
86 |     SEQC start
87 | 
88 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import shutil
  4 | from subprocess import call
  5 | from setuptools import setup
  6 | from warnings import warn
  7 | import py_compile
  8 | 
  9 | 
 10 | # Replace py_compile.compile with a function that calls it with doraise=True
 11 | # so stop when there is a syntax error
 12 | orig_py_compile = py_compile.compile
 13 | 
 14 | 
 15 | def doraise_py_compile(file, cfile=None, dfile=None, doraise=False):
 16 |     orig_py_compile(file, cfile=cfile, dfile=dfile, doraise=True)
 17 | 
 18 | 
 19 | py_compile.compile = doraise_py_compile
 20 | 
 21 | if sys.version_info.major != 3:
 22 |     raise RuntimeError('SEQC requires Python 3')
 23 | if sys.version_info.minor < 5:
 24 |     warn('Multiprocessing analysis methods may not function on Python versions < 3.5')
 25 | 
 26 | # install phenograph if pip3 is installed
 27 | if shutil.which('pip3'):
 28 |     call(['pip3', 'install', 'git+https://github.com/jacoblevine/phenograph.git'])
 29 |     call(['pip3', 'install', 'git+https://github.com/pkathail/magic.git'])
 30 | 
 31 | # get version
 32 | with open('src/seqc/version.py') as f:
 33 |     exec(f.read())
 34 | 
 35 | setup(
 36 |     name='seqc',
 37 |     version=__version__,  # read in from the exec of version.py; ignore error
 38 |     description='Single Cell Sequencing Processing and QC Suite',
 39 |     author='Ambrose J. Carr',
 40 |     author_email='mail@ambrosejcarr.com',
 41 |     package_dir={'': 'src'},
 42 |     package_data={'': ['*.r', '*.R']},
 43 |     packages=['seqc', 'seqc.sequence', 'seqc.alignment', 'seqc.core', 'seqc.stats',
 44 |               'seqc.summary', 'seqc.notebooks'],
 45 |     install_requires=[
 46 |       'numpy>=1.10.0',
 47 |       'bhtsne',
 48 |       'wikipedia',
 49 |       'awscli',
 50 |       'Cython>0.14',
 51 |       'numexpr>=2.4',
 52 |       'pandas>=0.18.1',
 53 |       'paramiko>=2.0.2',
 54 |       'regex',
 55 |       'requests',
 56 |       'nose2',
 57 |       'scipy>=0.14.0',
 58 |       'boto3',
 59 |       'intervaltree',
 60 |       'matplotlib',
 61 |       'tinydb',
 62 |       'tables',
 63 |       'fastcluster',
 64 |       'statsmodels',
 65 |       'ecdsa',
 66 |       'dill',
 67 |       'jupyter',
 68 |       'multiprocessing_on_dill',
 69 |       'jinja2',
 70 |       'pycrypto',
 71 |       'cairocffi>=0.8.0',
 72 |       'weasyprint',
 73 |       'scikit_learn>=0.17'],
 74 |     scripts=['src/scripts/SEQC'],
 75 |     extras_require={
 76 |       'GSEA_XML': ['html5lib', 'lxml', 'BeautifulSoup4'],
 77 |     },
 78 |     include_package_data=True
 79 | )
 80 | 
 81 | # look for star
 82 | if not shutil.which('STAR'):
 83 |     warn('SEQC: STAR is not installed. SEQC will not be able to align files.')
 84 | 
 85 | # get location of setup.py
 86 | setup_dir = os.path.dirname(os.path.realpath(__file__))
 87 | seqc_dir = os.path.expanduser('~/.seqc/seqc')
 88 | 
 89 | print('setup_dir: {}'.format(setup_dir))
 90 | print('seqc_dir: {}'.format(seqc_dir))
 91 | 
 92 | if os.path.isdir(seqc_dir):
 93 |     shutil.rmtree(seqc_dir)
 94 | 
 95 | 
 96 | def ignore_test_and_tools(dir_, files):
 97 |     """Filter files to be moved by shutil.copytree. Ignore any hidden file and the
 98 |     test and tools directories, which are not needed by the remote instance.
 99 |     :param dir_: dummy variable, must be present to be passed to shutil.copytree()
100 |     :param files: output of os.listdir(), files to be subjected to filtering
101 |     :return list: list of files that should be filtered, and not copied.
102 |     """
103 |     return [f for f in files if (f == 'test' or f.startswith('.'))]
104 | 
105 | 
106 | # install tools and a local copy of seqc.
107 | shutil.copytree(setup_dir, seqc_dir, ignore=ignore_test_and_tools)  # copy seqc repository
108 | shutil.make_archive(base_name=seqc_dir, format='gztar', root_dir=seqc_dir)
109 | 


--------------------------------------------------------------------------------
/src/scripts/SEQC:
--------------------------------------------------------------------------------
1 | #!/usr/local/python3
2 | 
3 | import sys
4 | from seqc.core.main import main
5 | 
6 | if __name__ == "__main__":
7 |     main(sys.argv[1:])
8 | 


--------------------------------------------------------------------------------
/src/seqc/__init__.py:
--------------------------------------------------------------------------------
1 | from .h5 import H5
2 | from .version import __version__
3 | from . import stats
4 | # from . import plot
5 | 


--------------------------------------------------------------------------------
/src/seqc/alignment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/alignment/__init__.py


--------------------------------------------------------------------------------
/src/seqc/alignment/sam.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | from subprocess import Popen, PIPE
  3 | import shutil
  4 | import gzip
  5 | 
  6 | 
  7 | class SamRecord:
  8 |     """Simple record object allowing access to Sam record properties"""
  9 | 
 10 |     __slots__ = ['_record', '_parsed_name_field']
 11 | 
 12 |     NameField = namedtuple('NameField', ['pool', 'cell', 'rmt', 'poly_t', 'name'])
 13 | 
 14 |     def __init__(self, record):
 15 |         self._record = record
 16 |         self._parsed_name_field = None
 17 | 
 18 |     def __repr__(self):
 19 |         return '<SamRecord{!s}>'.format('\t'.join(self._record))
 20 | 
 21 |     def __bytes__(self):
 22 |         return '\t'.join(self._record) + '\n'
 23 | 
 24 |     @property
 25 |     def qname(self) -> str:
 26 |         return self._record[0]
 27 | 
 28 |     @property
 29 |     def flag(self) -> int:
 30 |         return int(self._record[1])
 31 | 
 32 |     @property
 33 |     def rname(self) -> str:
 34 |         return self._record[2]
 35 | 
 36 |     @property
 37 |     def pos(self) -> int:
 38 |         return int(self._record[3])
 39 | 
 40 |     @property
 41 |     def mapq(self) -> int:
 42 |         return int(self._record[4])
 43 | 
 44 |     @property
 45 |     def cigar(self) -> str:
 46 |         return self._record[5]
 47 | 
 48 |     @property
 49 |     def rnext(self) -> str:
 50 |         return self._record[6]
 51 | 
 52 |     @property
 53 |     def pnext(self) -> int:
 54 |         return int(self._record[7])
 55 | 
 56 |     @property
 57 |     def tlen(self) -> int:
 58 |         return int(self._record[8])
 59 | 
 60 |     @property
 61 |     def seq(self) -> str:
 62 |         return self._record[9]
 63 | 
 64 |     @property
 65 |     def qual(self) -> str:
 66 |         return self._record[10]
 67 | 
 68 |     @property
 69 |     def optional_fields(self):
 70 |         flags_ = {}
 71 |         for f in self._record[11:]:
 72 |             k, _, v = f.split(':')
 73 |             flags_[k] = int(v)
 74 |         return flags_
 75 | 
 76 |     def _parse_name_field(self):
 77 |         fields, name = self.qname.split(';')
 78 |         processed_fields = fields.split(':')
 79 |         processed_fields.append(name)
 80 |         self._parsed_name_field = self.NameField(*processed_fields)
 81 | 
 82 |     @property
 83 |     def pool(self) -> str:
 84 |         try:
 85 |             return self._parsed_name_field.pool
 86 |         except AttributeError:
 87 |             self._parse_name_field()
 88 |             return self._parsed_name_field.pool
 89 | 
 90 |     @property
 91 |     def rmt(self) -> str:
 92 |         try:
 93 |             return self._parsed_name_field.rmt
 94 |         except AttributeError:
 95 |             self._parse_name_field()
 96 |             return self._parsed_name_field.rmt
 97 | 
 98 |     @property
 99 |     def cell(self) -> str:
100 |         try:
101 |             return self._parsed_name_field.cell
102 |         except AttributeError:
103 |             self._parse_name_field()
104 |             return self._parsed_name_field.cell
105 | 
106 |     @property
107 |     def poly_t(self) -> str:
108 |         try:
109 |             return self._parsed_name_field.poly_t
110 |         except AttributeError:
111 |             self._parse_name_field()
112 |             return self._parsed_name_field.poly_t
113 | 
114 |     @property
115 |     def name(self):
116 |         try:
117 |             return self._parsed_name_field.name
118 |         except AttributeError:
119 |             self._parse_name_field()
120 |             return self._parsed_name_field.name
121 | 
122 |     @property
123 |     def is_mapped(self):
124 |         return False if (int(self.flag) & 4) else True
125 | 
126 |     @property
127 |     def is_unmapped(self):
128 |         return not self.is_mapped
129 | 
130 |     @property
131 |     def is_multimapped(self):
132 |         return True if self.optional_fields['NH'] > 1 else False
133 | 
134 |     @property
135 |     def is_uniquely_mapped(self):
136 |         return True if self.optional_fields['NH'] == 1 else False
137 | 
138 |     @property
139 |     def strand(self):
140 |         minus_strand = int(self.flag) & 16
141 |         return '-' if minus_strand else '+'
142 | 
143 |     # # todo this takes up 66% of the processing time for parsing the sam record
144 |     # @property
145 |     # def dust_low_complexity_score(self) -> int:
146 |     #
147 |     #     # Counts of 3-mers in the sequence
148 |     #     counts = {}
149 |     #     for i in range(len(self.seq) - 2):
150 |     #         kmer = self.seq[i:i + 3]
151 |     #         counts[kmer] = counts.get(kmer, 0) + 1
152 |     #
153 |     #     # Calculate dust score  # todo this is 30% faster when vectorized
154 |     #     score = sum([i * (i - 1) / 2 for i in counts.values()]) / (len(self.seq) - 3)
155 |     #
156 |     #     # Scale score (Max score possible is no. of 3mers/2)
157 |     #     score = int(score / ((len(self.seq) - 2) / 2) * 100)
158 |     #
159 |     #     return score
160 | 
161 | 
162 | class Reader:
163 |     """Simple sam reader, optimized for utility rather than speed"""
164 | 
165 |     def __init__(self, samfile: str):
166 |         """
167 |         :param samfile: str, location of a .sam file
168 | 
169 |         usage:
170 |         if rd = Reader(samfile)
171 |         :method __iter__: iterate over the .sam file's records (also usable in for loop)
172 |         :method __len__: return the number of alignments in the file
173 |         :method itermultialignments: return tuples of multiple alignments, all from the
174 |            same fastq record
175 |         """
176 | 
177 |         self._samfile = samfile
178 |         try:
179 |             samfile_iterator = iter(self)
180 |             next(samfile_iterator)
181 |         except:
182 |             raise ValueError('%s is an invalid samfile. Please check file formatting.' %
183 |                              samfile)
184 | 
185 |     @property
186 |     def samfile(self):
187 |         return self._samfile
188 | 
189 |     def _open(self):
190 |         """
191 |         seamlessly open self._samfile, whether gzipped or uncompressed
192 |         :returns: open file object
193 |         """
194 |         if self.samfile.endswith('.gz'):
195 |             fobj = gzip.open(self.samfile, 'rb')
196 |         elif self.samfile.endswith('.bam'):
197 |             if not shutil.which('samtools'):
198 |                 raise RuntimeError('samtools utility must be installed to run bamfiles')
199 |             p = Popen(['samtools', 'view', self.samfile], stdout=PIPE)
200 |             fobj = p.stdout
201 |         else:
202 |             fobj = open(self.samfile, 'rb')
203 |         return fobj
204 | 
205 |     def __len__(self):
206 |         return sum(1 for _ in self)
207 | 
208 |     def __iter__(self):
209 |         """return an iterator over all non-header records in samfile"""
210 |         fobj = self._open()
211 |         try:
212 |             for line in fobj:
213 |                 line = line.decode()
214 |                 # todo move this if statement to execute only until header is exhausted
215 |                 if line.startswith('@'):
216 |                     continue
217 |                 yield SamRecord(line.strip().split('\t'))
218 |         finally:
219 |             fobj.close()
220 | 
221 |     def iter_multialignments(self):
222 |         """yields tuples of all alignments for each fastq record"""
223 |         sam_iter = iter(self)
224 |         fq = [next(sam_iter)]
225 |         for record in sam_iter:
226 |             if record.qname == fq[0].qname:
227 |                 fq.append(record)
228 |             else:
229 |                 yield tuple(fq)
230 |                 fq = [record]
231 |         yield tuple(fq)
232 | 


--------------------------------------------------------------------------------
/src/seqc/alignment/star.py:
--------------------------------------------------------------------------------
  1 | from subprocess import Popen, PIPE
  2 | from multiprocessing import cpu_count
  3 | from os import makedirs
  4 | import shlex
  5 | 
  6 | 
  7 | def default_alignment_args(
  8 |         fastq_records: str, n_threads: int or str, index: str, output_dir: str) -> dict:
  9 |     """default arguments for STAR alignment
 10 | 
 11 |     To report unaligned reads, add '--outSAMunmapped': 'Within',
 12 | 
 13 |     :param fastq_records: str, name of fastq file
 14 |     :param n_threads: int or str, number of threads to allocate when calling STAR
 15 |     :param index: str, location of the STAR index
 16 |     :param output_dir: str, prefix for output files
 17 |     :return: dict, default alignment arguments
 18 |     """
 19 |     default_align_args = {
 20 |         '--runMode': 'alignReads',
 21 |         '--runThreadN': str(n_threads),
 22 |         '--genomeDir': index,
 23 |         '--outFilterType': 'BySJout',
 24 |         '--outFilterMultimapNmax': '10',  # require unique alignments
 25 |         '--limitOutSJcollapsed': '2000000',  # deal with many splice variants
 26 |         '--alignSJDBoverhangMin': '8',
 27 |         '--outFilterMismatchNoverLmax': '0.04',
 28 |         '--alignIntronMin': '20',
 29 |         '--alignIntronMax': '1000000',
 30 |         '--readFilesIn': fastq_records,
 31 |         '--outSAMprimaryFlag': 'AllBestScore',  # all equal-scoring reads are primary
 32 |         '--outSAMtype': 'BAM Unsorted',
 33 |         '--outFileNamePrefix': output_dir,
 34 |     }
 35 |     if fastq_records.endswith('.gz'):
 36 |         default_align_args['--readFilesCommand'] = 'gunzip -c'
 37 |     if fastq_records.endswith('.bz2'):
 38 |         default_align_args['--readFilesCommand'] = 'bunzip2 -c'
 39 |     return default_align_args
 40 | 
 41 | 
 42 | def align(fastq_file: str, index: str, n_threads: int, alignment_dir: str,
 43 |           reverse_fastq_file: str or bool=None, **kwargs) -> str:
 44 |     """align a fastq file, or a paired set of fastq files
 45 | 
 46 |     :param fastq_file: str, location of a fastq file
 47 |     :param index: str, folder containing the STAR index
 48 |     :param n_threads: int, number of parallel alignment processes to spawn
 49 |     :param alignment_dir: str, directory for output data
 50 |     :param reverse_fastq_file: optional, location of reverse paired-end fastq file
 51 |     :param kwargs: additional kwargs for STAR, passed without the leading '--'
 52 |     :return: str, .sam file location
 53 |     """
 54 | 
 55 |     runtime_args = default_alignment_args(
 56 |         fastq_file, n_threads, index, alignment_dir)
 57 | 
 58 |     for k, v in kwargs.items():  # overwrite or add any arguments passed from cmdline
 59 |         if not isinstance(k, str):
 60 |             try:
 61 |                 k = str(k)
 62 |             except ValueError:
 63 |                 raise ValueError('arguments passed to STAR must be strings')
 64 |         if not isinstance(v, str):
 65 |             try:
 66 |                 v = str(v)
 67 |             except ValueError:
 68 |                 raise ValueError('arguments passed to STAR must be strings')
 69 |         runtime_args['--' + k] = v
 70 | 
 71 |     # construct command line arguments for STAR
 72 |     cmd = ['STAR']
 73 |     if reverse_fastq_file:
 74 |         for key, value in runtime_args.items():
 75 |             if key == '--readFilesIn':
 76 |                 cmd.extend((key, value))
 77 |                 cmd.append(reverse_fastq_file)
 78 |             else:
 79 |                 cmd.extend((key, value))
 80 |     else:
 81 |         for pair in runtime_args.items():
 82 |             cmd.extend(pair)
 83 | 
 84 |     cmd = shlex.split(' '.join(cmd))
 85 |     aln = Popen(cmd, stderr=PIPE, stdout=PIPE)
 86 |     out, err = aln.communicate()
 87 |     if err:
 88 |         raise ChildProcessError(err)
 89 | 
 90 |     return alignment_dir + 'Aligned.out.bam'
 91 | 
 92 | 
 93 | def create_index(
 94 |         fasta: str,
 95 |         gtf: str,
 96 |         genome_dir: str,
 97 |         read_length: int=75, **kwargs) -> None:
 98 |     """Create a new STAR index
 99 | 
100 |     :param fasta: complete filepath to fasta file
101 |     :param gtf: complete filepath to gtf file
102 |     :param genome_dir: directory in which new index should be constructed
103 |     :param read_length: length of reads that will be aligned against this index
104 |     :param kwargs: additional keyword arguments to pass to the genome construction call.
105 |       to pass --sjdbFileChrStartEnd filename, pass sjdbFileChrStartEnd=filename (no --)
106 |     :return: None
107 |     """
108 |     ncpu = str(cpu_count())
109 |     makedirs(genome_dir, exist_ok=True)
110 |     overhang = str(read_length - 1)
111 | 
112 |     cmd = (
113 |         'STAR '
114 |         '--runMode genomeGenerate '
115 |         '--runThreadN {ncpu} '
116 |         '--genomeDir {genome_dir} '
117 |         '--genomeFastaFiles {fasta} '
118 |         '--sjdbGTFfile {gtf} '
119 |         '--sjdbOverhang {overhang} '.format(
120 |             ncpu=ncpu, genome_dir=genome_dir, fasta=fasta, gtf=gtf, overhang=overhang)
121 |     )
122 | 
123 |     for k, v in kwargs.items():
124 |         cmd += '--{k} {v} '.format(k=k, v=v)
125 | 
126 |     p = Popen(cmd, stderr=PIPE, stdout=PIPE)
127 |     out, err = p.communicate()
128 |     if err:
129 |         raise ChildProcessError(err)
130 | 


--------------------------------------------------------------------------------
/src/seqc/barcode_correction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from seqc.sequence.encodings import DNA3Bit
  3 | import pandas as pd
  4 | from itertools import permutations
  5 | import seqc.sequence.barcodes
  6 | from seqc import log
  7 | 
  8 | 
  9 | def ten_x_barcode_correction(ra, platform, barcode_files, max_ed=2,
 10 |             default_error_rate=0.02):
 11 |     '''
 12 |     Correct reads with incorrect barcodes according to the correct barcodes files.
 13 |     Reads with barcodes that have too many errors are filtered out.    
 14 |     :param ra: seqc.read_array.ReadArray object
 15 |     :param platform: the platform object
 16 |     :param barcode_files: the list of the paths of barcode files
 17 |     :param max_ed: maximum allowed Hamming distance from known cell barcodes 
 18 |     :param default_error_rate: assumed sequencing error rate
 19 |     :return:
 20 |     '''
 21 | 
 22 |     # Read the barcodes into lists
 23 |     valid_barcodes = set()
 24 |     for barcode_file in barcode_files:
 25 |         with open(barcode_file, 'r') as f:
 26 |             valid_barcodes = set([DNA3Bit.encode(line.strip()) for line in
 27 |                                       f.readlines()])
 28 | 
 29 |     # Group reads by cells
 30 |     indices_grouped_by_cells = ra.group_indices_by_cell(multimapping=True)
 31 | 
 32 |     # Find all valid barcodes and their counts
 33 |     valid_barcode_count = dict()
 34 |     for inds in indices_grouped_by_cells:
 35 |         # Extract barcodes for one of the reads
 36 |         barcode = platform.extract_barcodes(ra.data['cell'][inds[0]])[0]
 37 |         if barcode in valid_barcodes:
 38 |             valid_barcode_count[barcode] = len(inds)
 39 | 
 40 |     # Find the set of invalid barcodes and check out whether they can be corrected
 41 |     for inds in indices_grouped_by_cells:
 42 | 
 43 |         # Extract barcodes for one of the reads
 44 |         barcode = platform.extract_barcodes(ra.data['cell'][inds[0]])[0]
 45 |         if barcode not in valid_barcode_count:
 46 |         # Identify correct barcode as one Hamming distance away with most reads
 47 |             hammind_dist_1_barcodes = seqc.sequence.barcodes.generate_hamming_dist_1(barcode)
 48 |             fat_bc = -1
 49 |             fat_bc_count = 0
 50 |             for bc in hammind_dist_1_barcodes:
 51 |                 if (bc in valid_barcode_count) and (valid_barcode_count[bc] > fat_bc_count):
 52 |                     fat_bc = bc
 53 |                     fat_bc_count = valid_barcode_count[bc]
 54 | 
 55 |             if fat_bc < 0:
 56 |                 ra.data['status'][inds] |= ra.filter_codes['cell_error']
 57 |             else:
 58 |                 # Update the read array with the correct barcode
 59 |                 ra.data['cell'][inds] = fat_bc
 60 | 
 61 | 
 62 | 
 63 | def in_drop(ra, platform, barcode_files, max_ed=2,
 64 |             default_error_rate=0.02):
 65 |     """
 66 |     Correct reads with incorrect barcodes according to the correct barcodes files.
 67 |     Reads with barcodes that have too many errors are filtered out.
 68 |     :param ra: seqc.read_array.ReadArray object
 69 |     :param platform: the platform object
 70 |     :param barcode_files: the list of the paths of barcode files
 71 |     :param max_ed: maximum allowed Hamming distance from known cell barcodes 
 72 |     :param default_error_rate: assumed sequencing error rate
 73 |     :return:
 74 |     """
 75 | 
 76 |     # Read the barcodes into lists
 77 |     valid_barcodes = []
 78 |     for barcode_file in barcode_files:
 79 |         with open(barcode_file, 'r') as f:
 80 |             valid_barcodes.append(set(DNA3Bit.encode(line.strip()) for line in
 81 |                                       f.readlines()))
 82 |     
 83 |     # Containers         
 84 |     num_barcodes = platform.num_barcodes
 85 |     correct = [None] * num_barcodes
 86 |     edit_dist = [None] * num_barcodes
 87 | 
 88 |     # Error table container
 89 |     errors = [p for p in permutations(DNA3Bit.bin2strdict.keys(), r=2)]
 90 |     error_table = dict(zip(errors, np.zeros(len(errors))))
 91 |     cor_instance_table = dict(zip(DNA3Bit.bin2strdict.keys(),
 92 |                                   np.zeros(len(DNA3Bit.bin2strdict))))
 93 |     
 94 |     # Check if the barcode has to be an exact match
 95 |     exact_match = False
 96 |     if max_ed == 0:
 97 |         exact_match = True
 98 | 
 99 |     # Group reads by cells
100 |     indices_grouped_by_cells = ra.group_indices_by_cell(multimapping=True)
101 | 
102 |     for inds in indices_grouped_by_cells:
103 | 
104 |         # Extract barcodes for one of the reads
105 |         barcodes = platform.extract_barcodes(ra.data['cell'][inds[0]])
106 | 
107 |         # Identify correct barcode
108 |         for i in range(num_barcodes):
109 |             correct[i], edit_dist[i] = seqc.sequence.barcodes.find_correct_barcode(
110 |                 barcodes[i], valid_barcodes[i], exact_match)
111 | 
112 |         # 1. If all edit distances are 0, barcodes are correct,
113 |         #    update the correct instance table
114 |         # 2. Correct any barcodes within permissible edit distance,
115 |         #    update the correct instance table for non-errored bases,
116 |         #    update error table for the errored bases
117 |         # 3. Mark the uncorrectable barcodes as cell errors
118 | 
119 |         if all(np.array(edit_dist) == 0):
120 |             # Temp container to increment the correct instance counter
121 |             tmp_bc = DNA3Bit.ints2int(barcodes)
122 |             while tmp_bc > 0:
123 |                 cor_instance_table[tmp_bc & 0b111] += 1
124 |                 tmp_bc >>= 3
125 | 
126 |         elif max(edit_dist) > max_ed:
127 |             ra.data['status'][inds] |= ra.filter_codes['cell_error']
128 |             continue
129 | 
130 |         else:
131 |             # These barcodes can be corrected, Count the number of correct bases
132 |             # Update the error table if there was only one error across the barcodes                
133 |             tmp_bc = DNA3Bit.ints2int(barcodes)
134 |             tmp_cor = DNA3Bit.ints2int(correct)
135 | 
136 |             # Update the read array with the correct barcode
137 |             ra.data['cell'][inds] = tmp_cor
138 | 
139 |             # Iterating through the sequences
140 |             while tmp_bc > 0:
141 |                 if tmp_bc & 0b111 == tmp_cor & 0b111:
142 |                     cor_instance_table[tmp_bc & 0b111] += 1
143 |                 elif sum(edit_dist) == 1:
144 |                     error_table[(tmp_cor & 0b111, tmp_bc & 0b111)] += 1
145 |                 tmp_bc >>= 3
146 |                 tmp_cor >>= 3
147 | 
148 |     # Create error rate table
149 |     if sum(error_table.values()) == 0:
150 |         log.info('No errors were detected or barcodes do not support error '
151 |                  'correction, using %f uniform error chance.' % default_error_rate)
152 |         err_rate = dict(zip(errors, [default_error_rate] * len(errors)))
153 |     # todo @Manu bug here, we're always setting the error rate even if there are
154 |     # no detected errors. should the following line be in an "else" clause?
155 |     err_rate = dict(zip(errors, [0.0] * len(errors)))
156 |     for k, v in error_table.items():
157 |         if DNA3Bit.decode(k[0]) in b'Nn':
158 |             continue
159 |         try:
160 |             err_rate[k] = v / (sum(n for err_type, n in error_table.items()
161 |                                    if err_type[0] == k[0]) + cor_instance_table[k[0]])
162 |         except ZeroDivisionError:
163 |             log.info('Warning: too few reads to estimate error rate for %s, setting '
164 |                      'default rate of %f' %
165 |                      (str(DNA3Bit.decode(k)), default_error_rate))
166 |             err_rate[k] = default_error_rate
167 | 
168 |     return err_rate
169 | 
170 | 
171 | def drop_seq(ra, min_rmt_cutoff=10, rmt_error_frequency=0.8, barcode_base_shift_threshold=0.9):
172 | 
173 |     """Drop-seq barcode correction suggested by Ashley
174 |     1. Barcodes can be truncated to 11 bases because of synthesis error. Therefore a single
175 |        barcode can be potentially be split to 4 barcodes
176 |        Solution: Fix barcode: At the 8th position of RMT, if the fraction of T > 80%,
177 |                  replace the 12th position of the cell barcode with N
178 |                  Fix RMT: Remove the T in the last position of the RMT and prepend the 
179 |                  first base from the uncorrected cell barcode
180 |     2. If a particular base dominates any of the positions of the RMT,
181 |        remove that cell barcode
182 |     3. TODO: Primer match
183 | 
184 |     :param ra: seqc.read_array.ReadArray object
185 |     :param min_rmt_cutoff: Minimum number of RMTs to apply barcode correction
186 |     :param rmt_error_frequency: If a base appears with this frequency across the RMTs associated with the barcode
187 |            in any position, the barcode is removed
188 |     :param barcode_base_shift_threshold: Thresholds for detecting barcode shift 
189 |     :return:
190 |     """
191 |     
192 |     # Cell header [First 11 bases only - this should be parametrized]
193 |     cell_header = ra.data['cell'] >> 3
194 |     idx = np.argsort( cell_header )
195 |     # Active reads
196 |     passing = ra.data['status'][idx] == 0
197 |     idx = idx[passing]
198 |     breaks = np.where(np.diff(cell_header[idx]))[0] + 1
199 |     indices_grouped_by_cell_headers = np.split(idx, breaks)
200 | 
201 |     # RMT length
202 |     rmt_length = DNA3Bit.seq_len( ra.data['rmt'][idx[0]] )
203 | 
204 |     # 1. Barcode synthesis errors
205 |     for header_group in indices_grouped_by_cell_headers:
206 | 
207 |         # RMT set
208 |         # todo this could potentially be used in RMT correction / barcode correction in indrop
209 |         all_rmts = list(set(ra.data['rmt'][header_group]))
210 |         if len(all_rmts) < min_rmt_cutoff:
211 |             continue
212 | 
213 |         # Count Ts in the last RMT position
214 |         nuc_counts = dict(zip(DNA3Bit.bin2strdict.keys(), np.zeros(len(DNA3Bit.bin2strdict))))
215 |         for rmt in all_rmts:
216 |             nuc_counts[rmt & 0b0111] += 1
217 | 
218 |         # Correct cell barcode if necessary
219 |         if nuc_counts[DNA3Bit.str2bindict['T']] > barcode_base_shift_threshold * len(all_rmts):
220 | 
221 |             # Correct the RMTs [This needs to done for each cell/RMT combination]
222 |             idx = header_group[np.argsort(ra.data['cell'][header_group])]
223 |             breaks = np.where(np.diff(ra.data['cell'][idx]))[0] + 1
224 |             cell_groups = np.split(idx, breaks)
225 | 
226 |             for cell_group in cell_groups:
227 |                 last_base = ra.data['cell'][cell_group[0]] & 0b111
228 | 
229 |                 # Correct the RMTs
230 |                 idx = cell_group[np.argsort(ra.data['rmt'][cell_group])]
231 |                 breaks = np.where(np.diff(ra.data['rmt'][cell_group]))[0] + 1
232 |                 rmt_groups = np.split(idx, breaks)
233 | 
234 |                 for rmt_group in rmt_groups:
235 |                     # Skip the last base
236 |                     new_rmt = ra.data['rmt'][rmt_group[0]] >> 3
237 |                     # Get the last base from the cell barcode
238 |                     new_rmt = DNA3Bit.ints2int([last_base, new_rmt ])
239 |                     ra.data['rmt'][rmt_group] = new_rmt
240 | 
241 |             # Append N to the cell header
242 |             correct_barcode = DNA3Bit.ints2int([cell_header[header_group[0]], DNA3Bit.str2bindict['N']])
243 |             ra.data['cell'][header_group] = correct_barcode
244 | 
245 |     # 2. Single UMI error
246 |     indices_grouped_by_cells = ra.group_indices_by_cell()
247 |     for cell_group in indices_grouped_by_cells:
248 | 
249 |         # RMT set
250 |         all_rmts = list(set(ra.data['rmt'][cell_group]))
251 |         if len(all_rmts) < min_rmt_cutoff:
252 |             continue
253 | 
254 |         # RMT nucleotide frequency per position
255 |         base_frequencies = dict()
256 |         for i in DNA3Bit.bin2strdict.keys():
257 |             base_frequencies[i] = np.zeros(rmt_length)
258 |         for i in range(len(all_rmts)):
259 |             rmt = all_rmts[i]
260 |             position = rmt_length-1
261 |             while rmt > 0:
262 |                 base_frequencies[rmt & 0b111][position] += 1
263 |                 rmt >>= 3
264 |                 position -= 1
265 | 
266 |         # Chuck N
267 |         base_frequencies = pd.DataFrame(base_frequencies).T
268 |         base_frequencies.ix[DNA3Bit.str2bindict['N']] = 0
269 | 
270 |         # Identify incorrect UMIs
271 |         if any( base_frequencies.iloc[:, 0:(rmt_length-1)].max() > rmt_error_frequency * len(all_rmts)):
272 |             ra.data['status'][cell_group] |= ra.filter_codes['cell_error']
273 | 
274 | 
275 | 


--------------------------------------------------------------------------------
/src/seqc/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .progress import progress
2 | from .run import run
3 | from .index import index
4 | from .instances import instances
5 | from .terminate import terminate
6 | from .start import start
7 | from .notebook import notebook


--------------------------------------------------------------------------------
/src/seqc/core/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from seqc import io
 3 | 
 4 | 
 5 | def s3_data(files_or_links, output_prefix):
 6 |     """downloads any data provided by s3 links, otherwise gets list of files.
 7 | 
 8 |     :param list files_or_links: str files or str s3 links to files
 9 |     :param str output_prefix: prefix to prepend files
10 |     :returns list files: filename(s) of downloaded files
11 |     """
12 |     files = []
13 |     for f in files_or_links:
14 |         if not f.startswith('s3://'):
15 |             if f.endswith('/'):
16 |                 files.extend(f + subfile for subfile in os.listdir(f))
17 |             else:
18 |                 files.append(f)
19 |         else:
20 |             recursive = True if f.endswith('/') else False
21 |             files.extend(io.S3.download(f, output_prefix, overwrite=True,
22 |                                         recursive=recursive))
23 |     return files
24 | 


--------------------------------------------------------------------------------
/src/seqc/core/index.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def index(args):
 3 |     """create an index for SEQC.
 4 | 
 5 |     :param args: parsed arguments. This function is only called if subprocess_name is
 6 |       'index'
 7 |     """
 8 | 
 9 |     # functions to be pickled and run remotely must import all their own modules
10 |     from seqc import ec2, log
11 |     from seqc.sequence.index import Index
12 | 
13 |     log.setup_logger(args.log_name)
14 |     with ec2.instance_clean_up(args.email, args.upload, log_name=args.log_name):
15 |         idx = Index(args.organism, args.additional_id_types)
16 |         idx.create_index(args.upload_location)
17 | 


--------------------------------------------------------------------------------
/src/seqc/core/instances.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import os
 3 | 
 4 | 
 5 | def instances(args):
 6 |     """list instances and return
 7 | 
 8 |     :param args: namespace object from argparse, must contain args.rsa_key, the path to
 9 |       the rsa-key used to start the instances you want to list
10 |     :return None:
11 |     """
12 | 
13 |     if args.rsa_key is None:
14 |         raise ValueError('-k/--rsa-key does not point to a valid file object. ')
15 |     if not os.path.isfile(args.rsa_key):
16 |         raise ValueError('-k/--rsa-key does not point to a valid file object. ')
17 | 
18 |     keyname = args.rsa_key.rpartition('.')[0].rpartition('/')[-1]
19 | 
20 |     ec2 = boto3.resource('ec2')
21 |     all_instances = ec2.instances.filter(
22 |         Filters=[
23 |             {'Name': 'key-name',
24 |              'Values': [keyname]}])
25 |     for i in all_instances.all():
26 |         print('id: %s, type: %s, launch-time: %s, state: %s, ip %s' % (
27 |             i.id, i.instance_type, str(i.launch_time), i.state, i.public_ip_address))
28 | 


--------------------------------------------------------------------------------
/src/seqc/core/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python3
 2 | 
 3 | import sys
 4 | from seqc import core
 5 | from seqc.core import parser, verify
 6 | from seqc import ec2
 7 | import boto3
 8 | 
 9 | def clean_up_security_groups():
10 |     '''
11 |     Cleanning all the unused security groups that were created/started using SEQC
12 |     when the number of unused ones is greater than 300 
13 |     '''
14 |     ec2 = boto3.resource('ec2') 
15 |     sgs = list(ec2.security_groups.all())
16 |     insts = list(ec2.instances.all())
17 |     
18 |     all_sgs = set([sg.group_name for sg in sgs])                    # get all security groups                        
19 |     all_inst_sgs = set([sg['GroupName'] 
20 |         for inst in insts for sg in inst.security_groups])          # get security groups associated with instances
21 |     unused_sgs = all_sgs - all_inst_sgs                             # get ones without instance association
22 |     
23 |     if len(unused_sgs) >= 300:
24 |         print("Cleaning up the unused security groups:")
25 |         client = boto3.client('ec2')
26 |         for g in unused_sgs:
27 |             all_inst_sgs = set([sg['GroupName'] for inst in insts for sg in inst.security_groups])      # since deleting ones takes a while, doublecheck whether 
28 |             if g.startswith("SEQC") and (g not in all_inst_sgs):    # only cleaning ones associated with SEQC                                        # the security group is still unused
29 |                 client.delete_security_group(GroupName=g)
30 |                 print(g+" deleted")
31 | 
32 | 
33 | def main(argv):
34 |     """Check arguments, then call the appropriate sub-module
35 | 
36 |     Created to allow the main pipeline to be tested from the earliest entry point
37 |     (command-line arguments).
38 | 
39 |     :param argv: output of sys.argv[1:]
40 |     """
41 |     arguments = parser.parse_args(argv)
42 | 
43 |     func = getattr(core, arguments.subparser_name)
44 |     assert func is not None
45 | 
46 |     # notebooks execute local
47 |     if arguments.subparser_name == 'notebook':
48 |         return func(arguments)
49 | 
50 |     if arguments.remote:
51 |         # todo improve how verification works; it's not really necessary, what is needed
52 |         # is a method to determine volume size for remote.
53 |         verification_func = getattr(verify, arguments.subparser_name)
54 |         verified_args = verification_func(arguments)
55 |         remote_args = {
56 |             k: getattr(verified_args, k) for k in
57 |             ('rsa_key', 'instance_type', 'spot_bid', 'volume_size') if
58 |             getattr(verified_args, k)}
59 |         clean_up_security_groups()
60 |         ec2.AWSInstance(synchronous=False, **remote_args)(func)(verified_args)
61 |     else:
62 |         func(arguments)
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main(sys.argv[1:])
67 | 


--------------------------------------------------------------------------------
/src/seqc/core/notebook.py:
--------------------------------------------------------------------------------
 1 | from seqc.notebooks.notebooks import Notebook
 2 | from seqc import log
 3 | 
 4 | 
 5 | def notebook(args):
 6 |     if args.subsubparser_name == 'merge':
 7 |         # need to also take a output directory because this thing will write stuff.
 8 |         # then merge the things
 9 |         # then return?
10 |         n = Notebook(args.output_filename, *args.input_data)
11 |         n.merge_data(merged_sample_name=args.output_filename)
12 |         log.info('Merged samples written to %s' % args.input_data)
13 |     elif args.subsubparser_name == 'generate':
14 |         n = Notebook(args.output_stem, args.input_count_matrix)
15 |         n.write_template()
16 |         log.info('Notebook Template written to %s' % n.notebook_path)
17 |         n.run_notebook()
18 |         log.info('Notebook Run and written to %s' % n.notebook_path)
19 | 
20 | 


--------------------------------------------------------------------------------
/src/seqc/core/parser.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import sys
  4 | import inspect
  5 | from subprocess import Popen, PIPE
  6 | from seqc import version, platforms
  7 | 
  8 | 
  9 | def parse_args(args):
 10 |     """
 11 |     command line argument parser for process_experiment
 12 | 
 13 |     :param args: list of command-line arguments (either passed as a list, or retrieved
 14 |       from sys.argv.
 15 |     :returns: args, namespace object, output of ArgumentParser.parse_args()
 16 |     """
 17 | 
 18 |     meta = argparse.ArgumentParser(
 19 |         description='Processing Tools for scRNA-seq Experiments')
 20 |     meta.add_argument('-v', '--version', action='version',
 21 |                       version='{} {}'.format(meta.prog, version.__version__))
 22 |     subparsers = meta.add_subparsers(dest='subparser_name')
 23 | 
 24 |     # subparser for running experiments
 25 |     # can use to make prettier: formatter_class=partial(argparse.HelpFormatter, width=200)    
 26 |     p = subparsers.add_parser('run', help='initiate SEQC runs')
 27 | 
 28 |     # Platform choices
 29 |     choices = [x[0] for x in inspect.getmembers(platforms, inspect.isclass) if
 30 |            issubclass(x[1], platforms.AbstractPlatform)][1:]
 31 |     p.add_argument('platform',
 32 |                    choices=choices,
 33 |                    help='which platform are you merging annotations from?')
 34 | 
 35 |     a = p.add_argument_group('required arguments')
 36 |     a.add_argument('-o', '--output-prefix', metavar='O', required=True,
 37 |                    help='filename prefix for all seqc output. Should not be a directory.')
 38 |     a.add_argument('-i', '--index', metavar='I', required=True,
 39 |                    help='Local folder or s3 link to a directory containing the STAR '
 40 |                         'index used for alignment.')
 41 |     a.add_argument('--barcode-files', nargs='*', metavar='BF', default=[],
 42 |                    help='Either (a) an s3 link to a folder containing only barcode '
 43 |                         'files, or (b) the full file path of each file on the local '
 44 |                         'machine.')
 45 | 
 46 |     i = p.add_argument_group('input arguments')
 47 |     i.add_argument('-g', '--genomic-fastq', nargs='*', metavar='G', default=[],
 48 |                    help='List of fastq file(s) containing genomic information, or an s3 '
 49 |                         'link to a directory containing only genomic fastq file(s).')
 50 |     i.add_argument('-b', '--barcode-fastq', nargs='*', metavar='B', default=[],
 51 |                    help='List of fastq file(s) containing barcode information, or an s3 '
 52 |                         'link to a directory containing only barcode fastq file(s).')
 53 |     i.add_argument('-m', '--merged-fastq', nargs='?', metavar='M', default='',
 54 |                    help='Filename or s3 link to a fastq file containing genomic '
 55 |                         'information annotated with barcode data.')
 56 |     i.add_argument('-a', '--alignment-file', nargs='?', metavar='A', default='',
 57 |                    help='Filename or s3 link to a .sam or .bam file containing aligned, '
 58 |                         'merged sequence records.')
 59 |     i.add_argument('-r', '--read-array', nargs='?', metavar='RA', default='',
 60 |                    help='Filename or s3 link to a ReadArray (.h5) archive containing '
 61 |                         'processed sam records.')
 62 |     i.add_argument('--basespace', metavar='BS',
 63 |                    help='BaseSpace sample ID. The string of numbers indicating the id '
 64 |                         'of the BaseSpace sample. (e.g. if the link to the sample is '
 65 |                         'https://basespace.illumina.com/sample/34000253/0309, '
 66 |                         'then --basespace would be 34000253.')
 67 |     i.add_argument('--basespace-token', metavar='BST', default=None,
 68 |                    help='OAuth token for basespace access. Required if BaseSpace input '
 69 |                         'is used.')
 70 | 
 71 |     f = p.add_argument_group('filter arguments')
 72 |     f.add_argument('--max-insert-size', metavar='F', type=int,
 73 |                    help='the maximum fragment size in bp. Aligments that are further '
 74 |                         'than this distance from a TTS are discarded. Default=1000',
 75 |                    default=1000)
 76 |     f.add_argument('--min-poly-t', metavar='T',
 77 |                    help='minimum size of poly-T tail that is required for a barcode to '
 78 |                         'be considered a valid record (default=None, automatically '
 79 |                         'estimates the parameter from the sequence length)',
 80 |                    default=None, type=int)
 81 |     # f.add_argument('--max-dust-score', metavar='D', default=10, type=int,
 82 |     #                help='maximum complexity score for a read to be considered valid. '
 83 |     #                     '(default=10, higher scores indicate lower complexity.)')
 84 |     f.add_argument('--singleton-weight', metavar='SW',
 85 |                    help='Weight to apply to singletons in the count matrix. Float '
 86 |                         'between 0 and 1, default=1 (all molecules get full weight)',
 87 |                    default=1.0, type=float)
 88 |     f.set_defaults(filter_mitochondrial_rna=True)
 89 |     f.add_argument('--no-filter-mitochondrial-rna', action='store_false',
 90 |                    dest='filter_mitochondrial_rna',
 91 |                    help='Do not filter cells with greater than 20 percent mitochondrial '
 92 |                         'RNA ')
 93 |     f.set_defaults(filter_low_coverage=True)
 94 |     f.add_argument('--no-filter-low-coverage', action='store_false',
 95 |                    dest='filter_low_coverage',
 96 |                    help='Do not filter cells with low coverage')
 97 |     f.set_defaults(filter_low_gene_abundance=True)
 98 |     f.add_argument('--no-filter-low-gene-abundance', action='store_false',
 99 |                    dest='filter_low_gene_abundance',
100 |                    help='Do not filter cells with low coverage')
101 |     f.add_argument('--low-coverage-alpha', metavar='LA',
102 |                    help='FDR rate for low coverage reads filter in mars-seq datasets. '
103 |                         'Float between 0 and 1, default=0.25',
104 |                    default=0.25, type=float)
105 | 
106 |     s = p.add_argument_group('alignment arguments')
107 |     s.add_argument('--star-args', default=None, nargs='*',
108 |                    help='additional arguments that should be passed to the STAR '
109 |                         'aligner. For example, to set the maximum allowable times for a '
110 |                         'read to align to 20, one would set '
111 |                         '--star-args outFilterMultimapNmax=20. Additional arguments can '
112 |                         'be provided as a white-space separated list.')
113 | 
114 |     # PROGRESS PARSER
115 |     progress = subparsers.add_parser('progress', help='check SEQC run progress')
116 |     progress.set_defaults(remote=False)
117 |     progress.add_argument(
118 |         '-i', '--instance-ids', help='check the progress of run(s)', nargs='+')
119 |     progress.add_argument(
120 |         '-k', '--rsa-key', help='RSA key registered to your aws account',
121 |         default=None)
122 | 
123 |     # TERMINATE PARSER
124 |     terminate = subparsers.add_parser('terminate', help='terminate SEQC runs')
125 |     terminate.set_defaults(remote=False)
126 |     terminate.add_argument(
127 |         '-i', '--instance-ids', help='terminate these instance(s)', nargs='+')
128 | 
129 |     # INSTANCES PARSER
130 |     instances = subparsers.add_parser('instances', help='list all running instances')
131 |     instances.set_defaults(remote=False)
132 |     instances.add_argument(
133 |         '-k', '--rsa-key', help='RSA key registered to your aws account',
134 |         default=None)
135 | 
136 |     # START PARSER
137 |     start = subparsers.add_parser(
138 |         'start', help='initialize a seqc-ready instance')
139 |     start.set_defaults(remote=False)
140 |     start.add_argument(
141 |         '-s', '--volume-size', help='size of volume (Gb) to attach to instance',
142 |         default=5, type=int)
143 |     start.add_argument(
144 |         '-b', '--spot-bid', help='amount to bid for instance in fractions of dollars',
145 |         type=float, default=None)
146 |     start.add_argument(
147 |         '-t', '--instance-type', default='c4.8xlarge',
148 |         help='AWS instance type to initialize. '
149 |              'See https://aws.amazon.com/ec2/instance-types/ for valid types')
150 |     start.add_argument(
151 |         '-k', '--rsa-key', help='RSA key registered to your aws account',
152 |         default=None)
153 | 
154 |     # NOTEBOOK PARSERS
155 |     notebook_sp = subparsers.add_parser('notebook', help='notebook tools')
156 |     _nb_parser = notebook_sp.add_subparsers(dest='subsubparser_name')
157 | 
158 |     # NOTEBOOK MERGE PARSER
159 |     merge = _nb_parser.add_parser(
160 |         'merge', help='merge multiple datasets prior to running an analysis notebook')
161 |     merge.add_argument(
162 |         '-o', '--output-filename', help='name for merged fastq file', required=True)
163 |     merge.add_argument(
164 |         '-i', '--input-data', nargs='+', help='count matrices to merge', required=True)
165 | 
166 |     # NOTEBOOK GENERATE PARSER
167 |     generate = _nb_parser.add_parser('generate', help='generate a notebook from a dataset')
168 |     generate.add_argument(
169 |         '-i', '--input-count-matrix', help='count matrix file', required=True)
170 |     generate.add_argument(
171 |         '-o', '--output-stem', help='directory and filestem for output', required=True)
172 | 
173 |     pindex = subparsers.add_parser('index', help='create a SEQC index')
174 |     pindex.add_argument(
175 |         '-o', '--organism', required=True,
176 |         help='organism to create index for. Must be formatted as genus_species in all '
177 |              'lower-case. e.g. human is homo_sapiens.')
178 |     pindex.add_argument(
179 |         '-f', '--folder', default=None,
180 |         help='folder in which to create the index. Defaults to the name of the organism, '
181 |              'which is created in the current directory.')
182 |     pindex.add_argument(
183 |         '--ids', '--additional-id-types', nargs='*',
184 |         help='names of additional ids from other consortia to check against. If '
185 |              'provided, each ENSEMBL gene id must also be annotated by at least one of '
186 |              'these consortia to be considered valid and appear in the final SEQC count '
187 |              'matrix.')
188 |     pindex.add_argument(
189 |         '-b', '--valid-biotypes', default=('protein_coding', 'lincRNA'),
190 |         help='list of gene biotypes that are considered valid. Defaults are '
191 |              'protein_coding and lincRNA. In most cases, other biotypes are not expected '
192 |              'to be captured by SEQC, and should be excluded')
193 | 
194 |     for parser in [pindex, p]:
195 |         r = parser.add_argument_group('Amazon Web Services arguments')
196 |         r.set_defaults(remote=True)
197 |         r.set_defaults(terminate=True)
198 |         r.set_defaults(log_name='seqc_log.txt')  # changed to .txt for email
199 |         r.add_argument(
200 |             '--local', dest="remote", action="store_false",
201 |             help='Run locally instead of on an aws instance')
202 |         r.add_argument(
203 |             '-u', '--upload-prefix', metavar='U', default=None,
204 |             help='s3 location for data to be uploaded.')
205 |         r.add_argument(
206 |             '--instance-type', default='c4.8xlarge',
207 |             help='AWS instance type to initialize for this job. '
208 |                  'See https://aws.amazon.com/ec2/instance-types/ for valid types')
209 |         r.add_argument(
210 |             '--spot-bid', type=float, default=None,
211 |             help='float, Amount to bid for a spot instance. Default=None (will reserve a '
212 |                  'non-spot instance). WARNING: using spot instances will cause your '
213 |                  'instance to terminate if instance prices exceed your spot bid during '
214 |                  'runtime.')
215 |         r.add_argument(
216 |             '--volume-size', type=int, default=None,
217 |             help='size in Gb required to execute the requested process. If not provided, '
218 |                  'it will be estimated from passed parameters.')
219 |         r.add_argument(
220 |             '-e', '--email', metavar='E', default=None,
221 |             help='Email address to receive run summary or errors when running remotely. '
222 |                  'Optional only if running locally.')
223 |         r.add_argument('--debug', default=False, action='store_true',
224 |                        help='If debug is set, runs that throw errors do not '
225 |                             'terminate the instance they were run on.')
226 |         r.add_argument(
227 |             '-k', '--rsa-key', metavar='K', default=None,
228 |             help='RSA key registered to your aws account that allowed access to ec2 '
229 |                  'resources. Required if running instance remotely.')
230 | 
231 |     # custom help handling
232 |     if len(args) == 0:  # print help if no args are passed
233 |         meta.print_help()
234 |         sys.exit(1)
235 |     if args == ['run', '-h']:  # send help for run to less, is too long
236 |         pipe = Popen(['less'], stdin=PIPE)
237 |         pipe.communicate(p.format_help().encode())
238 |         sys.exit(1)
239 | 
240 |     parsed = meta.parse_args(args)
241 | 
242 |     if hasattr(parsed, 'rsa_key'):
243 |         if parsed.rsa_key is None:
244 |             try:
245 |                 parsed.rsa_key = os.environ['AWS_RSA_KEY']
246 |             except KeyError:
247 |                 pass
248 | 
249 |     return parsed
250 | 


--------------------------------------------------------------------------------
/src/seqc/core/progress.py:
--------------------------------------------------------------------------------
 1 | from subprocess import Popen, PIPE
 2 | from seqc import ec2
 3 | from paramiko.ssh_exception import AuthenticationException
 4 | from botocore.exceptions import ClientError
 5 | 
 6 | 
 7 | def progress(args):
 8 |     """print progress of requested seqc run(s) to less
 9 | 
10 |     :param args: namespace object from argparse, must include rsa-key and instance-id
11 |     :return None:
12 |     """
13 |     if args.rsa_key is None:
14 |         raise ValueError('User must supply -k/--rsa-key or set the environment variable '
15 |                          'AWS_RSA_KEY')
16 | 
17 |     if args.instance_ids is None:
18 |         raise ValueError('No instances specified. Please supply an instance using the -i '
19 |                          'parameter.')
20 | 
21 |     for id_ in args.instance_ids:
22 |         connection = ec2.SSHConnection(id_, args.rsa_key)
23 |         try:
24 |             out, err = connection.execute('cat ./seqc_log.txt')
25 |         except AuthenticationException:
26 |             raise ValueError('instance %s cannot be found.' % repr(id_))
27 |         except ClientError:
28 |             raise ValueError('instance %s cannot be found.' % repr(id_))
29 |         p = Popen(['less'], stdin=PIPE)
30 |         p.communicate(input='\n'.join(out).encode())
31 | 


--------------------------------------------------------------------------------
/src/seqc/core/start.py:
--------------------------------------------------------------------------------
 1 | from seqc import ec2
 2 | import os
 3 | 
 4 | 
 5 | def start(args):
 6 |     """start an aws instance"""
 7 | 
 8 |     if args.rsa_key is None:
 9 |         raise ValueError('-k/--rsa-key does not point to a valid file object. ')
10 |     if not os.path.isfile(args.rsa_key):
11 |         raise ValueError('-k/--rsa-key does not point to a valid file object. ')
12 | 
13 |     instance = ec2.AWSInstance(
14 |         rsa_key=args.rsa_key, instance_type=args.instance_type, spot_bid=args.spot_bid,
15 |         volume_size=args.volume_size)
16 |     instance.start()
17 | 


--------------------------------------------------------------------------------
/src/seqc/core/terminate.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from botocore.exceptions import ClientError
 3 | 
 4 | 
 5 | def terminate(args):
 6 |     """print progress of requested seqc run to top
 7 | 
 8 |     :param args: namespace object from argparse, must include rsa-key and instance-id
 9 |     :return None:
10 |     """
11 |     ec2 = boto3.resource('ec2')
12 |     for id_ in args.instance_ids:
13 |         instance = ec2.Instance(id=id_)
14 |         try:
15 |             response = instance.terminate()
16 |             print('termination signal sent:\n%s' % response)
17 |         except ClientError:
18 |             print('instance %s does not exist')
19 | 


--------------------------------------------------------------------------------
/src/seqc/core/verify.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import inspect
  4 | from math import ceil
  5 | from seqc import io, platforms, ec2
  6 | 
  7 | 
  8 | def filesize(filename):
  9 |     """return filesize of filename in bytes
 10 | 
 11 |     :param str filename: full path to file
 12 |     :return int: number of bytes in filename
 13 |     """
 14 |     return os.stat(filename).st_size
 15 | 
 16 | 
 17 | def validate_and_return_size(filename):
 18 |     """return true if a link or filepath points to a valid file or directory
 19 | 
 20 |     :param str filename: filepath or s3 link
 21 |     :return None: raises errors if path or link is invalid.
 22 |     """
 23 |     if filename.startswith('s3://'):
 24 |         io.S3.check_links([filename])
 25 |         return io.S3.obtain_size(filename)
 26 |     else:
 27 |         if os.path.isfile(filename):
 28 |             return filesize(filename)
 29 |         elif os.path.isdir(filename.rstrip('/')):
 30 |             return sum(filesize(filename + f) for f in os.listdir(filename))
 31 |         else:
 32 |             print(filename)
 33 |             raise ValueError('%s does not point to a valid file')
 34 | 
 35 | 
 36 | def estimate_required_volume_size(args):
 37 |     """estimate the size of volume that should be attached to an aws instance to run SEQC
 38 | 
 39 |     :param args: namespace object containing filepaths or download links to input data
 40 |     :return int: size of volume in gb
 41 |     """
 42 |     # using worst-case estimates to make sure we don't run out of space, 35 = genome index
 43 |     total = (35 * 1e10) + sum(validate_and_return_size(f) for f in args.barcode_files)
 44 | 
 45 |     # todo stopped here; remove aws dependency
 46 |     if args.barcode_fastq and args.genomic_fastq:
 47 |         total += sum(validate_and_return_size(f) for f in args.barcode_fastq) * 14 + 9e10
 48 |         total += sum(validate_and_return_size(f) for f in args.genomic_fastq) * 14 + 9e10
 49 |         total += validate_and_return_size(args.index)
 50 | 
 51 |     elif args.alignment_file:
 52 |         total += (validate_and_return_size(args.alignment_file) * 2) + 4e10
 53 |         total += validate_and_return_size(args.index)
 54 | 
 55 |     elif args.merged_fastq:
 56 |         total += (validate_and_return_size(args.merged_fastq) * 13) + 9e10
 57 |         total += validate_and_return_size(args.index)
 58 | 
 59 |     elif args.read_array:
 60 |         total += validate_and_return_size(args.read_array)
 61 | 
 62 |     if args.basespace:
 63 |         if not args.basespace_token or args.basespace_token == 'None':
 64 |             raise ValueError(
 65 |                 'If the --basespace argument is used, the basespace token must be '
 66 |                 'specified in the seqc config file or passed as --basespace-token')
 67 | 
 68 |         io.BaseSpace.check_sample(args.basespace, args.basespace_token)
 69 |         total += io.BaseSpace.check_size(args.basespace, args.basespace_token) * 14 + 9e10
 70 | 
 71 |     return ceil(total * 1e-9)
 72 | 
 73 | 
 74 | def run(args) -> float:
 75 |     """
 76 |     verify data input through the command line arguments, fixes minor issues, and
 77 |     throws exceptions if invalid parameters are encountered
 78 | 
 79 |     additionally, this function obtains a rough estimate of how much
 80 |     volume storage is needed for a remote run.
 81 | 
 82 |     :param Namespace args: Namespace object, output from ArgumentParser.parse_args()
 83 |     :returns total: float, estimated Kb of Volume space needed to run SEQC remotely.
 84 |     """
 85 | 
 86 |     if args.rsa_key is None:
 87 |         raise ValueError('-k/--rsa-key does not point to a valid file object. ')
 88 |     if not os.path.isfile(args.rsa_key):
 89 |         raise ValueError('-k/--rsa-key does not point to a valid file object. ')
 90 | 
 91 |     if args.output_prefix.endswith('/'):
 92 |         raise ValueError('output_stem should not be a directory.')
 93 |     if not args.index.endswith('/'):
 94 |         raise ValueError('index must be a directory, and must end with "/"')
 95 | 
 96 |     # check platform name; raises ValueError if invalid
 97 |     platform_name(args.platform)
 98 | 
 99 |     # check to make sure that --email-status is passed with remote run
100 |     if args.remote and not args.email:
101 |         raise ValueError('Please supply the --email-status flag for a remote SEQC run.')
102 |     # if args.instance_type not in ['c3', 'c4', 'r3']:  # todo fix this instance check
103 |     #     raise ValueError('All AWS instance types must be either c3, c4, or r3.')
104 |     # if args.terminate not in ['True', 'true', 'False', 'false', 'on-success']:
105 |     #     raise ValueError('the --no-terminate flag must be either True, False, '
106 |     #                      'or on-success.')
107 | 
108 |     # make sure at least one input has been passed
109 |     valid_inputs = (
110 |         args.barcode_fastq, args.genomic_fastq, args.merged_fastq, args.alignment_file,
111 |         args.basespace, args.read_array)
112 |     if not any(valid_inputs):
113 |         raise ValueError(
114 |             'At least one input argument (-b/-g, -m, -s, -r, --basespace) must be passed '
115 |             'to SEQC.')
116 |     if not args.barcode_files:  # todo clean this up and fold into platform somehow
117 |         if args.platform != 'drop_seq':
118 |             raise ValueError('--barcode-files is required for this platform.')
119 | 
120 |     # make sure at most one input type has been passed
121 |     num_inputs = 0
122 |     if args.barcode_fastq or args.genomic_fastq:
123 |         if not all((args.barcode_fastq, args.genomic_fastq)):
124 |             raise ValueError(
125 |                 'if either genomic or barcode fastq are provided, both must be provided')
126 |         num_inputs += 1
127 |     num_inputs += sum(1 for i in (args.merged_fastq, args.alignment_file,
128 |                                   args.basespace, args.read_array) if i)
129 |     if num_inputs > 1:
130 |         raise ValueError(
131 |             'user should provide at most one input argument (-b/-g, -m, -s, -r, '
132 |             '--basespace')
133 | 
134 |     # if basespace is being used, make sure there is a valid basespace token
135 |     if args.basespace and not hasattr(args, 'basespace_token'):
136 |         raise RuntimeError('if --basespace input is selected, user must provide an OAuth '
137 |                            'token using the --basespace-token parameter.')
138 | 
139 |     # check that spot-bid is correct
140 |     if args.spot_bid is not None:
141 |         if args.spot_bid < 0:
142 |             raise ValueError('bid %f must be a non-negative float.' % args.spot_bid)
143 | 
144 |     if args.upload_prefix and not args.upload_prefix.startswith('s3://'):
145 |         raise ValueError('upload_prefix should be an s3 address beginning with s3://')
146 | 
147 |     if args.upload_prefix.startswith('s3://'):
148 |         ec2.check_bucket(args.upload_prefix)
149 | 
150 |     if args.volume_size is None:
151 |         setattr(args, 'volume_size', estimate_required_volume_size(args))
152 | 
153 |     return args
154 | 
155 | 
156 | def index(args):
157 |     """add a default volume_size if it was not otherwise passed to seqc.
158 | 
159 |     :param args: namespace object from argparse
160 |     :return: updated namespace object with volume_size set.
161 |     """
162 |     if args.volume_size is None:
163 |         setattr(args, 'volume_size', 100)
164 |     return args
165 | 
166 | 
167 | def executables(*execs):
168 |     """
169 |     checks whether executables are installed on the machine of the
170 |     current seqc run.
171 | 
172 |     :param execs: Tuple of executables to check
173 |     :returns : Tuple of boolean (True if a specific executable is installed).
174 |     """
175 |     return tuple(map(lambda exe: shutil.which(exe) is not None, execs))
176 | 
177 | 
178 | def platform_name(name: str):
179 |     """
180 |     checks whether the platform name supplied by the user is supported by the current
181 |     iteration of seqc.
182 |     :param name: string of platform name to check
183 |     :return: name (if supported by seqc).
184 |     """
185 |     choices = [x[0] for x in inspect.getmembers(platforms, inspect.isclass) if
186 |                issubclass(x[1], platforms.AbstractPlatform)][1:]
187 |     if name not in choices:
188 |         raise ValueError('Please specify a valid platform name for SEQC. The available '
189 |                          'options are: {}'.format(choices))
190 |     # throw error for mars1_seq since we don't have the appropriate primer length yet
191 |     if name == 'mars1_seq':
192 |         raise ValueError('Mars1-seq is currently not stable in this version of SEQC.')
193 |     return name
194 | 


--------------------------------------------------------------------------------
/src/seqc/distance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def jsd(p, q) -> float:
 5 |     """Jensen Shannon distance of two variables normalized variables p and q
 6 | 
 7 |     Note that if p and q are not normalized, this function will not return a proper
 8 |     distance, so matrices should be normalized prior to use
 9 | 
10 |     use with sklearn.NearestNeighbors:
11 | 
12 |     >>> from sklearn.neighbors import NearestNeighbors
13 |     >>> # set some dummy variables
14 |     >>> data = np.random.random((100, 100))
15 |     >>> data = data / data.sum(axis=1)[:, np.newaxis]  # norm rows
16 |     >>> assert(np.all(np.array(data.sum(axis=1) == 1)))3
17 |     >>> k = 10
18 |     >>>
19 |     >>> nn = NearestNeighbors(k=k, metric='pyfunc', algorithm='ball_tree',
20 |     >>>          metric_params={'func': jsd})
21 |     >>> nn.fit(data)
22 | 
23 |     Parameters
24 |     ----------
25 |     p, q : np.array
26 | 
27 |     Returns
28 |     -------
29 |     float : kl divergence between p and q
30 |     """
31 |     idx = np.logical_or(p != 0, q != 0)
32 |     p = p[idx]
33 |     q = q[idx]
34 |     m = (p + q) / 2
35 |     return np.sqrt((.5 * kldiv(p, m)) + (.5 * kldiv(q, m)))
36 | 
37 | 
38 | def kldiv(x: np.ndarray, m: np.ndarray) -> float:
39 |     """Modified Kullback-Liebler divergence of two variables x and m.
40 | 
41 |     depends upon normalization done by jsd parent function, namely that (1) there are no
42 |     zero-valued entries in m, and (2) both x and m are probability distributions that
43 |     sum to 1
44 | 
45 |     Parameters
46 |     ----------
47 |     x, m : normalized probability vectors
48 | 
49 |     Returns
50 |     -------
51 |     float : kl divergence between p and q
52 |     """
53 |     return np.nansum(x * np.log2(x / m))
54 | 


--------------------------------------------------------------------------------
/src/seqc/email_.py:
--------------------------------------------------------------------------------
 1 | from subprocess import Popen, PIPE
 2 | import os
 3 | 
 4 | 
 5 | def email_user(attachment: str, email_body: str, email_address: str) -> None:
 6 |     """
 7 |     sends an email to email address with text contents of email_body and attachment
 8 |     attached. Email will come from "ec2-User@<ec2-instance-ip-of-aws-instance>
 9 | 
10 |     :param attachment: the file location of the attachment to append to the email
11 |     :param email_body: text to send in the body of the email
12 |     :param email_address: the address to which the email should be sent"""
13 | 
14 |     # todo if remote is sending double emails, add quotes around attachment.
15 |     if isinstance(email_body, str):
16 |         email_body = email_body.encode()
17 |     email_args = ['mutt', '-e', 'set content_type="text/html"', '-a', attachment, '-s',
18 |                   'Remote Process', '--', email_address]
19 |     email_process = Popen(email_args, stdin=PIPE)
20 |     email_process.communicate(email_body)
21 | 


--------------------------------------------------------------------------------
/src/seqc/exceptions.py:
--------------------------------------------------------------------------------
 1 | class RetryLimitExceeded(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class InstanceNotRunningError(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class EC2RuntimeError(Exception):
10 |     pass
11 | 
12 | 
13 | class ConfigurationError(Exception):
14 |     pass
15 | 
16 | 
17 | class ArgumentParserError(Exception):
18 |     pass
19 | 
20 | 
21 | class EmptyMatrixError(Exception):
22 |     pass
23 | 


--------------------------------------------------------------------------------
/src/seqc/h5.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import os
  4 | 
  5 | 
  6 | class H5:
  7 | 
  8 |     def __init__(self, archive_name: str):
  9 |         """Wrapper for the pandas HDFStore class which ensures that all interactions with
 10 |         the archive result in a closed, flushed archive.
 11 | 
 12 |         In order to ensure data usability, all data must be submitted in DataFrame format.
 13 |         This decision was made to encourage users to pair metadata with sequencing data,
 14 |         and reduce the incidence of unexpected data permutation.
 15 | 
 16 |         :param archive_name: name of the h5 archive to open. If the archive does not exist
 17 |           it will be created using a blosc5 filter
 18 | 
 19 |         :method ls: list contents of the archive
 20 |         :method save: save an object to the h5 archive
 21 |         :method load: load an object from the archive
 22 |         :method remove: remove a DataFrame from the archive
 23 |         :method is_open: returns True if the h5 archive is open, else False
 24 |         """
 25 |         if os.path.isfile(archive_name):
 26 |             self._archive = pd.HDFStore(archive_name, mode='a')
 27 |             self._archive.close()
 28 |         else:
 29 |             self._archive = pd.HDFStore(
 30 |                 archive_name, mode='a', complib='blosc', complevel=5)
 31 |             self._archive.close()
 32 | 
 33 |     def __repr__(self):
 34 |         self._archive.open()
 35 |         try:
 36 |             return repr(self._archive)
 37 |         finally:
 38 |             self._archive.close()
 39 | 
 40 |     def save(self, data: pd.DataFrame, location: str) -> None:
 41 |         """Save DataFrame data to the h5 archive in location.
 42 | 
 43 |         :param data: DataFrame object to store
 44 |         :param location: filepath to save the object in the h5 hierarchy
 45 |         """
 46 |         if not isinstance(data, pd.DataFrame):
 47 |             if isinstance(data, np.ndarray):
 48 |                 res = input('np.ndarray class detected. Save as pd.DataFrame with '
 49 |                             'ascending integer indices? [y/n] ')
 50 |                 if res in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
 51 |                     data = pd.DataFrame(data)
 52 |                 else:
 53 |                     print('User elected not to save DataFrame, archive is unmodified.')
 54 |                     return
 55 |             else:
 56 |                 raise TypeError('only pd.DataFrame objects can be saved using this '
 57 |                                 'class. To save np.ndarray objects please see the tables '
 58 |                                 'package.')
 59 |         self._archive.open()
 60 |         try:
 61 |             self._archive[location] = data
 62 |         finally:
 63 |             self._archive.close()
 64 | 
 65 |     def load(self, location: str) -> None:
 66 |         """Load and return the dataframe found at location in the archive.
 67 | 
 68 |         :param location: str, location of object to retrieve from h5
 69 |         :return: pd.DataFrame, object found at location
 70 |         """
 71 |         self._archive.open()
 72 |         try:
 73 |             return self._archive[location]
 74 |         finally:
 75 |             self._archive.close()
 76 | 
 77 |     def ls(self) -> None:
 78 |         """list archive contents"""
 79 |         try:
 80 |             self._archive.open()
 81 |             print(self._archive)
 82 |         finally:
 83 |             self._archive.close()
 84 | 
 85 |     def remove(self, location: str) -> None:
 86 |         """remove the DataFrame at location from the archive
 87 | 
 88 |         Note: removing a dataframe at a branch node will remove all leaves sharing this
 89 |         prefix. e.g. in an archive containing:
 90 | 
 91 |         /data
 92 |         /data/filtered
 93 |         /data/metadata
 94 |         /new_data/data
 95 | 
 96 |         removing /data would remove the first three DataFrame objects from the archive.
 97 | 
 98 |         :param location: location of DataFrame to remove
 99 |         :return: None
100 |         """
101 | 
102 |         self._archive.open()
103 |         try:
104 |             if location not in self._archive.keys():
105 |                 raise ValueError(
106 |                     '{} not contained in archive, nothing to remove.'.format(location))
107 |             else:
108 |                 removed = [k for k in self._archive.keys()
109 |                            if k.startswith(location + '/')]
110 |                 if len(removed) != 0:
111 |                     res = input(
112 |                         'Removing branch node {}, which is a prefix for {!a} will remove '
113 |                         'all listed DataFrames. Continue with removal? [y/n] '.format(
114 |                             location, removed))
115 |                     if res not in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
116 |                         print('returned without deletion.')
117 |                         return
118 |                 self._archive.remove(location)
119 |         finally:
120 |             self._archive.close()
121 | 
122 |     @property
123 |     def is_open(self) -> bool:
124 |         return self._archive.is_open
125 | 


--------------------------------------------------------------------------------
/src/seqc/log.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from datetime import datetime
  4 | import pandas as pd
  5 | from seqc.stats.experimental_yield import ExperimentalYield
  6 | from collections import defaultdict
  7 | import os
  8 | import re
  9 | 
 10 | 
 11 | def setup_logger(filename):
 12 |     """create a simple log file in the cwd to track progress and any errors"""
 13 |     logging.basicConfig(filename=filename, level=logging.DEBUG, filemode='w')
 14 | 
 15 | 
 16 | def info(message):
 17 |     """print a timestamped update for the user.
 18 |     :param message:
 19 |     """
 20 |     logging.info(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':' + message)
 21 | 
 22 | 
 23 | def exception():
 24 |     """log the most recent exception to an initialized logger"""
 25 |     logging.exception(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':main:')
 26 | 
 27 | 
 28 | def notify(message):
 29 |     """print a timestamped update for the user and log it to file"""
 30 |     info(message)
 31 |     print('SEQC: ' + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ': %s' % message)
 32 | 
 33 | 
 34 | def debug(message):
 35 |     logging.debug(datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
 36 |                   ':%(module)s:%(funcName)s:' + ': %s' % message)
 37 | 
 38 | 
 39 | def args(arguments):
 40 |     """
 41 |     log namespace object from argument parser to file.
 42 | 
 43 |     :param arguments: namespace object, output of ArgumentParser.parse_args()
 44 |     :return: None
 45 |     """
 46 |     arguments = vars(arguments)
 47 |     info('Passed command line arguments: {}'.format(
 48 |             json.dumps(arguments, separators=(',', ': '), indent=4, sort_keys=True)))
 49 | 
 50 | 
 51 | class LogData:
 52 |     """
 53 |     Automatically parse SEQC logs
 54 | 
 55 |     :method parse_log: parse an individual seqc log into a pd.DataFrame with a MultiIndex
 56 |       index corresponding to the categories of the seqc log
 57 |     :method parse_multiple: parse a directory hierarchy into a pd.DataFrame with an index
 58 |       as above, but the columns recapitulate the directory structure that the logs are
 59 |       stored in.
 60 |     """
 61 | 
 62 |     _oldver = ('{divide}\nINPUT\n{divide}\n'
 63 |                'Total input reads:\t{n_fastq}\n'
 64 |                '{divide}\nALIGNMENT (% FROM INPUT)\n{divide}\n'
 65 |                'Total reads aligned:\t{n_sam} ({prop_al}%)\n'
 66 |                ' - Genomic alignments:\t{genomic} ({prop_gen}%)\n'
 67 |                ' - PhiX alignments:\t{phi_x} ({prop_phix}%)\n'
 68 |                ' - Transcriptome alignments:\t{trans} ({prop_trans}%)\n'
 69 |                '{divide}\nFILTERING (% FROM ALIGNMENT)\n{divide}\n'
 70 |                'Genomic alignments:\t{genomic} ({bad_gen}%)\n'
 71 |                'PhiX alignments:\t{phi_x} ({bad_phi}%)\n'
 72 |                'Incorrect barcodes:\t{wrong_cb} ({bad_cb}%)\n'
 73 |                'Missing cell barcodes:\t{no_cell} ({bad_cell}%)\n'
 74 |                'Missing RMTs (same as above):\t{no_cell} ({bad_cell}%)\n'
 75 |                'N present in RMT:\t{rmt_N} ({bad_rmtN}%)\n'
 76 |                'Insufficient poly(T):\t{poly_t} ({bad_polyt}%)\n'
 77 |                '{divide}\nCELL/MOLECULE COUNT DISTRIBUTION\n{divide}\n'
 78 |                'Total molecules:\t\t{tot_mc}\n'
 79 |                'Molecules lost:\t{mols_lost}\n'
 80 |                'Cells lost:\t{cells_lost}\n'
 81 |                'Cell description:\n{cell_desc}\n'
 82 |                '{divide}\nSUMMARY\n{divide}\n'
 83 |                'Total retained reads:\t{n_good} ({prop_good}%)\n'
 84 |                'Total reads unaligned:\t{lost_al} ({prop_un}%)\n'
 85 |                'Total reads filtered:\t{n_bad} ({prop_bad}%)\n'
 86 |                '{divide}\n'
 87 |                )
 88 | 
 89 |     @staticmethod
 90 |     def string_to_regex(summary: str=None) -> str:
 91 |         """
 92 |         converts the contents of seqc.stats.ExperimentalYield.output into a regex object
 93 |         that may contain duplicate definitions
 94 |         :param summary: str, optional, a summary to convert
 95 |         :return summary: str, a regex object that may contain errors
 96 |         """
 97 |         if not summary:
 98 |             summary = ExperimentalYield.output
 99 |         replacements = [
100 |             ('{divide}', '-*?'),
101 |             ('(', '\('),
102 |             (')', '\)'),
103 |             ('{', '(?P<'),
104 |             ('}', '>.*?)')
105 |         ]
106 |         for r in replacements:
107 |             summary = summary.replace(r[0], r[1])
108 |         return summary
109 | 
110 |     @staticmethod
111 |     def identify_duplicate_patterns(regex: str) -> dict:
112 |         """
113 |         identifies replicated name patterns, which are not allowed in regex
114 | 
115 |         :param regex: str, pattern in which to find replicated assignments
116 |         :return replicates: dict, contains names of replicated definition with values
117 |           equal to the number of times each was replicated
118 |         """
119 | 
120 |         name_pattern = '(\(\?P<)(.*?)(>\.\*\?\))'
121 |         patterns = set()
122 |         replicates = defaultdict(int)
123 |         for mo in re.finditer(name_pattern, regex):
124 |             if mo.group(2) in patterns:
125 |                 replicates[mo.group(2)] += 1
126 |             else:
127 |                 patterns.add(mo.group(2))
128 |         return replicates
129 | 
130 |     @staticmethod
131 |     def replace_replicated_patterns(regex: str, duplicated_pattern: str) -> str:
132 |         """
133 |         replace the second definition of pattern_name with a regex-compliant reference
134 | 
135 |         :param regex: str, regex containing replicated pattern
136 |         :param duplicated_pattern: pattern_id
137 |         :return regex: str, pattern without duplicate group definitions
138 |         """
139 |         old = '(?P<{}>.*?)'.format(duplicated_pattern)
140 |         new = '(?P={})'.format(duplicated_pattern)
141 |         idx = regex.find(old) + len(old)
142 |         return regex[:idx] + regex[idx:].replace(old, new)
143 | 
144 |     @classmethod
145 |     def dictionary_to_dataframe(cls, groupdict, col_label) -> pd.DataFrame:
146 |         """
147 |         Warning: This function contains summary-specific information and may break or
148 |         need to be modified when the summary is changed. This function translates
149 |         the format parameters into interpretable columns in the dataframe
150 | 
151 |         :param groupdict: result of groupdict() call on match object generated by the
152 |           parse_log() classmethod
153 |         :param col_label: name of log file
154 |         :return: pd.DataFrame containing log data
155 |         """
156 |         index = (
157 |             ('total', 'input_reads'),
158 |             ('total', 'reads_aligned'),
159 |             ('aligned', 'genomic'),
160 |             ('aligned', 'phi_x'),
161 |             ('aligned', 'transcriptome'),
162 |             ('filtered', 'genomic'),
163 |             ('filtered', 'phi_x'),
164 |             ('filtered', 'incorrect_barcodes'),
165 |             ('filtered', 'no_barcodes'),
166 |             ('filtered', 'CB_contains_N'),
167 |             ('filtered', 'RMT_contains_N'),
168 |             ('filtered', 'broken_capture_primer'),
169 |             ('filtered', 'low_complexity'),
170 |             ('summary', 'reads_retained'),
171 |             ('summary', 'reads_not_aligned'),
172 |             ('summary', 'reads_filtered'),
173 |             ('summary', 'total_molecules')
174 |         )
175 |         data_list = ('n_fastq', 'n_sam', 'genomic', 'phi_x', 'trans', 'genomic', 'phi_x',
176 |                      'wrong_cb', 'no_cell', 'cell_N', 'rmt_N', 'poly_t', 'dust', 'n_good',
177 |                      'lost_al', 'n_bad', 'tot_mc')
178 | 
179 |         # account for older log version
180 |         if groupdict['wrong_cb'] == 'NA':
181 |             groupdict['wrong_cb'] = 0
182 | 
183 |         data = list(map(lambda x: float(groupdict[x]), data_list))
184 | 
185 |         spec_index, spec_data = cls.parse_special_fields(groupdict)
186 | 
187 |         index = pd.MultiIndex.from_tuples(index + spec_index)
188 |         return pd.DataFrame(data + spec_data, index, columns=[col_label])
189 | 
190 |     @staticmethod
191 |     def parse_special_fields(groupdict: dict) -> (tuple, list):
192 |         """
193 |         extracts information from special fields in run summary and returns
194 |         a tuple index suitable for multiindex creation and string representations of
195 |         data.
196 | 
197 |         :param groupdict: result of groupdict() call on match object generated by the
198 |           parse_log() classmethod
199 |         :returns index, data_list: (tuple, list)
200 |         """
201 |         lost_pattern = (
202 |             "^\[\('low_count', (?P<low_count>[0-9]+)\), "
203 |             "\('low_coverage', (?P<low_coverage>[0-9]+)\), "
204 |             "\('high_mt', (?P<high_mt>[0-9]+)\), "
205 |             "\('low_gene_detection', (?P<low_gene_detection>[0-9]+)\)\]$")
206 | 
207 |         summary_pattern = (
208 |             "^count\s+(?P<count>[0-9]+\.[0-9]+)\s"
209 |             "mean\s+(?P<mean>[0-9]+\.[0-9]+)\s"
210 |             "std\s+(?P<std>[0-9]+\.[0-9]+)\s"
211 |             "min\s+(?P<min>[0-9]+\.[0-9]+)\s"
212 |             "25%\s+(?P<low_quartile>[0-9]+\.[0-9]+)\s"
213 |             "50%\s+(?P<median>[0-9]+\.[0-9]+)\s"
214 |             "75%\s+(?P<high_quartile>[0-9]+\.[0-9]+)\s"
215 |             "max\s+(?P<max>[0-9]+\.[0-9]+)\s?")
216 | 
217 |         cell = re.match(lost_pattern, groupdict['cells_lost'], re.M).groupdict()
218 |         mols = re.match(lost_pattern, groupdict['mols_lost'], re.M).groupdict()
219 |         desc = re.match(summary_pattern, groupdict['cell_desc'], re.M).groupdict()
220 | 
221 |         if not all((cell, mols, desc)):
222 |             raise ValueError('Regex failed to match log. Please check that you are using '
223 |                              'a matched log/seqc pair.')
224 | 
225 |         index = (
226 |             ('molecules_lost', 'low_count'),
227 |             ('molecules_lost', 'low_coverage'),
228 |             ('molecules_lost', 'high_mt'),
229 |             ('molecules_lost', 'low_gene_detection'),
230 |             ('cells_lost', 'low_count'),
231 |             ('cells_lost', 'low_coverage'),
232 |             ('cells_lost', 'high_mt'),
233 |             ('cells_lost', 'low_gene_detection'),
234 |             ('cell_summary', 'count'),
235 |             ('cell_summary', 'mean'),
236 |             ('cell_summary', 'std'),
237 |             ('cell_summary', 'min'),
238 |             ('cell_summary', '25%'),
239 |             ('cell_summary', '50%'),
240 |             ('cell_summary', '75%'),
241 |             ('cell_summary', 'max')
242 |         )
243 |         data_list = (
244 |             mols['low_count'], mols['low_coverage'], mols['high_mt'],
245 |             mols['low_gene_detection'], cell['low_count'], cell['low_coverage'],
246 |             cell['high_mt'], cell['low_gene_detection'], desc['count'], desc['mean'],
247 |             desc['std'], desc['min'], desc['low_quartile'], desc['median'],
248 |             desc['high_quartile'], desc['max'])
249 |         data_list = list(map(lambda x: float(x), data_list))
250 |         return index, data_list
251 | 
252 |     @classmethod
253 |     def match_log(cls, log_file: str, pattern: str=None) -> dict:
254 |         """
255 |         create a dictionary to hold data from SEQC summary.
256 | 
257 |         :param log_file: name of the seqc log to extract information from
258 |         :param pattern: str, optional, the value of seqc.stats.ExperimentYield.output.
259 |           useful to parse seqc logs from older versions
260 |         :return match_results: dict, argument names and values from seqc.log summary
261 |         """
262 |         if pattern is None:
263 |             pattern = cls.string_to_regex()
264 | 
265 |         def get_match_object(pattern_):
266 |             duplicates = cls.identify_duplicate_patterns(pattern_)
267 | 
268 |             for k, v in duplicates.items():
269 |                 pattern_ = cls.replace_replicated_patterns(pattern_, k)
270 | 
271 |             # add beginning and end wildcards
272 |             pattern_ = '^.*?' + pattern_ + '.*?$'
273 |             with open(log_file, 'r') as f:
274 |                 summary_data = f.read()
275 |                 mo = re.match(pattern_, summary_data, re.M | re.DOTALL)
276 |                 match_results = mo.groupdict()
277 |             return match_results
278 | 
279 |         try:
280 |             data = get_match_object(pattern)
281 |         except AttributeError:
282 |             data = get_match_object(cls.string_to_regex(cls._oldver))
283 |         return data
284 | 
285 |     @classmethod
286 |     def parse_log(cls, logfile: str) -> pd.DataFrame:
287 |         """
288 |         parse a SEQC log into a pd.DataFrame column with a multi-index corresponding to
289 |         the RUN SUMMARY section of the seqc.log object.
290 | 
291 |         :param logfile: str, path to log file
292 |         :returns df: pd.DataFrame, dataframe containing log information
293 |         """
294 |         mo = LogData.match_log(logfile)
295 |         return LogData.dictionary_to_dataframe(
296 |             mo, logfile.split('/')[-1].replace('.log', ''))
297 | 
298 |     @classmethod
299 |     def parse_multiple(cls, directory: str, exclude: str='') -> pd.DataFrame:
300 |         """
301 |         parse multiple SEQC logs into a pd.DataFrame object with a multi-index
302 |         corresponding to the RUN SUMMARY section of the seqc.log object and a column
303 |         multi-index corresponding to the directory hierarchy containing each log.
304 | 
305 |         This function takes a root directory and parses each log within the directory and
306 |         all sub-directories. logs matching exclude pattern are omitted.
307 | 
308 |         :param directory: str, root directory to search for logs
309 |         :param exclude: regex pattern to exclude log names
310 |         :returns df: pd.DataFrame, dataframe containing log information
311 |         """
312 |         logs = []
313 |         for path, subdirs, files in os.walk(directory):
314 |             for name in files:
315 |                 filepath = os.path.join(path, name)
316 |                 if filepath.endswith('.log') and re.match(exclude, filepath) is None:
317 |                     logs.append(filepath)
318 | 
319 |         frames = [cls.parse_log(f) for f in logs]
320 | 
321 |         # create column index
322 |         cols = pd.MultiIndex.from_tuples(list(map(lambda p: tuple(p.split('/')), logs)))
323 |         df = pd.concat(frames, 1)
324 |         df.columns = cols
325 |         return df
326 | 


--------------------------------------------------------------------------------
/src/seqc/multialignment.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import itertools
  3 | import time
  4 | import seqc
  5 | 
  6 | 
  7 | class UnionFind:
  8 |     """Union-find data structure.
  9 | 
 10 |     Each unionFind instance X maintains a family of disjoint sets of
 11 |     hashable objects, supporting the following two methods:
 12 | 
 13 |     - X[item] returns a name for the set containing the given item.
 14 |       Each set is named by an arbitrarily-chosen one of its members; as
 15 |       long as the set remains unchanged it will keep the same name. If
 16 |       the item is not yet part of a set in X, a new singleton set is
 17 |       created for it.
 18 | 
 19 |     - X.union(item1, item2, ...) merges the sets containing each item
 20 |       into a single larger set.  If any item is not yet part of a set
 21 |       in X, it is added to X as one of the members of the merged set.
 22 |     """
 23 | 
 24 |     def __init__(self):
 25 |         """Create a new empty union-find structure."""
 26 |         self.weights = {}
 27 |         self.parents = {}
 28 | 
 29 |     def __getitem__(self, obj):
 30 |         """Find and return the name of the set containing the object."""
 31 | 
 32 |         # check for previously unknown object
 33 |         if obj not in self.parents:
 34 |             self.parents[obj] = obj
 35 |             self.weights[obj] = 1
 36 |             return obj
 37 | 
 38 |         # find path of objects leading to the root
 39 |         path = [obj]
 40 |         root = self.parents[obj]
 41 |         while root != path[-1]:
 42 |             path.append(root)
 43 |             root = self.parents[root]
 44 | 
 45 |         # compress the path and return
 46 |         for ancestor in path:
 47 |             self.parents[ancestor] = root
 48 |         return root
 49 | 
 50 |     def __iter__(self):
 51 |         """Iterate through all items ever found or unioned by this structure."""
 52 |         return iter(self.parents)
 53 | 
 54 |     def union(self, *objects):
 55 |         """Find the sets containing the objects and merge them all."""
 56 |         roots = [self[x] for x in objects]
 57 |         heaviest = max([(self.weights[r], r) for r in roots])[1]
 58 |         for r in roots:
 59 |             if r != heaviest:
 60 |                 self.weights[heaviest] += self.weights[r]
 61 |                 self.parents[r] = heaviest
 62 | 
 63 |     def union_all(self, iterable):
 64 |         for i in iterable:
 65 |             self.union(*i)
 66 | 
 67 |     def find_all(self, vals):
 68 |         vals = [self.find_component(v) for v in vals]
 69 |         unique = set(vals)
 70 |         reindex = dict(zip(unique, range(len(unique))))
 71 |         set_membership = np.array([reindex[v] for v in vals])
 72 |         sets = np.array(list(reindex.values()))
 73 |         return set_membership, sets
 74 | 
 75 |     def find_component(self, iterable):
 76 |         """Return the set that obj belongs to
 77 | 
 78 |         If the iterable contains items that have been unioned, then any entry in the
 79 |          iterable will be sufficient to identify the set that obj belongs to. Use the
 80 |          first entry, and return the set associated with iterable.
 81 | 
 82 |         If the iterable has not been entered into the structure, this method can yield
 83 |          incorrect results
 84 |         """
 85 |         return self[next(iter(iterable))]
 86 | 
 87 | def intersection(set_l):
 88 |     res = set_l[0]
 89 |     for s in set_l:
 90 |         res = set(set(res) & set(s))
 91 |     return res
 92 | 
 93 | 
 94 | # # Some constants
 95 | # NO_DISAMBIGUATION = 0
 96 | # RESOLVED_GENE = 1
 97 | # NO_GENE_RESOLVED = 2
 98 | # MULTIPLE_MODELS = 3
 99 | 
100 | 
101 | # def reduce_coalignment_array(arr, threshold = 0.0001):
102 | #     res = {}
103 | #     for g in arr:
104 | #         temp = {}
105 | #         for k in arr[g]:
106 | #             if arr[g][k] < threshold:
107 | #                 continue
108 | #             temp[tuple(sorted(k))] = arr[g][k]
109 | #         if len(temp)>0:
110 | #             res[g] = temp
111 | #     return res
112 |     
113 | # #def strip(genes):
114 | # #    return tuple(sorted([int(g[2:]) for g in genes]))
115 | # def strip(genes):
116 | #     return tuple(sorted(genes))
117 | # def strip_model(mod):
118 | #     res = {}
119 | #     for k in mod:
120 | #         res[tuple(sorted(k))]=mod[k]
121 | #     return res
122 | 
123 | # def split_to_disjoint(obs):
124 | #     res = []
125 | #     uf = UnionFind()
126 | #     uf.union_all(obs.keys())
127 | #     set_membership, sets = uf.find_all(obs.keys())
128 |     
129 | #     for s in sets:
130 | #         d = {}
131 | #         for k in np.array(list(obs.keys()))[set_membership == s]:
132 | #             d[tuple(k)] = obs[tuple(k)]
133 | #         res.append(d)
134 | #     return res
135 | 
136 | # def get_indices(inds, obs_subset):
137 | #     res = []
138 | #     for genes in obs_subset:
139 | #         res += inds[genes]
140 | #     return res
141 | 
142 | # def model_to_gene(model):
143 | #     for g in model:
144 | #         if model[g]==1:
145 | #             return g
146 |             
147 |     
148 | # def get_combinations(l):
149 | #     res = []
150 | #     for i in range(len(l)):
151 | #         res += itertools.combinations(l,i+1)
152 | #     return res
153 |             
154 | # # rank the different possible models by their scores
155 | # def best_fit_model(obs_s, coalignment_mat):
156 | #     #obs_s = strip_model(obs)
157 | #     gene_l = single_gene_list(obs_s)  # From the list of observation create a list of unique single genes from which different models can be inferred
158 |       
159 |     
160 | #     if len(obs_s) == 1:
161 | #         if len(list(obs_s.keys())[0]) == 1:
162 | #             return [{gene_l[0]:1}], NO_DISAMBIGUATION
163 |     
164 | #     possible_genes = intersection(list(obs_s.keys()))
165 |     
166 | #     #There is one gene that resolve the disambiguation
167 | #     if len(possible_genes) == 1:
168 | #         model = {}
169 | #         for g in gene_l:
170 | #             model[g] = 0
171 | #         model[list(possible_genes)[0]] = 1
172 | #         return [model], RESOLVED_GENE
173 |     
174 | #     #There is more than one gene that can explain it, no model can be decided
175 | #     if len(possible_genes) > 1:
176 | #         return [], NO_GENE_RESOLVED
177 | 
178 | #     #There are multiple competing models. For now we don't decide bewteen them
179 | #     return [], MULTIPLE_MODELS
180 | # #    mod_score_list = []     
181 | # #    for mod in get_combinations(gene_l):
182 | # #        model = {}
183 | # #        for k in gene_l: 
184 | # #            if k in mod:
185 | # #                model[k] = 1
186 | # #            else:
187 | # #                model[k] = 0
188 | # #        score = model_score(model, obs_s, coalignment_mat)
189 | # #        mod_score_list.append((model,score))
190 |         
191 | #     #Here to decide if there is one model that's obviously better
192 | # #    return mod_score_list, MULTIPLE_MODELS
193 | 
194 | # # get a model and returns its likelihood score comparing the expected number of reads and the observed
195 | # # model is basically just a bool dic of all the unique genes with flags of wether or not they're in model
196 | # # observed is a dictionary of all gene combinations and their expected proportion
197 | # # coalignment_mat is the coalignment matrix used to calculate the expected number of reads
198 | # # eg:
199 | # #   model - {A:1, B:0}
200 | # #   observed - {A: 100 B:50, AB: 30 }
201 | # #
202 | # def model_score(model, observed, coalignment_mat):
203 | #     exp = {}
204 | #     tot = {}
205 | #     for gene in model:
206 | #         # patch for SC000
207 | #         if gene==0:
208 | #             tot[gene] = model[gene]*observed[gene,]
209 | #         # Theres a common edge case where a gene A will only be aligned with other genes as well, in this case we update our observation vector to include A:0
210 | #         elif (gene, ) not in observed:
211 | #             tot[gene] = 0
212 | #         elif gene not in coalignment_mat:
213 | #             raise KeyError('{} not found in coalignment matrix'.format(gene))
214 | #         elif (gene, ) not in coalignment_mat[gene]:
215 | #             tot[gene] = 0
216 | #         else:
217 | #             tot[gene] = model[gene]*(observed[gene,]/coalignment_mat[gene][gene,])
218 | 
219 | #     keys = get_combinations(model.keys())   #get a list of all possible molecule combinations
220 |     
221 | #     # key is a set of genes and the expected number of reads for it is the sum of expected reads from all genes shared by the key,
222 | #     # these in turn are the total reads for a gene (extrapoletaed from the uniqely mapped) multiplied by the coalignment factor (present in the coalignment matrix)
223 | #     # e.g. if A has 20% coalignment with B and there are 80 reads mapped uniquely to A, we expect 80/0.8 * 0.2 = 20 reads to be mapped to AB from A (and more from B)
224 | #     for k in keys:  
225 | #         k = tuple(sorted(k))
226 | #         sum = 0
227 | #         for gene in k:
228 | #             #Patch for SC000
229 | #             if gene==0: 
230 | #                 if k==(0,):
231 | #                     sum=1
232 | #                 else:
233 | #                     sum = 0
234 | #             #####
235 | #             elif k in coalignment_mat[gene]:
236 | #                 sum += tot[gene]*coalignment_mat[gene][k]
237 | #         exp[k] = sum
238 |     
239 | #     score = calc_score(observed, exp)
240 | #     return score
241 | 
242 | # def calc_score(obs, exp):
243 | #     sum = 0
244 | #     for k in obs:
245 | #         if k not in exp:
246 | #             print(k)
247 | #             k = tuple(sorted(k))
248 | #             print ('bad key')
249 | #         diff = (obs[k]-exp[k])**2
250 | #         if exp[k]!=0:
251 | #             diff /= exp[k]
252 | #         sum += diff
253 | #     return sum
254 | 
255 | # #Get a dictionary of observations per gene/s and return a list of single unique genes
256 | # def single_gene_list(obs):
257 | #     l = []
258 | #     for genes in obs:
259 | #         for g in genes:
260 | #             l.append(g)
261 | #     return list(set(l))
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------
/src/seqc/notebooks/__init__.py:
--------------------------------------------------------------------------------
1 | from . import notebooks
2 | 


--------------------------------------------------------------------------------
/src/seqc/notebooks/notebooks.py:
--------------------------------------------------------------------------------
 1 | from jinja2 import Environment, FileSystemLoader
 2 | import os
 3 | import pandas as pd
 4 | import tempfile
 5 | 
 6 | import nbformat
 7 | from nbconvert.preprocessors import ExecutePreprocessor
 8 | 
 9 | 
10 | class Notebook:
11 | 
12 |     def __init__(self, output_stem: str, *data):
13 | 
14 |         # strip notebook affix if user provided it; this is a common error mode
15 |         if output_stem.endswith('.ipynb'):
16 |             output_stem = output_stem.replace('.ipynb', '')
17 |         self._output_stem = output_stem
18 | 
19 |         self._data = data
20 |         self._this_dir = os.path.dirname(os.path.abspath(__file__))
21 | 
22 |     @property
23 |     def notebook_path(self):
24 |         return self._output_stem + '.ipynb'
25 | 
26 |     @property
27 |     def merged_data(self):
28 |         if isinstance(self._data, str):
29 |             if os.path.isfile(self._data):
30 |                 return os.path.abspath(self._data)
31 |         elif isinstance(self._data, (list, tuple)) and isinstance(self._data[0], str):
32 |             if os.path.isfile(self._data[0]):
33 |                 return os.path.abspath(self._data[0])
34 |         raise TypeError('Data is not a 1-length iterable or string that contains a filepath')
35 | 
36 |     def merge_data(self, merged_sample_name=None, remove_unmerged=False):
37 |         """
38 |         This function will merge any datasets provided as nested lists.
39 |         Each top-level value is considered an input alias.
40 |         Any second-level list is considered to be a group of files to be joined
41 | 
42 |         :param bool remove_unmerged: if True, this function will delete the unmerged files after
43 |           completion
44 |         :param str merged_sample_name: name of merged csv file
45 |         :return None: The list of merged file names will replace the list passed to the class in
46 |           self._datasets
47 |         """
48 |         dfs = [pd.read_csv(csv, index_col=0) for csv in self._data]
49 |         df = pd.concat(
50 |             dfs,
51 |             keys=list(range(len(self._data))),
52 |             names=['sample_number', 'cell_id']
53 |         )
54 | 
55 |         if not merged_sample_name:
56 |             merged_sample_name = self._output_stem + '_merged_data.csv'
57 |         df.to_csv(merged_sample_name)
58 | 
59 |         # delete original files, if requested
60 |         if remove_unmerged:
61 |             for csv in self._data:
62 |                 os.remove(csv)
63 | 
64 |         # update file urns
65 |         self._data = merged_sample_name
66 | 
67 |     def write_template(self):
68 |         """write a filled ipython notebook to disk
69 | 
70 |         :return:
71 |         """
72 | 
73 |         j2_env = Environment(loader=FileSystemLoader(self._this_dir), trim_blocks=True)
74 |         rendered = j2_env.get_template('analysis_template.json').render(
75 |             output_stem=self._output_stem,
76 |             data=os.path.abspath(self.merged_data),
77 |         )
78 |         with open(self._output_stem + '.ipynb', 'w') as fdw:
79 |             fdw.write(rendered)
80 | 
81 |     def run_notebook(self, notebook_filename=None):
82 | 
83 |         if not notebook_filename:
84 |             notebook_filename = self._output_stem + '.ipynb'
85 | 
86 |         dir_ = os.getcwd()
87 |         with open(notebook_filename) as f:
88 |             nb = nbformat.read(f, as_version=4)
89 | 
90 |         ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
91 |         ep.preprocess(nb, {'metadata': {'path': dir_}})
92 | 
93 |         with open(notebook_filename, 'wt') as f:
94 |             nbformat.write(nb, f)
95 | 


--------------------------------------------------------------------------------
/src/seqc/notebooks/test_notebooks.py:
--------------------------------------------------------------------------------
 1 | from . import notebooks
 2 | import tempfile
 3 | import pytest
 4 | import numpy as np
 5 | import pandas as pd
 6 | import uuid
 7 | import os
 8 | from seqc.core import main
 9 | 
10 | 
11 | @pytest.fixture()
12 | def testing_data():
13 |     dir_ = tempfile.mkdtemp()
14 |     test_data = [np.random.randint(10, 110, (100, 100)) for _ in range(4)]
15 |     test_files = []
16 |     for f in test_data:
17 |         filename = '{}/{}'.format(dir_, uuid.uuid4())
18 |         pd.DataFrame(f).to_csv(filename)
19 |         test_files.append(filename)
20 |     return test_files
21 | 
22 | 
23 | @pytest.fixture()
24 | def merged_data(testing_data):
25 |     output_stem = os.path.join(tempfile.mkdtemp(), 'test_notebooks')
26 |     n = notebooks.Notebook(output_stem, *testing_data)
27 |     n.merge_data()
28 |     return n.merged_data
29 | 
30 | 
31 | def test_template_filling(testing_data):
32 |     output_stem = os.path.join(tempfile.mkdtemp(), 'test_notebooks')
33 |     n = notebooks.Notebook(output_stem, *testing_data)
34 |     n.merge_data()
35 |     n.write_template()
36 |     n.run_notebook()
37 |     print(os.listdir(os.path.dirname(output_stem)))
38 | 
39 | 
40 | def test_merge_api(testing_data):
41 |     output_filename = os.path.join(tempfile.mkdtemp(), 'test_notebooks.ipynb')
42 |     args = ['notebook', 'merge', '-o', output_filename, '-i'] + testing_data
43 |     main.main(args)
44 | 
45 | 
46 | def test_generate_api(merged_data):
47 |     output_stem = os.path.join(tempfile.mkdtemp(), 'test_notebooks')
48 |     args = ['notebook', 'generate', '-o', output_stem, '-i', merged_data]
49 |     main.main(args)
50 | 
51 | 


--------------------------------------------------------------------------------
/src/seqc/reader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gzip
 3 | import bz2
 4 | 
 5 | 
 6 | class Reader:
 7 |     """
 8 |     Basic reader object that seamlessly loops over multiple input files
 9 | 
10 |     Can be subclassed to create readers for specific file types (fastq, gtf, etc.)
11 |     """
12 | 
13 |     def __init__(self, files_):
14 | 
15 |         if isinstance(files_, list):
16 |             self._files = files_
17 |         elif isinstance(files_, str):
18 |             self._files = [files_]
19 |         else:
20 |             raise TypeError('files_ must be a string filename or a list of such names.')
21 | 
22 |     @property
23 |     def filenames(self):
24 |         return self._files
25 | 
26 |     def __len__(self):
27 |         """
28 |         return the length of the Reader object. This depends on the implementation of
29 |         self.__iter__(); it does not necessarily represent the length of the file in
30 |         lines.
31 |         """
32 |         return sum(1 for _ in self)
33 | 
34 |     def __iter__(self):
35 |         for f in self._files:
36 |             if f.endswith('.gz'):
37 |                 file_input = gzip.open(f, 'rb')
38 |             elif f.endswith('.bz2'):
39 |                 file_input = bz2.open(f, 'rb')
40 |             else:
41 |                 file_input = open(f, 'rb')
42 |             for record in file_input:
43 |                 yield record
44 |             file_input.close()
45 | 
46 |     @property
47 |     def size(self) -> int:
48 |         """return the collective size of all files being read in bytes"""
49 |         return sum(os.stat(f).st_size for f in self._files)
50 | 


--------------------------------------------------------------------------------
/src/seqc/rmt_correction.py:
--------------------------------------------------------------------------------
  1 | from scipy.special import gammainc
  2 | from seqc.sequence.encodings import DNA3Bit
  3 | import numpy as np
  4 | from seqc import log
  5 | from seqc.read_array import ReadArray
  6 | import time
  7 | import pandas as pd
  8 | import multiprocessing as multi
  9 | from itertools import repeat
 10 | import ctypes
 11 | from contextlib import closing
 12 | from functools import partial
 13 | 
 14 | # todo document me
 15 | def generate_close_seq(seq):
 16 |     """ Return a list of all sequences that are up to 2 hamm distance from seq
 17 |     :param seq:
 18 |     """
 19 |     res = []
 20 |     l = DNA3Bit.seq_len(seq)
 21 | 
 22 |     # generate all sequences that are dist 1
 23 |     for i in range(l):
 24 |         mask = 0b111 << (i * 3)
 25 |         cur_chr = (seq & mask) >> (i * 3)
 26 |         res += [seq & (~mask) | (new_chr << (i * 3))
 27 |                 for new_chr in DNA3Bit.bin2strdict.keys() if new_chr != cur_chr]
 28 |     # generate all sequences that are dist 2
 29 |     for i in range(l):
 30 |         mask_i = 0b111 << (i * 3)
 31 |         chr_i = (seq & mask_i) >> (i * 3)
 32 |         for j in range(i + 1, l):
 33 |             mask_j = 0b111 << (j * 3)
 34 |             chr_j = (seq & mask_j) >> (j * 3)
 35 |             mask = mask_i | mask_j
 36 |             res += [seq & (~mask) | (new_chr_i << (i * 3)) | (new_chr_j << (j * 3)) for
 37 |                     new_chr_i in DNA3Bit.bin2strdict.keys() if new_chr_i != chr_i for
 38 |                     new_chr_j in DNA3Bit.bin2strdict.keys() if new_chr_j != chr_j]
 39 | 
 40 |     return res
 41 | 
 42 | 
 43 | # todo document me
 44 | def probability_for_convert_d_to_r(d_seq, r_seq, err_rate):
 45 |     """
 46 |     Return the probability of d_seq turning into r_seq based on the err_rate table
 47 |     (all binary)
 48 | 
 49 |     :param err_rate:
 50 |     :param r_seq:
 51 |     :param d_seq:
 52 |     """
 53 | 
 54 |     if DNA3Bit.seq_len(d_seq) != DNA3Bit.seq_len(r_seq):
 55 |         return 1
 56 | 
 57 |     p = 1.0
 58 |     while d_seq > 0:
 59 |         if d_seq & 0b111 != r_seq & 0b111:
 60 |             if isinstance(err_rate,float):
 61 |                 p *= err_rate
 62 |             else:
 63 |                 p *= err_rate[(d_seq & 0b111, r_seq & 0b111)]
 64 |         d_seq >>= 3
 65 |         r_seq >>= 3
 66 |     return p
 67 | 
 68 | 
 69 | def in_drop(read_array, error_rate, alpha=0.05):
 70 |     """ Tag any RMT errors
 71 | 
 72 |     :param read_array: Read array
 73 |     :param error_rate: Sequencing error rate determined during barcode correction
 74 |     :param alpha: Tolerance for errors
 75 |     """
 76 | 
 77 |     global ra
 78 |     global indices_grouped_by_cells
 79 | 
 80 |     ra = read_array
 81 |     indices_grouped_by_cells = ra.group_indices_by_cell()
 82 |     _correct_errors(error_rate, alpha)
 83 | 
 84 | 
 85 | # a method called by each process to correct RMT for each cell
 86 | def _correct_errors_by_cell_group(err_rate, p_value, cell_index):
 87 | 
 88 |     cell_group = indices_grouped_by_cells[cell_index]
 89 |     # Breaks for each gene
 90 |     gene_inds = cell_group[np.argsort(ra.genes[cell_group])]
 91 |     breaks = np.where(np.diff(ra.genes[gene_inds]))[0] + 1
 92 |     splits = np.split(gene_inds, breaks)
 93 |     rmt_groups = {}
 94 |     res = []
 95 | 
 96 |     for inds in splits:
 97 |         # RMT groups
 98 |         for ind in inds:
 99 |             rmt = ra.data['rmt'][ind]
100 |             try:
101 |                 rmt_groups[rmt].append(ind)
102 |             except KeyError:
103 |                 rmt_groups[rmt] = [ind]
104 | 
105 |         if len(rmt_groups) == 1:
106 |             continue
107 | 
108 |         # This logic retains RMTs with N if no donor is found and contributes to the
109 |         # molecule count
110 |         for rmt in rmt_groups.keys():
111 | 
112 |             # Enumerate all possible RMTs with hamming distances 1 and/or 2
113 |             # to build a probablitiy that this particular RMT was not an error
114 |             # Simulatenously, check if Jaitin error correction can be applied
115 |             jaitin_corrected = False
116 |             expected_errors = 0
117 |             for donor_rmt in generate_close_seq(rmt):
118 | 
119 |                 # Check if donor is detected
120 |                 try:
121 |                     donor_count = len(rmt_groups[donor_rmt])
122 |                 except KeyError:
123 |                     continue
124 | 
125 |                 # Build likelihood
126 |                 # Probability of converting donor to target
127 |                 p_dtr = probability_for_convert_d_to_r(donor_rmt, rmt, err_rate)
128 |                 # Number of occurrences
129 |                 expected_errors += donor_count * p_dtr
130 | 
131 |                 # Check if jaitin correction is feasible
132 |                 if not jaitin_corrected: 
133 |                     ref_positions = ra.positions[rmt_groups[rmt]]
134 |                     donor_positions = ra.positions[rmt_groups[donor_rmt]]
135 | 
136 |                     # Is reference a subset of the donor ? 
137 |                     if (set(ref_positions)).issubset(donor_positions):
138 |                         jaitin_corrected = True
139 |                         jaitin_donor = donor_rmt
140 | 
141 |             # Probability that the RMT is an error
142 |             p_val_err = gammainc(len(rmt_groups[rmt]), expected_errors)
143 | 
144 |             # Remove Jaitin corrected reads if probability of RMT == error is high
145 |             if p_val_err > p_value and jaitin_corrected:
146 |                 # Save the RMT donor
147 |                 # save the index of the read and index of donor rmt read
148 |                 for i in rmt_groups[rmt]:
149 |                     res.append(i)
150 |                     res.append(rmt_groups[jaitin_donor][0])
151 | 
152 |         rmt_groups.clear()
153 | 
154 |     return res
155 | 
156 | 
157 | def _correct_errors(err_rate, p_value=0.05):
158 |     #Calculate and correct errors in RMTs
159 |     with multi.Pool(processes=multi.cpu_count()) as p:
160 |         p = multi.Pool(processes=multi.cpu_count())
161 |         results = p.starmap(_correct_errors_by_cell_group, 
162 |                           zip(repeat(err_rate), repeat(p_value), range(len(indices_grouped_by_cells))))
163 |         p.close()
164 |         p.join()
165 | 
166 |         # iterate through the list of returned read indices and donor rmts 
167 |         for i in range(len(results)):
168 |             res = results[i]
169 |             if len(res) > 0:
170 |                 for i in range(0, len(res), 2):
171 |                     ra.data['rmt'][res[i]] = ra.data['rmt'][res[i+1]]
172 |                     ra.data['status'][res[i]] |= ra.filter_codes['rmt_error']


--------------------------------------------------------------------------------
/src/seqc/run_mast.R:
--------------------------------------------------------------------------------
 1 | suppressMessages(library(MAST))
 2 | suppressPackageStartupMessages({library(data.table)})
 3 | options(mc.cores = 1) # gives me error messages when I use > 1
 4 | 
 5 | loadData <- function(input_data) {
 6 |   df <- (read.csv(input_data, row.names=NULL))
 7 | }
 8 | 
 9 | extractConditions <- function(df) {
10 |   # extract conditions (sg) from column names of the df
11 |   sg <- factor(unlist(df[1]))
12 |   return(sg)
13 | }
14 | 
15 | annotateDF <- function(df, sg) {
16 |   df[1] <- NULL
17 |   df <- t(df)  
18 |   names(df) <- sg
19 |   return(df)
20 | }
21 | 
22 | runMAST <- function(df, sg) {
23 |   # extract columns and row information
24 |   # add a cell number column to avoid duplicate row names
25 |   wellKey <- seq_len(dim(df)[2])
26 |   wellKey <- lapply(wellKey, toString)
27 |   condition <- as.numeric(unlist(as.list(sg)))
28 |   cdata <- data.frame(cbind(wellKey=wellKey, condition=condition))
29 |   fdata <- data.frame(primerid=row.names(df))
30 |   
31 |   # create the sca object. Note that we do filtering before
32 |   # we create the test matrix, so no additional filtering of cells is added here
33 |   exprsArray <- as.matrix(df)
34 |   dimnames(exprsArray)[[2]] <- cdata$wellKey
35 |   sca <- FromMatrix(exprsArray, cdata, fdata)
36 |   
37 |   # calculate cellular detection rate
38 |   cdr2 <-colSums(assay(sca)>0)
39 |   colData(sca)$cngeneson <- scale(cdr2)
40 |   colData(sca)$cond <- as.numeric(unlist(as.list(sg)))
41 |   
42 |   # carry out DE analysis
43 |   zlmCond <- zlm.SingleCellAssay(~cond + cngeneson, sca)
44 |   #res <- lrTest(zlmCond, CoefficientHypothesis("cond"))
45 | 
46 |   #only test the cluster coefficient.
47 |   summaryCond <- summary(zlmCond, doLRT=TRUE)
48 |   summaryDt <- summaryCond$datatable
49 |   fcHurdle <- merge(summaryDt[contrast=='cond' & component=='H',.(primerid, `Pr(>Chisq)`)], summaryDt[contrast=='cond' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') 
50 |   
51 |   fcHurdle <- fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')]
52 |   fcHurdleSig <- fcHurdle[(fdr<=0.05) & (abs(coef)>=log2(1.25)) ]
53 |   setorder(fcHurdleSig, fdr)
54 |   
55 |   return(fcHurdleSig)
56 | }
57 | 
58 | saveResult <- function(result, filename) {
59 |   resultDf <- as.data.frame(result)
60 |   colnames(resultDf)[1] = 'gene'
61 |   colnames(resultDf)[2] = 'p'
62 |   colnames(resultDf)[3] = 'logFC'
63 |   colnames(resultDf)[6] = 'p.fdr.adj'
64 |   resultDf <- resultDf[,c('gene','p','p.fdr.adj','logFC')]
65 |   write.table(resultDf, file = filename, row.names = FALSE, col.names = TRUE, sep = ",", quote = FALSE)
66 | }
67 | 
68 | testMAST <- function(input_filename, save_filename) {
69 |   df <- loadData(input_filename)
70 |   sg <- extractConditions(df)
71 |   df <- annotateDF(df, sg)
72 |   result <- runMAST(df, sg)
73 |   saveResult(result, save_filename)
74 | }
75 | 
76 | # args should be:
77 | # 1. input_filename
78 | # 2. output_filename
79 | 
80 | args <- commandArgs(trailingOnly = TRUE)
81 | stopifnot(length(args) == 2)
82 | 
83 | testMAST(args[1], args[2])
84 | 


--------------------------------------------------------------------------------
/src/seqc/sequence/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/sequence/__init__.py


--------------------------------------------------------------------------------
/src/seqc/sequence/barcodes.py:
--------------------------------------------------------------------------------
 1 | from seqc.sequence.encodings import DNA3Bit
 2 | from sys import maxsize
 3 | 
 4 | # todo document me
 5 | def generate_hamming_dist_1(seq):
 6 |     """ Return a list of all sequences that are up to 1 hamming distance from seq
 7 |     :param seq:
 8 |     """
 9 |     res = []
10 |     l = DNA3Bit.seq_len(seq)
11 |     #=barcode
12 |     
13 |     # generate all sequences that are dist 1
14 |     for i in range(l):
15 |         mask = 0b111 << (i * 3)
16 |         cur_chr = (seq & mask) >> (i * 3)
17 |         res += [seq & (~mask) | (new_chr << (i * 3))
18 |                 for new_chr in DNA3Bit.bin2strdict.keys() if new_chr != cur_chr]
19 | 
20 |     return res
21 | 
22 | 
23 | def find_correct_barcode(code, barcodes_list, exact_match=False):
24 |     """
25 |     For a given barcode find the closest correct barcode to it from the list (limited to
26 |     one ED), a string representing the error and the edit distance
27 |     NOTE: for now this function looks for a barcode with ED==1 and does not bother
28 |     looking for the minimum
29 | 
30 |     :param exact_match:
31 |     :param barcodes_list:
32 |     :param code:
33 |     :returns:
34 |     """
35 | 
36 |     # Return the barcode if it exists
37 |     if code in barcodes_list:
38 |         return code, 0
39 | 
40 |     # If perfect match is required, return an error since the barcode does not appear
41 |     # in the correct barcode list
42 |     if exact_match:
43 |         return 0, maxsize
44 | 
45 |     min_ed = maxsize
46 |     cor_code = 0
47 |     for bc in barcodes_list:
48 |         hamm_d = hamming_dist_bin(code, bc)
49 |         if hamm_d == 1:
50 |             min_ed = 1
51 |             cor_code = bc
52 |             break
53 |         if hamm_d < min_ed:
54 |             min_ed = hamm_d
55 |             cor_code = bc
56 | 
57 |     return cor_code, min_ed
58 |         
59 |         
60 | def hamming_dist_bin(c1, c2):
61 |     """Return the hamming distance between two numbers representing a sequence (3 bits
62 |     per base)
63 | 
64 |     :param c1:
65 |     :param c2:
66 |     :return:
67 |     """
68 |     if DNA3Bit.seq_len(c1) != DNA3Bit.seq_len(c2):
69 |         return maxsize
70 |     d = 0
71 |     while c1 > 0:
72 |         if c1 & 0b111 != c2 & 0b111:
73 |             d += 1
74 |         c1 >>= 3
75 |         c2 >>= 3
76 |     return d
77 | 
78 | 
79 | def list_errors(s1, s2):
80 |     """
81 |     Return the list of nucleotide transformations that turn s1 to s2.
82 |     An error is a six bit int representing a two chr string of type "AG","CT", etc.
83 | 
84 |     :param s2:
85 |     :param s1:
86 | 
87 |     :returns:
88 |     """
89 | 
90 |     # return the actual error
91 |     err_list = []
92 |     while s1 > 0:
93 |         if s1 & 0b111 != s2 & 0b111:
94 |             err_list.append((s1 & 0b111, s2 & 0b111))
95 |         s1 >>= 3
96 |         s2 >>= 3
97 |     return err_list
98 | 


--------------------------------------------------------------------------------
/src/seqc/sequence/encodings.py:
--------------------------------------------------------------------------------
  1 | 
  2 | class DNA3Bit(object):
  3 |     """
  4 |     Compact 3-bit encoding scheme for sequence data.
  5 |     """
  6 |     
  7 |     @staticmethod
  8 |     def bits_per_base():
  9 |         return 3
 10 | 
 11 | # TODO: The sam reader needs to be fixed so text files are read as text not binary
 12 |     str2bindict = {65: 0b100, 67: 0b110, 71: 0b101, 84: 0b011, 78: 0b111,
 13 |                    97: 0b100, 99: 0b110, 103: 0b101, 116: 0b011, 110: 0b111,
 14 |                    'A': 0b100, 'C': 0b110, 'G': 0b101, 'T': 0b011, 'N': 0b111,
 15 |                    'a': 0b100, 'c': 0b110, 'g': 0b101, 't': 0b011, 'n': 0b111}
 16 |     bin2strdict = {0b100: b'A', 0b110: b'C', 0b101: b'G', 0b011: b'T', 0b111: b'N'}
 17 |     
 18 |     @staticmethod
 19 |     def encode(b) -> int:
 20 |         """
 21 |         Convert string nucleotide sequence into binary, note: string is stored so
 22 |         that the first nucleotide is in the MSB position
 23 | 
 24 |         :param bytes|str b: sequence containing nucleotides to be encoded
 25 |         """
 26 |         res = 0
 27 |         for c in b:
 28 |             res <<= 3
 29 |             res += DNA3Bit.str2bindict[c]
 30 |         return res
 31 |         
 32 |     @staticmethod
 33 |     def decode(i: int) -> bytes:
 34 |         """
 35 |         Convert binary nucleotide sequence into string
 36 | 
 37 |         :param i: int, encoded sequence to be converted back to nucleotides
 38 |         """
 39 |         if i < 0:
 40 |             message = 'i must be an unsigned (positive) integer, not {0!s}'.format(i)
 41 |             raise ValueError(message)
 42 |         r = b''
 43 |         while i > 0:
 44 |             r = DNA3Bit.bin2strdict[i & 0b111] + r
 45 |             i >>= 3
 46 |         return r
 47 |         
 48 |     # TODO: another ooption is to use i.bit_length and take into account preceding 0's
 49 |     @staticmethod
 50 |     def seq_len(i: int) -> int:
 51 |         """
 52 |         Return the length of an encoded sequence based on its binary representation
 53 | 
 54 |         :param i: int, encoded sequence
 55 |         """
 56 |         l = 0
 57 |         while i > 0:
 58 |             l += 1
 59 |             i >>= 3
 60 |         return l
 61 |         
 62 |     @staticmethod
 63 |     def contains(s: int, char: int) -> bool:
 64 |         """
 65 |         return true if the char (bin representation) is contained in seq (binary
 66 |         representation)
 67 | 
 68 |         :param char: int, encoded character (one must be only one nucleotide)
 69 |         :param s: int, sequence of encoded nucleotides
 70 |         """
 71 |         while s > 0:
 72 |             if char == (s & 0b111):
 73 |                 return True
 74 |             s >>= 3
 75 |         return False
 76 |     
 77 |     @staticmethod
 78 |     def ints2int(ints):
 79 |         """
 80 |         convert an iterable of sequences [i1, i2, i3] into a concatenated single integer
 81 |         0bi1i2i3. In cases where the sequence is longer than 64 bits, python will
 82 |         transition seamlessly to a long int representation, however the user must be
 83 |         aware that downsteam interaction with numpy or other fixed-size representations
 84 |         may not function
 85 | 
 86 |         :param ints: iterable of encoded sequences to concatenate
 87 |         """
 88 | 
 89 |         res = 0
 90 |         for num in ints:
 91 |             tmp = num
 92 |             # Get length of next number to concatenate (with enough room for leading 0's)
 93 |             while tmp > 0:
 94 |                 res <<= 3
 95 |                 tmp >>= 3
 96 |             res += num
 97 |         return res
 98 |     
 99 |     @staticmethod
100 |     def count(seq, char_bin):
101 |         """
102 |         count how many times char is in seq.
103 |         char needs to be an encoded value of one of the bases.
104 |         """
105 |         if char_bin not in DNA3Bit.bin2strdict.keys():
106 |             raise ValueError("DNA3Bit.count was called with an invalid char code - "
107 |                              "{}".format(char_bin))
108 |         res = 0
109 |         while seq > 0:
110 |             if seq & 0b111 == char_bin:
111 |                 res += 1
112 |             seq >>= 3
113 |         return res
114 |     
115 | 
116 | # TODO: this was written for tests, not sure it's being used anymore
117 | #   @staticmethod
118 | #    def gc_content(i: int) -> float:
119 | #        """
120 | #        calculates percentage of nucleotides in i that is G or C#
121 | #
122 | #        :param i: int, encoded sequence
123 | #        """
124 | #        gc = 0
125 | #        length = 0
126 | #        while i > 0:
127 | #            length += 1
128 | #            masked = i & 111
129 | #            if masked == 0b100 or masked == 0b100:
130 | #                gc += 1
131 | #            i >>= 3
132 | #        return gc / length
133 | 


--------------------------------------------------------------------------------
/src/seqc/sequence/fastq.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from seqc import reader
  4 | 
  5 | 
  6 | class FastqRecord:
  7 |     """Fastq record object
  8 | 
  9 |     Defines several properties for accessing fastq record information:
 10 |     :property name: name field
 11 |     :property sequence: sequence field
 12 |     :property name2: second name field
 13 |     :property quality: quality field
 14 | 
 15 |     Also defines several methods for accessing SEQC annotation fields:
 16 |     :property annotations: list of annotations
 17 |     :property metadata: dictionary of read metadata (if any present)
 18 |     :property average_quality: return the mean quality of FastqRecord
 19 |     """
 20 | 
 21 |     __slots__ = ['_data']
 22 | 
 23 |     def __init__(self, record: [bytes, bytes, bytes, bytes]):
 24 |         self._data = list(record)
 25 | 
 26 |     @property
 27 |     def name(self) -> bytes:
 28 |         return self._data[0]
 29 | 
 30 |     @name.setter
 31 |     def name(self, value: bytes):
 32 |         self._data[0] = value
 33 | 
 34 |     @property
 35 |     def sequence(self) -> bytes:
 36 |         return self._data[1]
 37 | 
 38 |     @sequence.setter
 39 |     def sequence(self, value: bytes):
 40 |         self._data[1] = value
 41 | 
 42 |     @property
 43 |     def name2(self) -> bytes:
 44 |         return self._data[2]
 45 | 
 46 |     @name2.setter
 47 |     def name2(self, value: bytes):
 48 |         self._data[2] = value
 49 | 
 50 |     @property
 51 |     def quality(self) -> bytes:
 52 |         return self._data[3]
 53 | 
 54 |     @quality.setter
 55 |     def quality(self, value: bytes):
 56 |         self._data[3] = value
 57 | 
 58 |     def __bytes__(self) -> bytes:
 59 |         return b''.join(self._data)
 60 | 
 61 |     def __str__(self) -> str:
 62 |         return bytes(self).decode()
 63 | 
 64 |     def __len__(self) -> int:
 65 |         return len(self.sequence)
 66 | 
 67 |     @property
 68 |     def annotations(self) -> list:
 69 |         """
 70 |         returns:
 71 |         --------
 72 |         list of annotations present in the fastq header
 73 |         """
 74 |         try:
 75 |             end = self.name.index(b';')
 76 |             return self.name[:end].split(b':')
 77 |         except ValueError:
 78 |             return []
 79 | 
 80 |     @property
 81 |     def metadata(self) -> dict:
 82 |         """
 83 |         returns:
 84 |         --------
 85 |         dictionary of annotations and fields, if any are present"""
 86 |         try:
 87 |             start = self.name.rindex(b'|')
 88 |         except ValueError:
 89 |             return {}
 90 |         fields = {}
 91 |         for field in self.name[start + 1:].split(b':'):
 92 |             k, v = field.split(b'=')
 93 |             fields[k] = v
 94 |         return fields
 95 | 
 96 |     def add_annotation(self, values) -> None:
 97 |         """prepends a list of annotations to the name field of self.name
 98 |         :param values:
 99 |         """
100 |         self._data[0] = b'@' + b':'.join(values) + b';' + self.name[1:]
101 | 
102 |     def add_metadata(self, values) -> None:
103 |         """appends a list of metadata fields to the name field of self.name
104 |         :param values:
105 |         """
106 |         self.name += b'|' + b':'.join(k + '=' + v for k, v in values.items())
107 | 
108 |     def average_quality(self) -> int:
109 |         """"""
110 |         return np.mean(np.frombuffer(self.quality, dtype=np.int8, count=len(self)))\
111 |             .astype(int) - 33
112 | 
113 | 
114 | class Reader(reader.Reader):
115 |     """
116 |     Fastq Reader, defines some special methods for reading and summarizing fastq data:
117 | 
118 |     :method __iter__: Iterator over fastq Record objects
119 |     :method __len__: return number of records in file
120 |     :method estimate_sequence_length: estimate the length of fastq sequences in file
121 |     """
122 | 
123 |     @staticmethod
124 |     def record_grouper(iterable):
125 |         args = [iter(iterable)] * 4
126 |         return zip(*args)
127 | 
128 |     def __iter__(self):
129 |         for record in self.record_grouper(super().__iter__()):
130 |             yield FastqRecord(record)
131 | 
132 |     def __len__(self):
133 |         """
134 |         return the length of the Reader object. This depends on the implementation of
135 |         self.__iter__(); it does not necessarily represent the length of the file in
136 |         lines.
137 |         """
138 |         return sum(1 for _ in self) / 4
139 | 
140 |     def estimate_sequence_length(self):
141 |         """
142 |         estimate the sequence length of a fastq file from the first 10000 records of
143 |         the file.
144 | 
145 |         :return: int mean, float standard deviation, (np.ndarray: observed lengths,
146 |           np.ndarray: counts per length)
147 |         """
148 |         i = 0
149 |         records = iter(self)
150 |         data = np.empty(10000, dtype=int)
151 |         while i < 10000:
152 |             try:
153 |                 seq = next(records).sequence
154 |             except StopIteration:  # for fastq files shorter than 10000 records
155 |                 data = data[:i]
156 |                 break
157 |             data[i] = len(seq) - 1  # last character is a newline
158 |             i += 1
159 |         return np.mean(data), np.std(data), np.unique(data, return_counts=True)
160 |     
161 |     
162 | def merge_paired(merge_function, fout, genomic, barcode=None) -> (str, int):
163 |     """
164 |     General function to annotate genomic fastq with barcode information from reverse read.
165 |     Takes a merge_function which indicates which kind of platform was used to generate
166 |     the data, and specifies how the merging should be done.
167 | 
168 |     :param merge_function: function from merge_functions.py
169 |     :param fout: merged output file name
170 |     :param genomic: fastq containing genomic data
171 |     :param barcode: fastq containing barcode data
172 |     :return str fout, filename of merged fastq file
173 | 
174 |     """
175 |     directory, filename = os.path.split(fout)
176 |     if directory and not os.path.isdir(directory):
177 |         os.makedirs(directory, exist_ok=True)
178 |     genomic = Reader(genomic)
179 |     if barcode:
180 |         barcode = Reader(barcode)
181 |         with open(fout, 'wb') as f:
182 |             for g, b in zip(genomic, barcode):
183 |                 r = merge_function(g, b)
184 |                 f.write(bytes(r))
185 |     else:
186 |         with open(fout, 'wb') as f:
187 |             for g in genomic:
188 |                 r = merge_function(g)
189 |                 f.write(bytes(r))
190 | 
191 |     return fout
192 | 
193 | 
194 | def truncate(fastq_file, lengths):
195 |     """
196 | 
197 |     :param str fastq_file: the input fastq file
198 |     :param [int] lengths: a list of integer lengths to truncate the input fastq file
199 |     :return:
200 |     """
201 |     # get sequence length of input file
202 |     r = Reader(fastq_file)
203 |     length = None
204 |     for record in r:
205 |         length = len(record.sequence)
206 |         break
207 | 
208 |     print('sequence length in file is %d' % length)
209 | 
210 |     # remove any lengths longer than sequence length of file
211 |     lengths = sorted([l for l in lengths if l < length])[::-1]  # largest to smallest
212 | 
213 |     # open a bunch of files
214 |     files = []
215 |     for l in lengths:
216 |         name = fastq_file.replace('.gz', '').replace('.fastq', '') + '_%d_' % l + '.fastq'
217 |         files.append(open(name, 'wb'))
218 | 
219 |     i = 0
220 |     indices = list(range(len(lengths)))
221 |     for record in r:
222 |         if i > 10e6:
223 |             break
224 |         for j in indices:
225 |             record.sequence = record.sequence[:-1][:lengths[j]] + b'\n'
226 |             record.quality = record.quality[:-1][:lengths[j]] + b'\n'
227 |             files[j].write(bytes(record))
228 |         i += 1
229 | 
230 |     for f in files:
231 |         f.close()
232 | 


--------------------------------------------------------------------------------
/src/seqc/sparse_frame.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from scipy.sparse import coo_matrix
  4 | from collections import OrderedDict
  5 | from seqc.sequence.gtf import create_gene_id_to_official_gene_symbol_map
  6 | from seqc.sequence.gtf import ensembl_gene_id_to_official_gene_symbol
  7 | 
  8 | 
  9 | class SparseFrame:
 10 | 
 11 |     def __init__(self, data, index, columns):
 12 |         """
 13 |         lightweight wrapper of scipy.stats.coo_matrix to provide pd.DataFrame-like access
 14 |         to index, column, and shape properties.
 15 | 
 16 |         :param data: scipy.stats.coo_matrix
 17 |         :param index: np.ndarray: row index
 18 |         :param columns: np.ndarray: column index
 19 | 
 20 |         :property data: scipy.stats.coo_matrix
 21 |         :property index: np.ndarray row index
 22 |         :property columns: np.ndarray column index
 23 |         :property shape: (int, int), number of rows and columns
 24 |         :method sum: wrapper of np.sum()
 25 |         """
 26 | 
 27 |         if not isinstance(data, coo_matrix):
 28 |             raise TypeError('data must be type coo_matrix')
 29 |         if not isinstance(index, np.ndarray):
 30 |             raise TypeError('index must be type np.ndarray')
 31 |         if not isinstance(columns, np.ndarray):
 32 |             raise TypeError('columns must be type np.ndarray')
 33 | 
 34 |         self._data = data
 35 |         self._index = index
 36 |         self._columns = columns
 37 | 
 38 |     @property
 39 |     def data(self):
 40 |         return self._data
 41 | 
 42 |     @data.setter
 43 |     def data(self, item):
 44 |         if not isinstance(item, coo_matrix):
 45 |             raise TypeError('data must be type coo_matrix')
 46 |         self._data = item
 47 | 
 48 |     @property
 49 |     def index(self):
 50 |         return self._index
 51 | 
 52 |     @index.setter
 53 |     def index(self, item):
 54 |         try:
 55 |             self._index = np.array(item)
 56 |         except:
 57 |             raise TypeError('self.index must be convertible into a np.array object')
 58 | 
 59 |     @property
 60 |     def columns(self):
 61 |         return self._columns
 62 | 
 63 |     @columns.setter
 64 |     def columns(self, item):
 65 |         try:
 66 |             self._columns = np.array(item)
 67 |         except:
 68 |             raise TypeError('self.columns must be convertible into a np.array object')
 69 | 
 70 |     @property
 71 |     def shape(self):
 72 |         return len(self.index), len(self.columns)
 73 | 
 74 |     def sum(self, axis=0):
 75 |         """
 76 |         sum over provided axis
 77 | 
 78 |         :param axis: options: 0 (rows) or 1 (columns)
 79 |         :return: np.ndarray vector of column or row sums
 80 |         """
 81 |         return self.data.sum(axis=axis)
 82 | 
 83 |     @classmethod
 84 |     def from_dict(cls, dictionary, genes_to_symbols=False):
 85 |         """create a SparseFrame from a dictionary
 86 | 
 87 |         :param dict dictionary: dictionary in form (cell, gene) -> count
 88 |         :param str|bool genes_to_symbols: convert genes into symbols. If not False, user
 89 |           must provide the location of a .gtf file to carry out conversion. Otherwise the
 90 |           column index will retain the original integer ids
 91 |         :return SparseFrame: SparseFrame containing dictionary data
 92 |         """
 93 | 
 94 |         # todo this throws an uninformative error in the case that there are no active
 95 |         # reads in the ReadArray
 96 |         i, j = (np.array(v, dtype=int) for v in zip(*dictionary.keys()))
 97 |         data = np.fromiter(dictionary.values(), dtype=int)
 98 | 
 99 |         # map cells to small values
100 |         uniq_i = np.unique(i)
101 |         imap = OrderedDict(zip(uniq_i, np.arange(uniq_i.shape[0])))
102 | 
103 |         uniq_j = np.unique(j)
104 |         jmap = OrderedDict(zip(uniq_j, np.arange(uniq_j.shape[0])))
105 | 
106 |         i_inds = np.fromiter((imap[v] for v in i), dtype=int)
107 |         j_inds = np.fromiter((jmap[v] for v in j), dtype=int)
108 | 
109 |         coo = coo_matrix((data, (i_inds, j_inds)), shape=(len(imap), len(jmap)),
110 |                          dtype=np.int32)
111 | 
112 |         index = np.fromiter(imap.keys(), dtype=int)
113 |         columns = np.fromiter(jmap.keys(), dtype=int)
114 | 
115 |         if genes_to_symbols:
116 |             if not os.path.isfile(genes_to_symbols):
117 |                 raise ValueError('genes_to_symbols argument %s is not a valid annotation '
118 |                                  'file' % repr(genes_to_symbols))
119 |             gmap = create_gene_id_to_official_gene_symbol_map(genes_to_symbols)
120 |             columns = np.array(ensembl_gene_id_to_official_gene_symbol(
121 |                 columns, gene_id_map=gmap))
122 | 
123 |         return cls(coo, index, columns)
124 | 


--------------------------------------------------------------------------------
/src/seqc/stats/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ttest import bootstrap_t as ttest
 2 | from .gsea import GSEA as gsea
 3 | from .correlation import correlation
 4 | from .anova import ANOVA as anova
 5 | from .graph_diffusion import GraphDiffusion as graph_diffusion
 6 | from .smoothing import smoothing
 7 | from .tree import Tree as tree
 8 | from .pca import PCA as pca
 9 | from .tsne import TSNE as tsne
10 | from .g_test import g_test
11 | from .mast import run_mast
12 | from .resampled_nonparametric import mannwhitneyu, kruskalwallis


--------------------------------------------------------------------------------
/src/seqc/stats/anova.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from collections import namedtuple
  3 | import numpy as np
  4 | import pandas as pd
  5 | from functools import partial
  6 | from scipy.stats.mstats import kruskalwallis, rankdata
  7 | from scipy.stats import t
  8 | from statsmodels.sandbox.stats.multicomp import multipletests
  9 | 
 10 | class ANOVA:
 11 | 
 12 |     def __init__(self, data, group_assignments, alpha=0.05):
 13 |         """
 14 |         Carry out ANOVA between the groups of data
 15 | 
 16 |         :param data: n cells x k genes 2d array
 17 |         :param group_assignments: n cells 1d vector
 18 |         :param alpha: float (0, 1], acceptable type I error
 19 |         """
 20 |         # make sure group_assignments and data have the same length
 21 |         warnings.warn('DeprecationWarning: This function is deprecated.')
 22 |         if not data.shape[0] == group_assignments.shape[0]:
 23 |             raise ValueError(
 24 |                 'Group assignments shape ({!s}) must equal the number of rows in data '
 25 |                 '({!s}).'.format(group_assignments.shape[0], data.shape[0]))
 26 | 
 27 |         # todo
 28 |         # may want to verify that each group has at least two observations
 29 |         # (else variance won't work)
 30 | 
 31 |         # store index if both data and group_assignments are pandas objects
 32 |         if isinstance(data, pd.DataFrame) and isinstance(group_assignments, pd.Series):
 33 |             # ensure assignments and data indices are aligned
 34 |             try:
 35 |                 ordered_assignments = group_assignments[data.index]
 36 |                 if not len(ordered_assignments) == data.shape[0]:
 37 |                     raise ValueError(
 38 |                         'Index mismatch between data and group_assignments detected when '
 39 |                         'aligning indices. check for duplicates.')
 40 |             except:
 41 |                 raise ValueError('Index mismatch between data and group_assignments.')
 42 | 
 43 |             # sort data by cluster assignment
 44 |             idx = np.argsort(ordered_assignments.values)
 45 |             self.data = data.iloc[idx, :].values
 46 |             ordered_assignments = ordered_assignments.iloc[idx]
 47 |             self.group_assignments = ordered_assignments.values
 48 |             self.index = data.columns
 49 | 
 50 |         else:  # get arrays from input values
 51 |             self.index = None  # inputs were not all indexed pandas objects
 52 | 
 53 |             try:
 54 |                 data = np.array(data)
 55 |             except:
 56 |                 raise ValueError('data must be convertible to a np.ndarray')
 57 | 
 58 |             try:
 59 |                 group_assignments = np.array(group_assignments)
 60 |             except:
 61 |                 raise ValueError('group_assignments must be convertible to a np.ndarray')
 62 | 
 63 |             idx = np.argsort(group_assignments)
 64 |             self.data = data[idx, :]
 65 |             self.group_assignments = group_assignments[idx]
 66 | 
 67 |         self.post_hoc = None
 68 |         self.groups = np.unique(group_assignments)
 69 | 
 70 |         # get points to split the array, create slicers for each group
 71 |         self.split_indices = np.where(np.diff(self.group_assignments))[0] + 1
 72 |         # todo is this a faster way of calculating the below anova?
 73 |         # self.array_views = np.array_split(self.data, self.split_indices, axis=0)
 74 | 
 75 |         if not 0 < alpha <= 1:
 76 |             raise ValueError('Parameter alpha must fall within the interval (0, 1].')
 77 |         self.alpha = alpha
 78 | 
 79 |         self._anova = None
 80 | 
 81 |     def anova(self, min_mean_expr=None):
 82 |         """
 83 |         carry out non-parametric ANOVA across the groups of self.
 84 | 
 85 |         :param min_mean_expr: minimum average gene expression value that must be reached
 86 |           in at least one cluster for the gene to be considered
 87 |         :return:
 88 |         """
 89 |         if self._anova is not None:
 90 |             return self._anova
 91 | 
 92 |         # run anova
 93 |         f = lambda v: kruskalwallis(*np.split(v, self.split_indices))[1]
 94 |         pvals = np.apply_along_axis(f, 0, self.data)  # todo could shunt to a multiprocessing pool
 95 | 
 96 |         # correct the pvals
 97 |         _, pval_corrected, _, _ = multipletests(pvals, self.alpha, method='fdr_tsbh')
 98 | 
 99 |         # store data & return
100 |         if self.index is not None:
101 |             self._anova = pd.Series(pval_corrected, index=self.index)
102 |         else:
103 |             self._anova = pval_corrected
104 |         return self._anova
105 | 
106 |     def post_hoc_tests(self):
107 |         """
108 |         carries out post-hoc tests between genes with significant ANOVA results using
109 |         Welch's U-test on ranked data.
110 |         """
111 |         if self._anova is None:
112 |             self.anova()
113 | 
114 |         anova_significant = np.array(self._anova) < 1  # call array in case it is a Series
115 | 
116 |         # limit to significant data, convert to column-wise ranks.
117 |         data = self.data[:, anova_significant]
118 |         rank_data = np.apply_along_axis(rankdata, 0, data)
119 |         # assignments = self.group_assignments[anova_significant]
120 | 
121 |         split_indices = np.where(np.diff(self.group_assignments))[0] + 1
122 |         array_views = np.array_split(rank_data, split_indices, axis=0)
123 | 
124 |         # get mean and standard deviations of each
125 |         fmean = partial(np.mean, axis=0)
126 |         fvar = partial(np.var, axis=0)
127 |         mu = np.vstack(list(map(fmean, array_views))).T  # transpose to get gene rows
128 |         n = np.array(list(map(lambda x: x.shape[0], array_views)))
129 |         s = np.vstack(list(map(fvar, array_views))).T
130 |         s_norm = s / n  # transpose to get gene rows
131 | 
132 |         # calculate T
133 |         numerator = mu[:, np.newaxis, :] - mu[:, :, np.newaxis]
134 |         denominator = np.sqrt(s_norm[:, np.newaxis, :] + s_norm[:, :, np.newaxis])
135 |         statistic = numerator / denominator
136 | 
137 |         # calculate df
138 |         s_norm2 = s**2 / (n**2 * n-1)
139 |         numerator = (s_norm[:, np.newaxis, :] + s_norm[:, :, np.newaxis]) ** 2
140 |         denominator = (s_norm2[:, np.newaxis, :] + s_norm2[:, :, np.newaxis])
141 |         df = np.floor(numerator / denominator)
142 | 
143 |         # get significance
144 |         p = t.cdf(np.abs(statistic), df)  # note, two tailed test
145 | 
146 |         # calculate fdr correction; because above uses 2-tails, alpha here is halved
147 |         # because each test is evaluated twice due to the symmetry of vectorization.
148 |         p_adj = multipletests(np.ravel(p), alpha=self.alpha, method='fdr_tsbh')[1]
149 |         p_adj = p_adj.reshape(*p.shape)
150 | 
151 |         phr = namedtuple('PostHocResults', ['p_adj', 'statistic', 'mu'])
152 |         self.post_hoc = phr(p_adj, statistic, mu)
153 | 
154 |         if self.index is not None:
155 |             p_adj = pd.Panel(
156 |                 p_adj, items=self.index[anova_significant], major_axis=self.groups,
157 |                 minor_axis=self.groups)
158 |             statistic = pd.Panel(
159 |                 statistic, items=self.index[anova_significant], major_axis=self.groups,
160 |                 minor_axis=self.groups)
161 |             mu = pd.DataFrame(mu, self.index[anova_significant], columns=self.groups)
162 | 
163 |         return p_adj, statistic, mu
164 | 
165 |     def population_markers(self, p_crit=0.0):
166 |         """
167 |         Return markers that are significantly differentially expressed in one
168 |         population vs all others
169 | 
170 |         :param p_crit: float, fraction populations that may be indistinguishable from the
171 |           highest expressing population for each gene. If zero, each marker gene is
172 |           significantly higher expressed in one population relative to all others.
173 |           If 0.1, 10% of populations may share high expression of a gene, and those
174 |           populations will be marked as expressing that gene.
175 | 
176 |         """
177 |         if self.post_hoc is None:
178 |             self.post_hoc_tests()
179 | 
180 |         # get highest mean for each gene
181 |         top_gene_idx = np.argmax(self.post_hoc.mu, axis=1)
182 | 
183 |         # index p_adj first dimension with each sample, will reduce to 2d genes x samples
184 |         top_gene_sig = self.post_hoc.p_adj[:, top_gene_idx, :]
185 | 
186 |         # for each gene, count the number of non-significant DE results.
187 |         sig = np.array(top_gene_sig < self.alpha)
188 |         num_sig = np.sum(sig, axis=2)
189 | 
190 |         # if this is greater than N - 1 * p_crit, discard the gene.
191 |         n = self.post_hoc.p_adj.shape[2] - 1  # number of genes, sub 1 for self
192 |         idx_marker_genes = np.where(num_sig < n * (1 - p_crit))
193 |         marker_genes = sig[idx_marker_genes, :]
194 | 
195 |         # correctly index these genes
196 |         if self.index:
197 |             pass  # todo fix this
198 | 
199 |         return marker_genes
200 | 


--------------------------------------------------------------------------------
/src/seqc/stats/correlation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | class correlation:
 6 |     """Fast vectorized correlation methods
 7 | 
 8 |     :method vector(x, y): correlate each column in y with a vector in x
 9 |     :method map(x, y): correlate each column of x with each column in y
10 |     :method eigv(evec, data): get pairwise correlations of eigenvectors
11 |       with columns of data
12 |     """
13 | 
14 |     @staticmethod
15 |     def vector(x: np.array, y: np.array):
16 |         """
17 |         Correlate each column in y with a vector x
18 | 
19 |         :param x: np.ndarray vector of length n
20 |         :param y: np.ndarray matrix of shape (n, k)
21 |         :returns: vector of length n
22 |         """
23 |         # x = x[:, np.newaxis]  # for working with matrices
24 |         mu_x = x.mean()  # cells
25 |         mu_y = y.mean(axis=0)  # cells by gene --> cells by genes
26 |         sigma_x = x.std()
27 |         sigma_y = y.std(axis=0)
28 | 
29 |         return ((y * x).mean(axis=0) - mu_y * mu_x) / (sigma_y * sigma_x)
30 | 
31 |     @staticmethod
32 |     def map(x: np.ndarray, y: np.ndarray):
33 |         """Correlate each row of x with each row of y
34 | 
35 |         :param x: np.array; shape N x T.
36 |         :param y: np.array; shape M x T.
37 |         :returns: np.array; shape N x M in which each element is a correlation
38 |                             coefficient.
39 |         """
40 |         assert(x.shape[1] == y.shape[1])
41 |         n = x.shape[1]
42 |         x_diff = x - x.mean(axis=-1)[:, None]
43 |         y_diff = y - y.mean(axis=-1)[:, None]
44 |         x_std = x.std(axis=-1)
45 |         y_std = y.std(axis=-1)
46 |         return np.dot(x_diff, y_diff.T) / (n * x_std[:, np.newaxis] * y_std)
47 | 
48 |     @staticmethod
49 |     def eigv(evec, data, components=tuple(), knn=10):
50 |         """
51 |         get pairwise correlations of eigenvectors with columns in data
52 | 
53 |         :param evec: eigenvectors
54 |         :param data: np.ndarray genes x cells data matrix
55 |         :param components: which eigenvectors to select
56 |         :param knn: number of neighbors to smooth gene expression values over
57 |         :return:
58 |         """
59 |         if isinstance(data, pd.DataFrame):
60 |             D = data.values
61 |         elif isinstance(data, np.ndarray):
62 |             D = data
63 |         else:
64 |             raise TypeError('data must be a pd.DataFrame or np.ndarray')
65 | 
66 |         # set components, remove zero if it was specified
67 |         if not components:
68 |             components = np.arange(evec.shape[1])
69 |         else:
70 |             components = np.array(components)
71 |         components = components[components != 0]
72 | 
73 |         eigv_corr = np.empty((D.shape[1], evec.shape[1]), dtype=np.float)
74 | 
75 |         for component_index in components:
76 |             component_data = evec[:, component_index]
77 | 
78 |             order = np.argsort(component_data)
79 |             x = pd.DataFrame(component_data[order]).rolling(
80 |                 window=knn, center=False).mean()[knn:].values
81 |             # this fancy indexing will copy self.molecules
82 |             vals = pd.DataFrame(D[order, :]).rolling(
83 |                 window=knn, center=False, axis=0).mean()[knn:].values
84 |             eigv_corr[:, component_index] = correlation.vector(x, vals)
85 | 
86 |         # this is sorted by order, need it in original order (reverse the sort)
87 |         eigv_corr = eigv_corr[:, components]
88 |         if isinstance(data, pd.DataFrame):
89 |             eigv_corr = pd.DataFrame(eigv_corr, index=data.columns, columns=components)
90 |         return eigv_corr
91 | 


--------------------------------------------------------------------------------
/src/seqc/stats/experimental_yield.py:
--------------------------------------------------------------------------------
  1 | class ExperimentalYield:
  2 | 
  3 |     output = (
  4 |         '{divide}\nINPUT\n{divide}\n'
  5 |         'Total input reads:\t{n_fastq}\n'
  6 |         '{divide}\nALIGNMENT (% FROM INPUT)\n{divide}\n'
  7 |         'Total reads aligned:\t{n_sam} ({prop_al}%)\n'
  8 |         ' - Genomic alignments:\t{genomic} ({prop_gen}%)\n'
  9 |         ' - PhiX alignments:\t{phi_x} ({prop_phix}%)\n'
 10 |         ' - Transcriptome alignments:\t{trans} ({prop_trans}%)\n'
 11 |         '{divide}\nFILTERING (% FROM ALIGNMENT)\n{divide}\n'
 12 |         'Genomic alignments:\t{genomic} ({bad_gen}%)\n'
 13 |         'PhiX alignments:\t{phi_x} ({bad_phi}%)\n'
 14 |         'Incorrect barcodes:\t{wrong_cb} ({bad_cb}%)\n'
 15 |         'Missing cell barcodes/RMT:\t{no_cell} ({bad_cell}%)\n'
 16 |         'N present in RMT:\t{rmt_N} ({bad_rmtN}%)\n'
 17 |         'N present in CB:\t{cell_N} ({bad_cellN}%)\n'
 18 |         'Insufficient poly(T):\t{poly_t} ({bad_polyt}%)\n'
 19 |         'High dust score:\t{dust} ({bad_dust}%)\n'
 20 |         '{divide}\nCELL/MOLECULE COUNT DISTRIBUTION\n{divide}\n'
 21 |         'Total molecules:\t\t{tot_mc}\n'
 22 |         'Molecules lost:\t{mols_lost}\n'
 23 |         'Cells lost:\t{cells_lost}\n'
 24 |         'Cell description:\n{cell_desc}\n'
 25 |         '{divide}\nSUMMARY\n{divide}\n'
 26 |         'Total retained reads:\t{n_good} ({prop_good}%)\n'
 27 |         'Total reads unaligned:\t{lost_al} ({prop_un}%)\n'
 28 |         'Total reads filtered:\t{n_bad} ({prop_bad}%)\n'
 29 |         '{divide}\n')
 30 | 
 31 |     @classmethod
 32 |     def construct_run_summary(cls, summary: dict):
 33 |         """
 34 |         calculates basic loss statistics and constructs a summary
 35 |         that will be sent to the user after the SEQC run has completed.
 36 | 
 37 |         :param summary: dictionary constructed during error correction
 38 |         :return: output of basic summary statistics
 39 |         """
 40 |         if not summary:
 41 |             return
 42 | 
 43 |         # obtain values from summary
 44 |         n_fastq = summary['n_fastq']
 45 |         n_sam = summary['n_sam']
 46 |         genomic = summary['gene_0']
 47 |         phix = summary['phi_x']
 48 |         no_cell = summary['cell_0']
 49 |         # no_rmt = summary['rmt_0']
 50 |         rmt_N = summary['rmt_N']
 51 |         cell_N = summary['cell_N']
 52 |         dust = summary['dust']
 53 |         poly_t = summary['poly_t']
 54 |         tot_mc = summary['total_mc']
 55 |         mols_lost = list(summary['mols_lost'].items())
 56 |         cells_lost = list(summary['cells_lost'].items())
 57 |         cell_desc = summary['cell_desc'].to_string()
 58 |         divide = '-' * 40
 59 | 
 60 |         # run summary will not be calculated if user started SEQC midway
 61 |         if n_fastq == 'NA' or n_sam == 'NA':
 62 |             return
 63 | 
 64 |         # calculate summary statistics
 65 |         trans = n_sam - genomic - phix
 66 |         prop_al = round((n_sam/n_fastq) * 100, 1)
 67 |         prop_gen = round((genomic/n_sam) * 100, 1)
 68 |         prop_phix = round((phix/n_sam) * 100, 1)
 69 |         prop_trans = round((trans/n_sam) * 100, 1)
 70 |         lost_al = n_fastq - n_sam
 71 |         prop_un = round(100 - prop_al, 1)
 72 |         n_bad = genomic + phix + no_cell + rmt_N + cell_N + poly_t + dust
 73 |         # n_bad = genomic + phix + no_cell + no_rmt + rmt_N + poly_t
 74 |         # wrong_cb does not apply to drop-seq
 75 |         try:
 76 |             wrong_cb = summary['cb_wrong']
 77 |             n_bad += wrong_cb
 78 |             bad_cb = round((wrong_cb/n_bad) * 100, 1)
 79 |         except KeyError:
 80 |             wrong_cb = 0
 81 |             bad_cb = 0
 82 |         # continue with calculations
 83 |         n_good = n_sam - n_bad
 84 |         bad_gen = round((genomic/n_bad) * 100, 1)
 85 |         bad_phi = round((phix/n_bad) * 100, 1)
 86 |         bad_cell = round((no_cell/n_bad) * 100, 1)
 87 |         # bad_rmt = round((no_rmt/n_bad) * 100, 1)
 88 |         bad_rmtN = round((rmt_N/n_bad) * 100, 1)
 89 |         bad_cellN = round((cell_N/n_bad) * 100, 1)
 90 |         bad_polyt = round((poly_t/n_bad) * 100, 1)
 91 |         bad_dust = round((dust/n_bad) * 100, 1)
 92 |         prop_bad = round((n_bad/n_fastq) * 100, 1)
 93 |         prop_good = round((n_good/n_fastq) * 100, 1)
 94 | 
 95 |         # format output
 96 |         output = cls.output.format(
 97 |             n_fastq=n_fastq, n_sam=n_sam, genomic=genomic, phi_x=phix, no_cell=no_cell,
 98 |             wrong_cb=wrong_cb, rmt_N=rmt_N, poly_t=poly_t, divide=divide,
 99 |             prop_al=prop_al, prop_gen=prop_gen, prop_phix=prop_phix, lost_al=lost_al,
100 |             n_bad=n_bad, n_good=n_good, prop_good=prop_good, prop_bad=prop_bad,
101 |             prop_un=prop_un, bad_gen=bad_gen, bad_phi=bad_phi, bad_cb=bad_cb,
102 |             bad_cell=bad_cell, bad_rmtN=bad_rmtN, bad_polyt=bad_polyt, trans=trans,
103 |             cell_N=cell_N, bad_cellN=bad_cellN, dust=dust, bad_dust=bad_dust,
104 |             prop_trans=prop_trans, tot_mc=tot_mc, mols_lost=mols_lost,
105 |             cells_lost=cells_lost, cell_desc=cell_desc)
106 |         return output
107 | 


--------------------------------------------------------------------------------
/src/seqc/stats/g_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from contextlib import closing
 4 | from multiprocessing import Pool
 5 | from sklearn.cluster import KMeans
 6 | 
 7 | 
 8 | def _assign(d):
 9 |     """
10 | 
11 |     :param np.ndarray d: 1d vector of scaled differences
12 |     :return np.ndarray: 1d boolean gene-enrichment assignment vector
13 |     """
14 |     km = KMeans(n_clusters=2)
15 |     km.fit(d[:, np.newaxis])
16 |     assignments = km.labels_.astype(bool)
17 |     if np.argmax(km.cluster_centers_) == 0:
18 |         return assignments
19 |     else:
20 |         return ~assignments
21 | 
22 | 
23 | def g_test(data, labels, log=False):
24 |     """
25 | 
26 |     :param pd.DataFrame data:
27 |     :param labels:
28 |     :param log:
29 |     :return:
30 |     """
31 | 
32 |     if log:
33 |         data = np.log(data + 1)
34 | 
35 |     data = pd.DataFrame(data.values / data.values.sum(axis=1)[:, np.newaxis],
36 |                         index=labels, columns=data.columns)
37 | 
38 |     # calculate data that are useful for determining observed and expected values
39 |     gene_sums = data.sum(axis=0)
40 |     grouped = data.groupby(axis=0, level=0)  # group only once
41 |     category_sizes = grouped.size()
42 |     category_fractions = category_sizes / category_sizes.sum()  # normalize
43 | 
44 |     # get observed, expected
45 |     expected = pd.DataFrame(
46 |         data=np.dot(category_fractions.values[:, np.newaxis],
47 |                     gene_sums.values[np.newaxis, :]),
48 |         index=category_sizes.index,
49 |         columns=gene_sums.index)
50 |     observed = grouped.sum()
51 | 
52 |     # scaled ratios are used in both g-test, and partitioning of expressed vs. not
53 |     logratio = np.log(observed / expected)
54 |     logratio.values[~np.isfinite(logratio.values)] = 0
55 |     scaled_diff = observed * logratio
56 | 
57 |     g = 2 * np.sum(scaled_diff, axis=0)  # g-test
58 | 
59 |     # todo only assign significant values
60 |     # todo calculate significance
61 |     with closing(Pool()) as pool:
62 |         assignments = pool.map(_assign, scaled_diff.values.T)
63 | 
64 |     assignments = pd.DataFrame(
65 |         data=np.vstack(assignments).T,
66 |         index=category_sizes.index,
67 |         columns=data.columns
68 |     )
69 | 
70 |     return g, assignments
71 | 


--------------------------------------------------------------------------------
/src/seqc/stats/graph_diffusion.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | from scipy.sparse.linalg import eigs
  4 | from numpy.linalg import norm
  5 | from scipy.sparse import csr_matrix, find
  6 | from sklearn.neighbors import NearestNeighbors
  7 | 
  8 | 
  9 | class GraphDiffusion:
 10 |     def __init__(self, knn=10, normalization='smarkov', epsilon=1,
 11 |                  n_diffusion_components=10):
 12 |         """
 13 |         Run diffusion maps on the data. This implementation is based on the
 14 |         diffusion geometry library in Matlab:
 15 |         https://services.math.duke.edu/~mauro/code.html#DiffusionGeom and was implemented
 16 |         by Pooja Kathail
 17 | 
 18 |         :param knn: Number of neighbors for graph construction to determine distances
 19 |           between cells
 20 |         :param normalization: method for normalizing the matrix of weights
 21 |              'bimarkov'            force row and column sums to be 1
 22 |              'markov'              force row sums to be 1
 23 |              'smarkov'             symmetric conjugate to markov
 24 |              'beltrami'            Laplace-Beltrami normalization ala Coifman-Lafon
 25 |              'sbeltrami'           symmetric conjugate to beltrami
 26 |              'FokkerPlanck'        Fokker-Planck normalization
 27 |              'sFokkerPlanck'       symmetric conjugate to Fokker-Planck normalization
 28 |         :param epsilon: Gaussian standard deviation for converting distances to affinities
 29 |         :param n_diffusion_components: Number of diffusion components to generate
 30 |         """
 31 |         if normalization not in ['bimarkov', 'smarkov', 'markov', 'sbeltrami', 'beltrami',
 32 |                                  'FokkerPlanck', 'sFokkerPlanck']:
 33 |             raise ValueError(
 34 |                 'Unsupported normalization. Please refer to the docstring for the '
 35 |                 'supported methods')
 36 | 
 37 |         self.knn = knn
 38 |         self.normalization = normalization
 39 |         self.epsilon = epsilon
 40 |         self.n_diffusion_components = n_diffusion_components
 41 |         self.eigenvectors = None
 42 |         self.eigenvalues = None
 43 |         self.diffusion_operator = None
 44 |         self.weights = None
 45 | 
 46 |     @staticmethod
 47 |     def keigs(T, k, P, take_diagonal=0):
 48 |         """ return k largest magnitude eigenvalues for the matrix T.
 49 |         :param T: Matrix to find eigen values/vectors of
 50 |         :param k: number of eigen values/vectors to return
 51 |         :param P: in the case of symmetric normalizations,
 52 |                   this is the NxN diagonal matrix which relates the nonsymmetric
 53 |                   version to the symmetric form via conjugation
 54 |         :param take_diagonal: if 1, returns the eigenvalues as a vector rather than as a
 55 |                               diagonal matrix.
 56 |         """
 57 |         D, V = eigs(T, k, tol=1e-4, maxiter=1000)
 58 |         D = np.real(D)
 59 |         V = np.real(V)
 60 |         inds = np.argsort(D)[::-1]
 61 |         D = D[inds]
 62 |         V = V[:, inds]
 63 |         if P is not None:
 64 |             V = P.dot(V)
 65 | 
 66 |         # Normalize
 67 |         for i in range(V.shape[1]):
 68 |             V[:, i] = V[:, i] / norm(V[:, i])
 69 |         V = np.round(V, 10)
 70 | 
 71 |         if take_diagonal == 0:
 72 |             D = np.diag(D)
 73 | 
 74 |         return V, D
 75 | 
 76 |     @staticmethod  # todo fix; what is S?
 77 |     def bimarkov(W, max_iters=100, abs_error=0.00001, **kwargs):
 78 |         """normalization method for GraphDiffusion"""
 79 | 
 80 |         if W.size == 0:
 81 |             return
 82 | 
 83 |         # process input
 84 |         if W.shape[0] != W.shape[1]:
 85 |             raise ValueError('Bimarkov.py: kernel must be NxN\n')
 86 | 
 87 |         N = W.shape[0]
 88 | 
 89 |         # initialize
 90 |         p = np.ones(N)
 91 | 
 92 |         # iterative
 93 |         for i in range(max_iters):
 94 | 
 95 |             S = np.ravel(W.sum(axis=1))
 96 |             err = np.max(np.absolute(1.0 - np.max(S)), np.absolute(1.0 - np.min(S)))
 97 | 
 98 |             if err < abs_error:
 99 |                 break
100 | 
101 |             D = csr_matrix((np.divide(1, np.sqrt(S)), (range(N), range(N))), shape=[N, N])
102 |             p *= S
103 |             W = D.dot(W).dot(D)
104 | 
105 |         # iron out numerical errors
106 |         T = (W + W.T) / 2
107 |         return T, p
108 | 
109 |     @staticmethod
110 |     def smarkov(D, N, W):
111 |         """normalization method for GraphDiffusion"""
112 |         D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N])
113 |         P = D
114 |         T = D.dot(W).dot(D)
115 |         T = (T + T.T) / 2
116 |         return T, P
117 | 
118 |     @staticmethod
119 |     def markov(D, N, W):
120 |         """normalization method for GraphDiffusion"""
121 |         T = csr_matrix((D, (range(N), range(N))), shape=[N, N]).dot(W)
122 |         return T, None
123 | 
124 |     @staticmethod
125 |     def sbeltrami(D, N, W):
126 |         """normalization method for GraphDiffusion"""
127 |         P = csr_matrix((D, (range(N), range(N))), shape=[N, N])
128 |         K = P.dot(W).dot(P)
129 | 
130 |         D = np.ravel(K.sum(axis=1))
131 |         D[D != 0] = 1 / D[D != 0]
132 | 
133 |         D = csr_matrix((D, (range(N), range(N))), shape=[N, N])
134 |         P = D
135 |         T = D.dot(K).dot(D)
136 | 
137 |         T = (T + T.T) / 2
138 |         return T, P
139 | 
140 |     @staticmethod
141 |     def beltrami(D, N, W):
142 |         """normalization method for GraphDiffusion"""
143 |         D = csr_matrix((D, (range(N), range(N))), shape=[N, N])
144 |         K = D.dot(W).dot(D)
145 | 
146 |         D = np.ravel(K.sum(axis=1))
147 |         D[D != 0] = 1 / D[D != 0]
148 | 
149 |         V = csr_matrix((D, (range(N), range(N))), shape=[N, N])
150 |         T = V.dot(K)
151 |         return T, None
152 | 
153 |     @staticmethod
154 |     def FokkerPlanck(D, N, W):
155 |         """normalization method for GraphDiffusion"""
156 |         D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N])
157 |         K = D.dot(W).dot(D)
158 | 
159 |         D = np.ravel(K.sum(axis=1))
160 |         D[D != 0] = 1 / D[D != 0]
161 | 
162 |         D = csr_matrix((D, (range(N), range(N))), shape=[N, N])
163 |         T = D.dot(K)
164 |         return T, None
165 | 
166 |     @staticmethod
167 |     def sFokkerPlanck(D, N, W):
168 |         """normalization method for GraphDiffusion"""
169 |         print('(sFokkerPlanck) ... ')
170 | 
171 |         D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N])
172 |         K = D.dot(W).dot(D)
173 | 
174 |         D = np.ravel(K.sum(axis=1))
175 |         D[D != 0] = 1 / D[D != 0]
176 | 
177 |         D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N])
178 |         P = D
179 |         T = D.dot(K).dot(D)
180 | 
181 |         T = (T + T.T) / 2
182 |         return T, P
183 | 
184 |     def fit(self, data, verbose=True):
185 |         """
186 |         :param data: Data matrix of samples X features
187 |         :param verbose: print progress report
188 | 
189 |         :return: Dictionary containing diffusion operator, weight matrix,
190 |                  diffusion eigen vectors, and diffusion eigen values
191 |         """
192 |         if verbose:
193 |             print('Running Diffusion maps with the following parameters:')
194 |             print('Normalization: %s' % self.normalization)
195 |             print('Number of nearest neighbors k: %d' % self.knn)
196 |             print('Epsilon: %.4f' % self.epsilon)
197 | 
198 |         # Nearest neighbors
199 |         start = time.process_time()
200 |         N = data.shape[0]
201 |         nbrs = NearestNeighbors(n_neighbors=self.knn).fit(data)
202 |         distances, indices = nbrs.kneighbors(data)
203 | 
204 |         # Adjacency matrix
205 |         rows = np.zeros(N * self.knn, dtype=np.int32)
206 |         cols = np.zeros(N * self.knn, dtype=np.int32)
207 |         dists = np.zeros(N * self.knn)
208 |         location = 0
209 |         for i in range(N):
210 |             inds = range(location, location + self.knn)
211 |             rows[inds] = indices[i, :]
212 |             cols[inds] = i
213 |             dists[inds] = distances[i, :]
214 |             location += self.knn
215 |         W = csr_matrix((dists, (rows, cols)), shape=[N, N])
216 | 
217 |         # Symmetrize W
218 |         W = W + W.T
219 | 
220 |         # Convert to affinity (with selfloops)
221 |         rows, cols, dists = find(W)
222 |         rows = np.append(rows, range(N))
223 |         cols = np.append(cols, range(N))
224 |         dists = np.append(dists / (self.epsilon ** 2), np.zeros(N))
225 |         W = csr_matrix((np.exp(-dists), (rows, cols)), shape=[N, N])
226 | 
227 |         # Create D
228 |         D = np.ravel(W.sum(axis=1))
229 |         D[D != 0] = 1 / D[D != 0]
230 | 
231 |         # Go through the various normalizations
232 |         fnorm = getattr(self, self.normalization)
233 |         T, P = fnorm(D=D, N=N, W=W)
234 | 
235 |         if self.normalization != 'bimarkov' and verbose:
236 |             print('%.2f seconds' % (time.process_time() - start))
237 | 
238 |         # Eigen value decomposition
239 |         V, D = GraphDiffusion.keigs(T, self.n_diffusion_components, P, take_diagonal=1)
240 |         self.eigenvectors = V
241 |         self.eigenvalues = D
242 |         self.diffusion_operator = T
243 |         self.weights = W
244 |         return {'operator': T, 'eigval': D, 'eigvec': V, 'weights': W}
245 | 


--------------------------------------------------------------------------------
/src/seqc/stats/gsea.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import os
  3 | import shlex
  4 | import glob
  5 | import re
  6 | import numpy as np
  7 | import pandas as pd
  8 | from scipy.special import expit
  9 | 
 10 | 
 11 | class GSEA:
 12 | 
 13 |     def __init__(self, correlations, output_stem=None):
 14 |         """initialize a gsea object
 15 |         :param pd.Series correlations: correlations in the range of [-1, 1] whose index
 16 |           contains gene names
 17 |         :param str output_stem: the filestem for the output data
 18 | 
 19 |         :method linear_scale: method to linearly scale a vector to lie on the interval
 20 |           [-1, 1]
 21 |         :method logisitc_scale: method to scale a vector by the logistic function to lie
 22 |           on the interval [-1, 1]
 23 |         :method run: run GSEA on these correlations
 24 |         """
 25 |         if not isinstance(correlations, pd.Series):
 26 |             raise TypeError('correlations must be a pandas series')
 27 |         if not ((np.min(correlations) >= -1) & (np.max(correlations) <= 1)):
 28 |             raise RuntimeError(
 29 |                 'input correlations were not contained within the interval [-1, 1]. '
 30 |                 'Please use JavaGSEA.linear_scale() or JavaGSEA.logistic_scale() to '
 31 |                 'scale values to this interval before running.')
 32 |         self._correlations = correlations.sort_values()
 33 |         self._rnk = None
 34 |         if output_stem is None:
 35 |             self._output_stem = os.environ['TMPDIR'] + 'gsea_corr_{!s}'.format(
 36 |                 np.random.randint(0, 1000000))
 37 |         elif not isinstance(output_stem, str):
 38 |             raise TypeError('output stem must be a str reference to a file prefix')
 39 |         elif output_stem.find('-') > -1:
 40 |             raise ValueError('output_stem cannot contain the dash (-) character.')
 41 |         else:
 42 |             self._output_stem = output_stem
 43 |         self._results = {}
 44 | 
 45 |     @property
 46 |     def correlations(self):
 47 |         return self._correlations
 48 | 
 49 |     @correlations.setter
 50 |     def correlations(self):
 51 |         raise RuntimeError('Please create a new object to compare different correlations')
 52 | 
 53 |     @property
 54 |     def results(self):
 55 |         return self._results
 56 | 
 57 |     @staticmethod
 58 |     def linear_scale(data: pd.Series) -> pd.Series:
 59 |         """scale input vector to interval [-1, 1] using a linear scaling
 60 |         :return correlations: pd.Series, data scaled to the interval [-1, 1]
 61 |         """
 62 |         data = data.copy()
 63 |         data -= np.min(data, axis=0)
 64 |         data /= np.max(data, axis=0) / 2
 65 |         data -= 1
 66 |         return data
 67 | 
 68 |     @staticmethod
 69 |     def logistic_scale(data: pd.Series) -> pd.Series:
 70 |         """scale input vector to interval [-1, 1] using a sigmoid scaling
 71 |         :return correlations: pd.Series, data scaled to the interval [-1, 1]
 72 |         """
 73 |         return pd.Series((expit(data.values) * 2) - 1, index=data.index)
 74 | 
 75 |     def _save_rank_file(self) -> None:
 76 |         """save the correlations to a .rnk file"""
 77 |         self._rnk = self._output_stem + '.rnk'
 78 |         df = pd.DataFrame(self._correlations).fillna(0)
 79 |         df.to_csv(self._rnk, sep='\t', header=False)
 80 | 
 81 |     @staticmethod
 82 |     def _gmt_options():
 83 |         """
 84 |         Private method. identifies GMT files available for mouse or human genomes
 85 |         :return: str, file options
 86 |         """
 87 | 
 88 |         mouse_options = os.listdir(os.path.expanduser('~/.seqc/tools/mouse'))
 89 |         human_options = os.listdir(os.path.expanduser('~/.seqc/tools/human'))
 90 |         print('Available GSEA .gmt files:\n\nmouse:\n{m}\n\nhuman:\n{h}\n'.format(
 91 |                 m='\n'.join(mouse_options),
 92 |                 h='\n'.join(human_options)))
 93 |         print('Please specify the gmt_file parameter as gmt_file=(organism, filename)')
 94 | 
 95 |     def run(self, gmt_file):
 96 |         """
 97 |         Helper function. Run GSEA on an already-ranked list of corrleations. To see
 98 |         available files, leave gmt_file parameter empty
 99 | 
100 |         :param (str, str) gmt_file: organism and filename of gmt file to use
101 |         :return (pd.DataFrame, pd.DataFrame): positive and negative GSEA enrichments
102 |         """
103 |         out_dir, out_prefix = os.path.split(self._output_stem)
104 |         os.makedirs(out_dir, exist_ok=True)
105 | 
106 |         if self._rnk is None:
107 |             self._save_rank_file()
108 | 
109 |         if not gmt_file:
110 |             self._gmt_options()
111 |             return
112 |         else:
113 |             if not len(gmt_file) == 2:
114 |                 raise ValueError('gmt_file should be a tuple of (organism, filename).')
115 |             else:
116 |                 gmt_file = os.path.expanduser('~/.seqc/tools/{}/{}').format(*gmt_file)
117 | 
118 |         # Construct the GSEA call
119 |         cmd = shlex.split(
120 |             'java -cp {user}/.seqc/tools/gsea2-2.2.1.jar -Xmx1g '
121 |             'xtools.gsea.GseaPreranked -collapse false -mode Max_probe -norm meandiv '
122 |             '-nperm 1000 -include_only_symbols true -make_sets true -plot_top_x 0 '
123 |             '-set_max 500 -set_min 50 -zip_report false -gui false -rnk {rnk} '
124 |             '-rpt_label {out_prefix} -out {out_dir}/ -gmx {gmt_file}'
125 |             ''.format(user=os.path.expanduser('~'), rnk=self._rnk, out_prefix=out_prefix,
126 |                       out_dir=out_dir, gmt_file=gmt_file))
127 | 
128 |         # Call GSEA
129 |         p = subprocess.Popen(cmd, stderr=subprocess.PIPE)
130 |         _, err = p.communicate()
131 | 
132 |         # find the file that GSEA created
133 |         if err:
134 |             print(err.decode())
135 |             return
136 |         else:
137 |             pattern = '{p}.GseaPreranked.[0-9]*'.format(p=out_prefix)
138 |             files = os.listdir(out_dir)
139 |             folder = None
140 |             for f in files:
141 |                 mo = re.match(pattern, f)
142 |                 if mo:
143 |                     folder = out_dir + '/' + mo.group(0)
144 |         if folder is None:
145 |             raise RuntimeError(
146 |                 'seqc.JavaGSEA was not able to recover the output of the Java '
147 |                 'executable. This likely represents a bug.')
148 | 
149 |         # recover information from run
150 |         names = ['size', 'es', 'nes', 'p', 'fdr_q', 'fwer_p', 'rank_at_max',
151 |                  'leading_edge']
152 |         pos = pd.DataFrame.from_csv(glob.glob(folder + '/gsea*pos*xls')[0],
153 |                                     sep='\t', infer_datetime_format=False, parse_dates=False).iloc[:, :-1]
154 |         pos.drop(['GS<br> follow link to MSigDB', 'GS DETAILS'], axis=1, inplace=True)
155 |         neg = pd.DataFrame.from_csv(glob.glob(folder + '/gsea*neg*xls')[0],
156 |                                     sep='\t', infer_datetime_format=False, parse_dates=False).iloc[:, :-1]
157 |         neg.drop(['GS<br> follow link to MSigDB', 'GS DETAILS'], axis=1, inplace=True)
158 |         pos.columns, neg.columns = names, names
159 |         self._results[gmt_file] = {'positive': pos, 'negative': neg}
160 |         return list(self._results[gmt_file].values())
161 | 


--------------------------------------------------------------------------------
/src/seqc/stats/mast.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import subprocess
 3 | import imp
 4 | import os
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | 
 9 | def run_mast(counts_filtered, clustering_communities, output_prefix):
10 |     # Differentially Expression Analysis using MAST
11 |     log_counts = (counts_filtered + 1.0).applymap(math.log2)
12 |     de_results = []  # array containing the differentially expression analysis for each cluster
13 |     for c in range(np.max(clustering_communities) + 1):
14 |         tmp_input_file = output_prefix + "_cluster_" + str(c) + "_mast_input.csv"
15 |         tmp_output_file = output_prefix + "_cluster_" + str(c) + "_mast_results.csv"
16 |         reduced_tdf1 = log_counts.iloc[np.where(clustering_communities == c)[0]]
17 |         reduced_tdf2 = log_counts.iloc[np.where(clustering_communities != c)[0]]
18 |         reduced_df = pd.concat([reduced_tdf1, reduced_tdf2])
19 |         reduced_df.index = pd.Index([1 if i < len(reduced_tdf1.index) else 0 for i in range(len(reduced_tdf1.index) + len(reduced_tdf2.index))])
20 |         reduced_df.to_csv(tmp_input_file)
21 | 
22 |         path_to_run_mast = imp.find_module('seqc')[1]
23 |         args = 'Rscript {p} {i} {o}'.format(p=os.path.join(path_to_run_mast, 'run_mast.R'), i=tmp_input_file, o=tmp_output_file)
24 |         with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as p:
25 |             out, err = p.communicate()
26 |             if os.path.isfile(tmp_output_file):
27 |                 de_gene_df = pd.read_csv(tmp_output_file)
28 |                 if len(de_gene_df.index) > 0:
29 |                     de_results.append(de_gene_df)
30 |                 else:  # if no differentially expressed genes
31 |                     de_results.append(None)
32 |             else:
33 |                 de_results.append(None)
34 | 
35 |     de_gene_list_file = output_prefix + "_de_gene_list.txt"
36 |     with open(de_gene_list_file, "w") as f:
37 |         f.write("Differential Expression Analysis Using MAST\n\n")
38 |         c = 1
39 |         for de_result in de_results:
40 |             if de_result is not None:
41 |                 f.write("Differentially expressed genes for cluster %d:\n" % (c))
42 |                 f.write("%-10s  %-10s  %-10s  %-10s\n" % ("Gene", "p", "p.fdr", "logFC"))
43 | 
44 |                 for i in range(len(de_result)):
45 |                     p_v = "%.2e" % de_result.loc[i][1]
46 |                     p_fdr = "%.2e" % de_result.loc[i][2]
47 |                     logFC = "%.2f" % de_result.loc[i][3]
48 |                     f.write("%-10s  %-10s  %-10s  %-10s\n" % (de_result.loc[i][0], p_v, p_fdr, logFC))
49 |             else:
50 |                 f.write("No differentially expressed genes has been found for cluster %d.\n" % (c))
51 |             c += 1
52 |             f.write("\n")
53 |         f.close()
54 |     return de_gene_list_file


--------------------------------------------------------------------------------
/src/seqc/stats/pca.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | class PCA:
  6 | 
  7 |     def __init__(self, n_components=30):
  8 |         """
  9 |         construct a model for Principle Component Analysis
 10 | 
 11 |         :param n_components: number of principle components to retain
 12 | 
 13 |         :property eigenvalues: stores the eigenvalues computed by fit()
 14 |         :property loadings: stores the eigenvectors of the pca decomposition computed by
 15 |           fit()
 16 |         :method fit: fit the model to the data
 17 |         :method transform: project the data onto a subset of the principle components
 18 |           (default: all components other than the first)
 19 |         :method fit_transform: fit and transform the data, returning the projected result
 20 |         """
 21 |         self.n_components = n_components
 22 |         self.loadings = None
 23 |         self.eigenvalues = None
 24 | 
 25 |     def fit(self, data: np.ndarray or pd.DataFrame, fillna=0):
 26 |         """
 27 |         Fit the model to data
 28 | 
 29 |         :param data: n observation x k feature data array
 30 |         :param fillna: fill np.NaN values with this value. If None, will not fill.
 31 |         :return: None
 32 |         """
 33 | 
 34 |         if isinstance(data, pd.DataFrame):
 35 |             X = data.values
 36 |         elif isinstance(data, np.ndarray):
 37 |             X = data
 38 |         else:
 39 |             raise TypeError('data must be a pd.DataFrame or np.ndarray')
 40 | 
 41 |         if fillna is not None:
 42 |             X[np.where(np.isnan(X))] = fillna
 43 |             X[np.where(np.isinf(X))] = fillna
 44 | 
 45 |         # Compute covariance matrix
 46 |         if X.shape[1] < X.shape[0]:
 47 |             C = np.cov(X, rowvar=False)
 48 |         # if N > D, we better use this matrix for the eigendecomposition
 49 |         else:
 50 |             C = np.multiply((1 / X.shape[0]), np.dot(X, X.T))
 51 | 
 52 |         # Perform eigendecomposition of C
 53 |         C[np.where(np.isnan(C))] = 0
 54 |         C[np.where(np.isinf(C))] = 0
 55 |         l, M = np.linalg.eig(C)
 56 | 
 57 |         # Sort eigenvectors in descending order
 58 |         ind = np.argsort(l)[::-1]
 59 |         l = l[ind]
 60 |         if self.n_components < 1:
 61 |             self.n_components = (
 62 |                 np.where(np.cumsum(np.divide(l, np.sum(l)), axis=0) >=
 63 |                          self.n_components)[0][0] + 1)
 64 |             print('Embedding into ' + str(self.n_components) + ' dimensions.')
 65 |         elif self.n_components > M.shape[1]:
 66 |             self.n_components = M.shape[1]
 67 |             print('Target dimensionality reduced to ' + str(self.n_components) + '.')
 68 | 
 69 |         M = M[:, ind[:self.n_components]]
 70 |         l = l[:self.n_components]
 71 | 
 72 |         # Apply mapping on the data
 73 |         if X.shape[1] >= X.shape[0]:
 74 |             M = np.multiply(np.dot(X.T, M), (1 / np.sqrt(X.shape[0] * l)).T)
 75 | 
 76 |         self.loadings = M
 77 |         self.eigenvalues = l
 78 | 
 79 |     def transform(self, data, components=None) -> np.ndarray or pd.DataFrame:
 80 |         """
 81 |         Transform data using the fit PCA model.
 82 | 
 83 |         :param data:  n observation x k feature data array
 84 |         :param components:  components to retain when transforming
 85 |           data, if None, uses all components except for the first
 86 |         :return: np.ndarray containing transformed data
 87 |         """
 88 | 
 89 |         if components is None:
 90 |             components = np.arange(1, self.n_components)
 91 | 
 92 |         projected = np.dot(data, self.loadings[:, components])
 93 |         if isinstance(data, pd.DataFrame):
 94 |             return pd.DataFrame(projected, index=data.index, columns=components)
 95 |         else:
 96 |             return projected
 97 | 
 98 |     def fit_transform(self, data: np.ndarray or pd.DataFrame, n_components=None) -> \
 99 |             np.ndarray or pd.DataFrame:
100 |         """
101 |         Fit the model to data and transform the data using the fit model
102 | 
103 |         :param data:  n observation x k feature data array
104 |         :param n_components:  number of components to retain when transforming
105 |           data
106 |         :return np.ndarray or pd.DataFrame: transformed data
107 |         """
108 | 
109 |         self.fit(data)
110 |         return self.transform(data, components=n_components)
111 | 


--------------------------------------------------------------------------------
/src/seqc/stats/resampled_nonparametric.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from functools import partial
  3 | from multiprocessing import Pool
  4 | from contextlib import closing
  5 | from itertools import repeat
  6 | import numpy as np
  7 | import numpy.ma as ma
  8 | import pandas as pd
  9 | from scipy.stats.mstats import count_tied_groups, rankdata
 10 | from scipy.stats.mstats import kruskalwallis as _kruskalwallis
 11 | from scipy.special import erfc
 12 | from statsmodels.sandbox.stats.multicomp import multipletests
 13 | 
 14 | 
 15 | def get_memory():
 16 |     """
 17 |     """
 18 |     return os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') / (1024 ** 3)
 19 | 
 20 | 
 21 | def _mannwhitneyu(x, y, use_continuity=True):
 22 |     """
 23 |     Computes the Mann-Whitney statistic
 24 |     Missing values in `x` and/or `y` are discarded.
 25 |     Parameters
 26 |     ----------
 27 |     x : ndarray,
 28 |         Input, vector or observations x features matrix
 29 |     y : ndarray,
 30 |         Input, vector or observations x features matrix. If matrix, must have
 31 |         same number of features as x
 32 |     use_continuity : {True, False}, optional
 33 |         Whether a continuity correction (1/2.) should be taken into account.
 34 |     Returns
 35 |     -------
 36 |     statistic : float
 37 |         The Mann-Whitney statistic
 38 |     approx z : float
 39 |         The normal-approximated z-score for U.
 40 |     pvalue : float
 41 |         Approximate p-value assuming a normal distribution.
 42 |     """
 43 |     if x.ndim == 1 and y.ndim == 1:
 44 |         x, y = x[:, np.newaxis], y[:, np.newaxis]
 45 |     ranks = rankdata(np.concatenate([x, y]), axis=0)
 46 |     nx, ny = x.shape[0], y.shape[0]
 47 |     nt = nx + ny
 48 |     U = ranks[:nx].sum(0) - nx * (nx + 1) / 2.
 49 | 
 50 |     mu = (nx * ny) / 2.
 51 |     u = np.amin([U, nx*ny - U], axis=0)  # get smaller U by convention
 52 | 
 53 |     sigsq = np.ones(ranks.shape[1]) * (nt ** 3 - nt) / 12.
 54 | 
 55 |     for i in np.arange(len(sigsq)):
 56 |         ties = count_tied_groups(ranks[:, i])
 57 |         sigsq[i] -= np.sum(v * (k ** 3 - k) for (k, v) in ties.items()) / 12.
 58 |     sigsq *= nx * ny / float(nt * (nt - 1))
 59 | 
 60 |     if use_continuity:
 61 |         z = (U - 1 / 2. - mu) / np.sqrt(sigsq)
 62 |     else:
 63 |         z = (U - mu) / np.sqrt(sigsq)
 64 | 
 65 |     prob = erfc(abs(z) / np.sqrt(2))
 66 |     return np.vstack([u, z, prob]).T
 67 | 
 68 | 
 69 | def find_sampling_value(group_data, percentile):
 70 |     """
 71 | 
 72 |     :param group_data:
 73 |     :param int percentile:
 74 |     :return:
 75 |     """
 76 |     return min(np.percentile(g.sum(axis=1), percentile) for g in group_data)
 77 | 
 78 | 
 79 | def normalize(data, downsample_value, upsample=False, labels=None):
 80 |     """
 81 |     :param data:
 82 |     :param downsample_value: value to normalize cell counts to. In current implementation,
 83 |         a small number of cells (10%) are upsampled to this value.
 84 |     :param upsample: if False, all observations with size < downsample_value are excluded.
 85 |         if True, those cells are upsampled to downsample_value.
 86 |     :return:
 87 |     """
 88 |     obs_size = data.sum(axis=1)
 89 |     if not upsample:
 90 |         keep = obs_size >= downsample_value
 91 |         data = data[keep, :]
 92 |         if labels is not None:
 93 |             labels = labels[keep]
 94 |     norm = (data * downsample_value) / data.sum(axis=1)[:, np.newaxis]
 95 |     if labels is not None:
 96 |         return norm, labels
 97 |     else:
 98 |         return norm
 99 | 
100 | 
101 | def _draw_sample(normalized_data, n):
102 |     """
103 |     :param normalized_data:
104 |     :param n:
105 |     """
106 |     np.random.seed()
107 |     idx = np.random.randint(0, normalized_data.shape[0], n)
108 |     sample = normalized_data[idx, :]
109 |     p = np.random.sample(sample.shape)  # round samples probabilistically
110 | 
111 |     return np.floor(sample) + (sample % 1 > p).astype(int)
112 | 
113 | 
114 | def _mw_sampling_function(norm_data, n_cell):
115 |     """
116 |     :param norm_data:
117 |     :param n_cell:
118 |     :return:
119 |     """
120 |     a, b = (_draw_sample(d, n_cell) for d in norm_data)
121 |     return _mannwhitneyu(a, b)  # dim = (n_genes, 3)
122 | 
123 | 
124 | def confidence_interval(z):
125 |     """
126 | 
127 |     :param z:
128 |     :return:
129 |     """
130 |     return np.percentile(z, [2.5, 97.5], axis=0).T
131 | 
132 | 
133 | def mannwhitneyu(
134 |         x, y, n_iter=50, sampling_percentile=10, alpha=0.05, verbose=False,
135 |         upsample=False):
136 |     """
137 |     :param x: observations by features array or DataFrame (ndim must be 2, although there
138 |         needn't be more than one feature)
139 |     :param y: observations by features array or DataFrama. Features must be the same as x
140 |     :param n_iter: number of times to sample x and y
141 |     :param sampling_percentile: percentile to downsample to. observations with row sums
142 |         lower than this value will be excluded
143 |     :param alpha: significance threshold for FDR correction
144 |     :param verbose: if True, report number of cells sampled in each iteration and the
145 |         integer value to which cells are downsampled
146 |     :param upsample: if False, cells with size lower than sampling_percentile are
147 |         discarded. If True, those cells are upsampled.
148 |     :return pd.DataFrame: DataFrame with columns:
149 |         U: median u-statistic over the n_iter iterations of the test
150 |         z_approx: median approximate tie-corrected z-score for the mann-whitney U-test
151 |         z_lo: lower bound, 95% confidence interval over z
152 |         z_hi: upper bound, 95% confidence interval over z
153 |         p: p-value for z_approx
154 |         q: FDR-corrected q-value over all tests in output, using two-stage BH-FDR.
155 |     """
156 | 
157 |     # do some sanity checks on input data
158 |     if isinstance(x, pd.DataFrame) and isinstance(y, pd.DataFrame):
159 |         assert np.array_equal(x.columns, y.columns)
160 |         labels = x.columns
161 |         x = x.values
162 |         y = y.values
163 |     elif x.ndim > 1:
164 |         assert x.shape[1] == y.shape[1]
165 |         labels = None
166 |     else:
167 |         labels = None
168 | 
169 |     # calculate sampling values
170 |     v = find_sampling_value([x, y], sampling_percentile)
171 |     norm_data = [normalize(d, v, upsample) for d in [x, y]]
172 |     n_cell = min(d.shape[0] for d in norm_data)
173 |     sampling_function = partial(_mw_sampling_function, n_cell=n_cell)
174 | 
175 |     if verbose:  # report sampling values
176 |         print('sampling %d cells (with replacement) per iteration' % n_cell)
177 |         print('sampling %d molecules per cell' % v)
178 | 
179 |     with closing(Pool()) as pool:
180 |         results = pool.map(sampling_function, repeat(norm_data, n_iter))
181 | 
182 |     results = np.stack(results)  # u, z, p
183 | 
184 |     ci = confidence_interval(results[:, :, 1])
185 |     results = pd.DataFrame(
186 |         data=np.concatenate([np.median(results, axis=0), ci], axis=1),
187 |         index=labels,
188 |         columns=['U', 'z_approx', 'p', 'z_lo', 'z_hi'])
189 | 
190 |     # add multiple-testing correction
191 |     results['q'] = multipletests(results['p'], alpha=alpha, method='fdr_tsbh')[1]
192 | 
193 |     # remove low-value genes whose median sampling value is -inf
194 |     neginf = np.isneginf(results['z_approx'])
195 |     results.ix[neginf, 'z_lo'] = np.nan
196 |     results.ix[neginf, 'z_approx'] = 0
197 |     results.ix[neginf, ['p', 'q']] = 1.
198 | 
199 |     results = results[['U', 'z_approx', 'z_lo', 'z_hi', 'p', 'q']].sort_values('q')
200 |     results.iloc[:, 1:4] = np.round(results.iloc[:, 1:4], 2)
201 | 
202 |     return results
203 | 
204 | 
205 | def _kw_sampling_function(data, splits, n_cell):
206 |     data = [_draw_sample(d, n_cell) for d in np.split(data, splits)]
207 |     return _kruskal(data)
208 | 
209 | 
210 | def _kruskal(data):
211 |     """
212 |     Compute the Kruskal-Wallis H-test for independent samples
213 |     Parameters
214 |     ----------
215 |     sample1, sample2, ... : array_like
216 |        Two or more arrays with the sample measurements can be given as
217 |        arguments.
218 |     Returns
219 |     -------
220 |     statistic : float
221 |        The Kruskal-Wallis H statistic, corrected for ties
222 |     pvalue : float
223 |        The p-value for the test using the assumption that H has a chi
224 |        square distribution
225 |     Notes
226 |     -----
227 |     For more details on `kruskal`, see `stats.kruskal`.
228 |     """
229 |     results = []
230 |     for i in np.arange(data[0].shape[1]):
231 |         args = [d[:, i] for d in data]
232 |         try:
233 |             results.append(_kruskalwallis(*args))
234 |         except ValueError:
235 |             results.append([0, 1.])
236 |     return results
237 | 
238 | 
239 | def category_to_numeric(labels):
240 |     """transform categorical labels to a numeric array"""
241 |     labels = np.array(labels)
242 |     if np.issubdtype(labels.dtype, np.integer):
243 |         return labels
244 |     else:
245 |         cats = np.unique(labels)
246 |         map_ = dict(zip(cats, np.arange(cats.shape[0])))
247 |         return np.array([map_[i] for i in labels])
248 | 
249 | 
250 | def kruskalwallis(
251 |         data, labels, n_iter=50, sampling_percentile=10, alpha=0.05, verbose=False,
252 |         upsample=False):
253 |     """
254 |     :param data: np.ndarray or pd.DataFrame of observations x features
255 |     :param labels: observation labels for categories to be compared
256 |     :param n_iter: number of times to sample x and y
257 |     :param sampling_percentile: percentile to downsample to. observations with row sums
258 |         lower than this value will be excluded
259 |     :param alpha: significance threshold for FDR correction
260 |     :param verbose: if True, report number of cells sampled in each iteration and the
261 |         integer value to which cells are downsampled
262 |     :param upsample: if False, cells with size lower than sampling_percentile are
263 |         discarded. If True, those cells are upsampled.
264 |     :return pd.DataFrame: DataFrame with columns:
265 |         H: median u-statistic over the n_iter iterations of the test
266 |         z_approx: median approximate tie-corrected z-score for the mann-whitney U-test
267 |         z_lo: lower bound, 95% confidence interval over z
268 |         z_hi: upper bound, 95% confidence interval over z
269 |         p: p-value for z_approx
270 |         q: FDR-corrected q-value over all tests in output, using two-stage BH-FDR.
271 |     """
272 | 
273 |     if isinstance(data, pd.DataFrame):
274 |         features = data.columns
275 |         data = data.values
276 |     elif isinstance(data, np.ndarray):
277 |         features = None
278 |     else:
279 |         raise ValueError('data must be a np.ndarray or pd.DataFrame, not %s' %
280 |                          repr(type(data)))
281 | 
282 |     # if labels are not numeric, transform to numeric categories
283 |     labels = category_to_numeric(labels)
284 |     if not labels.shape[0] == data.shape[0]:
285 |         raise ValueError('labels (shape=%s) must match dimension 0 of data (shape=%s)' %
286 |                          (repr(labels.shape), repr(labels.data)))
287 | 
288 |     idx = np.argsort(labels)
289 |     data = data[idx, :]  # will copy
290 |     labels = labels[idx]
291 | 
292 |     splits = np.where(np.diff(labels))[0] + 1
293 | 
294 |     # calculate sampling values and downsample data
295 |     v = find_sampling_value(np.split(data, splits), sampling_percentile)
296 |     norm_data, labels = normalize(data, v, upsample, labels)
297 | 
298 |     splits = np.where(np.diff(labels))[0] + 1  # rediff, norm_data causes loss
299 | 
300 |     n_cell = min(d.shape[0] for d in np.split(norm_data, splits))
301 |     sampling_function = partial(_kw_sampling_function, n_cell=n_cell, splits=splits)
302 | 
303 |     if verbose:  # report sampling values
304 |         print('sampling %d cells (with replacement) per iteration' % n_cell)
305 |         print('sampling %d molecules per cell' % v)
306 | 
307 |     with closing(Pool()) as pool:
308 |         results = pool.map(sampling_function, repeat(norm_data, n_iter))
309 | 
310 |     results = np.stack(results)  # H, p
311 | 
312 |     ci = confidence_interval(results[:, :, 0])  # around H
313 |     results = pd.DataFrame(
314 |         data=np.concatenate([np.median(results, axis=0), ci], axis=1),
315 |         index=features,
316 |         columns=['H', 'p', 'H_lo', 'H_hi'])
317 | 
318 |     results['q'] = multipletests(results['p'], alpha=alpha, method='fdr_tsbh')[1]
319 |     results = results[['H', 'H_lo', 'H_hi', 'p', 'q']]
320 |     return results
321 | 
322 | 


--------------------------------------------------------------------------------
/src/seqc/stats/smoothing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import multiprocessing
 4 | from sklearn.neighbors import NearestNeighbors
 5 | 
 6 | 
 7 | class smoothing:
 8 |     """Data smoothing kernels
 9 | 
10 |     :method kneighbors: transforms each observation (row) of data by setting it
11 |       equal to the average of its k-nearest neighbors
12 |     """
13 | 
14 |     @staticmethod
15 |     def kneighbors(data: np.array or pd.DataFrame, n_neighbors=50, pca=None, **kwargs):
16 |         """
17 |         Smooth gene expression values by setting the expression of each gene in each
18 |         cell equal to the mean value of itself and its n_neighbors
19 | 
20 |         :param data: np.ndarray | pd.DataFrame; genes x cells array
21 |         :param n_neighbors: int; number of neighbors to smooth over
22 |         :param pca: dimensionality reduced matrix, knn will be run on this and applied
23 |           to data (runs much faster)
24 |         :param kwargs: keyword arguments to pass sklearn.NearestNeighbors
25 |         :return: np.ndarray | pd.DataFrame; same as input
26 |         """
27 | 
28 |         if isinstance(data, pd.DataFrame):
29 |             data_ = data.values
30 |         elif isinstance(data, np.ndarray):
31 |             data_ = data
32 |         else:
33 |             raise TypeError("data must be a pd.DataFrame or np.ndarray")
34 | 
35 |         knn = NearestNeighbors(
36 |             n_neighbors=n_neighbors,
37 |             n_jobs=multiprocessing.cpu_count() - 1,
38 |             **kwargs)
39 | 
40 |         if pca is not None:
41 |             knn.fit(pca)
42 |             inds = knn.kneighbors(pca, return_distance=False)
43 |         else:
44 |             knn.fit(data_)
45 |             inds = knn.kneighbors(data_, return_distance=False)
46 | 
47 |         # smoothing creates large intermediates; break up to avoid memory errors
48 |         pieces = []
49 |         num_partitions = np.round(data_.shape[0] / 2000) + 1
50 |         if num_partitions > 2:  # 2 partitions produces start + end, need a third to split
51 |             sep = np.linspace(0, data_.shape[0] + 1, num_partitions, dtype=int)
52 |             for start, end in zip(sep, sep[1:]):
53 |                 pieces.append(data_[inds[start:end, :], :].mean(axis=1))
54 |             res = np.vstack(pieces)
55 |         else:
56 |             res = data_[inds, :].mean(axis=1)
57 | 
58 |         if isinstance(data, pd.DataFrame):
59 |             res = pd.DataFrame(res, index=data.index, columns=data.columns)
60 | 
61 |         return res
62 | 


--------------------------------------------------------------------------------
/src/seqc/stats/tree.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Tree:
 4 | 
 5 |     def __init__(self, id, left=None, right=None, dist=None):
 6 |         self.id = id
 7 |         self.left = left
 8 |         self.right = right
 9 |         self.dist = dist
10 | 
11 |     def __repr__(self):
12 |         return '<Node id=%s, left=%a, right=%a, dist=%a>' % (
13 |             self.id,
14 |             self.left.id if self.left is not None else None,
15 |             self.right.id if self.left is not None else None,
16 |             self.dist if self.dist is not None else None)
17 | 
18 |     @classmethod
19 |     def from_linkage(cls, Z):
20 |         current_id = Z.shape[0] * 2
21 |         tree = {}
22 |         for (left, right, dist, n_children) in Z[::-1]:
23 |             tree[left] = Tree(id=left)
24 |             tree[right] = Tree(id=right)
25 |             if current_id not in tree:
26 |                 tree[current_id] = Tree(id=current_id, left=tree[left], right=tree[right], dist=dist)
27 |             else:
28 |                 tree[current_id].left = tree[left]
29 |                 tree[current_id].right = tree[right]
30 |                 tree[current_id].dist = dist
31 |             current_id -= 1
32 |         return tree[max(tree.keys())]
33 | 
34 |     def is_leaf(self):
35 |         return True if self.left is None and self.right is None else False
36 | 
37 |     @staticmethod
38 |     def nodes2labels(nodes):
39 |         return [n.id for n in nodes]
40 | 
41 |     def get_daughter(self, id_):
42 |         for daughter in self.dfs():
43 |             if daughter.id == id_:
44 |                 return daughter
45 |         return None
46 | 
47 |     def has_daughter(self, id_):
48 |         for daughter in self.dfs():
49 |             if daughter.id == id_:
50 |                 return True
51 |         return False
52 | 
53 |     def dfs(self):
54 |         visited, stack = [], [self]
55 |         while stack:
56 |             vertex = stack.pop()
57 |             yield vertex
58 |             if vertex not in visited:
59 |                 visited.append(vertex)
60 |                 if vertex.left is not None:
61 |                     stack.append(vertex.left)
62 |                 if vertex.right is not None:
63 |                     stack.append(vertex.right)
64 | 
65 |     def bfs(self):
66 |         visited, queue = [], [self]
67 |         while queue:
68 |             vertex = queue.pop(0)
69 |             yield vertex
70 |             if vertex not in visited:
71 |                 visited.append(vertex)
72 |                 if vertex.left is not None:
73 |                     queue.append(vertex.left)
74 |                 if vertex.right is not None:
75 |                     queue.append(vertex.right)
76 | 


--------------------------------------------------------------------------------
/src/seqc/stats/tsne.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import bhtsne
 4 | from seqc.stats.pca import PCA
 5 | 
 6 | class TSNE:
 7 | 
 8 |     def __init__(self, n_components: int=2, run_pca: bool=False,
 9 |                  n_pca_components: int=20, fillna: float=None, **kwargs):
10 |         """
11 |         t-stochastic neighbor embedding
12 | 
13 | 
14 |         :param normalize: if True, scales features to unit size
15 |         :param run_pca: if True, runs PCA on the input data and runs tSNE on the
16 |           components retained by PCA.
17 |         :param n_components: number of tSNE components to return
18 |         :param n_pca_components: number of components to which data should be projected,
19 |           if run_pca is True
20 |         :param fillna: fills np.nan values with this float value
21 |         :param kwargs:  additional keyword arguments to pass tsne
22 | 
23 |         :method fit_transform: fits the tSNE model to data and returns the transformed
24 |           result
25 | 
26 |         """
27 | 
28 |         self.run_pca = run_pca
29 |         self.n_components = n_components
30 |         self.n_pca_components = n_pca_components
31 |         self.kwargs = kwargs
32 |         self.tsne = None
33 |         self.pca = None
34 |         self.fillna = fillna
35 | 
36 |     def fit_transform(self, data: np.ndarray or pd.DataFrame) -> None:
37 |         """
38 |         fit the tSNE model to data given the parameters provided during
39 |          initialization and transform the output
40 | 
41 |         :param data: n observation x k feature data array
42 |         :return np.ndarray or pd.DataFrame: tsne results
43 |         """
44 |         if isinstance(data, pd.DataFrame):
45 |             data_ = data.values
46 |         else:
47 |             data_ = data
48 | 
49 |         if self.fillna is not None:
50 |             data_[np.where(np.isnan(data_))] = self.fillna
51 |             data_[np.where(np.isinf(data_))] = self.fillna
52 |         if self.run_pca:
53 |             self.pca = PCA(n_components=self.n_pca_components)
54 |             data_ = self.pca.fit_transform(data_)
55 | 
56 |         res = bhtsne.tsne(data_.astype(float), dimensions=self.n_components, **self.kwargs)
57 | 
58 |         if isinstance(data, pd.DataFrame):
59 |             self.tsne = pd.DataFrame(res, index=data.index)
60 |         else:
61 |             self.tsne = res
62 |         return self.tsne
63 | 


--------------------------------------------------------------------------------
/src/seqc/stats/ttest.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections.abc import Callable
  3 | from multiprocessing import Pool, cpu_count
  4 | from functools import partial
  5 | from contextlib import closing
  6 | from scipy.stats import t
  7 | import pandas as pd
  8 | from statsmodels.sandbox.stats.multicomp import multipletests
  9 | 
 10 | 
 11 | def estimate_multinomial(x):
 12 |     """estimate empirical multinomial expectation for a set of cells with each cell
 13 |      normalized to contribute equally to the expectation.
 14 | 
 15 |     :param np.ndarray x: cell x gene array containing expression data
 16 |     :return np.ndarray: multinomial expectation over genes of x
 17 |     """
 18 |     return (x / x.sum(axis=1)[:, np.newaxis]).mean(axis=0)
 19 | 
 20 | 
 21 | def assert_input_non_negative(*args):
 22 |     """
 23 |     :param [np.ndarray] args: input numpy arrays
 24 |     :return None:
 25 |     """
 26 |     if any(np.any(np.less(a, 0)) for a in args):
 27 |         raise ValueError('input data must be non-negative')
 28 | 
 29 | 
 30 | def _sampling_function(n_iter, n_molecules, theta, n_cells):
 31 |     """
 32 | 
 33 |     :param n_iter:
 34 |     :param n_molecules:
 35 |     :param theta:
 36 |     :param n_cells:
 37 |     :return:
 38 |     """
 39 | 
 40 |     def online_mean_var(nb, mu_b, var_b, na, mu_a, var_a):
 41 |         nx = na + nb
 42 |         delta = mu_b - mu_a
 43 |         mu_x_ = mu_a + delta * nb / nx
 44 |         var_x_ = (na * (var_a + mu_a ** 2) + nb * (var_b + mu_b ** 2)) / nx - mu_x_ ** 2
 45 |         return nx, mu_x_, var_x_
 46 | 
 47 |     res_mu = np.zeros((n_iter, theta.shape[0]), dtype=np.float32)
 48 |     res_var = np.zeros((n_iter, theta.shape[0]), dtype=np.float32)
 49 |     n_cells //= 10
 50 |     for i in np.arange(n_iter):
 51 |         # break sampling (n_cells) into 10 pieces
 52 |         obs = np.random.multinomial(n_molecules, theta, n_cells)
 53 |         mu_x = np.mean(obs, axis=0)
 54 |         var_x = np.mean(obs, axis=0)
 55 |         n_x = obs.shape[0]
 56 |         for _ in np.arange(9):
 57 |             obs = np.random.multinomial(n_molecules, theta, n_cells)
 58 |             mu = np.mean(obs, axis=0)
 59 |             var = np.mean(obs, axis=0)
 60 |             n = obs.shape[0]
 61 |             n_x, mu_x, var_x = online_mean_var(n, mu, var, n_x, mu_x, var_x)
 62 |         res_mu[i, :] = mu_x
 63 |         res_var[i, :] = var_x / n_x
 64 |     return res_mu, res_var
 65 | 
 66 | 
 67 | def sample_moments(mult_probability, n_samples, n_cells, n_molecules):
 68 |     """sample mean and variance of n_cells, each containing n_molecules. n_samples mean/
 69 |     variance pairs are sampled on each call.
 70 | 
 71 |     :param mult_probability:
 72 |     :param n_samples:
 73 |     :param n_cells:
 74 |     :param n_molecules:
 75 |     :return:
 76 |     """
 77 | 
 78 |     # parition iterations among available compute cores
 79 |     ncpu = cpu_count()
 80 |     if n_samples > ncpu:
 81 |         samples_per_process = np.array([n_samples // ncpu] * ncpu)
 82 |         samples_per_process[:n_samples % ncpu] += 1
 83 |     else:
 84 |         samples_per_process = np.ones((n_samples,))
 85 | 
 86 |     # map iterations across compute cores
 87 |     sampler = partial(
 88 |         _sampling_function, n_molecules=n_molecules, theta=mult_probability,
 89 |         n_cells=n_cells)
 90 |     with closing(Pool(ncpu)) as pool:
 91 |         results = pool.map(sampler, samples_per_process)
 92 |         mu, var = (np.vstack(mats) for mats in zip(*results))
 93 | 
 94 |     # all means should be finite
 95 |     assert np.sum(np.isnan(mu)) == 0
 96 | 
 97 |     # in cases where variance is np.nan, we can safely set the variance to zero since the
 98 |     # mean for that tissue will also be zero; this will eliminate singularities caused by
 99 |     # one tissue never expressing a protein.
100 |     var[np.isnan(var)] = 0
101 | 
102 |     return mu, var
103 | 
104 | 
105 | def whelch_satterthwaite_df(a_var, b_var, a_n, b_n):
106 |     t1 = a_var.mean(axis=0)
107 |     t2 = b_var.mean(axis=0)
108 |     numerator = (t1 / a_n + t2 / b_n) ** 2
109 |     denominator = t1 ** 2 / (a_n ** 2 * (a_n - 1)) + t2 ** 2 / (b_n ** 2 * (b_n - 1))
110 |     df = numerator / denominator
111 |     return df
112 | 
113 | 
114 | def whelchs_t(a_mu, a_var, b_mu, b_var, a_n, b_n):
115 |     """
116 | 
117 |     :param np.ndarray a_mu:
118 |     :param np.ndarray a_var:
119 |     :param np.ndarray b_mu:
120 |     :param np.ndarray b_var:
121 |     :param int a_n:
122 |     :param int b_n:
123 |     :return float, float: statistic and p-value
124 |     """
125 |     df = whelch_satterthwaite_df(a_var, b_var, a_n, b_n)
126 |     numerator = a_mu - b_mu  # (samples, genes)
127 |     denominator = np.sqrt(a_var + b_var)  # (samples, genes)
128 |     statistic = numerator / denominator  # (samples, genes)
129 | 
130 |     # statistic has NaNs where there are no observations of a or b (DivideByZeroError)
131 |     statistic[np.isnan(statistic)] = 0
132 |     median_statistic = np.median(np.abs(statistic), axis=0)
133 |     p = (1 - t.cdf(median_statistic, df)) * 2  # p-value
134 |     ci_95 = np.percentile(np.abs(statistic), [2.5, 97.5], axis=0).T
135 | 
136 |     return median_statistic, p, ci_95
137 | 
138 | 
139 | def bootstrap_t(a, b, n_samples=100, n_cells=None, alpha=0.05,
140 |                 downsample_value_function=np.median, labels=None):
141 |     """
142 | 
143 |     :param np.ndarray a:
144 |     :param np.ndarray b:
145 |     :param int n_samples:
146 |     :param int n_cells:
147 |     :param float alpha: acceptable type-I error (default = 0.05)
148 |     :param Callable downsample_value_function: function that identifies the number of
149 |       molecules n to sample from a and b. the sampling number will be the minimum of the
150 |       result across a and b. default = np.median. Other values include np.mean and np.max.
151 |     :param labels: feature labels for columns of a & b
152 |     :return (int, int) statistic, q_val:
153 |     """
154 |     assert_input_non_negative(a, b)
155 |     mult_a = estimate_multinomial(a)
156 |     mult_b = estimate_multinomial(b)
157 | 
158 |     # get number of molecules to sample
159 |     a_sizes = a.sum(axis=1)
160 |     b_sizes = b.sum(axis=1)
161 |     n_molecules = min(
162 |         map(lambda x: downsample_value_function(x).astype(int), [a_sizes, b_sizes]))
163 | 
164 |     # set n_cells to the smaller of the two passed samples (e.g. if comparing two sets,
165 |     # one with 130 cells, and one with 1902 cells, n_cells = 130).
166 |     if n_cells is None:
167 |         n_cells = min(a.shape[0], b.shape[0])
168 | 
169 |     a_mu, a_var = sample_moments(mult_a, n_samples, n_cells, n_molecules)
170 |     b_mu, b_var = sample_moments(mult_b, n_samples, n_cells, n_molecules)
171 | 
172 |     statistic, p, ci_95 = whelchs_t(a_mu, a_var, b_mu, b_var, a.shape[0], b.shape[0])
173 | 
174 |     q = multipletests(p, alpha=alpha, method='fdr_tsbh')[1]
175 | 
176 |     results = pd.DataFrame(
177 |         data=np.vstack([statistic, ci_95.T, p, q]).T,
178 |         index=labels,
179 |         columns=['t', 't_ci95_low', 't_ci95_high', 'p', 'q'])
180 | 
181 |     return results
182 | 


--------------------------------------------------------------------------------
/src/seqc/summary/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/summary/__init__.py


--------------------------------------------------------------------------------
/src/seqc/summary/css/simple-sidebar.css:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * Start Bootstrap - Simple Sidebar (http://startbootstrap.com/)
  3 |  * Copyright 2013-2016 Start Bootstrap
  4 |  * Licensed under MIT (https://github.com/BlackrockDigital/startbootstrap/blob/gh-pages/LICENSE)
  5 |  */
  6 | 
  7 |  body {
  8 |     overflow-x: hidden;
  9 |  }
 10 | 
 11 | /* Toggle Styles */
 12 | 
 13 | #wrapper {
 14 |     padding-left: 0;
 15 |     -webkit-transition: all 0.5s ease;
 16 |     -moz-transition: all 0.5s ease;
 17 |     -o-transition: all 0.5s ease;
 18 |     transition: all 0.5s ease;
 19 | }
 20 | 
 21 | #wrapper.toggled {
 22 |     padding-left: 250px;
 23 | }
 24 | 
 25 | #sidebar-wrapper {
 26 |     z-index: 1000;
 27 |     position: fixed;
 28 |     left: 250px;
 29 |     width: 0;
 30 |     height: 100%;
 31 |     margin-left: -250px;
 32 |     overflow-y: auto;
 33 |     background: #000;
 34 |     -webkit-transition: all 0.5s ease;
 35 |     -moz-transition: all 0.5s ease;
 36 |     -o-transition: all 0.5s ease;
 37 |     transition: all 0.5s ease;
 38 | }
 39 | 
 40 | #wrapper.toggled #sidebar-wrapper {
 41 |     width: 250px;
 42 | }
 43 | 
 44 | #page-content-wrapper {
 45 |     width: 100%;
 46 |     position: absolute;
 47 |     padding: 15px;
 48 | }
 49 | 
 50 | #wrapper.toggled #page-content-wrapper {
 51 |     position: absolute;
 52 |     margin-right: -250px;
 53 | }
 54 | 
 55 | /* Sidebar Styles */
 56 | 
 57 | .sidebar-nav {
 58 |     position: absolute;
 59 |     top: 0;
 60 |     width: 250px;
 61 |     margin: 0;
 62 |     padding: 0;
 63 |     list-style: none;
 64 | }
 65 | 
 66 | .sidebar-nav li {
 67 |     text-indent: 20px;
 68 |     line-height: 40px;
 69 | }
 70 | 
 71 | .sidebar-nav li a {
 72 |     display: block;
 73 |     text-decoration: none;
 74 |     color: #999999;
 75 | }
 76 | 
 77 | .sidebar-nav li a:hover {
 78 |     text-decoration: none;
 79 |     color: #fff;
 80 |     background: rgba(255,255,255,0.2);
 81 | }
 82 | 
 83 | .sidebar-nav li a:active,
 84 | .sidebar-nav li a:focus {
 85 |     text-decoration: none;
 86 | }
 87 | 
 88 | .sidebar-nav > .sidebar-brand {
 89 |     height: 65px;
 90 |     font-size: 18px;
 91 |     line-height: 60px;
 92 | }
 93 | 
 94 | .sidebar-nav > .sidebar-brand a {
 95 |     color: #999999;
 96 | }
 97 | 
 98 | .sidebar-nav > .sidebar-brand a:hover {
 99 |     color: #fff;
100 |     background: none;
101 | }
102 | 
103 | @media(min-width:768px) {
104 |     #wrapper {
105 |         padding-left: 250px;
106 |     }
107 | 
108 |     #wrapper.toggled {
109 |         padding-left: 0;
110 |     }
111 | 
112 |     #sidebar-wrapper {
113 |         width: 250px;
114 |     }
115 | 
116 |     #wrapper.toggled #sidebar-wrapper {
117 |         width: 0;
118 |     }
119 | 
120 |     #page-content-wrapper {
121 |         padding: 20px;
122 |         position: relative;
123 |     }
124 | 
125 |     #wrapper.toggled #page-content-wrapper {
126 |         position: relative;
127 |         margin-right: 0;
128 |     }
129 | }


--------------------------------------------------------------------------------
/src/seqc/summary/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/summary/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/src/seqc/summary/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/summary/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/src/seqc/summary/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/summary/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/src/seqc/summary/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/summary/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/src/seqc/summary/html_/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/summary/html_/__init__.py


--------------------------------------------------------------------------------
/src/seqc/summary/img/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/summary/img/__init__.py


--------------------------------------------------------------------------------
/src/seqc/summary/static/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ambrosejcarr/seqc/21ef6736638a5f05b263876dcc23012faa157100/src/seqc/summary/static/__init__.py


--------------------------------------------------------------------------------
/src/seqc/summary/templates/mini_summary_base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html><html lang="en">
 2 | <head>
 3 | <title>{{output_prefix}} Mini Summary</title><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1">
 4 | <style> .pagebreak { page-break-before: always; } </style>
 5 | </head>
 6 | <body>
 7 | <center><h2>{{output_prefix}} Mini Summary</h2></center>
 8 | <h3>Overall Statistics</h3>
 9 | <table>
10 | 	<tr><td># Reads:</td><td>{{mini_summary_d['n_reads']}}</td></tr>
11 | 	{% if mini_summary_d['uniqmapped_pct'] == 'N/A' %}
12 | 		<tr><td>% of uniquely mapped reads:</td><td>N/A</td></tr>
13 | 		<tr><td>% of multi-mapped reads:</td><td>N/A</td></tr>
14 | 		<tr><td>% of unmapped reads:</td><td>N/A</td></tr>
15 | 		<tr><td>% of filtered reads mapping to genome:</td><td>N/A</td></tr>
16 | 	{% else %}
17 | 		<tr><td>% of uniquely mapped reads:</td><td>{{'%.2f%%' % mini_summary_d['uniqmapped_pct']}}</td></tr>
18 | 		<tr><td>% of multi-mapped reads:</td><td>{{'%.2f%%' % mini_summary_d['multimapped_pct']}}</td></tr>
19 | 		<tr><td>% of unmapped reads:</td><td>{{'%.2f%%' % mini_summary_d['unmapped_pct']}}</td></tr>
20 | 		<tr><td>% of filtered reads mapping to genome:</td><td>{{'%.2f%%' % mini_summary_d['genomic_read_pct']}}</td></tr>
21 | 	{% endif %}
22 | 	<tr><td>Sequencing saturation rate:</td><td>{{'%.2f%%' % mini_summary_d['seq_sat_rate']}}</td></tr>
23 | 	<tr><td>&nbsp</td></tr>
24 | 	<tr><td># Cells:</td><td>{{'%d' % mini_summary_d['n_cells']}}</td></tr>
25 | 	<tr><td>Median molecules per cell:</td><td>{{'%d' % mini_summary_d['med_molcs_per_cell']}}</td></tr>
26 | 	<tr><td>Average reads per cell:</td><td>{{'%d' % mini_summary_d['avg_reads_per_cell']}}</td></tr>
27 | 	<tr><td>Average reads per molecule:</td><td>{{'%.2f' % mini_summary_d['avg_reads_per_molc']}}</td></tr>
28 | 	{% if 'mt_rna_fraction' in mini_summary_d %}
29 | 		<tr><td>% of cells filtered by high mt-RNA content:</td><td>{{'%.2f%%' % mini_summary_d['mt_rna_fraction']}}</td></tr>
30 | 	{% endif %}
31 | </table>
32 | 
33 | <h3>Cell Size Distribution</h3>
34 | <center><img src="{{cellsize_fig}}" style="width:40%;height:40%;"></center>
35 | 
36 | <div class="pagebreak"> </div>
37 | <h3>Filtering</h3>
38 | Indian red indicates cells that have been filtered<br>
39 | <center><img src="{{filter_fig}}" style="width:95%;height:95%;"></center>
40 | 
41 | 
42 | <div class="pagebreak"> </div>
43 | <h3>PCA Components</h3>
44 | <center><img src="{{pca_fig}}" style="width:95%;height:95%;"></center>
45 | 
46 | <div class="pagebreak"> </div>
47 | <h3>Phenograph Clustering</h3>
48 | Library size has been regressed out of all PCA components. We ran Phenograph clustering algorithm on the dataset with revised PCA components and with 80 nearest neighbors.<br><br>
49 | <center><img src="{{tsne_and_phenograph_fig}}" style="width:99%;height:99%;"></center>
50 | 
51 | <h3>Warnings</h3>
52 | <table>    
53 |     {% for w,m in warning_d.items() %}
54 |     	<tr><td>{{w}}:</td><td>{{m}}</td></tr>
55 |     {% endfor %}
56 | </table>
57 | </body>


--------------------------------------------------------------------------------
/src/seqc/summary/templates/section_base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 | 
 6 |     <meta charset="utf-8">
 7 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 8 |     <meta name="viewport" content="width=device-width, shrink-to-fit=no, initial-scale=1">
 9 |     <meta name="description" content="">
10 |     <meta name="author" content="">
11 | 
12 |     <title>SEQC report</title>  <!-- consider adding variable here -->
13 | 
14 |     <!-- Bootstrap Core CSS -->
15 |     <link href="../css/bootstrap.min.css" rel="stylesheet">
16 | 
17 |     <!-- Custom CSS -->
18 |     <link href="../css/simple-sidebar.css" rel="stylesheet">
19 | 
20 |     <!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->
21 |     <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
22 |     <!--[if lt IE 9]>
23 |     <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
24 |     <script src="https://oss.maxcdn.com/libs/respond.js/1.4.2/respond.min.js"></script>
25 |     <![endif]-->
26 | 
27 | </head>
28 | 
29 | <body>
30 | 
31 |     <div id="wrapper">
32 | 
33 |         <!-- Sidebar -->
34 |         <div id="sidebar-wrapper">
35 |             <ul class="sidebar-nav">
36 |                 <li class="sidebar-title">
37 |                     <a href="{{index_section_link}}">SEQC Run Summary</a>
38 |                 </li>
39 |                 {% for s in sections %}
40 |                 <li>
41 |                     <a href="{{s.filename}}">{{s.name}}</a>
42 |                 </li>
43 |                 {% endfor %}
44 |             </ul>
45 |         </div>
46 |         <!-- /Sidebar -->
47 | 
48 |         <!-- Page Content -->
49 |         <div id="page-content-wrapper">
50 |             {% block content %}{% endblock %}
51 |         </div> <!-- /#page-content-wrapper -->
52 | 
53 |     </div> <!-- /#wrapper -->
54 | 
55 | 
56 |     <!-- jQuery -->
57 |     <script src="../js/jquery.js"></script>
58 | 
59 |     <!-- Bootstrap Core JavaScript -->
60 |     <script src="../js/bootstrap.min.js"></script>
61 | 
62 | </body>
63 | 
64 | </html>
65 | 


--------------------------------------------------------------------------------
/src/seqc/summary/templates/section_content.html:
--------------------------------------------------------------------------------
 1 | {% extends "section_base.html" %}}
 2 | {% block content %}
 3 |     <h2>{{section.name}}</h2>
 4 | 
 5 |     <div class="row">
 6 |         {% for name, c in section.content.items() %}
 7 | 
 8 |         <h3>{{name}}</h3>
 9 | 
10 |         {% if c.keys is defined %}
11 |         <div align="right" class="col-sm-4">
12 |             {% for k in c.keys %}
13 |             {{k}}<br>
14 |             {% endfor %}
15 |         </div>
16 |         <div align="left" class="col-sm-8">
17 |             {% for v in c.values %}
18 |             {{v}}<br>
19 |             {% endfor %}
20 |         </div>
21 |         {% elif c.text is defined %}
22 |         <div align="left" class="col-sm-12">
23 |             {{c.text}}
24 |         </div>
25 |         {% elif c.image is defined %}
26 |         <div class="col-sm-8">
27 |             <img src="../img/{{c.image}}" alt="{{c.caption}}">  <!-- need style here to set size in pixels -->
28 |         </div>
29 |         <div align="left" class="col-sm-4">
30 |             {{c.legend}}
31 |         </div>
32 |         {% endif %}
33 | 
34 |         {% endfor %}
35 |     </div>
36 | {% endblock %}


--------------------------------------------------------------------------------
/src/seqc/summary/test.py:
--------------------------------------------------------------------------------
 1 | import nose2
 2 | import unittest
 3 | from seqc.summary import summary
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | class TestSummary(unittest.TestCase):
 8 | 
 9 |     def test_render_section(self):
10 |         s1 = summary.Section.from_alignment_summary(
11 |             '/var/folders/y3/ysxvl2w921d881nfpvx5ypvh0000gn/T/seqc/test_no_aws_in_drop_v2'
12 |             '/alignment_summary.txt')
13 |         s1.render('./src/seqc/summary/test_summary.html')
14 | 
15 | if __name__ == "__main__":
16 |     nose2.main()
17 | 


--------------------------------------------------------------------------------
/src/seqc/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.0"
2 | 


--------------------------------------------------------------------------------