├── .circleci
    └── config.yml
├── .github
    ├── ISSUE_TEMPLATE
    │   └── need-some-support.md
    └── workflows
    │   └── python-app.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── README.md
    ├── install-SUSE.md
    ├── install-dev.md
    └── run-test.md
├── nose2.cfg
├── repackage.py
├── requirements.txt
├── setup.py
└── src
    ├── scripts
        └── SEQC
    └── seqc
        ├── __init__.py
        ├── alignment
            ├── __init__.py
            ├── sam.py
            └── star.py
        ├── barcode_correction.py
        ├── core
            ├── __init__.py
            ├── download.py
            ├── index.py
            ├── instances.py
            ├── main.py
            ├── notebook.py
            ├── parser.py
            ├── progress.py
            ├── run.py
            ├── start.py
            ├── terminate.py
            └── verify.py
        ├── distance.py
        ├── ec2.py
        ├── email_.py
        ├── exceptions.py
        ├── filter.py
        ├── gene_info.py
        ├── h5.py
        ├── io.py
        ├── log.py
        ├── multialignment.py
        ├── notebooks
            ├── __init__.py
            ├── analysis_template.json
            ├── notebooks.py
            └── test_notebooks.py
        ├── platforms.py
        ├── plot.py
        ├── read_array.py
        ├── reader.py
        ├── rmt_correction.py
        ├── run_mast.R
        ├── sequence
            ├── __init__.py
            ├── barcodes.py
            ├── encodings.py
            ├── fastq.py
            ├── gtf.py
            └── index.py
        ├── sparse_frame.py
        ├── stats
            ├── __init__.py
            ├── anova.py
            ├── correlation.py
            ├── experimental_yield.py
            ├── g_test.py
            ├── graph_diffusion.py
            ├── gsea.py
            ├── mast.py
            ├── pca.py
            ├── resampled_nonparametric.py
            ├── smoothing.py
            ├── tree.py
            ├── tsne.py
            └── ttest.py
        ├── summary
            ├── __init__.py
            ├── css
            │   ├── bootstrap.css
            │   ├── bootstrap.min.css
            │   └── simple-sidebar.css
            ├── fonts
            │   ├── glyphicons-halflings-regular.eot
            │   ├── glyphicons-halflings-regular.svg
            │   ├── glyphicons-halflings-regular.ttf
            │   ├── glyphicons-halflings-regular.woff
            │   └── glyphicons-halflings-regular.woff2
            ├── html_
            │   └── __init__.py
            ├── img
            │   └── __init__.py
            ├── js
            │   ├── bootstrap.js
            │   ├── bootstrap.min.js
            │   └── jquery.js
            ├── static
            │   └── __init__.py
            ├── summary.py
            ├── templates
            │   ├── mini_summary_base.html
            │   ├── section_base.html
            │   └── section_content.html
            └── test.py
        ├── tests
            ├── __init__.py
            ├── test_args.py
            ├── test_dataset.py
            ├── test_index.py
            ├── test_run_e2e_local.py
            ├── test_run_e2e_remote.py
            ├── test_run_gtf.py
            ├── test_run_readarray.py
            └── test_run_rmt_correction.py
        └── version.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | 
 3 | orbs:
 4 |   python: circleci/python@0.2.1
 5 | 
 6 | jobs:
 7 |   build-and-test:
 8 |     executor: python/default
 9 |     steps:
10 |       - checkout
11 |       - python/load-cache
12 |       - run:
13 |           name: Install cython/numpy/bhtsne
14 |           command: |
15 |             pip install Cython
16 |             pip install numpy
17 |             pip install bhtsne
18 |       - python/install-deps
19 |       - python/save-cache
20 |       - run:
21 |           name: Install seqc
22 |           command: pip install .
23 |       - run:
24 |           name: Test
25 |           command: |
26 |             export TMPDIR="/tmp"
27 |             python -m nose2 -s src/seqc/tests test_run_rmt_correction
28 | 
29 | 
30 | workflows:
31 |   main:
32 |     jobs:
33 |       - build-and-test
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/need-some-support.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Need some support
 3 | about: Ask for help or create a bug report
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. ...
16 | 2. ...
17 | 
18 | **Expected behavior**
19 | A clear and concise description of what you expected to happen.
20 | 
21 | **Screenshots**
22 | If applicable, add screenshots to help explain your problem.
23 | 
24 | **Assay**
25 | - 10x v2/v3, Drop-seq, In-drop, ...
26 | 
27 | **Runtime Environment**
28 |  - SEQC Version [e.g. v0.2.2]
29 |  - HPC/AWS/GCP/Desktop
30 | 
31 | **Additional context**
32 | Add any other context about the problem here.
33 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on: [push, pull_request]
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python 3.8
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: 3.8
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         pip install flake8 pytest
23 |         pip install Cython
24 |         pip install numpy
25 |         pip install bhtsne
26 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
27 |     - name: Lint with flake8
28 |       run: |
29 |         # stop the build if there are Python syntax errors or undefined names
30 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
31 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
32 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
33 |     - name: Install SEQC
34 |       run: pip install .
35 |     - name: Test with nose2
36 |       run: |
37 |         export TMPDIR="/tmp"
38 |         nose2 -s src/seqc/tests test_run_rmt_correction
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg*
 2 | *.idea*
 3 | *__pycache__*
 4 | .idea*
 5 | testfiles*
 6 | *.DS_Store*
 7 | *seqc.log
 8 | build/*
 9 | dist/*
10 | .project
11 | .pydevproject
12 | .c9/
13 | test-data/
14 | dask-worker-space/
15 | 
16 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/seqc/summary/*/*.css
2 | include src/seqc/summary/fonts/*
3 | include src/seqc/summary/*/*.py
4 | include src/seqc/summary/*/*.js
5 | include src/seqc/summary/*/*.html
6 | include src/seqc/notebooks/*.json
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SEquence Quality Control (SEQC -- /sek-si:/)
  2 | 
  3 | ## Overview:
  4 | 
  5 | SEQC is a python package that processes single-cell sequencing data in the cloud and analyzes it interactively on your local machine.
  6 | 
  7 | To faciliate easy installation and use, we have made available Amazon Machine Images (AMIs) that come with all of SEQC's dependencies pre-installed. In addition, we have uploaded common genome indices (`-i/--index parameter`) and barcode data (`--barcode-files`) to public Amazon S3 repositories. These links can be provided to SEQC and it will automatically fetch them prior to initiating an analysis run. Finally, it can fetch input data directly from BaseSpace or amazon s3 for analysis.
  8 | 
  9 | For users with access to in-house compute clusters, SEQC can be installed on your systems and run using the `--local` parameter.
 10 | 
 11 | ## Dependencies:
 12 | 
 13 | ### Python 3
 14 | 
 15 | Python3 must be installed on your local machine to run SEQC. We recommend installing Python3 through Miniconda (https://docs.conda.io/en/latest/miniconda.html).
 16 | 
 17 | ### Python 3 Libraries
 18 | 
 19 |  We recommend creating a virtual environment before installing anything:
 20 | 
 21 | ```bash
 22 | conda create -n seqc python=3.7.7 pip
 23 | conda activate seqc
 24 | ```
 25 | 
 26 | ```bash
 27 | pip install Cython
 28 | pip install numpy
 29 | pip install bhtsne
 30 | ```
 31 | 
 32 | ### STAR, Samtools, and HDF5
 33 | 
 34 | To process data locally using SEQC, you must install the <a href=https://github.com/alexdobin/STAR>STAR Aligner</a>, <a href=http://www.htslib.org/>Samtools</a>, and <a href=https://support.hdfgroup.org/HDF5/>hdf5</a>. If you only intend to use SEQC to trigger remote processing on AWS, these dependencies are optional. We recommend installing samtools and hdf5 through your package manager, if possible.
 35 | 
 36 | ## SEQC Installation
 37 | 
 38 | Once all dependencies have been installed, SEQC can be installed by running:
 39 | 
 40 | ```bash
 41 | export SEQC_VERSION="0.2.11"
 42 | wget https://github.com/hisplan/seqc/archive/v${SEQC_VERSION}.tar.gz
 43 | tar xvzf v${SEQC_VERSION}.tar.gz
 44 | cd seqc-${SEQC_VERSION}
 45 | pip install .
 46 | ```
 47 | 
 48 | ## Hardware Requirements:
 49 | 
 50 | For processing a single lane (~200M reads) against human- and mouse-scale genomes, SEQC requires 30GB RAM, approximately 200GB free hard drive space, and scales linearly with additional compute cores. If running on AWS (see below), jobs are automatically scaled up or down according to the size of the input. There are no hardware requirements for the computer used to launch remote instances.
 51 | 
 52 | ## Running SEQC on Local Machine:
 53 | 
 54 | Download an example dataset (1k PBMCs from a healthy donor; freely available at 10x Genomics https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0/pbmc_1k_v3):
 55 | 
 56 | ```bash
 57 | wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/pbmc_1k_v3_fastqs.tar
 58 | tar xvf pbmc_1k_v3_fastqs.tar
 59 | ```
 60 | 
 61 | Move R1 FASTQ files to the `barcode` folder and R2 FASTQ files to the `genomic` folder:
 62 | 
 63 | ```bash
 64 | mkdir barcode
 65 | mkdir genomic
 66 | mv ./pbmc_1k_v3_fastqs/*R1*.fastq.gz barcode/
 67 | mv ./pbmc_1k_v3_fastqs/*R2*.fastq.gz genomic/
 68 | ```
 69 | 
 70 | Download the 10x barcode whitelist file:
 71 | 
 72 | ```bash
 73 | mkdir whitelist
 74 | wget https://seqc-public.s3.amazonaws.com/barcodes/ten_x_v3/flat/3M-february-2018.txt
 75 | mv 3M-february-2018.txt ./whitelist/
 76 | ```
 77 | 
 78 | The resulting directory structure should look something like this:
 79 | 
 80 | ```
 81 | .
 82 | ├── barcode
 83 | │   ├── pbmc_1k_v3_S1_L001_R1_001.fastq.gz
 84 | │   └── pbmc_1k_v3_S1_L002_R1_001.fastq.gz
 85 | ├── genomic
 86 | │   ├── pbmc_1k_v3_S1_L001_R2_001.fastq.gz
 87 | │   └── pbmc_1k_v3_S1_L002_R2_001.fastq.gz
 88 | ├── pbmc_1k_v3_fastqs
 89 | │   ├── pbmc_1k_v3_S1_L001_I1_001.fastq.gz
 90 | │   └── pbmc_1k_v3_S1_L002_I1_001.fastq.gz
 91 | ├── pbmc_1k_v3_fastqs.tar
 92 | └── whitelist
 93 |     └── 3M-february-2018.txt
 94 | ```
 95 | 
 96 | Create a reference package (STAR index + gene annotation):
 97 | 
 98 | ```bash
 99 | SEQC index \
100 |   --organism homo_sapiens \
101 |   --ensemble-release 93 \
102 |   --valid-biotypes protein_coding lincRNA antisense IG_V_gene IG_D_gene IG_J_gene IG_C_gene TR_V_gene TR_D_gene TR_J_gene TR_C_gene \
103 |   --read-length 101 \
104 |   --folder index \
105 |   --local
106 | ```
107 | 
108 | Run SEQC:
109 | 
110 | ```bash
111 | export AWS_DEFAULT_REGION=us-east-1
112 | export SEQC_MAX_WORKERS=7
113 | 
114 | SEQC run ten_x_v3 \
115 |   --index ./index/ \
116 |   --barcode-files ./whitelist/ \
117 |   --barcode-fastq ./barcode/ \
118 |   --genomic-fastq ./genomic/ \
119 |   --output-prefix PBMC \
120 |   --no-filter-low-coverage \
121 |   --min-poly-t 0 \
122 |   --star-args runRNGseed=0 \
123 |   --local
124 | ```
125 | 
126 | ## Running SEQC on Amazon Web Services:
127 | 
128 | SEQC can be run on any unix-based operating system, however it also features the ability to automatically spawn Amazon Web Services instances to process your data.
129 | 
130 | 1. <a href=http://aws.amazon.com>Set up an AWS account</a>
131 | 2. <a href=https://aws.amazon.com/cli/>Install and configure AWS CLI</a>
132 | 3. <a href=http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html>Create and upload an rsa-key for AWS</a>
133 | 
134 | Run SEQC:
135 | 
136 | ```bash
137 | SEQC run ten_x_v2 \
138 |   --ami-id ami-08652ee2477761403 \
139 |   --user-tags Job:Test,Project:PBMC-Test,Sample:pbmc_1k_v3 \
140 |   --index s3://seqc-public/genomes/hg38_long_polya/ \
141 |   --barcode-files s3://seqc-public/barcodes/ten_x_v2/flat/ \
142 |   --genomic-fastq s3://.../genomic/ \
143 |   --barcode-fastq s3://.../barcode/ \
144 |   --upload-prefix s3://.../seqc-results/ \
145 |   --output-prefix PBMC \
146 |   --no-filter-low-coverage \
147 |   --min-poly-t 0 \
148 |   --star-args runRNGseed=0
149 | ```
150 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # docs
 2 | 
 3 | ## Developers
 4 | 
 5 | - [Environment setup for development](./install-dev.md)
 6 | - [Running test](./run-test.md)
 7 | 
 8 | 
 9 | ## Generating Reference Packages
10 | 
11 | This generates a reference package (STAR index and GTF) using SEQC.
12 | 
13 | - Ensembl 85
14 | - Gene annotation file that contains only the reference chromosomes (no scaffolds, no patches)
15 | - Only these biotypes: 'protein_coding', 'lincRNA', 'IG_V_gene', 'IG_C_gene', 'IG_J_gene', 'TR_C_gene', 'TR_J_gene', 'TR_V_gene', 'TR_D_gene', 'IG_D_gene'
16 | - Not passing anything to `--additional-id-types`
17 | - Setting the read length to 101 (internally, this becomes 100)
18 | 
19 | ### Local
20 | 
21 | ```bash
22 | SEQC index \
23 |     -o homo_sapiens \
24 |     -f homo_sapiens \
25 |     --ensemble-release 85 \
26 |     --valid-biotypes protein_coding lincRNA antisense IG_V_gene IG_D_gene IG_J_gene IG_C_gene TR_V_gene TR_D_gene TR_J_gene TR_C_gene \
27 |     --read-length 101 \
28 |     --folder ./test-data/index/ \
29 |     --local
30 | ```
31 | 
32 | ### AWS
33 | 
34 | ```bash
35 | SEQC index \
36 |     -o homo_sapiens \
37 |     -f homo_sapiens \
38 |     --ensemble-release 85 \
39 |     --valid-biotypes protein_coding lincRNA antisense IG_V_gene IG_D_gene IG_J_gene IG_C_gene TR_V_gene TR_D_gene TR_J_gene TR_C_gene \
40 |     --read-length 101 \
41 |     --upload-prefix s3://dp-lab-test/seqc/index/86/ \
42 |     --rsa-key ~/dpeerlab-chunj.pem \
43 |     --ami-id ami-037cc8c1417e197c1
44 | ```
45 | 


--------------------------------------------------------------------------------
/docs/install-SUSE.md:
--------------------------------------------------------------------------------
 1 | # Installation for SUSE
 2 | 
 3 | This was tested with AWS SUSE Linux Enterprise Server 15 SP1 (HVM).
 4 | 
 5 | ## Install gcc & c++
 6 | 
 7 | ```bash
 8 | sudo zypper in gcc-c++
 9 | ```
10 | 
11 | ## Install Miniconda
12 | 
13 | ```bash
14 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
15 | bash Miniconda3-latest-Linux-x86_64.sh
16 | ```
17 | 
18 | For more information:
19 | - https://docs.conda.io/en/latest/miniconda.html
20 | - https://conda.io/projects/conda/en/latest/user-guide/install/linux.html#install-linux-silent
21 | 
22 | Log out log back in.
23 | 
24 | ## Create a Virtual Environment
25 | 
26 | ```bash
27 | conda create -n seqc python=3.7.7 pip
28 | conda activate seqc
29 | ```
30 | 
31 | ## Install dependencies
32 | 
33 | ```
34 | pip install Cython
35 | pip install numpy
36 | pip install bhtsne
37 | 
38 | conda install -c anaconda hdf5
39 | conda install -c bioconda samtools
40 | conda install -c bioconda star
41 | ```
42 | 
43 | ## Install SEQC
44 | 
45 | ```
46 | wget https://github.com/dpeerlab/seqc/archive/v0.2.11.tar.gz
47 | tar xvzf v0.2.11.tar.gz
48 | cd seqc-0.2.11/
49 | pip install .
50 | ```
51 | 


--------------------------------------------------------------------------------
/docs/install-dev.md:
--------------------------------------------------------------------------------
 1 | # Setup for Development
 2 | 
 3 | Last verified: Jun 4, 2020
 4 | 
 5 | ## Create Conda Environment
 6 | 
 7 | ```bash
 8 | conda create -n seqc-dev python=3.7.7 pip
 9 | conda activate seqc-dev
10 | ```
11 | 
12 | ## Install Dependencies
13 | 
14 | ```bash
15 | pip install Cython
16 | pip install numpy
17 | pip install bhtsne
18 | ```
19 | 
20 | For Mac (Mojave 10.14.6), install the following additional components. You must have `brew` to install.
21 | 
22 | ```
23 | brew install cairo
24 | brew install pango
25 | ```
26 | 
27 | ## Install SEQC (editable mode)
28 | 
29 | ```bash
30 | pip install --editable .
31 | ```
32 | 
33 | ## Install STAR
34 | 
35 | ```bash
36 | curl -OL https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz
37 | tar -xf 2.5.3a.tar.gz
38 | cp STAR-2.5.3a/bin/MacOSX_x86_64/STAR /usr/local/bin/
39 | ```
40 | 
41 | ## Install samtools
42 | 
43 | ```bash
44 | conda install -c bioconda samtools=1.3.1
45 | ```
46 | 
47 | ## Install Packages for Testing
48 | 
49 | ```bash
50 | pip install nose
51 | ```
52 | 
53 | ## Install Packages for Linting and Formating
54 | 
55 | ```bash
56 | pip install pylint
57 | pip install autopep8
58 | pip install black
59 | ```
60 | 


--------------------------------------------------------------------------------
/docs/run-test.md:
--------------------------------------------------------------------------------
 1 | # Running Test
 2 | 
 3 | ## Setup
 4 | 
 5 | Set the following environment variables:
 6 | 
 7 | ```bash
 8 | export SEQC_TEST_RSA_KEY=/Users/chunj/dpeerlab-chunj.pem
 9 | export SEQC_TEST_EMAIL=jaeyoung.chun@gmail.com
10 | export SEQC_TEST_AMI_ID=ami-037cc8c1417e197c1
11 | ```
12 | 
13 | For local test, download test data in S3 to your test machine:
14 | 
15 | ```
16 | aws s3 sync s3://seqc-public/test/ten_x_v2/ ./test-data/datasets/ten_x_v2/
17 | aws s3 sync s3://seqc-public/barcodes/ten_x_v2/ ./test-data/datasets/barcodes/ten_x_v2/
18 | aws s3 sync s3://seqc-public/genomes/hg38_chr19/ ./test-data/datasets/genomes/hg38_chr19/
19 | ```
20 | 
21 | ## Test Everything
22 | 
23 | Runs tests based on `nose2.cfg`:
24 | 
25 | ```bash
26 | nose2
27 | ```
28 | 
29 | ## SEQC index
30 | 
31 | ```bash
32 | nose2 -s src/seqc/tests test_index
33 | ```
34 | 
35 | Besides the nose2 test results, actual SEQC output files can be found here, for example:
36 | 
37 | ```
38 | s3://dp-lab-cicd/seqc/index-ciona_intestinalis-0d19e818-7623-4a1d-bac3-a8c9e3be1e3e/
39 | ```
40 | 
41 | ## SEQC run
42 | 
43 | ### Local
44 | 
45 | SEQC will run with `--local`.
46 | 
47 | ```bash
48 | nose2 -s src/seqc/tests test_run_e2e_local
49 | ```
50 | 
51 | ### Remote
52 | 
53 | SEQC will run on AWS.
54 | 
55 | The following will generate a package that can be uploaded to AWS EC2 for testing:
56 | 
57 | ```bash
58 | python repackage.py
59 | ```
60 | 
61 | ```bash
62 | nose2 -s src/seqc/tests test_run_e2e_remote
63 | ```
64 | 
65 | Besides the nose2 test results, actual SEQC output files can be found here, for example:
66 | 
67 | ```
68 | s3://dp-lab-cicd/seqc/run-in_drop_v2-a997b408-f883-4ba2-9941-8b541e319850/
69 | ```
70 | 
71 | ### Clean Up
72 | 
73 | ```bash
74 | aws s3 rm s3://dp-lab-cicd/seqc/ --recursive
75 | ```
76 | 


--------------------------------------------------------------------------------
/nose2.cfg:
--------------------------------------------------------------------------------
1 | [unittest]
2 | start-dir = src/seqc/tests
3 | test-file-pattern = test_*.py
4 | test-method-prefix = test
5 | 


--------------------------------------------------------------------------------
/repackage.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | import os
 4 | import shutil
 5 | 
 6 | 
 7 | def ignore_test_and_tools(dir_, files):
 8 |     """Filter files to be moved by shutil.copytree. Ignore any hidden file and the
 9 |     test and tools directories, which are not needed by the remote instance.
10 |     :param dir_: dummy variable, must be present to be passed to shutil.copytree()
11 |     :param files: output of os.listdir(), files to be subjected to filtering
12 |     :return list: list of files that should be filtered, and not copied.
13 |     """
14 |     return [
15 |         f
16 |         for f in files
17 |         if (f == "test" or f == "test-data" or f == "__pycache__" or f.startswith("."))
18 |     ]
19 | 
20 | 
21 | setup_dir = os.path.dirname(os.path.realpath(__file__))
22 | seqc_dir = os.path.expanduser("~/.seqc/seqc")
23 | 
24 | print("setup_dir: {}".format(setup_dir))
25 | print("seqc_dir: {}".format(seqc_dir))
26 | 
27 | # delete the existing one
28 | if os.path.isdir(seqc_dir):
29 |     shutil.rmtree(seqc_dir)
30 | 
31 | # copy the SEQC files in the working directory to ~/.seqc/seqc
32 | shutil.copytree(setup_dir, seqc_dir, ignore=ignore_test_and_tools)
33 | 
34 | # create .tar.gz of ~/.seqc/seqc/*
35 | shutil.make_archive(base_name=seqc_dir, format="gztar", root_dir=seqc_dir)
36 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython>0.14
 2 | numpy>=1.10.0
 3 | bhtsne
 4 | wikipedia
 5 | awscli
 6 | numexpr>=2.4
 7 | pandas>=1.0.4
 8 | paramiko>=2.0.2
 9 | regex
10 | requests
11 | nose2
12 | scipy>=1.5.1
13 | boto3
14 | intervaltree
15 | matplotlib
16 | tinydb
17 | tables
18 | fastcluster
19 | statsmodels==0.11.1
20 | ecdsa
21 | jupyter
22 | jinja2
23 | pycrypto
24 | cairocffi==0.8.0
25 | weasyprint==0.42.2
26 | scikit_learn>=0.17
27 | tqdm
28 | pendulum
29 | dask>=2.25.0
30 | distributed>=2.25.0
31 | dill>=0.3.2
32 | bokeh>=2.1.1
33 | numba~=0.51.2
34 | PhenoGraph>=1.5.7
35 | magic@https://github.com/dpeerlab/magic/archive/v0.1.1.tar.gz
36 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | from subprocess import call
 5 | from setuptools import setup
 6 | from warnings import warn
 7 | import py_compile
 8 | from pathlib import Path
 9 | 
10 | 
11 | # Replace py_compile.compile with a function that calls it with doraise=True
12 | # so stop when there is a syntax error
13 | orig_py_compile = py_compile.compile
14 | 
15 | 
16 | def doraise_py_compile(file, cfile=None, dfile=None, doraise=False):
17 |     orig_py_compile(file, cfile=cfile, dfile=dfile, doraise=True)
18 | 
19 | 
20 | py_compile.compile = doraise_py_compile
21 | 
22 | if sys.version_info.major != 3:
23 |     raise RuntimeError("SEQC requires Python 3")
24 | if sys.version_info.minor < 5:
25 |     warn("Multiprocessing analysis methods may not function on Python versions < 3.5")
26 | 
27 | main_ns = {}
28 | 
29 | # get version
30 | with open("src/seqc/version.py") as f:
31 |     exec(f.read(), main_ns)
32 | 
33 | setup(
34 |     name="seqc",
35 |     version=main_ns["__version__"],
36 |     description="Single Cell Sequencing Processing and QC Suite",
37 |     author="Ambrose J. Carr",
38 |     author_email="mail@ambrosejcarr.com",
39 |     package_dir={"": "src"},
40 |     package_data={"": ["*.r", "*.R"]},
41 |     packages=[
42 |         "seqc",
43 |         "seqc.sequence",
44 |         "seqc.alignment",
45 |         "seqc.core",
46 |         "seqc.stats",
47 |         "seqc.summary",
48 |         "seqc.notebooks",
49 |     ],
50 |     install_requires=[
51 |         dep.strip() for dep in Path("requirements.txt").read_text("utf-8").splitlines()
52 |     ],
53 |     scripts=["src/scripts/SEQC"],
54 |     extras_require={"GSEA_XML": ["html5lib", "lxml", "BeautifulSoup4"]},
55 |     include_package_data=True,
56 | )
57 | 
58 | # look for star
59 | if not shutil.which("STAR"):
60 |     warn("SEQC: STAR is not installed. SEQC will not be able to align files.")
61 | 
62 | # get location of setup.py
63 | setup_dir = os.path.dirname(os.path.realpath(__file__))
64 | seqc_dir = os.path.expanduser("~/.seqc/seqc")
65 | 
66 | print("setup_dir: {}".format(setup_dir))
67 | print("seqc_dir: {}".format(seqc_dir))
68 | 
69 | if os.path.isdir(seqc_dir):
70 |     shutil.rmtree(seqc_dir)
71 | 
72 | 
73 | def ignore_test_and_tools(dir_, files):
74 |     """Filter files to be moved by shutil.copytree. Ignore any hidden file and the
75 |     test and tools directories, which are not needed by the remote instance.
76 |     :param dir_: dummy variable, must be present to be passed to shutil.copytree()
77 |     :param files: output of os.listdir(), files to be subjected to filtering
78 |     :return list: list of files that should be filtered, and not copied.
79 |     """
80 |     return [f for f in files if (f == "test" or f.startswith("."))]
81 | 
82 | 
83 | # install tools and a local copy of seqc.
84 | # copy seqc repository
85 | shutil.copytree(setup_dir, seqc_dir, ignore=ignore_test_and_tools)
86 | shutil.make_archive(base_name=seqc_dir, format="gztar", root_dir=seqc_dir)
87 | 


--------------------------------------------------------------------------------
/src/scripts/SEQC:
--------------------------------------------------------------------------------
1 | #!/usr/local/python3
2 | 
3 | import sys
4 | from seqc.core.main import main
5 | 
6 | if __name__ == "__main__":
7 |     main(sys.argv[1:])
8 | 


--------------------------------------------------------------------------------
/src/seqc/__init__.py:
--------------------------------------------------------------------------------
1 | from .h5 import H5
2 | from .version import __version__
3 | from . import stats
4 | # from . import plot
5 | 


--------------------------------------------------------------------------------
/src/seqc/alignment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/alignment/__init__.py


--------------------------------------------------------------------------------
/src/seqc/alignment/sam.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | from subprocess import Popen, PIPE
  3 | import shutil
  4 | import gzip
  5 | 
  6 | 
  7 | def get_version():
  8 | 
  9 |     proc = Popen(["samtools", "--version"], stderr=PIPE, stdout=PIPE)
 10 |     out, err = proc.communicate()
 11 |     if err:
 12 |         raise ChildProcessError(err)
 13 | 
 14 |     # e.g.
 15 |     # samtools 1.9
 16 |     # Using htslib 1.9
 17 |     # Copyright (C) 2018 Genome Research Ltd.
 18 |     # --> 'samtools 1.9'
 19 |     version = out.decode().strip().split("\n")[0]
 20 | 
 21 |     # --> '1.9'
 22 |     version = version.split(" ")[1]
 23 | 
 24 |     return version
 25 | 
 26 | 
 27 | class SamRecord:
 28 |     """Simple record object allowing access to Sam record properties"""
 29 | 
 30 |     __slots__ = ["_record", "_parsed_name_field"]
 31 | 
 32 |     NameField = namedtuple("NameField", ["pool", "cell", "rmt", "poly_t", "name"])
 33 | 
 34 |     def __init__(self, record):
 35 |         self._record = record
 36 |         self._parsed_name_field = None
 37 | 
 38 |     def __repr__(self):
 39 |         return "<SamRecord{!s}>".format("\t".join(self._record))
 40 | 
 41 |     def __bytes__(self):
 42 |         return "\t".join(self._record) + "\n"
 43 | 
 44 |     @property
 45 |     def qname(self) -> str:
 46 |         return self._record[0]
 47 | 
 48 |     @property
 49 |     def flag(self) -> int:
 50 |         return int(self._record[1])
 51 | 
 52 |     @property
 53 |     def rname(self) -> str:
 54 |         return self._record[2]
 55 | 
 56 |     @property
 57 |     def pos(self) -> int:
 58 |         return int(self._record[3])
 59 | 
 60 |     @property
 61 |     def mapq(self) -> int:
 62 |         return int(self._record[4])
 63 | 
 64 |     @property
 65 |     def cigar(self) -> str:
 66 |         return self._record[5]
 67 | 
 68 |     @property
 69 |     def rnext(self) -> str:
 70 |         return self._record[6]
 71 | 
 72 |     @property
 73 |     def pnext(self) -> int:
 74 |         return int(self._record[7])
 75 | 
 76 |     @property
 77 |     def tlen(self) -> int:
 78 |         return int(self._record[8])
 79 | 
 80 |     @property
 81 |     def seq(self) -> str:
 82 |         return self._record[9]
 83 | 
 84 |     @property
 85 |     def qual(self) -> str:
 86 |         return self._record[10]
 87 | 
 88 |     @property
 89 |     def optional_fields(self):
 90 |         flags_ = {}
 91 |         for f in self._record[11:]:
 92 |             k, _, v = f.split(":")
 93 |             flags_[k] = int(v)
 94 |         return flags_
 95 | 
 96 |     def _parse_name_field(self):
 97 |         fields, name = self.qname.split(";")
 98 |         processed_fields = fields.split(":")
 99 |         processed_fields.append(name)
100 |         self._parsed_name_field = self.NameField(*processed_fields)
101 | 
102 |     @property
103 |     def pool(self) -> str:
104 |         try:
105 |             return self._parsed_name_field.pool
106 |         except AttributeError:
107 |             self._parse_name_field()
108 |             return self._parsed_name_field.pool
109 | 
110 |     @property
111 |     def rmt(self) -> str:
112 |         try:
113 |             return self._parsed_name_field.rmt
114 |         except AttributeError:
115 |             self._parse_name_field()
116 |             return self._parsed_name_field.rmt
117 | 
118 |     @property
119 |     def cell(self) -> str:
120 |         try:
121 |             return self._parsed_name_field.cell
122 |         except AttributeError:
123 |             self._parse_name_field()
124 |             return self._parsed_name_field.cell
125 | 
126 |     @property
127 |     def poly_t(self) -> str:
128 |         try:
129 |             return self._parsed_name_field.poly_t
130 |         except AttributeError:
131 |             self._parse_name_field()
132 |             return self._parsed_name_field.poly_t
133 | 
134 |     @property
135 |     def name(self):
136 |         try:
137 |             return self._parsed_name_field.name
138 |         except AttributeError:
139 |             self._parse_name_field()
140 |             return self._parsed_name_field.name
141 | 
142 |     @property
143 |     def is_mapped(self):
144 |         return False if (int(self.flag) & 4) else True
145 | 
146 |     @property
147 |     def is_unmapped(self):
148 |         return not self.is_mapped
149 | 
150 |     @property
151 |     def is_multimapped(self):
152 |         return True if self.optional_fields["NH"] > 1 else False
153 | 
154 |     @property
155 |     def is_uniquely_mapped(self):
156 |         return True if self.optional_fields["NH"] == 1 else False
157 | 
158 |     @property
159 |     def strand(self):
160 |         minus_strand = int(self.flag) & 16
161 |         return "-" if minus_strand else "+"
162 | 
163 |     # # todo this takes up 66% of the processing time for parsing the sam record
164 |     # @property
165 |     # def dust_low_complexity_score(self) -> int:
166 |     #
167 |     #     # Counts of 3-mers in the sequence
168 |     #     counts = {}
169 |     #     for i in range(len(self.seq) - 2):
170 |     #         kmer = self.seq[i:i + 3]
171 |     #         counts[kmer] = counts.get(kmer, 0) + 1
172 |     #
173 |     #     # Calculate dust score  # todo this is 30% faster when vectorized
174 |     #     score = sum([i * (i - 1) / 2 for i in counts.values()]) / (len(self.seq) - 3)
175 |     #
176 |     #     # Scale score (Max score possible is no. of 3mers/2)
177 |     #     score = int(score / ((len(self.seq) - 2) / 2) * 100)
178 |     #
179 |     #     return score
180 | 
181 | 
182 | class Reader:
183 |     """Simple sam reader, optimized for utility rather than speed"""
184 | 
185 |     def __init__(self, samfile: str):
186 |         """
187 |         :param samfile: str, location of a .sam file
188 | 
189 |         usage:
190 |         if rd = Reader(samfile)
191 |         :method __iter__: iterate over the .sam file's records (also usable in for loop)
192 |         :method __len__: return the number of alignments in the file
193 |         :method itermultialignments: return tuples of multiple alignments, all from the
194 |            same fastq record
195 |         """
196 | 
197 |         self._samfile = samfile
198 |         try:
199 |             samfile_iterator = iter(self)
200 |             next(samfile_iterator)
201 |         except RuntimeError as ex:
202 |             raise ex
203 |         except:
204 |             raise ValueError(
205 |                 "%s is an invalid samfile. Please check file formatting." % samfile
206 |             )
207 | 
208 |     @property
209 |     def samfile(self):
210 |         return self._samfile
211 | 
212 |     def _open(self):
213 |         """
214 |         seamlessly open self._samfile, whether gzipped or uncompressed
215 |         :returns: open file object
216 |         """
217 |         if self.samfile.endswith(".gz"):
218 |             fobj = gzip.open(self.samfile, "rb")
219 |         elif self.samfile.endswith(".bam"):
220 |             if not shutil.which("samtools"):
221 |                 raise RuntimeError("samtools utility must be installed to run bamfiles")
222 |             p = Popen(["samtools", "view", self.samfile], stdout=PIPE)
223 |             fobj = p.stdout
224 |         else:
225 |             fobj = open(self.samfile, "rb")
226 |         return fobj
227 | 
228 |     def __len__(self):
229 |         return sum(1 for _ in self)
230 | 
231 |     def __iter__(self):
232 |         """return an iterator over all non-header records in samfile"""
233 |         fobj = self._open()
234 |         try:
235 |             for line in fobj:
236 |                 line = line.decode()
237 |                 # todo move this if statement to execute only until header is exhausted
238 |                 if line.startswith("@"):
239 |                     continue
240 |                 yield SamRecord(line.strip().split("\t"))
241 |         finally:
242 |             fobj.close()
243 | 
244 |     def iter_multialignments(self):
245 |         """yields tuples of all alignments for each fastq record"""
246 |         sam_iter = iter(self)
247 |         fq = [next(sam_iter)]
248 |         for record in sam_iter:
249 |             if record.qname == fq[0].qname:
250 |                 fq.append(record)
251 |             else:
252 |                 yield tuple(fq)
253 |                 fq = [record]
254 |         yield tuple(fq)
255 | 


--------------------------------------------------------------------------------
/src/seqc/alignment/star.py:
--------------------------------------------------------------------------------
  1 | from subprocess import Popen, PIPE
  2 | from multiprocessing import cpu_count
  3 | from os import makedirs
  4 | import shlex
  5 | 
  6 | 
  7 | def get_version():
  8 | 
  9 |     proc = Popen(["STAR", "--version"], stderr=PIPE, stdout=PIPE)
 10 |     out, err = proc.communicate()
 11 |     if err:
 12 |         raise ChildProcessError(err)
 13 | 
 14 |     version = out.decode().strip()
 15 | 
 16 |     if version.startswith("STAR_"):
 17 |         # e.g. STAR_2.5.3a
 18 |         # --> 2.5.3a
 19 |         return out.decode().strip().split("_")[1]
 20 |     else:
 21 |         # e.g. 2.7.3a
 22 |         return version
 23 | 
 24 | 
 25 | def default_alignment_args(
 26 |     fastq_records: str, n_threads: int or str, index: str, output_dir: str
 27 | ) -> dict:
 28 |     """default arguments for STAR alignment
 29 | 
 30 |     To report unaligned reads, add '--outSAMunmapped': 'Within',
 31 | 
 32 |     :param fastq_records: str, name of fastq file
 33 |     :param n_threads: int or str, number of threads to allocate when calling STAR
 34 |     :param index: str, location of the STAR index
 35 |     :param output_dir: str, prefix for output files
 36 |     :return: dict, default alignment arguments
 37 |     """
 38 |     default_align_args = {
 39 |         "--runMode": "alignReads",
 40 |         "--runThreadN": str(n_threads),
 41 |         "--genomeDir": index,
 42 |         "--outFilterType": "BySJout",
 43 |         "--outFilterMultimapNmax": "10",  # require unique alignments
 44 |         "--limitOutSJcollapsed": "2000000",  # deal with many splice variants
 45 |         "--alignSJDBoverhangMin": "8",
 46 |         "--outFilterMismatchNoverLmax": "0.04",
 47 |         "--alignIntronMin": "20",
 48 |         "--alignIntronMax": "1000000",
 49 |         "--readFilesIn": fastq_records,
 50 |         "--outSAMprimaryFlag": "AllBestScore",  # all equal-scoring reads are primary
 51 |         "--outSAMtype": "BAM Unsorted",
 52 |         "--outFileNamePrefix": output_dir,
 53 |     }
 54 |     if fastq_records.endswith(".gz"):
 55 |         default_align_args["--readFilesCommand"] = "gunzip -c"
 56 |     if fastq_records.endswith(".bz2"):
 57 |         default_align_args["--readFilesCommand"] = "bunzip2 -c"
 58 |     return default_align_args
 59 | 
 60 | 
 61 | def align(
 62 |     fastq_file: str,
 63 |     index: str,
 64 |     n_threads: int,
 65 |     alignment_dir: str,
 66 |     reverse_fastq_file: str or bool = None,
 67 |     **kwargs
 68 | ) -> str:
 69 |     """align a fastq file, or a paired set of fastq files
 70 | 
 71 |     :param fastq_file: str, location of a fastq file
 72 |     :param index: str, folder containing the STAR index
 73 |     :param n_threads: int, number of parallel alignment processes to spawn
 74 |     :param alignment_dir: str, directory for output data
 75 |     :param reverse_fastq_file: optional, location of reverse paired-end fastq file
 76 |     :param kwargs: additional kwargs for STAR, passed without the leading '--'
 77 |     :return: str, .sam file location
 78 |     """
 79 | 
 80 |     runtime_args = default_alignment_args(fastq_file, n_threads, index, alignment_dir)
 81 | 
 82 |     for k, v in kwargs.items():  # overwrite or add any arguments passed from cmdline
 83 |         if not isinstance(k, str):
 84 |             try:
 85 |                 k = str(k)
 86 |             except ValueError:
 87 |                 raise ValueError("arguments passed to STAR must be strings")
 88 |         if not isinstance(v, str):
 89 |             try:
 90 |                 v = str(v)
 91 |             except ValueError:
 92 |                 raise ValueError("arguments passed to STAR must be strings")
 93 |         runtime_args["--" + k] = v
 94 | 
 95 |     # construct command line arguments for STAR
 96 |     cmd = ["STAR"]
 97 |     if reverse_fastq_file:
 98 |         for key, value in runtime_args.items():
 99 |             if key == "--readFilesIn":
100 |                 cmd.extend((key, value))
101 |                 cmd.append(reverse_fastq_file)
102 |             else:
103 |                 cmd.extend((key, value))
104 |     else:
105 |         for pair in runtime_args.items():
106 |             cmd.extend(pair)
107 | 
108 |     cmd = shlex.split(" ".join(cmd))
109 |     aln = Popen(cmd, stderr=PIPE, stdout=PIPE)
110 |     _, err = aln.communicate()
111 |     if err:
112 |         raise ChildProcessError(err)
113 | 
114 |     return alignment_dir + "Aligned.out.bam"
115 | 
116 | 
117 | def create_index(
118 |     fasta: str, gtf: str, genome_dir: str, read_length: int = 75, **kwargs
119 | ) -> None:
120 |     """Create a new STAR index
121 | 
122 |     :param fasta: complete filepath to fasta file
123 |     :param gtf: complete filepath to gtf file
124 |     :param genome_dir: directory in which new index should be constructed
125 |     :param read_length: length of reads that will be aligned against this index
126 |     :param kwargs: additional keyword arguments to pass to the genome construction call.
127 |       to pass --sjdbFileChrStartEnd filename, pass sjdbFileChrStartEnd=filename (no --)
128 |     :return: None
129 |     """
130 |     ncpu = str(cpu_count())
131 |     makedirs(genome_dir, exist_ok=True)
132 |     overhang = str(read_length - 1)
133 | 
134 |     # Popen is hard to work as far as process substitution is concerned.
135 |     # let's just gunzip it before passing to STAR.
136 |     if fasta.endswith(".gz"):
137 |         proc_gunzip = Popen(["gunzip", fasta])
138 |         out, err = proc_gunzip.communicate()
139 |         if err:
140 |             raise ChildProcessError(err)
141 |         fasta = fasta.replace(".gz", "")
142 | 
143 |     cmd = [
144 |         "STAR",
145 |         "--runMode",
146 |         "genomeGenerate",
147 |         "--runThreadN",
148 |         ncpu,
149 |         "--genomeDir",
150 |         genome_dir,
151 |         "--genomeFastaFiles",
152 |         fasta,
153 |         "--sjdbGTFfile",
154 |         gtf,
155 |         "--sjdbOverhang",
156 |         overhang,
157 |     ]
158 | 
159 |     for k, v in kwargs.items():
160 |         cmd.append("--{}".format(k))
161 |         cmd.append(v)
162 | 
163 |     p = Popen(cmd, stderr=PIPE, stdout=PIPE)
164 |     out, err = p.communicate()
165 |     if err:
166 |         raise ChildProcessError(err)
167 | 


--------------------------------------------------------------------------------
/src/seqc/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .progress import progress
2 | from .run import run
3 | from .index import index
4 | from .instances import instances
5 | from .terminate import terminate
6 | from .start import start
7 | from .notebook import notebook


--------------------------------------------------------------------------------
/src/seqc/core/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from seqc import io
 3 | 
 4 | 
 5 | def s3_data(files_or_links, output_prefix):
 6 |     """downloads any data provided by s3 links, otherwise gets list of files.
 7 | 
 8 |     :param list files_or_links: str files or str s3 links to files
 9 |     :param str output_prefix: prefix to prepend files
10 |     :returns list files: filename(s) of downloaded files
11 |     """
12 |     files = []
13 |     for f in files_or_links:
14 |         if not f.startswith("s3://"):
15 |             if f.endswith("/"):
16 |                 files.extend(f + subfile for subfile in os.listdir(f))
17 |             else:
18 |                 files.append(f)
19 |         else:
20 |             recursive = True if f.endswith("/") else False
21 |             files.extend(
22 |                 io.S3.download(f, output_prefix, overwrite=True, recursive=recursive)
23 |             )
24 |     return files
25 | 


--------------------------------------------------------------------------------
/src/seqc/core/index.py:
--------------------------------------------------------------------------------
 1 | def index(args):
 2 |     """create an index for SEQC.
 3 | 
 4 |     :param args: parsed arguments. This function is only called if subprocess_name is
 5 |       'index'
 6 |     """
 7 | 
 8 |     # functions to be pickled and run remotely must import all their own modules
 9 |     import sys
10 |     import logging
11 |     from seqc import ec2, log, io
12 |     from seqc.sequence.index import Index
13 |     from seqc.alignment import star
14 |     from seqc import version
15 | 
16 |     logging.basicConfig(
17 |         level=logging.DEBUG,
18 |         handlers=[
19 |             logging.FileHandler(args.log_name),
20 |             logging.StreamHandler(sys.stdout),
21 |         ],
22 |     )
23 | 
24 |     log.info("SEQC=v{}".format(version.__version__))
25 |     log.info("STAR=v{}".format(star.get_version()))
26 |     log.args(args)
27 | 
28 |     with ec2.instance_clean_up(
29 |         email=args.email,
30 |         upload=args.upload_prefix,
31 |         log_name=args.log_name,
32 |         debug=args.debug,
33 |         terminate=args.terminate,
34 |         running_remote=args.remote,
35 |     ):
36 | 
37 |         idx = Index(args.organism, args.ids, args.folder)
38 |         idx.create_index(
39 |             s3_location=args.upload_prefix,
40 |             ensemble_release=args.ensemble_release,
41 |             read_length=args.read_length,
42 |             valid_biotypes=args.valid_biotypes,
43 |         )
44 | 
45 |         # upload the log file (seqc_log.txt, nohup.log, Log.out)
46 |         if args.upload_prefix:
47 |             bucket, key = io.S3.split_link(args.upload_prefix)
48 |             for item in [args.log_name, "./nohup.log", "./Log.out"]:
49 |                 try:
50 |                     ec2.Retry(retries=5)(io.S3.upload_file)(item, bucket, key)
51 |                     log.info(
52 |                         "Successfully uploaded {} to {}".format(
53 |                             item, args.upload_prefix
54 |                         )
55 |                     )
56 |                 except FileNotFoundError:
57 |                     log.notify(
58 |                         "Item {} was not found! Continuing with upload...".format(item)
59 |                     )
60 | 
61 |     log.info("DONE.")
62 | 


--------------------------------------------------------------------------------
/src/seqc/core/instances.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import os
 3 | 
 4 | 
 5 | def instances(args):
 6 |     """list instances and return
 7 | 
 8 |     :param args: namespace object from argparse, must contain args.rsa_key, the path to
 9 |       the rsa-key used to start the instances you want to list
10 |     :return None:
11 |     """
12 | 
13 |     if args.rsa_key is None:
14 |         raise ValueError('-k/--rsa-key does not point to a valid file object. ')
15 |     if not os.path.isfile(args.rsa_key):
16 |         raise ValueError('-k/--rsa-key does not point to a valid file object. ')
17 | 
18 |     keyname = args.rsa_key.rpartition('.')[0].rpartition('/')[-1]
19 | 
20 |     ec2 = boto3.resource('ec2')
21 |     all_instances = ec2.instances.filter(
22 |         Filters=[
23 |             {'Name': 'key-name',
24 |              'Values': [keyname]}])
25 |     for i in all_instances.all():
26 |         print('id: %s, type: %s, launch-time: %s, state: %s, ip %s' % (
27 |             i.id, i.instance_type, str(i.launch_time), i.state, i.public_ip_address))
28 | 


--------------------------------------------------------------------------------
/src/seqc/core/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python3
 2 | 
 3 | import sys
 4 | from seqc import core
 5 | from seqc.core import parser, verify
 6 | from seqc import ec2
 7 | import boto3
 8 | 
 9 | 
10 | def clean_up_security_groups():
11 |     """
12 |     Cleanning all the unused security groups that were created/started using SEQC
13 |     when the number of unused ones is greater than 300
14 |     """
15 |     ec2 = boto3.resource("ec2")
16 |     sgs = list(ec2.security_groups.all())
17 |     insts = list(ec2.instances.all())
18 | 
19 |     all_sgs = set([sg.group_name for sg in sgs])  # get all security groups
20 |     all_inst_sgs = set(
21 |         [sg["GroupName"] for inst in insts for sg in inst.security_groups]
22 |     )  # get security groups associated with instances
23 |     unused_sgs = all_sgs - all_inst_sgs  # get ones without instance association
24 | 
25 |     if len(unused_sgs) >= 100:
26 |         print("Cleaning up the unused security groups:")
27 |         client = boto3.client("ec2")
28 |         for g in unused_sgs:
29 |             all_inst_sgs = set(
30 |                 [sg["GroupName"] for inst in insts for sg in inst.security_groups]
31 |             )  # since deleting ones takes a while, doublecheck whether
32 |             if g.startswith("SEQC") and (
33 |                 g not in all_inst_sgs
34 |             ):  # only cleaning ones associated with SEQC                                        # the security group is still unused
35 |                 client.delete_security_group(GroupName=g)
36 |                 print(g + " deleted")
37 | 
38 | 
39 | def main(argv):
40 |     """Check arguments, then call the appropriate sub-module
41 | 
42 |     Created to allow the main pipeline to be tested from the earliest entry point
43 |     (command-line arguments).
44 | 
45 |     :param argv: output of sys.argv[1:]
46 |     """
47 |     arguments = parser.parse_args(argv)
48 | 
49 |     func = getattr(core, arguments.subparser_name)
50 |     assert func is not None
51 | 
52 |     # notebooks execute local
53 |     if arguments.subparser_name == "notebook":
54 |         return func(arguments)
55 | 
56 |     if arguments.remote:
57 |         # todo improve how verification works; it's not really necessary, what is needed
58 |         # is a method to determine volume size for remote.
59 |         verification_func = getattr(verify, arguments.subparser_name)
60 |         verified_args = verification_func(arguments)
61 |         remote_args = {
62 |             k: getattr(verified_args, k)
63 |             for k in (
64 |                 "rsa_key",
65 |                 "instance_type",
66 |                 "spot_bid",
67 |                 "volume_size",
68 |                 "user_tags",
69 |                 "remote_update",
70 |                 "ami_id",
71 |             )
72 |             if getattr(verified_args, k)
73 |         }
74 | 
75 |         # store the command-line arguments supplied by the user
76 |         # the same aguments will be used to run SEQC on EC2
77 |         remote_args["argv"] = argv
78 | 
79 |         # clean up AWS security groups
80 |         clean_up_security_groups()
81 | 
82 |         # start EC2 instance and run the function
83 |         ec2.AWSInstance(synchronous=False, **remote_args)(func)(verified_args)
84 |     else:
85 |         # run the function locally
86 |         func(arguments)
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     main(sys.argv[1:])
91 | 


--------------------------------------------------------------------------------
/src/seqc/core/notebook.py:
--------------------------------------------------------------------------------
 1 | from seqc.notebooks.notebooks import Notebook
 2 | from seqc import log
 3 | 
 4 | 
 5 | def notebook(args):
 6 |     if args.subsubparser_name == 'merge':
 7 |         # need to also take a output directory because this thing will write stuff.
 8 |         # then merge the things
 9 |         # then return?
10 |         n = Notebook(args.output_filename, *args.input_data)
11 |         n.merge_data(merged_sample_name=args.output_filename)
12 |         log.info('Merged samples written to %s' % args.input_data)
13 |     elif args.subsubparser_name == 'generate':
14 |         n = Notebook(args.output_stem, args.input_count_matrix)
15 |         n.write_template()
16 |         log.info('Notebook Template written to %s' % n.notebook_path)
17 |         n.run_notebook()
18 |         log.info('Notebook Run and written to %s' % n.notebook_path)
19 | 
20 | 


--------------------------------------------------------------------------------
/src/seqc/core/progress.py:
--------------------------------------------------------------------------------
 1 | from subprocess import Popen, PIPE
 2 | from seqc import ec2
 3 | from paramiko.ssh_exception import AuthenticationException
 4 | from botocore.exceptions import ClientError
 5 | 
 6 | 
 7 | def progress(args):
 8 |     """print progress of requested seqc run(s) to less
 9 | 
10 |     :param args: namespace object from argparse, must include rsa-key and instance-id
11 |     :return None:
12 |     """
13 |     if args.rsa_key is None:
14 |         raise ValueError('User must supply -k/--rsa-key or set the environment variable '
15 |                          'AWS_RSA_KEY')
16 | 
17 |     if args.instance_ids is None:
18 |         raise ValueError('No instances specified. Please supply an instance using the -i '
19 |                          'parameter.')
20 | 
21 |     for id_ in args.instance_ids:
22 |         connection = ec2.SSHConnection(id_, args.rsa_key)
23 |         try:
24 |             out, err = connection.execute('cat ./seqc_log.txt')
25 |         except AuthenticationException:
26 |             raise ValueError('instance %s cannot be found.' % repr(id_))
27 |         except ClientError:
28 |             raise ValueError('instance %s cannot be found.' % repr(id_))
29 |         p = Popen(['less'], stdin=PIPE)
30 |         p.communicate(input='\n'.join(out).encode())
31 | 


--------------------------------------------------------------------------------
/src/seqc/core/start.py:
--------------------------------------------------------------------------------
 1 | from seqc import ec2
 2 | import os
 3 | 
 4 | 
 5 | def start(args):
 6 |     """start an aws instance"""
 7 | 
 8 |     if args.rsa_key is None:
 9 |         raise ValueError("-k/--rsa-key does not point to a valid file object. ")
10 |     if not os.path.isfile(args.rsa_key):
11 |         raise ValueError("-k/--rsa-key does not point to a valid file object. ")
12 | 
13 |     instance = ec2.AWSInstance(
14 |         rsa_key=args.rsa_key,
15 |         instance_type=args.instance_type,
16 |         spot_bid=args.spot_bid,
17 |         volume_size=args.volume_size,
18 |         ami_id=args.ami_id,
19 |     )
20 | 
21 |     instance.start()
22 | 


--------------------------------------------------------------------------------
/src/seqc/core/terminate.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from botocore.exceptions import ClientError
 3 | 
 4 | 
 5 | def terminate(args):
 6 |     """print progress of requested seqc run to top
 7 | 
 8 |     :param args: namespace object from argparse, must include rsa-key and instance-id
 9 |     :return None:
10 |     """
11 |     ec2 = boto3.resource("ec2")
12 |     for id_ in args.instance_ids:
13 |         instance = ec2.Instance(id=id_)
14 |         try:
15 |             response = instance.terminate()
16 |             print("termination signal sent:\n%s" % response)
17 |         except ClientError:
18 |             print("instance %s does not exist")
19 | 


--------------------------------------------------------------------------------
/src/seqc/core/verify.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import inspect
  4 | from math import ceil
  5 | from seqc import io, platforms, ec2
  6 | 
  7 | 
  8 | def filesize(filename):
  9 |     """return filesize of filename in bytes
 10 | 
 11 |     :param str filename: full path to file
 12 |     :return int: number of bytes in filename
 13 |     """
 14 |     return os.stat(filename).st_size
 15 | 
 16 | 
 17 | def validate_and_return_size(filename):
 18 |     """return true if a link or filepath points to a valid file or directory
 19 | 
 20 |     :param str filename: filepath or s3 link
 21 |     :return None: raises errors if path or link is invalid.
 22 |     """
 23 |     if filename.startswith("s3://"):
 24 |         io.S3.check_links([filename])
 25 |         return io.S3.obtain_size(filename)
 26 |     else:
 27 |         if os.path.isfile(filename):
 28 |             return filesize(filename)
 29 |         elif os.path.isdir(filename.rstrip("/")):
 30 |             return sum(filesize(filename + f) for f in os.listdir(filename))
 31 |         else:
 32 |             print(filename)
 33 |             raise ValueError("%s does not point to a valid file")
 34 | 
 35 | 
 36 | def estimate_required_volume_size(args):
 37 |     """estimate the size of volume that should be attached to an aws instance to run SEQC
 38 | 
 39 |     :param args: namespace object containing filepaths or download links to input data
 40 |     :return int: size of volume in gb
 41 |     """
 42 |     # using worst-case estimates to make sure we don't run out of space, 35 = genome index
 43 |     total = (35 * 1e10) + sum(validate_and_return_size(f) for f in args.barcode_files)
 44 | 
 45 |     # todo stopped here; remove aws dependency
 46 |     if args.barcode_fastq and args.genomic_fastq:
 47 |         total += (
 48 |             sum(validate_and_return_size(f) for f in args.barcode_fastq) * 14 + 9e10
 49 |         )
 50 |         total += (
 51 |             sum(validate_and_return_size(f) for f in args.genomic_fastq) * 14 + 9e10
 52 |         )
 53 |         total += validate_and_return_size(args.index)
 54 | 
 55 |     elif args.alignment_file:
 56 |         total += (validate_and_return_size(args.alignment_file) * 2) + 4e10
 57 |         total += validate_and_return_size(args.index)
 58 | 
 59 |     elif args.merged_fastq:
 60 |         total += (validate_and_return_size(args.merged_fastq) * 13) + 9e10
 61 |         total += validate_and_return_size(args.index)
 62 | 
 63 |     elif args.read_array:
 64 |         total += validate_and_return_size(args.read_array)
 65 | 
 66 |     if args.basespace:
 67 |         if not args.basespace_token or args.basespace_token == "None":
 68 |             raise ValueError(
 69 |                 "If the --basespace argument is used, the basespace token must be "
 70 |                 "specified in the seqc config file or passed as --basespace-token"
 71 |             )
 72 | 
 73 |         io.BaseSpace.check_sample(args.basespace, args.basespace_token)
 74 |         total += (
 75 |             io.BaseSpace.check_size(args.basespace, args.basespace_token) * 14 + 9e10
 76 |         )
 77 | 
 78 |     return ceil(total * 1e-9)
 79 | 
 80 | 
 81 | def run(args) -> float:
 82 |     """
 83 |     verify data input through the command line arguments, fixes minor issues, and
 84 |     throws exceptions if invalid parameters are encountered
 85 | 
 86 |     additionally, this function obtains a rough estimate of how much
 87 |     volume storage is needed for a remote run.
 88 | 
 89 |     :param Namespace args: Namespace object, output from ArgumentParser.parse_args()
 90 |     :returns total: float, estimated Kb of Volume space needed to run SEQC remotely.
 91 |     """
 92 | 
 93 |     if args.rsa_key is None:
 94 |         raise ValueError("-k/--rsa-key does not point to a valid file object. ")
 95 |     if not os.path.isfile(args.rsa_key):
 96 |         raise ValueError("-k/--rsa-key does not point to a valid file object. ")
 97 | 
 98 |     if args.output_prefix.endswith("/"):
 99 |         raise ValueError("output_stem should not be a directory.")
100 |     if not args.index.endswith("/"):
101 |         raise ValueError('index must be a directory, and must end with "/"')
102 | 
103 |     # check platform name; raises ValueError if invalid
104 |     platform_name(args.platform)
105 | 
106 |     # check to make sure that --email-status is passed with remote run
107 |     if args.remote and not args.email:
108 |         raise ValueError("Please supply the --email-status flag for a remote SEQC run.")
109 |     # if args.instance_type not in ['c3', 'c4', 'r3']:  # todo fix this instance check
110 |     #     raise ValueError('All AWS instance types must be either c3, c4, or r3.')
111 |     # if args.terminate not in ['True', 'true', 'False', 'false', 'on-success']:
112 |     #     raise ValueError('the --no-terminate flag must be either True, False, '
113 |     #                      'or on-success.')
114 | 
115 |     # make sure at least one input has been passed
116 |     valid_inputs = (
117 |         args.barcode_fastq,
118 |         args.genomic_fastq,
119 |         args.merged_fastq,
120 |         args.alignment_file,
121 |         args.basespace,
122 |         args.read_array,
123 |     )
124 |     if not any(valid_inputs):
125 |         raise ValueError(
126 |             "At least one input argument (-b/-g, -m, -s, -r, --basespace) must be passed "
127 |             "to SEQC."
128 |         )
129 |     if not args.barcode_files:  # todo clean this up and fold into platform somehow
130 |         if args.platform != "drop_seq":
131 |             raise ValueError("--barcode-files is required for this platform.")
132 | 
133 |     # make sure at most one input type has been passed
134 |     num_inputs = 0
135 |     if args.barcode_fastq or args.genomic_fastq:
136 |         if not all((args.barcode_fastq, args.genomic_fastq)):
137 |             raise ValueError(
138 |                 "if either genomic or barcode fastq are provided, both must be provided"
139 |             )
140 |         num_inputs += 1
141 |     num_inputs += sum(
142 |         1
143 |         for i in (
144 |             args.merged_fastq,
145 |             args.alignment_file,
146 |             args.basespace,
147 |             args.read_array,
148 |         )
149 |         if i
150 |     )
151 |     if num_inputs > 1:
152 |         raise ValueError(
153 |             "user should provide at most one input argument (-b/-g, -m, -s, -r, "
154 |             "--basespace"
155 |         )
156 | 
157 |     # if basespace is being used, make sure there is a valid basespace token
158 |     if args.basespace and not hasattr(args, "basespace_token"):
159 |         raise RuntimeError(
160 |             "if --basespace input is selected, user must provide an OAuth "
161 |             "token using the --basespace-token parameter."
162 |         )
163 | 
164 |     # check that spot-bid is correct
165 |     if args.spot_bid is not None:
166 |         if args.spot_bid < 0:
167 |             raise ValueError("bid %f must be a non-negative float." % args.spot_bid)
168 | 
169 |     if args.upload_prefix and not args.upload_prefix.startswith("s3://"):
170 |         raise ValueError("upload_prefix should be an s3 address beginning with s3://")
171 | 
172 |     if args.upload_prefix.startswith("s3://"):
173 |         ec2.check_bucket(args.upload_prefix)
174 | 
175 |     if args.volume_size is None:
176 |         setattr(args, "volume_size", estimate_required_volume_size(args))
177 | 
178 |     return args
179 | 
180 | 
181 | def index(args):
182 |     """add a default volume_size if it was not otherwise passed to seqc.
183 | 
184 |     :param args: namespace object from argparse
185 |     :return: updated namespace object with volume_size set.
186 |     """
187 |     if args.volume_size is None:
188 |         setattr(args, "volume_size", 100)
189 |     return args
190 | 
191 | 
192 | def executables(*execs):
193 |     """
194 |     checks whether executables are installed on the machine of the
195 |     current seqc run.
196 | 
197 |     :param execs: Tuple of executables to check
198 |     :returns : Tuple of boolean (True if a specific executable is installed).
199 |     """
200 |     return tuple(map(lambda exe: shutil.which(exe) is not None, execs))
201 | 
202 | 
203 | def platform_name(name: str):
204 |     """
205 |     checks whether the platform name supplied by the user is supported by the current
206 |     iteration of seqc.
207 |     :param name: string of platform name to check
208 |     :return: name (if supported by seqc).
209 |     """
210 |     choices = [
211 |         x[0]
212 |         for x in inspect.getmembers(platforms, inspect.isclass)
213 |         if issubclass(x[1], platforms.AbstractPlatform)
214 |     ][1:]
215 |     if name not in choices:
216 |         raise ValueError(
217 |             "Please specify a valid platform name for SEQC. The available "
218 |             "options are: {}".format(choices)
219 |         )
220 |     # throw error for mars1_seq since we don't have the appropriate primer length yet
221 |     if name == "mars1_seq":
222 |         raise ValueError("Mars1-seq is currently not stable in this version of SEQC.")
223 |     return name
224 | 


--------------------------------------------------------------------------------
/src/seqc/distance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def jsd(p, q) -> float:
 5 |     """Jensen Shannon distance of two variables normalized variables p and q
 6 | 
 7 |     Note that if p and q are not normalized, this function will not return a proper
 8 |     distance, so matrices should be normalized prior to use
 9 | 
10 |     use with sklearn.NearestNeighbors:
11 | 
12 |     >>> from sklearn.neighbors import NearestNeighbors
13 |     >>> # set some dummy variables
14 |     >>> data = np.random.random((100, 100))
15 |     >>> data = data / data.sum(axis=1)[:, np.newaxis]  # norm rows
16 |     >>> assert(np.all(np.array(data.sum(axis=1) == 1)))3
17 |     >>> k = 10
18 |     >>>
19 |     >>> nn = NearestNeighbors(k=k, metric='pyfunc', algorithm='ball_tree',
20 |     >>>          metric_params={'func': jsd})
21 |     >>> nn.fit(data)
22 | 
23 |     Parameters
24 |     ----------
25 |     p, q : np.array
26 | 
27 |     Returns
28 |     -------
29 |     float : kl divergence between p and q
30 |     """
31 |     idx = np.logical_or(p != 0, q != 0)
32 |     p = p[idx]
33 |     q = q[idx]
34 |     m = (p + q) / 2
35 |     return np.sqrt((.5 * kldiv(p, m)) + (.5 * kldiv(q, m)))
36 | 
37 | 
38 | def kldiv(x: np.ndarray, m: np.ndarray) -> float:
39 |     """Modified Kullback-Liebler divergence of two variables x and m.
40 | 
41 |     depends upon normalization done by jsd parent function, namely that (1) there are no
42 |     zero-valued entries in m, and (2) both x and m are probability distributions that
43 |     sum to 1
44 | 
45 |     Parameters
46 |     ----------
47 |     x, m : normalized probability vectors
48 | 
49 |     Returns
50 |     -------
51 |     float : kl divergence between p and q
52 |     """
53 |     return np.nansum(x * np.log2(x / m))
54 | 


--------------------------------------------------------------------------------
/src/seqc/email_.py:
--------------------------------------------------------------------------------
 1 | from subprocess import Popen, PIPE
 2 | import os
 3 | 
 4 | 
 5 | def email_user(attachment: str, email_body: str, email_address: str) -> None:
 6 |     """
 7 |     sends an email to email address with text contents of email_body and attachment
 8 |     attached. Email will come from "ec2-User@<ec2-instance-ip-of-aws-instance>
 9 | 
10 |     :param attachment: the file location of the attachment to append to the email
11 |     :param email_body: text to send in the body of the email
12 |     :param email_address: the address to which the email should be sent"""
13 | 
14 |     # todo if remote is sending double emails, add quotes around attachment.
15 |     if isinstance(email_body, str):
16 |         email_body = email_body.encode()
17 |     email_args = ['mutt', '-e', 'set content_type="text/html"', '-a', attachment, '-s',
18 |                   'Remote Process', '--', email_address]
19 |     email_process = Popen(email_args, stdin=PIPE)
20 |     email_process.communicate(email_body)
21 | 


--------------------------------------------------------------------------------
/src/seqc/exceptions.py:
--------------------------------------------------------------------------------
 1 | class RetryLimitExceeded(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class InstanceNotRunningError(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class EC2RuntimeError(Exception):
10 |     pass
11 | 
12 | 
13 | class ConfigurationError(Exception):
14 |     pass
15 | 
16 | 
17 | class ArgumentParserError(Exception):
18 |     pass
19 | 
20 | 
21 | class EmptyMatrixError(Exception):
22 |     pass
23 | 


--------------------------------------------------------------------------------
/src/seqc/h5.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import os
  4 | 
  5 | 
  6 | class H5:
  7 | 
  8 |     def __init__(self, archive_name: str):
  9 |         """Wrapper for the pandas HDFStore class which ensures that all interactions with
 10 |         the archive result in a closed, flushed archive.
 11 | 
 12 |         In order to ensure data usability, all data must be submitted in DataFrame format.
 13 |         This decision was made to encourage users to pair metadata with sequencing data,
 14 |         and reduce the incidence of unexpected data permutation.
 15 | 
 16 |         :param archive_name: name of the h5 archive to open. If the archive does not exist
 17 |           it will be created using a blosc5 filter
 18 | 
 19 |         :method ls: list contents of the archive
 20 |         :method save: save an object to the h5 archive
 21 |         :method load: load an object from the archive
 22 |         :method remove: remove a DataFrame from the archive
 23 |         :method is_open: returns True if the h5 archive is open, else False
 24 |         """
 25 |         if os.path.isfile(archive_name):
 26 |             self._archive = pd.HDFStore(archive_name, mode='a')
 27 |             self._archive.close()
 28 |         else:
 29 |             self._archive = pd.HDFStore(
 30 |                 archive_name, mode='a', complib='blosc', complevel=5)
 31 |             self._archive.close()
 32 | 
 33 |     def __repr__(self):
 34 |         self._archive.open()
 35 |         try:
 36 |             return repr(self._archive)
 37 |         finally:
 38 |             self._archive.close()
 39 | 
 40 |     def save(self, data: pd.DataFrame, location: str) -> None:
 41 |         """Save DataFrame data to the h5 archive in location.
 42 | 
 43 |         :param data: DataFrame object to store
 44 |         :param location: filepath to save the object in the h5 hierarchy
 45 |         """
 46 |         if not isinstance(data, pd.DataFrame):
 47 |             if isinstance(data, np.ndarray):
 48 |                 res = input('np.ndarray class detected. Save as pd.DataFrame with '
 49 |                             'ascending integer indices? [y/n] ')
 50 |                 if res in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
 51 |                     data = pd.DataFrame(data)
 52 |                 else:
 53 |                     print('User elected not to save DataFrame, archive is unmodified.')
 54 |                     return
 55 |             else:
 56 |                 raise TypeError('only pd.DataFrame objects can be saved using this '
 57 |                                 'class. To save np.ndarray objects please see the tables '
 58 |                                 'package.')
 59 |         self._archive.open()
 60 |         try:
 61 |             self._archive[location] = data
 62 |         finally:
 63 |             self._archive.close()
 64 | 
 65 |     def load(self, location: str) -> None:
 66 |         """Load and return the dataframe found at location in the archive.
 67 | 
 68 |         :param location: str, location of object to retrieve from h5
 69 |         :return: pd.DataFrame, object found at location
 70 |         """
 71 |         self._archive.open()
 72 |         try:
 73 |             return self._archive[location]
 74 |         finally:
 75 |             self._archive.close()
 76 | 
 77 |     def ls(self) -> None:
 78 |         """list archive contents"""
 79 |         try:
 80 |             self._archive.open()
 81 |             print(self._archive)
 82 |         finally:
 83 |             self._archive.close()
 84 | 
 85 |     def remove(self, location: str) -> None:
 86 |         """remove the DataFrame at location from the archive
 87 | 
 88 |         Note: removing a dataframe at a branch node will remove all leaves sharing this
 89 |         prefix. e.g. in an archive containing:
 90 | 
 91 |         /data
 92 |         /data/filtered
 93 |         /data/metadata
 94 |         /new_data/data
 95 | 
 96 |         removing /data would remove the first three DataFrame objects from the archive.
 97 | 
 98 |         :param location: location of DataFrame to remove
 99 |         :return: None
100 |         """
101 | 
102 |         self._archive.open()
103 |         try:
104 |             if location not in self._archive.keys():
105 |                 raise ValueError(
106 |                     '{} not contained in archive, nothing to remove.'.format(location))
107 |             else:
108 |                 removed = [k for k in self._archive.keys()
109 |                            if k.startswith(location + '/')]
110 |                 if len(removed) != 0:
111 |                     res = input(
112 |                         'Removing branch node {}, which is a prefix for {!a} will remove '
113 |                         'all listed DataFrames. Continue with removal? [y/n] '.format(
114 |                             location, removed))
115 |                     if res not in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
116 |                         print('returned without deletion.')
117 |                         return
118 |                 self._archive.remove(location)
119 |         finally:
120 |             self._archive.close()
121 | 
122 |     @property
123 |     def is_open(self) -> bool:
124 |         return self._archive.is_open
125 | 


--------------------------------------------------------------------------------
/src/seqc/multialignment.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import itertools
  3 | import time
  4 | import seqc
  5 | 
  6 | 
  7 | class UnionFind:
  8 |     """Union-find data structure.
  9 | 
 10 |     Each unionFind instance X maintains a family of disjoint sets of
 11 |     hashable objects, supporting the following two methods:
 12 | 
 13 |     - X[item] returns a name for the set containing the given item.
 14 |       Each set is named by an arbitrarily-chosen one of its members; as
 15 |       long as the set remains unchanged it will keep the same name. If
 16 |       the item is not yet part of a set in X, a new singleton set is
 17 |       created for it.
 18 | 
 19 |     - X.union(item1, item2, ...) merges the sets containing each item
 20 |       into a single larger set.  If any item is not yet part of a set
 21 |       in X, it is added to X as one of the members of the merged set.
 22 |     """
 23 | 
 24 |     def __init__(self):
 25 |         """Create a new empty union-find structure."""
 26 |         self.weights = {}
 27 |         self.parents = {}
 28 | 
 29 |     def __getitem__(self, obj):
 30 |         """Find and return the name of the set containing the object."""
 31 | 
 32 |         # check for previously unknown object
 33 |         if obj not in self.parents:
 34 |             self.parents[obj] = obj
 35 |             self.weights[obj] = 1
 36 |             return obj
 37 | 
 38 |         # find path of objects leading to the root
 39 |         path = [obj]
 40 |         root = self.parents[obj]
 41 |         while root != path[-1]:
 42 |             path.append(root)
 43 |             root = self.parents[root]
 44 | 
 45 |         # compress the path and return
 46 |         for ancestor in path:
 47 |             self.parents[ancestor] = root
 48 |         return root
 49 | 
 50 |     def __iter__(self):
 51 |         """Iterate through all items ever found or unioned by this structure."""
 52 |         return iter(self.parents)
 53 | 
 54 |     def union(self, *objects):
 55 |         """Find the sets containing the objects and merge them all."""
 56 |         roots = [self[x] for x in objects]
 57 |         heaviest = max([(self.weights[r], r) for r in roots])[1]
 58 |         for r in roots:
 59 |             if r != heaviest:
 60 |                 self.weights[heaviest] += self.weights[r]
 61 |                 self.parents[r] = heaviest
 62 | 
 63 |     def union_all(self, iterable):
 64 |         for i in iterable:
 65 |             self.union(*i)
 66 | 
 67 |     def find_all(self, vals):
 68 |         vals = [self.find_component(v) for v in vals]
 69 |         unique = set(vals)
 70 |         reindex = dict(zip(unique, range(len(unique))))
 71 |         set_membership = np.array([reindex[v] for v in vals])
 72 |         sets = np.array(list(reindex.values()))
 73 |         return set_membership, sets
 74 | 
 75 |     def find_component(self, iterable):
 76 |         """Return the set that obj belongs to
 77 | 
 78 |         If the iterable contains items that have been unioned, then any entry in the
 79 |          iterable will be sufficient to identify the set that obj belongs to. Use the
 80 |          first entry, and return the set associated with iterable.
 81 | 
 82 |         If the iterable has not been entered into the structure, this method can yield
 83 |          incorrect results
 84 |         """
 85 |         return self[next(iter(iterable))]
 86 | 
 87 | 
 88 | def intersection(set_l):
 89 |     res = set_l[0]
 90 |     for s in set_l:
 91 |         res = set(set(res) & set(s))
 92 |     return res
 93 | 
 94 | 
 95 | # # Some constants
 96 | # NO_DISAMBIGUATION = 0
 97 | # RESOLVED_GENE = 1
 98 | # NO_GENE_RESOLVED = 2
 99 | # MULTIPLE_MODELS = 3
100 | 
101 | 
102 | # def reduce_coalignment_array(arr, threshold = 0.0001):
103 | #     res = {}
104 | #     for g in arr:
105 | #         temp = {}
106 | #         for k in arr[g]:
107 | #             if arr[g][k] < threshold:
108 | #                 continue
109 | #             temp[tuple(sorted(k))] = arr[g][k]
110 | #         if len(temp)>0:
111 | #             res[g] = temp
112 | #     return res
113 | 
114 | # #def strip(genes):
115 | # #    return tuple(sorted([int(g[2:]) for g in genes]))
116 | # def strip(genes):
117 | #     return tuple(sorted(genes))
118 | # def strip_model(mod):
119 | #     res = {}
120 | #     for k in mod:
121 | #         res[tuple(sorted(k))]=mod[k]
122 | #     return res
123 | 
124 | # def split_to_disjoint(obs):
125 | #     res = []
126 | #     uf = UnionFind()
127 | #     uf.union_all(obs.keys())
128 | #     set_membership, sets = uf.find_all(obs.keys())
129 | 
130 | #     for s in sets:
131 | #         d = {}
132 | #         for k in np.array(list(obs.keys()))[set_membership == s]:
133 | #             d[tuple(k)] = obs[tuple(k)]
134 | #         res.append(d)
135 | #     return res
136 | 
137 | # def get_indices(inds, obs_subset):
138 | #     res = []
139 | #     for genes in obs_subset:
140 | #         res += inds[genes]
141 | #     return res
142 | 
143 | # def model_to_gene(model):
144 | #     for g in model:
145 | #         if model[g]==1:
146 | #             return g
147 | 
148 | 
149 | # def get_combinations(l):
150 | #     res = []
151 | #     for i in range(len(l)):
152 | #         res += itertools.combinations(l,i+1)
153 | #     return res
154 | 
155 | # # rank the different possible models by their scores
156 | # def best_fit_model(obs_s, coalignment_mat):
157 | #     #obs_s = strip_model(obs)
158 | #     gene_l = single_gene_list(obs_s)  # From the list of observation create a list of unique single genes from which different models can be inferred
159 | 
160 | 
161 | #     if len(obs_s) == 1:
162 | #         if len(list(obs_s.keys())[0]) == 1:
163 | #             return [{gene_l[0]:1}], NO_DISAMBIGUATION
164 | 
165 | #     possible_genes = intersection(list(obs_s.keys()))
166 | 
167 | #     #There is one gene that resolve the disambiguation
168 | #     if len(possible_genes) == 1:
169 | #         model = {}
170 | #         for g in gene_l:
171 | #             model[g] = 0
172 | #         model[list(possible_genes)[0]] = 1
173 | #         return [model], RESOLVED_GENE
174 | 
175 | #     #There is more than one gene that can explain it, no model can be decided
176 | #     if len(possible_genes) > 1:
177 | #         return [], NO_GENE_RESOLVED
178 | 
179 | #     #There are multiple competing models. For now we don't decide bewteen them
180 | #     return [], MULTIPLE_MODELS
181 | # #    mod_score_list = []
182 | # #    for mod in get_combinations(gene_l):
183 | # #        model = {}
184 | # #        for k in gene_l:
185 | # #            if k in mod:
186 | # #                model[k] = 1
187 | # #            else:
188 | # #                model[k] = 0
189 | # #        score = model_score(model, obs_s, coalignment_mat)
190 | # #        mod_score_list.append((model,score))
191 | 
192 | #     #Here to decide if there is one model that's obviously better
193 | # #    return mod_score_list, MULTIPLE_MODELS
194 | 
195 | # # get a model and returns its likelihood score comparing the expected number of reads and the observed
196 | # # model is basically just a bool dic of all the unique genes with flags of wether or not they're in model
197 | # # observed is a dictionary of all gene combinations and their expected proportion
198 | # # coalignment_mat is the coalignment matrix used to calculate the expected number of reads
199 | # # eg:
200 | # #   model - {A:1, B:0}
201 | # #   observed - {A: 100 B:50, AB: 30 }
202 | # #
203 | # def model_score(model, observed, coalignment_mat):
204 | #     exp = {}
205 | #     tot = {}
206 | #     for gene in model:
207 | #         # patch for SC000
208 | #         if gene==0:
209 | #             tot[gene] = model[gene]*observed[gene,]
210 | #         # Theres a common edge case where a gene A will only be aligned with other genes as well, in this case we update our observation vector to include A:0
211 | #         elif (gene, ) not in observed:
212 | #             tot[gene] = 0
213 | #         elif gene not in coalignment_mat:
214 | #             raise KeyError('{} not found in coalignment matrix'.format(gene))
215 | #         elif (gene, ) not in coalignment_mat[gene]:
216 | #             tot[gene] = 0
217 | #         else:
218 | #             tot[gene] = model[gene]*(observed[gene,]/coalignment_mat[gene][gene,])
219 | 
220 | #     keys = get_combinations(model.keys())   #get a list of all possible molecule combinations
221 | 
222 | #     # key is a set of genes and the expected number of reads for it is the sum of expected reads from all genes shared by the key,
223 | #     # these in turn are the total reads for a gene (extrapoletaed from the uniqely mapped) multiplied by the coalignment factor (present in the coalignment matrix)
224 | #     # e.g. if A has 20% coalignment with B and there are 80 reads mapped uniquely to A, we expect 80/0.8 * 0.2 = 20 reads to be mapped to AB from A (and more from B)
225 | #     for k in keys:
226 | #         k = tuple(sorted(k))
227 | #         sum = 0
228 | #         for gene in k:
229 | #             #Patch for SC000
230 | #             if gene==0:
231 | #                 if k==(0,):
232 | #                     sum=1
233 | #                 else:
234 | #                     sum = 0
235 | #             #####
236 | #             elif k in coalignment_mat[gene]:
237 | #                 sum += tot[gene]*coalignment_mat[gene][k]
238 | #         exp[k] = sum
239 | 
240 | #     score = calc_score(observed, exp)
241 | #     return score
242 | 
243 | # def calc_score(obs, exp):
244 | #     sum = 0
245 | #     for k in obs:
246 | #         if k not in exp:
247 | #             print(k)
248 | #             k = tuple(sorted(k))
249 | #             print ('bad key')
250 | #         diff = (obs[k]-exp[k])**2
251 | #         if exp[k]!=0:
252 | #             diff /= exp[k]
253 | #         sum += diff
254 | #     return sum
255 | 
256 | # #Get a dictionary of observations per gene/s and return a list of single unique genes
257 | # def single_gene_list(obs):
258 | #     l = []
259 | #     for genes in obs:
260 | #         for g in genes:
261 | #             l.append(g)
262 | #     return list(set(l))
263 | 
264 | 


--------------------------------------------------------------------------------
/src/seqc/notebooks/__init__.py:
--------------------------------------------------------------------------------
1 | from . import notebooks
2 | 


--------------------------------------------------------------------------------
/src/seqc/notebooks/notebooks.py:
--------------------------------------------------------------------------------
 1 | from jinja2 import Environment, FileSystemLoader
 2 | import os
 3 | import pandas as pd
 4 | import tempfile
 5 | 
 6 | import nbformat
 7 | from nbconvert.preprocessors import ExecutePreprocessor
 8 | 
 9 | 
10 | class Notebook:
11 | 
12 |     def __init__(self, output_stem: str, *data):
13 | 
14 |         # strip notebook affix if user provided it; this is a common error mode
15 |         if output_stem.endswith('.ipynb'):
16 |             output_stem = output_stem.replace('.ipynb', '')
17 |         self._output_stem = output_stem
18 | 
19 |         self._data = data
20 |         self._this_dir = os.path.dirname(os.path.abspath(__file__))
21 | 
22 |     @property
23 |     def notebook_path(self):
24 |         return self._output_stem + '.ipynb'
25 | 
26 |     @property
27 |     def merged_data(self):
28 |         if isinstance(self._data, str):
29 |             if os.path.isfile(self._data):
30 |                 return os.path.abspath(self._data)
31 |         elif isinstance(self._data, (list, tuple)) and isinstance(self._data[0], str):
32 |             if os.path.isfile(self._data[0]):
33 |                 return os.path.abspath(self._data[0])
34 |         raise TypeError('Data is not a 1-length iterable or string that contains a filepath')
35 | 
36 |     def merge_data(self, merged_sample_name=None, remove_unmerged=False):
37 |         """
38 |         This function will merge any datasets provided as nested lists.
39 |         Each top-level value is considered an input alias.
40 |         Any second-level list is considered to be a group of files to be joined
41 | 
42 |         :param bool remove_unmerged: if True, this function will delete the unmerged files after
43 |           completion
44 |         :param str merged_sample_name: name of merged csv file
45 |         :return None: The list of merged file names will replace the list passed to the class in
46 |           self._datasets
47 |         """
48 |         dfs = [pd.read_csv(csv, index_col=0) for csv in self._data]
49 |         df = pd.concat(
50 |             dfs,
51 |             keys=list(range(len(self._data))),
52 |             names=['sample_number', 'cell_id']
53 |         )
54 | 
55 |         if not merged_sample_name:
56 |             merged_sample_name = self._output_stem + '_merged_data.csv'
57 |         df.to_csv(merged_sample_name)
58 | 
59 |         # delete original files, if requested
60 |         if remove_unmerged:
61 |             for csv in self._data:
62 |                 os.remove(csv)
63 | 
64 |         # update file urns
65 |         self._data = merged_sample_name
66 | 
67 |     def write_template(self):
68 |         """write a filled ipython notebook to disk
69 | 
70 |         :return:
71 |         """
72 | 
73 |         j2_env = Environment(loader=FileSystemLoader(self._this_dir), trim_blocks=True)
74 |         rendered = j2_env.get_template('analysis_template.json').render(
75 |             output_stem=self._output_stem,
76 |             data=os.path.abspath(self.merged_data),
77 |         )
78 |         with open(self._output_stem + '.ipynb', 'w') as fdw:
79 |             fdw.write(rendered)
80 | 
81 |     def run_notebook(self, notebook_filename=None):
82 | 
83 |         if not notebook_filename:
84 |             notebook_filename = self._output_stem + '.ipynb'
85 | 
86 |         dir_ = os.getcwd()
87 |         with open(notebook_filename) as f:
88 |             nb = nbformat.read(f, as_version=4)
89 | 
90 |         ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
91 |         ep.preprocess(nb, {'metadata': {'path': dir_}})
92 | 
93 |         with open(notebook_filename, 'wt') as f:
94 |             nbformat.write(nb, f)
95 | 


--------------------------------------------------------------------------------
/src/seqc/notebooks/test_notebooks.py:
--------------------------------------------------------------------------------
 1 | from . import notebooks
 2 | import tempfile
 3 | import pytest
 4 | import numpy as np
 5 | import pandas as pd
 6 | import uuid
 7 | import os
 8 | from seqc.core import main
 9 | 
10 | 
11 | @pytest.fixture()
12 | def testing_data():
13 |     dir_ = tempfile.mkdtemp()
14 |     test_data = [np.random.randint(10, 110, (100, 100)) for _ in range(4)]
15 |     test_files = []
16 |     for f in test_data:
17 |         filename = '{}/{}'.format(dir_, uuid.uuid4())
18 |         pd.DataFrame(f).to_csv(filename)
19 |         test_files.append(filename)
20 |     return test_files
21 | 
22 | 
23 | @pytest.fixture()
24 | def merged_data(testing_data):
25 |     output_stem = os.path.join(tempfile.mkdtemp(), 'test_notebooks')
26 |     n = notebooks.Notebook(output_stem, *testing_data)
27 |     n.merge_data()
28 |     return n.merged_data
29 | 
30 | 
31 | def test_template_filling(testing_data):
32 |     output_stem = os.path.join(tempfile.mkdtemp(), 'test_notebooks')
33 |     n = notebooks.Notebook(output_stem, *testing_data)
34 |     n.merge_data()
35 |     n.write_template()
36 |     n.run_notebook()
37 |     print(os.listdir(os.path.dirname(output_stem)))
38 | 
39 | 
40 | def test_merge_api(testing_data):
41 |     output_filename = os.path.join(tempfile.mkdtemp(), 'test_notebooks.ipynb')
42 |     args = ['notebook', 'merge', '-o', output_filename, '-i'] + testing_data
43 |     main.main(args)
44 | 
45 | 
46 | def test_generate_api(merged_data):
47 |     output_stem = os.path.join(tempfile.mkdtemp(), 'test_notebooks')
48 |     args = ['notebook', 'generate', '-o', output_stem, '-i', merged_data]
49 |     main.main(args)
50 | 
51 | 


--------------------------------------------------------------------------------
/src/seqc/reader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gzip
 3 | import bz2
 4 | 
 5 | 
 6 | class Reader:
 7 |     """
 8 |     Basic reader object that seamlessly loops over multiple input files
 9 | 
10 |     Can be subclassed to create readers for specific file types (fastq, gtf, etc.)
11 |     """
12 | 
13 |     def __init__(self, files_):
14 | 
15 |         if isinstance(files_, list):
16 |             self._files = files_
17 |         elif isinstance(files_, str):
18 |             self._files = [files_]
19 |         else:
20 |             raise TypeError('files_ must be a string filename or a list of such names.')
21 | 
22 |     @property
23 |     def filenames(self):
24 |         return self._files
25 | 
26 |     def __len__(self):
27 |         """
28 |         return the length of the Reader object. This depends on the implementation of
29 |         self.__iter__(); it does not necessarily represent the length of the file in
30 |         lines.
31 |         """
32 |         return sum(1 for _ in self)
33 | 
34 |     def __iter__(self):
35 |         for f in self._files:
36 |             if f.endswith('.gz'):
37 |                 file_input = gzip.open(f, 'rb')
38 |             elif f.endswith('.bz2'):
39 |                 file_input = bz2.open(f, 'rb')
40 |             else:
41 |                 file_input = open(f, 'rb')
42 |             for record in file_input:
43 |                 yield record
44 |             file_input.close()
45 | 
46 |     @property
47 |     def size(self) -> int:
48 |         """return the collective size of all files being read in bytes"""
49 |         return sum(os.stat(f).st_size for f in self._files)
50 | 


--------------------------------------------------------------------------------
/src/seqc/run_mast.R:
--------------------------------------------------------------------------------
 1 | suppressMessages(library(MAST))
 2 | suppressPackageStartupMessages({library(data.table)})
 3 | options(mc.cores = 1) # gives me error messages when I use > 1
 4 | 
 5 | loadData <- function(input_data) {
 6 |   df <- (read.csv(input_data, row.names=NULL))
 7 | }
 8 | 
 9 | extractConditions <- function(df) {
10 |   # extract conditions (sg) from column names of the df
11 |   sg <- factor(unlist(df[1]))
12 |   return(sg)
13 | }
14 | 
15 | annotateDF <- function(df, sg) {
16 |   df[1] <- NULL
17 |   df <- t(df)  
18 |   names(df) <- sg
19 |   return(df)
20 | }
21 | 
22 | runMAST <- function(df, sg) {
23 |   # extract columns and row information
24 |   # add a cell number column to avoid duplicate row names
25 |   wellKey <- seq_len(dim(df)[2])
26 |   wellKey <- lapply(wellKey, toString)
27 |   condition <- as.numeric(unlist(as.list(sg)))
28 |   cdata <- data.frame(cbind(wellKey=wellKey, condition=condition))
29 |   fdata <- data.frame(primerid=row.names(df))
30 |   
31 |   # create the sca object. Note that we do filtering before
32 |   # we create the test matrix, so no additional filtering of cells is added here
33 |   exprsArray <- as.matrix(df)
34 |   dimnames(exprsArray)[[2]] <- cdata$wellKey
35 |   sca <- FromMatrix(exprsArray, cdata, fdata)
36 |   
37 |   # calculate cellular detection rate
38 |   cdr2 <-colSums(assay(sca)>0)
39 |   colData(sca)$cngeneson <- scale(cdr2)
40 |   colData(sca)$cond <- as.numeric(unlist(as.list(sg)))
41 |   
42 |   # carry out DE analysis
43 |   zlmCond <- zlm.SingleCellAssay(~cond + cngeneson, sca)
44 |   #res <- lrTest(zlmCond, CoefficientHypothesis("cond"))
45 | 
46 |   #only test the cluster coefficient.
47 |   summaryCond <- summary(zlmCond, doLRT=TRUE)
48 |   summaryDt <- summaryCond$datatable
49 |   fcHurdle <- merge(summaryDt[contrast=='cond' & component=='H',.(primerid, `Pr(>Chisq)`)], summaryDt[contrast=='cond' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') 
50 |   
51 |   fcHurdle <- fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')]
52 |   fcHurdleSig <- fcHurdle[(fdr<=0.05) & (abs(coef)>=log2(1.25)) ]
53 |   setorder(fcHurdleSig, fdr)
54 |   
55 |   return(fcHurdleSig)
56 | }
57 | 
58 | saveResult <- function(result, filename) {
59 |   resultDf <- as.data.frame(result)
60 |   colnames(resultDf)[1] = 'gene'
61 |   colnames(resultDf)[2] = 'p'
62 |   colnames(resultDf)[3] = 'logFC'
63 |   colnames(resultDf)[6] = 'p.fdr.adj'
64 |   resultDf <- resultDf[,c('gene','p','p.fdr.adj','logFC')]
65 |   write.table(resultDf, file = filename, row.names = FALSE, col.names = TRUE, sep = ",", quote = FALSE)
66 | }
67 | 
68 | testMAST <- function(input_filename, save_filename) {
69 |   df <- loadData(input_filename)
70 |   sg <- extractConditions(df)
71 |   df <- annotateDF(df, sg)
72 |   result <- runMAST(df, sg)
73 |   saveResult(result, save_filename)
74 | }
75 | 
76 | # args should be:
77 | # 1. input_filename
78 | # 2. output_filename
79 | 
80 | args <- commandArgs(trailingOnly = TRUE)
81 | stopifnot(length(args) == 2)
82 | 
83 | testMAST(args[1], args[2])
84 | 


--------------------------------------------------------------------------------
/src/seqc/sequence/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/sequence/__init__.py


--------------------------------------------------------------------------------
/src/seqc/sequence/barcodes.py:
--------------------------------------------------------------------------------
 1 | from seqc.sequence.encodings import DNA3Bit
 2 | from sys import maxsize
 3 | 
 4 | # todo document me
 5 | def generate_hamming_dist_1(seq):
 6 |     """ Return a list of all sequences that are up to 1 hamming distance from seq
 7 |     :param seq:
 8 |     """
 9 |     res = []
10 |     l = DNA3Bit.seq_len(seq)
11 |     #=barcode
12 |     
13 |     # generate all sequences that are dist 1
14 |     for i in range(l):
15 |         mask = 0b111 << (i * 3)
16 |         cur_chr = (seq & mask) >> (i * 3)
17 |         res += [seq & (~mask) | (new_chr << (i * 3))
18 |                 for new_chr in DNA3Bit.bin2strdict.keys() if new_chr != cur_chr]
19 | 
20 |     return res
21 | 
22 | 
23 | def find_correct_barcode(code, barcodes_list, exact_match=False):
24 |     """
25 |     For a given barcode find the closest correct barcode to it from the list (limited to
26 |     one ED), a string representing the error and the edit distance
27 |     NOTE: for now this function looks for a barcode with ED==1 and does not bother
28 |     looking for the minimum
29 | 
30 |     :param exact_match:
31 |     :param barcodes_list:
32 |     :param code:
33 |     :returns:
34 |     """
35 | 
36 |     # Return the barcode if it exists
37 |     if code in barcodes_list:
38 |         return code, 0
39 | 
40 |     # If perfect match is required, return an error since the barcode does not appear
41 |     # in the correct barcode list
42 |     if exact_match:
43 |         return 0, maxsize
44 | 
45 |     min_ed = maxsize
46 |     cor_code = 0
47 |     for bc in barcodes_list:
48 |         hamm_d = hamming_dist_bin(code, bc)
49 |         if hamm_d == 1:
50 |             min_ed = 1
51 |             cor_code = bc
52 |             break
53 |         if hamm_d < min_ed:
54 |             min_ed = hamm_d
55 |             cor_code = bc
56 | 
57 |     return cor_code, min_ed
58 |         
59 |         
60 | def hamming_dist_bin(c1, c2):
61 |     """Return the hamming distance between two numbers representing a sequence (3 bits
62 |     per base)
63 | 
64 |     :param c1:
65 |     :param c2:
66 |     :return:
67 |     """
68 |     if DNA3Bit.seq_len(c1) != DNA3Bit.seq_len(c2):
69 |         return maxsize
70 |     d = 0
71 |     while c1 > 0:
72 |         if c1 & 0b111 != c2 & 0b111:
73 |             d += 1
74 |         c1 >>= 3
75 |         c2 >>= 3
76 |     return d
77 | 
78 | 
79 | def list_errors(s1, s2):
80 |     """
81 |     Return the list of nucleotide transformations that turn s1 to s2.
82 |     An error is a six bit int representing a two chr string of type "AG","CT", etc.
83 | 
84 |     :param s2:
85 |     :param s1:
86 | 
87 |     :returns:
88 |     """
89 | 
90 |     # return the actual error
91 |     err_list = []
92 |     while s1 > 0:
93 |         if s1 & 0b111 != s2 & 0b111:
94 |             err_list.append((s1 & 0b111, s2 & 0b111))
95 |         s1 >>= 3
96 |         s2 >>= 3
97 |     return err_list
98 | 


--------------------------------------------------------------------------------
/src/seqc/sequence/encodings.py:
--------------------------------------------------------------------------------
  1 | 
  2 | class DNA3Bit(object):
  3 |     """
  4 |     Compact 3-bit encoding scheme for sequence data.
  5 |     """
  6 |     
  7 |     @staticmethod
  8 |     def bits_per_base():
  9 |         return 3
 10 | 
 11 | # TODO: The sam reader needs to be fixed so text files are read as text not binary
 12 |     str2bindict = {65: 0b100, 67: 0b110, 71: 0b101, 84: 0b011, 78: 0b111,
 13 |                    97: 0b100, 99: 0b110, 103: 0b101, 116: 0b011, 110: 0b111,
 14 |                    'A': 0b100, 'C': 0b110, 'G': 0b101, 'T': 0b011, 'N': 0b111,
 15 |                    'a': 0b100, 'c': 0b110, 'g': 0b101, 't': 0b011, 'n': 0b111}
 16 |     bin2strdict = {0b100: b'A', 0b110: b'C', 0b101: b'G', 0b011: b'T', 0b111: b'N'}
 17 |     
 18 |     @staticmethod
 19 |     def encode(b) -> int:
 20 |         """
 21 |         Convert string nucleotide sequence into binary, note: string is stored so
 22 |         that the first nucleotide is in the MSB position
 23 | 
 24 |         :param bytes|str b: sequence containing nucleotides to be encoded
 25 |         """
 26 |         res = 0
 27 |         for c in b:
 28 |             res <<= 3
 29 |             res += DNA3Bit.str2bindict[c]
 30 |         return res
 31 |         
 32 |     @staticmethod
 33 |     def decode(i: int) -> bytes:
 34 |         """
 35 |         Convert binary nucleotide sequence into string
 36 | 
 37 |         :param i: int, encoded sequence to be converted back to nucleotides
 38 |         """
 39 |         if i < 0:
 40 |             message = 'i must be an unsigned (positive) integer, not {0!s}'.format(i)
 41 |             raise ValueError(message)
 42 |         r = b''
 43 |         while i > 0:
 44 |             r = DNA3Bit.bin2strdict[i & 0b111] + r
 45 |             i >>= 3
 46 |         return r
 47 |         
 48 |     # TODO: another ooption is to use i.bit_length and take into account preceding 0's
 49 |     @staticmethod
 50 |     def seq_len(i: int) -> int:
 51 |         """
 52 |         Return the length of an encoded sequence based on its binary representation
 53 | 
 54 |         :param i: int, encoded sequence
 55 |         """
 56 |         l = 0
 57 |         while i > 0:
 58 |             l += 1
 59 |             i >>= 3
 60 |         return l
 61 |         
 62 |     @staticmethod
 63 |     def contains(s: int, char: int) -> bool:
 64 |         """
 65 |         return true if the char (bin representation) is contained in seq (binary
 66 |         representation)
 67 | 
 68 |         :param char: int, encoded character (one must be only one nucleotide)
 69 |         :param s: int, sequence of encoded nucleotides
 70 |         """
 71 |         while s > 0:
 72 |             if char == (s & 0b111):
 73 |                 return True
 74 |             s >>= 3
 75 |         return False
 76 |     
 77 |     @staticmethod
 78 |     def ints2int(ints):
 79 |         """
 80 |         convert an iterable of sequences [i1, i2, i3] into a concatenated single integer
 81 |         0bi1i2i3. In cases where the sequence is longer than 64 bits, python will
 82 |         transition seamlessly to a long int representation, however the user must be
 83 |         aware that downsteam interaction with numpy or other fixed-size representations
 84 |         may not function
 85 | 
 86 |         :param ints: iterable of encoded sequences to concatenate
 87 |         """
 88 | 
 89 |         res = 0
 90 |         for num in ints:
 91 |             tmp = num
 92 |             # Get length of next number to concatenate (with enough room for leading 0's)
 93 |             while tmp > 0:
 94 |                 res <<= 3
 95 |                 tmp >>= 3
 96 |             res += num
 97 |         return res
 98 |     
 99 |     @staticmethod
100 |     def count(seq, char_bin):
101 |         """
102 |         count how many times char is in seq.
103 |         char needs to be an encoded value of one of the bases.
104 |         """
105 |         if char_bin not in DNA3Bit.bin2strdict.keys():
106 |             raise ValueError("DNA3Bit.count was called with an invalid char code - "
107 |                              "{}".format(char_bin))
108 |         res = 0
109 |         while seq > 0:
110 |             if seq & 0b111 == char_bin:
111 |                 res += 1
112 |             seq >>= 3
113 |         return res
114 |     
115 | 
116 | # TODO: this was written for tests, not sure it's being used anymore
117 | #   @staticmethod
118 | #    def gc_content(i: int) -> float:
119 | #        """
120 | #        calculates percentage of nucleotides in i that is G or C#
121 | #
122 | #        :param i: int, encoded sequence
123 | #        """
124 | #        gc = 0
125 | #        length = 0
126 | #        while i > 0:
127 | #            length += 1
128 | #            masked = i & 111
129 | #            if masked == 0b100 or masked == 0b100:
130 | #                gc += 1
131 | #            i >>= 3
132 | #        return gc / length
133 | 


--------------------------------------------------------------------------------
/src/seqc/sequence/fastq.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from seqc import reader
  4 | 
  5 | 
  6 | class FastqRecord:
  7 |     """Fastq record object
  8 | 
  9 |     Defines several properties for accessing fastq record information:
 10 |     :property name: name field
 11 |     :property sequence: sequence field
 12 |     :property name2: second name field
 13 |     :property quality: quality field
 14 | 
 15 |     Also defines several methods for accessing SEQC annotation fields:
 16 |     :property annotations: list of annotations
 17 |     :property metadata: dictionary of read metadata (if any present)
 18 |     :property average_quality: return the mean quality of FastqRecord
 19 |     """
 20 | 
 21 |     __slots__ = ["_data"]
 22 | 
 23 |     def __init__(self, record: [bytes, bytes, bytes, bytes]):
 24 |         self._data = list(record)
 25 | 
 26 |     @property
 27 |     def name(self) -> bytes:
 28 |         return self._data[0]
 29 | 
 30 |     @name.setter
 31 |     def name(self, value: bytes):
 32 |         self._data[0] = value
 33 | 
 34 |     @property
 35 |     def sequence(self) -> bytes:
 36 |         return self._data[1]
 37 | 
 38 |     @sequence.setter
 39 |     def sequence(self, value: bytes):
 40 |         self._data[1] = value
 41 | 
 42 |     @property
 43 |     def name2(self) -> bytes:
 44 |         return self._data[2]
 45 | 
 46 |     @name2.setter
 47 |     def name2(self, value: bytes):
 48 |         self._data[2] = value
 49 | 
 50 |     @property
 51 |     def quality(self) -> bytes:
 52 |         return self._data[3]
 53 | 
 54 |     @quality.setter
 55 |     def quality(self, value: bytes):
 56 |         self._data[3] = value
 57 | 
 58 |     def __bytes__(self) -> bytes:
 59 |         return b"".join(self._data)
 60 | 
 61 |     def __str__(self) -> str:
 62 |         return bytes(self).decode()
 63 | 
 64 |     def __len__(self) -> int:
 65 |         return len(self.sequence)
 66 | 
 67 |     @property
 68 |     def annotations(self) -> list:
 69 |         """
 70 |         returns:
 71 |         --------
 72 |         list of annotations present in the fastq header
 73 |         """
 74 |         try:
 75 |             end = self.name.index(b";")
 76 |             return self.name[:end].split(b":")
 77 |         except ValueError:
 78 |             return []
 79 | 
 80 |     @property
 81 |     def metadata(self) -> dict:
 82 |         """
 83 |         returns:
 84 |         --------
 85 |         dictionary of annotations and fields, if any are present"""
 86 |         try:
 87 |             start = self.name.rindex(b"|")
 88 |         except ValueError:
 89 |             return {}
 90 |         fields = {}
 91 |         for field in self.name[start + 1 :].split(b":"):
 92 |             k, v = field.split(b"=")
 93 |             fields[k] = v
 94 |         return fields
 95 | 
 96 |     def add_annotation(self, values) -> None:
 97 |         """prepends a list of annotations to the name field of self.name
 98 |         :param values:
 99 |         """
100 |         self._data[0] = b"@" + b":".join(values) + b";" + self.name[1:]
101 | 
102 |     def add_metadata(self, values) -> None:
103 |         """appends a list of metadata fields to the name field of self.name
104 |         :param values:
105 |         """
106 |         self.name += b"|" + b":".join(k + "=" + v for k, v in values.items())
107 | 
108 |     def average_quality(self) -> int:
109 |         """"""
110 |         return (
111 |             np.mean(np.frombuffer(self.quality, dtype=np.int8, count=len(self))).astype(
112 |                 int
113 |             )
114 |             - 33
115 |         )
116 | 
117 | 
118 | class Reader(reader.Reader):
119 |     """
120 |     Fastq Reader, defines some special methods for reading and summarizing fastq data:
121 | 
122 |     :method __iter__: Iterator over fastq Record objects
123 |     :method __len__: return number of records in file
124 |     :method estimate_sequence_length: estimate the length of fastq sequences in file
125 |     """
126 | 
127 |     @staticmethod
128 |     def record_grouper(iterable):
129 |         args = [iter(iterable)] * 4
130 |         return zip(*args)
131 | 
132 |     def __iter__(self):
133 |         for record in self.record_grouper(super().__iter__()):
134 |             yield FastqRecord(record)
135 | 
136 |     def __len__(self):
137 |         """
138 |         return the length of the Reader object. This depends on the implementation of
139 |         self.__iter__(); it does not necessarily represent the length of the file in
140 |         lines.
141 |         """
142 |         return sum(1 for _ in self) / 4
143 | 
144 |     def estimate_sequence_length(self):
145 |         """
146 |         estimate the sequence length of a fastq file from the first 10000 records of
147 |         the file.
148 | 
149 |         :return: int mean, float standard deviation, (np.ndarray: observed lengths,
150 |           np.ndarray: counts per length)
151 |         """
152 |         i = 0
153 |         records = iter(self)
154 |         data = np.empty(10000, dtype=int)
155 |         while i < 10000:
156 |             try:
157 |                 seq = next(records).sequence
158 |             except StopIteration:  # for fastq files shorter than 10000 records
159 |                 data = data[:i]
160 |                 break
161 |             data[i] = len(seq) - 1  # last character is a newline
162 |             i += 1
163 |         return np.mean(data), np.std(data), np.unique(data, return_counts=True)
164 | 
165 | 
166 | def merge_paired(merge_function, fout, genomic, barcode=None) -> (str, int):
167 |     """
168 |     General function to annotate genomic fastq with barcode information from reverse read.
169 |     Takes a merge_function which indicates which kind of platform was used to generate
170 |     the data, and specifies how the merging should be done.
171 | 
172 |     :param merge_function: function from merge_functions.py
173 |     :param fout: merged output file name
174 |     :param genomic: fastq containing genomic data
175 |     :param barcode: fastq containing barcode data
176 |     :return str fout, filename of merged fastq file
177 | 
178 |     """
179 |     directory, filename = os.path.split(fout)
180 |     if directory and not os.path.isdir(directory):
181 |         os.makedirs(directory, exist_ok=True)
182 |     genomic = Reader(genomic)
183 |     if barcode:
184 |         barcode = Reader(barcode)
185 |         with open(fout, "wb") as f:
186 |             for g, b in zip(genomic, barcode):
187 |                 r = merge_function(g, b)
188 |                 f.write(bytes(r))
189 |     else:
190 |         with open(fout, "wb") as f:
191 |             for g in genomic:
192 |                 r = merge_function(g)
193 |                 f.write(bytes(r))
194 | 
195 |     return fout
196 | 
197 | 
198 | def truncate(fastq_file, lengths):
199 |     """
200 | 
201 |     :param str fastq_file: the input fastq file
202 |     :param [int] lengths: a list of integer lengths to truncate the input fastq file
203 |     :return:
204 |     """
205 |     # get sequence length of input file
206 |     r = Reader(fastq_file)
207 |     length = None
208 |     for record in r:
209 |         length = len(record.sequence)
210 |         break
211 | 
212 |     print("sequence length in file is %d" % length)
213 | 
214 |     # remove any lengths longer than sequence length of file
215 |     lengths = sorted([l for l in lengths if l < length])[::-1]  # largest to smallest
216 | 
217 |     # open a bunch of files
218 |     files = []
219 |     for l in lengths:
220 |         name = (
221 |             fastq_file.replace(".gz", "").replace(".fastq", "") + "_%d_" % l + ".fastq"
222 |         )
223 |         files.append(open(name, "wb"))
224 | 
225 |     i = 0
226 |     indices = list(range(len(lengths)))
227 |     for record in r:
228 |         if i > 10e6:
229 |             break
230 |         for j in indices:
231 |             record.sequence = record.sequence[:-1][: lengths[j]] + b"\n"
232 |             record.quality = record.quality[:-1][: lengths[j]] + b"\n"
233 |             files[j].write(bytes(record))
234 |         i += 1
235 | 
236 |     for f in files:
237 |         f.close()
238 | 


--------------------------------------------------------------------------------
/src/seqc/sparse_frame.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from scipy.sparse import coo_matrix
  4 | from collections import OrderedDict
  5 | from seqc.sequence.gtf import create_gene_id_to_official_gene_symbol_map
  6 | from seqc.sequence.gtf import ensembl_gene_id_to_official_gene_symbol
  7 | 
  8 | 
  9 | class SparseFrame:
 10 |     def __init__(self, data, index, columns):
 11 |         """
 12 |         lightweight wrapper of scipy.stats.coo_matrix to provide pd.DataFrame-like access
 13 |         to index, column, and shape properties.
 14 | 
 15 |         :param data: scipy.stats.coo_matrix
 16 |         :param index: np.ndarray: row index
 17 |         :param columns: np.ndarray: column index
 18 | 
 19 |         :property data: scipy.stats.coo_matrix
 20 |         :property index: np.ndarray row index
 21 |         :property columns: np.ndarray column index
 22 |         :property shape: (int, int), number of rows and columns
 23 |         :method sum: wrapper of np.sum()
 24 |         """
 25 | 
 26 |         if not isinstance(data, coo_matrix):
 27 |             raise TypeError("data must be type coo_matrix")
 28 |         if not isinstance(index, np.ndarray):
 29 |             raise TypeError("index must be type np.ndarray")
 30 |         if not isinstance(columns, np.ndarray):
 31 |             raise TypeError("columns must be type np.ndarray")
 32 | 
 33 |         self._data = data
 34 |         self._index = index
 35 |         self._columns = columns
 36 | 
 37 |     @property
 38 |     def data(self):
 39 |         return self._data
 40 | 
 41 |     @data.setter
 42 |     def data(self, item):
 43 |         if not isinstance(item, coo_matrix):
 44 |             raise TypeError("data must be type coo_matrix")
 45 |         self._data = item
 46 | 
 47 |     @property
 48 |     def index(self):
 49 |         return self._index
 50 | 
 51 |     @index.setter
 52 |     def index(self, item):
 53 |         try:
 54 |             self._index = np.array(item)
 55 |         except:
 56 |             raise TypeError("self.index must be convertible into a np.array object")
 57 | 
 58 |     @property
 59 |     def columns(self):
 60 |         return self._columns
 61 | 
 62 |     @columns.setter
 63 |     def columns(self, item):
 64 |         try:
 65 |             self._columns = np.array(item)
 66 |         except:
 67 |             raise TypeError("self.columns must be convertible into a np.array object")
 68 | 
 69 |     @property
 70 |     def shape(self):
 71 |         return len(self.index), len(self.columns)
 72 | 
 73 |     def sum(self, axis=0):
 74 |         """
 75 |         sum over provided axis
 76 | 
 77 |         :param axis: options: 0 (rows) or 1 (columns)
 78 |         :return: np.ndarray vector of column or row sums
 79 |         """
 80 |         return self.data.sum(axis=axis)
 81 | 
 82 |     @classmethod
 83 |     def from_dict(cls, dictionary, genes_to_symbols=False):
 84 |         """create a SparseFrame from a dictionary
 85 | 
 86 |         :param dict dictionary: dictionary in form (cell, gene) -> count
 87 |         :param str|bool genes_to_symbols: convert genes into symbols. If not False, user
 88 |           must provide the location of a .gtf file to carry out conversion. Otherwise the
 89 |           column index will retain the original integer ids
 90 |         :return SparseFrame: SparseFrame containing dictionary data
 91 |         """
 92 | 
 93 |         # todo this throws an uninformative error in the case that there are no active
 94 |         # reads in the ReadArray
 95 |         i, j = (np.array(v, dtype=int) for v in zip(*dictionary.keys()))
 96 |         data = np.fromiter(dictionary.values(), dtype=int)
 97 | 
 98 |         # map cells to small values
 99 |         uniq_i = np.unique(i)
100 |         imap = OrderedDict(zip(uniq_i, np.arange(uniq_i.shape[0])))
101 | 
102 |         uniq_j = np.unique(j)
103 |         jmap = OrderedDict(zip(uniq_j, np.arange(uniq_j.shape[0])))
104 | 
105 |         i_inds = np.fromiter((imap[v] for v in i), dtype=int)
106 |         j_inds = np.fromiter((jmap[v] for v in j), dtype=int)
107 | 
108 |         coo = coo_matrix(
109 |             (data, (i_inds, j_inds)), shape=(len(imap), len(jmap)), dtype=np.int32
110 |         )
111 | 
112 |         index = np.fromiter(imap.keys(), dtype=int)
113 |         columns = np.fromiter(jmap.keys(), dtype=int)
114 | 
115 |         if genes_to_symbols:
116 |             if not os.path.isfile(genes_to_symbols):
117 |                 raise ValueError(
118 |                     "genes_to_symbols argument %s is not a valid annotation "
119 |                     "file" % repr(genes_to_symbols)
120 |                 )
121 |             gmap = create_gene_id_to_official_gene_symbol_map(genes_to_symbols)
122 |             columns = np.array(
123 |                 ensembl_gene_id_to_official_gene_symbol(columns, gene_id_map=gmap)
124 |             )
125 | 
126 |         return cls(coo, index, columns)
127 | 


--------------------------------------------------------------------------------
/src/seqc/stats/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ttest import bootstrap_t as ttest
 2 | from .gsea import GSEA as gsea
 3 | from .correlation import correlation
 4 | from .anova import ANOVA as anova
 5 | from .graph_diffusion import GraphDiffusion as graph_diffusion
 6 | from .smoothing import smoothing
 7 | from .tree import Tree as tree
 8 | from .pca import PCA as pca
 9 | from .tsne import TSNE as tsne
10 | from .g_test import g_test
11 | from .mast import run_mast
12 | from .resampled_nonparametric import mannwhitneyu, kruskalwallis


--------------------------------------------------------------------------------
/src/seqc/stats/anova.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from collections import namedtuple
  3 | import numpy as np
  4 | import pandas as pd
  5 | from functools import partial
  6 | from scipy.stats.mstats import kruskalwallis, rankdata
  7 | from scipy.stats import t
  8 | from statsmodels.sandbox.stats.multicomp import multipletests
  9 | 
 10 | class ANOVA:
 11 | 
 12 |     def __init__(self, data, group_assignments, alpha=0.05):
 13 |         """
 14 |         Carry out ANOVA between the groups of data
 15 | 
 16 |         :param data: n cells x k genes 2d array
 17 |         :param group_assignments: n cells 1d vector
 18 |         :param alpha: float (0, 1], acceptable type I error
 19 |         """
 20 |         # make sure group_assignments and data have the same length
 21 |         warnings.warn('DeprecationWarning: This function is deprecated.')
 22 |         if not data.shape[0] == group_assignments.shape[0]:
 23 |             raise ValueError(
 24 |                 'Group assignments shape ({!s}) must equal the number of rows in data '
 25 |                 '({!s}).'.format(group_assignments.shape[0], data.shape[0]))
 26 | 
 27 |         # todo
 28 |         # may want to verify that each group has at least two observations
 29 |         # (else variance won't work)
 30 | 
 31 |         # store index if both data and group_assignments are pandas objects
 32 |         if isinstance(data, pd.DataFrame) and isinstance(group_assignments, pd.Series):
 33 |             # ensure assignments and data indices are aligned
 34 |             try:
 35 |                 ordered_assignments = group_assignments[data.index]
 36 |                 if not len(ordered_assignments) == data.shape[0]:
 37 |                     raise ValueError(
 38 |                         'Index mismatch between data and group_assignments detected when '
 39 |                         'aligning indices. check for duplicates.')
 40 |             except:
 41 |                 raise ValueError('Index mismatch between data and group_assignments.')
 42 | 
 43 |             # sort data by cluster assignment
 44 |             idx = np.argsort(ordered_assignments.values)
 45 |             self.data = data.iloc[idx, :].values
 46 |             ordered_assignments = ordered_assignments.iloc[idx]
 47 |             self.group_assignments = ordered_assignments.values
 48 |             self.index = data.columns
 49 | 
 50 |         else:  # get arrays from input values
 51 |             self.index = None  # inputs were not all indexed pandas objects
 52 | 
 53 |             try:
 54 |                 data = np.array(data)
 55 |             except:
 56 |                 raise ValueError('data must be convertible to a np.ndarray')
 57 | 
 58 |             try:
 59 |                 group_assignments = np.array(group_assignments)
 60 |             except:
 61 |                 raise ValueError('group_assignments must be convertible to a np.ndarray')
 62 | 
 63 |             idx = np.argsort(group_assignments)
 64 |             self.data = data[idx, :]
 65 |             self.group_assignments = group_assignments[idx]
 66 | 
 67 |         self.post_hoc = None
 68 |         self.groups = np.unique(group_assignments)
 69 | 
 70 |         # get points to split the array, create slicers for each group
 71 |         self.split_indices = np.where(np.diff(self.group_assignments))[0] + 1
 72 |         # todo is this a faster way of calculating the below anova?
 73 |         # self.array_views = np.array_split(self.data, self.split_indices, axis=0)
 74 | 
 75 |         if not 0 < alpha <= 1:
 76 |             raise ValueError('Parameter alpha must fall within the interval (0, 1].')
 77 |         self.alpha = alpha
 78 | 
 79 |         self._anova = None
 80 | 
 81 |     def anova(self, min_mean_expr=None):
 82 |         """
 83 |         carry out non-parametric ANOVA across the groups of self.
 84 | 
 85 |         :param min_mean_expr: minimum average gene expression value that must be reached
 86 |           in at least one cluster for the gene to be considered
 87 |         :return:
 88 |         """
 89 |         if self._anova is not None:
 90 |             return self._anova
 91 | 
 92 |         # run anova
 93 |         f = lambda v: kruskalwallis(*np.split(v, self.split_indices))[1]
 94 |         pvals = np.apply_along_axis(f, 0, self.data)  # todo could shunt to a multiprocessing pool
 95 | 
 96 |         # correct the pvals
 97 |         _, pval_corrected, _, _ = multipletests(pvals, self.alpha, method='fdr_tsbh')
 98 | 
 99 |         # store data & return
100 |         if self.index is not None:
101 |             self._anova = pd.Series(pval_corrected, index=self.index)
102 |         else:
103 |             self._anova = pval_corrected
104 |         return self._anova
105 | 
106 |     def post_hoc_tests(self):
107 |         """
108 |         carries out post-hoc tests between genes with significant ANOVA results using
109 |         Welch's U-test on ranked data.
110 |         """
111 |         if self._anova is None:
112 |             self.anova()
113 | 
114 |         anova_significant = np.array(self._anova) < 1  # call array in case it is a Series
115 | 
116 |         # limit to significant data, convert to column-wise ranks.
117 |         data = self.data[:, anova_significant]
118 |         rank_data = np.apply_along_axis(rankdata, 0, data)
119 |         # assignments = self.group_assignments[anova_significant]
120 | 
121 |         split_indices = np.where(np.diff(self.group_assignments))[0] + 1
122 |         array_views = np.array_split(rank_data, split_indices, axis=0)
123 | 
124 |         # get mean and standard deviations of each
125 |         fmean = partial(np.mean, axis=0)
126 |         fvar = partial(np.var, axis=0)
127 |         mu = np.vstack(list(map(fmean, array_views))).T  # transpose to get gene rows
128 |         n = np.array(list(map(lambda x: x.shape[0], array_views)))
129 |         s = np.vstack(list(map(fvar, array_views))).T
130 |         s_norm = s / n  # transpose to get gene rows
131 | 
132 |         # calculate T
133 |         numerator = mu[:, np.newaxis, :] - mu[:, :, np.newaxis]
134 |         denominator = np.sqrt(s_norm[:, np.newaxis, :] + s_norm[:, :, np.newaxis])
135 |         statistic = numerator / denominator
136 | 
137 |         # calculate df
138 |         s_norm2 = s**2 / (n**2 * n-1)
139 |         numerator = (s_norm[:, np.newaxis, :] + s_norm[:, :, np.newaxis]) ** 2
140 |         denominator = (s_norm2[:, np.newaxis, :] + s_norm2[:, :, np.newaxis])
141 |         df = np.floor(numerator / denominator)
142 | 
143 |         # get significance
144 |         p = t.cdf(np.abs(statistic), df)  # note, two tailed test
145 | 
146 |         # calculate fdr correction; because above uses 2-tails, alpha here is halved
147 |         # because each test is evaluated twice due to the symmetry of vectorization.
148 |         p_adj = multipletests(np.ravel(p), alpha=self.alpha, method='fdr_tsbh')[1]
149 |         p_adj = p_adj.reshape(*p.shape)
150 | 
151 |         phr = namedtuple('PostHocResults', ['p_adj', 'statistic', 'mu'])
152 |         self.post_hoc = phr(p_adj, statistic, mu)
153 | 
154 |         if self.index is not None:
155 |             p_adj = pd.Panel(
156 |                 p_adj, items=self.index[anova_significant], major_axis=self.groups,
157 |                 minor_axis=self.groups)
158 |             statistic = pd.Panel(
159 |                 statistic, items=self.index[anova_significant], major_axis=self.groups,
160 |                 minor_axis=self.groups)
161 |             mu = pd.DataFrame(mu, self.index[anova_significant], columns=self.groups)
162 | 
163 |         return p_adj, statistic, mu
164 | 
165 |     def population_markers(self, p_crit=0.0):
166 |         """
167 |         Return markers that are significantly differentially expressed in one
168 |         population vs all others
169 | 
170 |         :param p_crit: float, fraction populations that may be indistinguishable from the
171 |           highest expressing population for each gene. If zero, each marker gene is
172 |           significantly higher expressed in one population relative to all others.
173 |           If 0.1, 10% of populations may share high expression of a gene, and those
174 |           populations will be marked as expressing that gene.
175 | 
176 |         """
177 |         if self.post_hoc is None:
178 |             self.post_hoc_tests()
179 | 
180 |         # get highest mean for each gene
181 |         top_gene_idx = np.argmax(self.post_hoc.mu, axis=1)
182 | 
183 |         # index p_adj first dimension with each sample, will reduce to 2d genes x samples
184 |         top_gene_sig = self.post_hoc.p_adj[:, top_gene_idx, :]
185 | 
186 |         # for each gene, count the number of non-significant DE results.
187 |         sig = np.array(top_gene_sig < self.alpha)
188 |         num_sig = np.sum(sig, axis=2)
189 | 
190 |         # if this is greater than N - 1 * p_crit, discard the gene.
191 |         n = self.post_hoc.p_adj.shape[2] - 1  # number of genes, sub 1 for self
192 |         idx_marker_genes = np.where(num_sig < n * (1 - p_crit))
193 |         marker_genes = sig[idx_marker_genes, :]
194 | 
195 |         # correctly index these genes
196 |         if self.index:
197 |             pass  # todo fix this
198 | 
199 |         return marker_genes
200 | 


--------------------------------------------------------------------------------
/src/seqc/stats/correlation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | class correlation:
 6 |     """Fast vectorized correlation methods
 7 | 
 8 |     :method vector(x, y): correlate each column in y with a vector in x
 9 |     :method map(x, y): correlate each column of x with each column in y
10 |     :method eigv(evec, data): get pairwise correlations of eigenvectors
11 |       with columns of data
12 |     """
13 | 
14 |     @staticmethod
15 |     def vector(x: np.array, y: np.array):
16 |         """
17 |         Correlate each column in y with a vector x
18 | 
19 |         :param x: np.ndarray vector of length n
20 |         :param y: np.ndarray matrix of shape (n, k)
21 |         :returns: vector of length n
22 |         """
23 |         # x = x[:, np.newaxis]  # for working with matrices
24 |         mu_x = x.mean()  # cells
25 |         mu_y = y.mean(axis=0)  # cells by gene --> cells by genes
26 |         sigma_x = x.std()
27 |         sigma_y = y.std(axis=0)
28 | 
29 |         return ((y * x).mean(axis=0) - mu_y * mu_x) / (sigma_y * sigma_x)
30 | 
31 |     @staticmethod
32 |     def map(x: np.ndarray, y: np.ndarray):
33 |         """Correlate each row of x with each row of y
34 | 
35 |         :param x: np.array; shape N x T.
36 |         :param y: np.array; shape M x T.
37 |         :returns: np.array; shape N x M in which each element is a correlation
38 |                             coefficient.
39 |         """
40 |         assert(x.shape[1] == y.shape[1])
41 |         n = x.shape[1]
42 |         x_diff = x - x.mean(axis=-1)[:, None]
43 |         y_diff = y - y.mean(axis=-1)[:, None]
44 |         x_std = x.std(axis=-1)
45 |         y_std = y.std(axis=-1)
46 |         return np.dot(x_diff, y_diff.T) / (n * x_std[:, np.newaxis] * y_std)
47 | 
48 |     @staticmethod
49 |     def eigv(evec, data, components=tuple(), knn=10):
50 |         """
51 |         get pairwise correlations of eigenvectors with columns in data
52 | 
53 |         :param evec: eigenvectors
54 |         :param data: np.ndarray genes x cells data matrix
55 |         :param components: which eigenvectors to select
56 |         :param knn: number of neighbors to smooth gene expression values over
57 |         :return:
58 |         """
59 |         if isinstance(data, pd.DataFrame):
60 |             D = data.values
61 |         elif isinstance(data, np.ndarray):
62 |             D = data
63 |         else:
64 |             raise TypeError('data must be a pd.DataFrame or np.ndarray')
65 | 
66 |         # set components, remove zero if it was specified
67 |         if not components:
68 |             components = np.arange(evec.shape[1])
69 |         else:
70 |             components = np.array(components)
71 |         components = components[components != 0]
72 | 
73 |         eigv_corr = np.empty((D.shape[1], evec.shape[1]), dtype=np.float)
74 | 
75 |         for component_index in components:
76 |             component_data = evec[:, component_index]
77 | 
78 |             order = np.argsort(component_data)
79 |             x = pd.DataFrame(component_data[order]).rolling(
80 |                 window=knn, center=False).mean()[knn:].values
81 |             # this fancy indexing will copy self.molecules
82 |             vals = pd.DataFrame(D[order, :]).rolling(
83 |                 window=knn, center=False, axis=0).mean()[knn:].values
84 |             eigv_corr[:, component_index] = correlation.vector(x, vals)
85 | 
86 |         # this is sorted by order, need it in original order (reverse the sort)
87 |         eigv_corr = eigv_corr[:, components]
88 |         if isinstance(data, pd.DataFrame):
89 |             eigv_corr = pd.DataFrame(eigv_corr, index=data.columns, columns=components)
90 |         return eigv_corr
91 | 


--------------------------------------------------------------------------------
/src/seqc/stats/experimental_yield.py:
--------------------------------------------------------------------------------
  1 | class ExperimentalYield:
  2 | 
  3 |     output = (
  4 |         '{divide}\nINPUT\n{divide}\n'
  5 |         'Total input reads:\t{n_fastq}\n'
  6 |         '{divide}\nALIGNMENT (% FROM INPUT)\n{divide}\n'
  7 |         'Total reads aligned:\t{n_sam} ({prop_al}%)\n'
  8 |         ' - Genomic alignments:\t{genomic} ({prop_gen}%)\n'
  9 |         ' - PhiX alignments:\t{phi_x} ({prop_phix}%)\n'
 10 |         ' - Transcriptome alignments:\t{trans} ({prop_trans}%)\n'
 11 |         '{divide}\nFILTERING (% FROM ALIGNMENT)\n{divide}\n'
 12 |         'Genomic alignments:\t{genomic} ({bad_gen}%)\n'
 13 |         'PhiX alignments:\t{phi_x} ({bad_phi}%)\n'
 14 |         'Incorrect barcodes:\t{wrong_cb} ({bad_cb}%)\n'
 15 |         'Missing cell barcodes/RMT:\t{no_cell} ({bad_cell}%)\n'
 16 |         'N present in RMT:\t{rmt_N} ({bad_rmtN}%)\n'
 17 |         'N present in CB:\t{cell_N} ({bad_cellN}%)\n'
 18 |         'Insufficient poly(T):\t{poly_t} ({bad_polyt}%)\n'
 19 |         'High dust score:\t{dust} ({bad_dust}%)\n'
 20 |         '{divide}\nCELL/MOLECULE COUNT DISTRIBUTION\n{divide}\n'
 21 |         'Total molecules:\t\t{tot_mc}\n'
 22 |         'Molecules lost:\t{mols_lost}\n'
 23 |         'Cells lost:\t{cells_lost}\n'
 24 |         'Cell description:\n{cell_desc}\n'
 25 |         '{divide}\nSUMMARY\n{divide}\n'
 26 |         'Total retained reads:\t{n_good} ({prop_good}%)\n'
 27 |         'Total reads unaligned:\t{lost_al} ({prop_un}%)\n'
 28 |         'Total reads filtered:\t{n_bad} ({prop_bad}%)\n'
 29 |         '{divide}\n')
 30 | 
 31 |     @classmethod
 32 |     def construct_run_summary(cls, summary: dict):
 33 |         """
 34 |         calculates basic loss statistics and constructs a summary
 35 |         that will be sent to the user after the SEQC run has completed.
 36 | 
 37 |         :param summary: dictionary constructed during error correction
 38 |         :return: output of basic summary statistics
 39 |         """
 40 |         if not summary:
 41 |             return
 42 | 
 43 |         # obtain values from summary
 44 |         n_fastq = summary['n_fastq']
 45 |         n_sam = summary['n_sam']
 46 |         genomic = summary['gene_0']
 47 |         phix = summary['phi_x']
 48 |         no_cell = summary['cell_0']
 49 |         # no_rmt = summary['rmt_0']
 50 |         rmt_N = summary['rmt_N']
 51 |         cell_N = summary['cell_N']
 52 |         dust = summary['dust']
 53 |         poly_t = summary['poly_t']
 54 |         tot_mc = summary['total_mc']
 55 |         mols_lost = list(summary['mols_lost'].items())
 56 |         cells_lost = list(summary['cells_lost'].items())
 57 |         cell_desc = summary['cell_desc'].to_string()
 58 |         divide = '-' * 40
 59 | 
 60 |         # run summary will not be calculated if user started SEQC midway
 61 |         if n_fastq == 'NA' or n_sam == 'NA':
 62 |             return
 63 | 
 64 |         # calculate summary statistics
 65 |         trans = n_sam - genomic - phix
 66 |         prop_al = round((n_sam/n_fastq) * 100, 1)
 67 |         prop_gen = round((genomic/n_sam) * 100, 1)
 68 |         prop_phix = round((phix/n_sam) * 100, 1)
 69 |         prop_trans = round((trans/n_sam) * 100, 1)
 70 |         lost_al = n_fastq - n_sam
 71 |         prop_un = round(100 - prop_al, 1)
 72 |         n_bad = genomic + phix + no_cell + rmt_N + cell_N + poly_t + dust
 73 |         # n_bad = genomic + phix + no_cell + no_rmt + rmt_N + poly_t
 74 |         # wrong_cb does not apply to drop-seq
 75 |         try:
 76 |             wrong_cb = summary['cb_wrong']
 77 |             n_bad += wrong_cb
 78 |             bad_cb = round((wrong_cb/n_bad) * 100, 1)
 79 |         except KeyError:
 80 |             wrong_cb = 0
 81 |             bad_cb = 0
 82 |         # continue with calculations
 83 |         n_good = n_sam - n_bad
 84 |         bad_gen = round((genomic/n_bad) * 100, 1)
 85 |         bad_phi = round((phix/n_bad) * 100, 1)
 86 |         bad_cell = round((no_cell/n_bad) * 100, 1)
 87 |         # bad_rmt = round((no_rmt/n_bad) * 100, 1)
 88 |         bad_rmtN = round((rmt_N/n_bad) * 100, 1)
 89 |         bad_cellN = round((cell_N/n_bad) * 100, 1)
 90 |         bad_polyt = round((poly_t/n_bad) * 100, 1)
 91 |         bad_dust = round((dust/n_bad) * 100, 1)
 92 |         prop_bad = round((n_bad/n_fastq) * 100, 1)
 93 |         prop_good = round((n_good/n_fastq) * 100, 1)
 94 | 
 95 |         # format output
 96 |         output = cls.output.format(
 97 |             n_fastq=n_fastq, n_sam=n_sam, genomic=genomic, phi_x=phix, no_cell=no_cell,
 98 |             wrong_cb=wrong_cb, rmt_N=rmt_N, poly_t=poly_t, divide=divide,
 99 |             prop_al=prop_al, prop_gen=prop_gen, prop_phix=prop_phix, lost_al=lost_al,
100 |             n_bad=n_bad, n_good=n_good, prop_good=prop_good, prop_bad=prop_bad,
101 |             prop_un=prop_un, bad_gen=bad_gen, bad_phi=bad_phi, bad_cb=bad_cb,
102 |             bad_cell=bad_cell, bad_rmtN=bad_rmtN, bad_polyt=bad_polyt, trans=trans,
103 |             cell_N=cell_N, bad_cellN=bad_cellN, dust=dust, bad_dust=bad_dust,
104 |             prop_trans=prop_trans, tot_mc=tot_mc, mols_lost=mols_lost,
105 |             cells_lost=cells_lost, cell_desc=cell_desc)
106 |         return output
107 | 


--------------------------------------------------------------------------------
/src/seqc/stats/g_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from contextlib import closing
 4 | from multiprocessing import Pool
 5 | from sklearn.cluster import KMeans
 6 | 
 7 | 
 8 | def _assign(d):
 9 |     """
10 | 
11 |     :param np.ndarray d: 1d vector of scaled differences
12 |     :return np.ndarray: 1d boolean gene-enrichment assignment vector
13 |     """
14 |     km = KMeans(n_clusters=2)
15 |     km.fit(d[:, np.newaxis])
16 |     assignments = km.labels_.astype(bool)
17 |     if np.argmax(km.cluster_centers_) == 0:
18 |         return assignments
19 |     else:
20 |         return ~assignments
21 | 
22 | 
23 | def g_test(data, labels, log=False):
24 |     """
25 | 
26 |     :param pd.DataFrame data:
27 |     :param labels:
28 |     :param log:
29 |     :return:
30 |     """
31 | 
32 |     if log:
33 |         data = np.log(data + 1)
34 | 
35 |     data = pd.DataFrame(data.values / data.values.sum(axis=1)[:, np.newaxis],
36 |                         index=labels, columns=data.columns)
37 | 
38 |     # calculate data that are useful for determining observed and expected values
39 |     gene_sums = data.sum(axis=0)
40 |     grouped = data.groupby(axis=0, level=0)  # group only once
41 |     category_sizes = grouped.size()
42 |     category_fractions = category_sizes / category_sizes.sum()  # normalize
43 | 
44 |     # get observed, expected
45 |     expected = pd.DataFrame(
46 |         data=np.dot(category_fractions.values[:, np.newaxis],
47 |                     gene_sums.values[np.newaxis, :]),
48 |         index=category_sizes.index,
49 |         columns=gene_sums.index)
50 |     observed = grouped.sum()
51 | 
52 |     # scaled ratios are used in both g-test, and partitioning of expressed vs. not
53 |     logratio = np.log(observed / expected)
54 |     logratio.values[~np.isfinite(logratio.values)] = 0
55 |     scaled_diff = observed * logratio
56 | 
57 |     g = 2 * np.sum(scaled_diff, axis=0)  # g-test
58 | 
59 |     # todo only assign significant values
60 |     # todo calculate significance
61 |     with closing(Pool()) as pool:
62 |         assignments = pool.map(_assign, scaled_diff.values.T)
63 | 
64 |     assignments = pd.DataFrame(
65 |         data=np.vstack(assignments).T,
66 |         index=category_sizes.index,
67 |         columns=data.columns
68 |     )
69 | 
70 |     return g, assignments
71 | 


--------------------------------------------------------------------------------
/src/seqc/stats/graph_diffusion.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | from scipy.sparse.linalg import eigs
  4 | from numpy.linalg import norm
  5 | from scipy.sparse import csr_matrix, find
  6 | from sklearn.neighbors import NearestNeighbors
  7 | 
  8 | 
  9 | class GraphDiffusion:
 10 |     def __init__(self, knn=10, normalization='smarkov', epsilon=1,
 11 |                  n_diffusion_components=10):
 12 |         """
 13 |         Run diffusion maps on the data. This implementation is based on the
 14 |         diffusion geometry library in Matlab:
 15 |         https://services.math.duke.edu/~mauro/code.html#DiffusionGeom and was implemented
 16 |         by Pooja Kathail
 17 | 
 18 |         :param knn: Number of neighbors for graph construction to determine distances
 19 |           between cells
 20 |         :param normalization: method for normalizing the matrix of weights
 21 |              'bimarkov'            force row and column sums to be 1
 22 |              'markov'              force row sums to be 1
 23 |              'smarkov'             symmetric conjugate to markov
 24 |              'beltrami'            Laplace-Beltrami normalization ala Coifman-Lafon
 25 |              'sbeltrami'           symmetric conjugate to beltrami
 26 |              'FokkerPlanck'        Fokker-Planck normalization
 27 |              'sFokkerPlanck'       symmetric conjugate to Fokker-Planck normalization
 28 |         :param epsilon: Gaussian standard deviation for converting distances to affinities
 29 |         :param n_diffusion_components: Number of diffusion components to generate
 30 |         """
 31 |         if normalization not in ['bimarkov', 'smarkov', 'markov', 'sbeltrami', 'beltrami',
 32 |                                  'FokkerPlanck', 'sFokkerPlanck']:
 33 |             raise ValueError(
 34 |                 'Unsupported normalization. Please refer to the docstring for the '
 35 |                 'supported methods')
 36 | 
 37 |         self.knn = knn
 38 |         self.normalization = normalization
 39 |         self.epsilon = epsilon
 40 |         self.n_diffusion_components = n_diffusion_components
 41 |         self.eigenvectors = None
 42 |         self.eigenvalues = None
 43 |         self.diffusion_operator = None
 44 |         self.weights = None
 45 | 
 46 |     @staticmethod
 47 |     def keigs(T, k, P, take_diagonal=0):
 48 |         """ return k largest magnitude eigenvalues for the matrix T.
 49 |         :param T: Matrix to find eigen values/vectors of
 50 |         :param k: number of eigen values/vectors to return
 51 |         :param P: in the case of symmetric normalizations,
 52 |                   this is the NxN diagonal matrix which relates the nonsymmetric
 53 |                   version to the symmetric form via conjugation
 54 |         :param take_diagonal: if 1, returns the eigenvalues as a vector rather than as a
 55 |                               diagonal matrix.
 56 |         """
 57 |         D, V = eigs(T, k, tol=1e-4, maxiter=1000)
 58 |         D = np.real(D)
 59 |         V = np.real(V)
 60 |         inds = np.argsort(D)[::-1]
 61 |         D = D[inds]
 62 |         V = V[:, inds]
 63 |         if P is not None:
 64 |             V = P.dot(V)
 65 | 
 66 |         # Normalize
 67 |         for i in range(V.shape[1]):
 68 |             V[:, i] = V[:, i] / norm(V[:, i])
 69 |         V = np.round(V, 10)
 70 | 
 71 |         if take_diagonal == 0:
 72 |             D = np.diag(D)
 73 | 
 74 |         return V, D
 75 | 
 76 |     @staticmethod  # todo fix; what is S?
 77 |     def bimarkov(W, max_iters=100, abs_error=0.00001, **kwargs):
 78 |         """normalization method for GraphDiffusion"""
 79 | 
 80 |         if W.size == 0:
 81 |             return
 82 | 
 83 |         # process input
 84 |         if W.shape[0] != W.shape[1]:
 85 |             raise ValueError('Bimarkov.py: kernel must be NxN\n')
 86 | 
 87 |         N = W.shape[0]
 88 | 
 89 |         # initialize
 90 |         p = np.ones(N)
 91 | 
 92 |         # iterative
 93 |         for i in range(max_iters):
 94 | 
 95 |             S = np.ravel(W.sum(axis=1))
 96 |             err = np.max(np.absolute(1.0 - np.max(S)), np.absolute(1.0 - np.min(S)))
 97 | 
 98 |             if err < abs_error:
 99 |                 break
100 | 
101 |             D = csr_matrix((np.divide(1, np.sqrt(S)), (range(N), range(N))), shape=[N, N])
102 |             p *= S
103 |             W = D.dot(W).dot(D)
104 | 
105 |         # iron out numerical errors
106 |         T = (W + W.T) / 2
107 |         return T, p
108 | 
109 |     @staticmethod
110 |     def smarkov(D, N, W):
111 |         """normalization method for GraphDiffusion"""
112 |         D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N])
113 |         P = D
114 |         T = D.dot(W).dot(D)
115 |         T = (T + T.T) / 2
116 |         return T, P
117 | 
118 |     @staticmethod
119 |     def markov(D, N, W):
120 |         """normalization method for GraphDiffusion"""
121 |         T = csr_matrix((D, (range(N), range(N))), shape=[N, N]).dot(W)
122 |         return T, None
123 | 
124 |     @staticmethod
125 |     def sbeltrami(D, N, W):
126 |         """normalization method for GraphDiffusion"""
127 |         P = csr_matrix((D, (range(N), range(N))), shape=[N, N])
128 |         K = P.dot(W).dot(P)
129 | 
130 |         D = np.ravel(K.sum(axis=1))
131 |         D[D != 0] = 1 / D[D != 0]
132 | 
133 |         D = csr_matrix((D, (range(N), range(N))), shape=[N, N])
134 |         P = D
135 |         T = D.dot(K).dot(D)
136 | 
137 |         T = (T + T.T) / 2
138 |         return T, P
139 | 
140 |     @staticmethod
141 |     def beltrami(D, N, W):
142 |         """normalization method for GraphDiffusion"""
143 |         D = csr_matrix((D, (range(N), range(N))), shape=[N, N])
144 |         K = D.dot(W).dot(D)
145 | 
146 |         D = np.ravel(K.sum(axis=1))
147 |         D[D != 0] = 1 / D[D != 0]
148 | 
149 |         V = csr_matrix((D, (range(N), range(N))), shape=[N, N])
150 |         T = V.dot(K)
151 |         return T, None
152 | 
153 |     @staticmethod
154 |     def FokkerPlanck(D, N, W):
155 |         """normalization method for GraphDiffusion"""
156 |         D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N])
157 |         K = D.dot(W).dot(D)
158 | 
159 |         D = np.ravel(K.sum(axis=1))
160 |         D[D != 0] = 1 / D[D != 0]
161 | 
162 |         D = csr_matrix((D, (range(N), range(N))), shape=[N, N])
163 |         T = D.dot(K)
164 |         return T, None
165 | 
166 |     @staticmethod
167 |     def sFokkerPlanck(D, N, W):
168 |         """normalization method for GraphDiffusion"""
169 |         print('(sFokkerPlanck) ... ')
170 | 
171 |         D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N])
172 |         K = D.dot(W).dot(D)
173 | 
174 |         D = np.ravel(K.sum(axis=1))
175 |         D[D != 0] = 1 / D[D != 0]
176 | 
177 |         D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N])
178 |         P = D
179 |         T = D.dot(K).dot(D)
180 | 
181 |         T = (T + T.T) / 2
182 |         return T, P
183 | 
184 |     def fit(self, data, verbose=True):
185 |         """
186 |         :param data: Data matrix of samples X features
187 |         :param verbose: print progress report
188 | 
189 |         :return: Dictionary containing diffusion operator, weight matrix,
190 |                  diffusion eigen vectors, and diffusion eigen values
191 |         """
192 |         if verbose:
193 |             print('Running Diffusion maps with the following parameters:')
194 |             print('Normalization: %s' % self.normalization)
195 |             print('Number of nearest neighbors k: %d' % self.knn)
196 |             print('Epsilon: %.4f' % self.epsilon)
197 | 
198 |         # Nearest neighbors
199 |         start = time.process_time()
200 |         N = data.shape[0]
201 |         nbrs = NearestNeighbors(n_neighbors=self.knn).fit(data)
202 |         distances, indices = nbrs.kneighbors(data)
203 | 
204 |         # Adjacency matrix
205 |         rows = np.zeros(N * self.knn, dtype=np.int32)
206 |         cols = np.zeros(N * self.knn, dtype=np.int32)
207 |         dists = np.zeros(N * self.knn)
208 |         location = 0
209 |         for i in range(N):
210 |             inds = range(location, location + self.knn)
211 |             rows[inds] = indices[i, :]
212 |             cols[inds] = i
213 |             dists[inds] = distances[i, :]
214 |             location += self.knn
215 |         W = csr_matrix((dists, (rows, cols)), shape=[N, N])
216 | 
217 |         # Symmetrize W
218 |         W = W + W.T
219 | 
220 |         # Convert to affinity (with selfloops)
221 |         rows, cols, dists = find(W)
222 |         rows = np.append(rows, range(N))
223 |         cols = np.append(cols, range(N))
224 |         dists = np.append(dists / (self.epsilon ** 2), np.zeros(N))
225 |         W = csr_matrix((np.exp(-dists), (rows, cols)), shape=[N, N])
226 | 
227 |         # Create D
228 |         D = np.ravel(W.sum(axis=1))
229 |         D[D != 0] = 1 / D[D != 0]
230 | 
231 |         # Go through the various normalizations
232 |         fnorm = getattr(self, self.normalization)
233 |         T, P = fnorm(D=D, N=N, W=W)
234 | 
235 |         if self.normalization != 'bimarkov' and verbose:
236 |             print('%.2f seconds' % (time.process_time() - start))
237 | 
238 |         # Eigen value decomposition
239 |         V, D = GraphDiffusion.keigs(T, self.n_diffusion_components, P, take_diagonal=1)
240 |         self.eigenvectors = V
241 |         self.eigenvalues = D
242 |         self.diffusion_operator = T
243 |         self.weights = W
244 |         return {'operator': T, 'eigval': D, 'eigvec': V, 'weights': W}
245 | 


--------------------------------------------------------------------------------
/src/seqc/stats/gsea.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import os
  3 | import shlex
  4 | import glob
  5 | import re
  6 | import numpy as np
  7 | import pandas as pd
  8 | from scipy.special import expit
  9 | 
 10 | 
 11 | class GSEA:
 12 | 
 13 |     def __init__(self, correlations, output_stem=None):
 14 |         """initialize a gsea object
 15 |         :param pd.Series correlations: correlations in the range of [-1, 1] whose index
 16 |           contains gene names
 17 |         :param str output_stem: the filestem for the output data
 18 | 
 19 |         :method linear_scale: method to linearly scale a vector to lie on the interval
 20 |           [-1, 1]
 21 |         :method logisitc_scale: method to scale a vector by the logistic function to lie
 22 |           on the interval [-1, 1]
 23 |         :method run: run GSEA on these correlations
 24 |         """
 25 |         if not isinstance(correlations, pd.Series):
 26 |             raise TypeError('correlations must be a pandas series')
 27 |         if not ((np.min(correlations) >= -1) & (np.max(correlations) <= 1)):
 28 |             raise RuntimeError(
 29 |                 'input correlations were not contained within the interval [-1, 1]. '
 30 |                 'Please use JavaGSEA.linear_scale() or JavaGSEA.logistic_scale() to '
 31 |                 'scale values to this interval before running.')
 32 |         self._correlations = correlations.sort_values()
 33 |         self._rnk = None
 34 |         if output_stem is None:
 35 |             self._output_stem = os.environ['TMPDIR'] + 'gsea_corr_{!s}'.format(
 36 |                 np.random.randint(0, 1000000))
 37 |         elif not isinstance(output_stem, str):
 38 |             raise TypeError('output stem must be a str reference to a file prefix')
 39 |         elif output_stem.find('-') > -1:
 40 |             raise ValueError('output_stem cannot contain the dash (-) character.')
 41 |         else:
 42 |             self._output_stem = output_stem
 43 |         self._results = {}
 44 | 
 45 |     @property
 46 |     def correlations(self):
 47 |         return self._correlations
 48 | 
 49 |     @correlations.setter
 50 |     def correlations(self):
 51 |         raise RuntimeError('Please create a new object to compare different correlations')
 52 | 
 53 |     @property
 54 |     def results(self):
 55 |         return self._results
 56 | 
 57 |     @staticmethod
 58 |     def linear_scale(data: pd.Series) -> pd.Series:
 59 |         """scale input vector to interval [-1, 1] using a linear scaling
 60 |         :return correlations: pd.Series, data scaled to the interval [-1, 1]
 61 |         """
 62 |         data = data.copy()
 63 |         data -= np.min(data, axis=0)
 64 |         data /= np.max(data, axis=0) / 2
 65 |         data -= 1
 66 |         return data
 67 | 
 68 |     @staticmethod
 69 |     def logistic_scale(data: pd.Series) -> pd.Series:
 70 |         """scale input vector to interval [-1, 1] using a sigmoid scaling
 71 |         :return correlations: pd.Series, data scaled to the interval [-1, 1]
 72 |         """
 73 |         return pd.Series((expit(data.values) * 2) - 1, index=data.index)
 74 | 
 75 |     def _save_rank_file(self) -> None:
 76 |         """save the correlations to a .rnk file"""
 77 |         self._rnk = self._output_stem + '.rnk'
 78 |         df = pd.DataFrame(self._correlations).fillna(0)
 79 |         df.to_csv(self._rnk, sep='\t', header=False)
 80 | 
 81 |     @staticmethod
 82 |     def _gmt_options():
 83 |         """
 84 |         Private method. identifies GMT files available for mouse or human genomes
 85 |         :return: str, file options
 86 |         """
 87 | 
 88 |         mouse_options = os.listdir(os.path.expanduser('~/.seqc/tools/mouse'))
 89 |         human_options = os.listdir(os.path.expanduser('~/.seqc/tools/human'))
 90 |         print('Available GSEA .gmt files:\n\nmouse:\n{m}\n\nhuman:\n{h}\n'.format(
 91 |                 m='\n'.join(mouse_options),
 92 |                 h='\n'.join(human_options)))
 93 |         print('Please specify the gmt_file parameter as gmt_file=(organism, filename)')
 94 | 
 95 |     def run(self, gmt_file):
 96 |         """
 97 |         Helper function. Run GSEA on an already-ranked list of corrleations. To see
 98 |         available files, leave gmt_file parameter empty
 99 | 
100 |         :param (str, str) gmt_file: organism and filename of gmt file to use
101 |         :return (pd.DataFrame, pd.DataFrame): positive and negative GSEA enrichments
102 |         """
103 |         out_dir, out_prefix = os.path.split(self._output_stem)
104 |         os.makedirs(out_dir, exist_ok=True)
105 | 
106 |         if self._rnk is None:
107 |             self._save_rank_file()
108 | 
109 |         if not gmt_file:
110 |             self._gmt_options()
111 |             return
112 |         else:
113 |             if not len(gmt_file) == 2:
114 |                 raise ValueError('gmt_file should be a tuple of (organism, filename).')
115 |             else:
116 |                 gmt_file = os.path.expanduser('~/.seqc/tools/{}/{}').format(*gmt_file)
117 | 
118 |         # Construct the GSEA call
119 |         cmd = shlex.split(
120 |             'java -cp {user}/.seqc/tools/gsea2-2.2.1.jar -Xmx1g '
121 |             'xtools.gsea.GseaPreranked -collapse false -mode Max_probe -norm meandiv '
122 |             '-nperm 1000 -include_only_symbols true -make_sets true -plot_top_x 0 '
123 |             '-set_max 500 -set_min 50 -zip_report false -gui false -rnk {rnk} '
124 |             '-rpt_label {out_prefix} -out {out_dir}/ -gmx {gmt_file}'
125 |             ''.format(user=os.path.expanduser('~'), rnk=self._rnk, out_prefix=out_prefix,
126 |                       out_dir=out_dir, gmt_file=gmt_file))
127 | 
128 |         # Call GSEA
129 |         p = subprocess.Popen(cmd, stderr=subprocess.PIPE)
130 |         _, err = p.communicate()
131 | 
132 |         # find the file that GSEA created
133 |         if err:
134 |             print(err.decode())
135 |             return
136 |         else:
137 |             pattern = '{p}.GseaPreranked.[0-9]*'.format(p=out_prefix)
138 |             files = os.listdir(out_dir)
139 |             folder = None
140 |             for f in files:
141 |                 mo = re.match(pattern, f)
142 |                 if mo:
143 |                     folder = out_dir + '/' + mo.group(0)
144 |         if folder is None:
145 |             raise RuntimeError(
146 |                 'seqc.JavaGSEA was not able to recover the output of the Java '
147 |                 'executable. This likely represents a bug.')
148 | 
149 |         # recover information from run
150 |         names = ['size', 'es', 'nes', 'p', 'fdr_q', 'fwer_p', 'rank_at_max',
151 |                  'leading_edge']
152 |         pos = pd.DataFrame.from_csv(glob.glob(folder + '/gsea*pos*xls')[0],
153 |                                     sep='\t', infer_datetime_format=False, parse_dates=False).iloc[:, :-1]
154 |         pos.drop(['GS<br> follow link to MSigDB', 'GS DETAILS'], axis=1, inplace=True)
155 |         neg = pd.DataFrame.from_csv(glob.glob(folder + '/gsea*neg*xls')[0],
156 |                                     sep='\t', infer_datetime_format=False, parse_dates=False).iloc[:, :-1]
157 |         neg.drop(['GS<br> follow link to MSigDB', 'GS DETAILS'], axis=1, inplace=True)
158 |         pos.columns, neg.columns = names, names
159 |         self._results[gmt_file] = {'positive': pos, 'negative': neg}
160 |         return list(self._results[gmt_file].values())
161 | 


--------------------------------------------------------------------------------
/src/seqc/stats/mast.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import subprocess
 3 | import imp
 4 | import os
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | 
 9 | def run_mast(counts_filtered, clustering_communities, output_prefix):
10 |     # Differentially Expression Analysis using MAST
11 |     log_counts = (counts_filtered + 1.0).applymap(math.log2)
12 |     de_results = []  # array containing the differentially expression analysis for each cluster
13 |     for c in range(np.max(clustering_communities) + 1):
14 |         tmp_input_file = output_prefix + "_cluster_" + str(c) + "_mast_input.csv"
15 |         tmp_output_file = output_prefix + "_cluster_" + str(c) + "_mast_results.csv"
16 |         reduced_tdf1 = log_counts.iloc[np.where(clustering_communities == c)[0]]
17 |         reduced_tdf2 = log_counts.iloc[np.where(clustering_communities != c)[0]]
18 |         reduced_df = pd.concat([reduced_tdf1, reduced_tdf2])
19 |         reduced_df.index = pd.Index([1 if i < len(reduced_tdf1.index) else 0 for i in range(len(reduced_tdf1.index) + len(reduced_tdf2.index))])
20 |         reduced_df.to_csv(tmp_input_file)
21 | 
22 |         path_to_run_mast = imp.find_module('seqc')[1]
23 |         args = 'Rscript {p} {i} {o}'.format(p=os.path.join(path_to_run_mast, 'run_mast.R'), i=tmp_input_file, o=tmp_output_file)
24 |         with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as p:
25 |             out, err = p.communicate()
26 |             if os.path.isfile(tmp_output_file):
27 |                 de_gene_df = pd.read_csv(tmp_output_file)
28 |                 if len(de_gene_df.index) > 0:
29 |                     de_results.append(de_gene_df)
30 |                 else:  # if no differentially expressed genes
31 |                     de_results.append(None)
32 |             else:
33 |                 de_results.append(None)
34 | 
35 |     de_gene_list_file = output_prefix + "_de_gene_list.txt"
36 |     with open(de_gene_list_file, "w") as f:
37 |         f.write("Differential Expression Analysis Using MAST\n\n")
38 |         c = 1
39 |         for de_result in de_results:
40 |             if de_result is not None:
41 |                 f.write("Differentially expressed genes for cluster %d:\n" % (c))
42 |                 f.write("%-10s  %-10s  %-10s  %-10s\n" % ("Gene", "p", "p.fdr", "logFC"))
43 | 
44 |                 for i in range(len(de_result)):
45 |                     p_v = "%.2e" % de_result.loc[i][1]
46 |                     p_fdr = "%.2e" % de_result.loc[i][2]
47 |                     logFC = "%.2f" % de_result.loc[i][3]
48 |                     f.write("%-10s  %-10s  %-10s  %-10s\n" % (de_result.loc[i][0], p_v, p_fdr, logFC))
49 |             else:
50 |                 f.write("No differentially expressed genes has been found for cluster %d.\n" % (c))
51 |             c += 1
52 |             f.write("\n")
53 |         f.close()
54 |     return de_gene_list_file


--------------------------------------------------------------------------------
/src/seqc/stats/pca.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | class PCA:
  6 | 
  7 |     def __init__(self, n_components=30):
  8 |         """
  9 |         construct a model for Principle Component Analysis
 10 | 
 11 |         :param n_components: number of principle components to retain
 12 | 
 13 |         :property eigenvalues: stores the eigenvalues computed by fit()
 14 |         :property loadings: stores the eigenvectors of the pca decomposition computed by
 15 |           fit()
 16 |         :method fit: fit the model to the data
 17 |         :method transform: project the data onto a subset of the principle components
 18 |           (default: all components other than the first)
 19 |         :method fit_transform: fit and transform the data, returning the projected result
 20 |         """
 21 |         self.n_components = n_components
 22 |         self.loadings = None
 23 |         self.eigenvalues = None
 24 | 
 25 |     def fit(self, data: np.ndarray or pd.DataFrame, fillna=0):
 26 |         """
 27 |         Fit the model to data
 28 | 
 29 |         :param data: n observation x k feature data array
 30 |         :param fillna: fill np.NaN values with this value. If None, will not fill.
 31 |         :return: None
 32 |         """
 33 | 
 34 |         if isinstance(data, pd.DataFrame):
 35 |             X = data.values
 36 |         elif isinstance(data, np.ndarray):
 37 |             X = data
 38 |         else:
 39 |             raise TypeError('data must be a pd.DataFrame or np.ndarray')
 40 | 
 41 |         if fillna is not None:
 42 |             X[np.where(np.isnan(X))] = fillna
 43 |             X[np.where(np.isinf(X))] = fillna
 44 | 
 45 |         # Compute covariance matrix
 46 |         if X.shape[1] < X.shape[0]:
 47 |             C = np.cov(X, rowvar=False)
 48 |         # if N > D, we better use this matrix for the eigendecomposition
 49 |         else:
 50 |             C = np.multiply((1 / X.shape[0]), np.dot(X, X.T))
 51 | 
 52 |         # Perform eigendecomposition of C
 53 |         C[np.where(np.isnan(C))] = 0
 54 |         C[np.where(np.isinf(C))] = 0
 55 |         l, M = np.linalg.eig(C)
 56 | 
 57 |         # Sort eigenvectors in descending order
 58 |         ind = np.argsort(l)[::-1]
 59 |         l = l[ind]
 60 |         if self.n_components < 1:
 61 |             self.n_components = (
 62 |                 np.where(np.cumsum(np.divide(l, np.sum(l)), axis=0) >=
 63 |                          self.n_components)[0][0] + 1)
 64 |             print('Embedding into ' + str(self.n_components) + ' dimensions.')
 65 |         elif self.n_components > M.shape[1]:
 66 |             self.n_components = M.shape[1]
 67 |             print('Target dimensionality reduced to ' + str(self.n_components) + '.')
 68 | 
 69 |         M = M[:, ind[:self.n_components]]
 70 |         l = l[:self.n_components]
 71 | 
 72 |         # Apply mapping on the data
 73 |         if X.shape[1] >= X.shape[0]:
 74 |             M = np.multiply(np.dot(X.T, M), (1 / np.sqrt(X.shape[0] * l)).T)
 75 | 
 76 |         self.loadings = M
 77 |         self.eigenvalues = l
 78 | 
 79 |     def transform(self, data, components=None) -> np.ndarray or pd.DataFrame:
 80 |         """
 81 |         Transform data using the fit PCA model.
 82 | 
 83 |         :param data:  n observation x k feature data array
 84 |         :param components:  components to retain when transforming
 85 |           data, if None, uses all components except for the first
 86 |         :return: np.ndarray containing transformed data
 87 |         """
 88 | 
 89 |         if components is None:
 90 |             components = np.arange(1, self.n_components)
 91 | 
 92 |         projected = np.dot(data, self.loadings[:, components])
 93 |         if isinstance(data, pd.DataFrame):
 94 |             return pd.DataFrame(projected, index=data.index, columns=components)
 95 |         else:
 96 |             return projected
 97 | 
 98 |     def fit_transform(self, data: np.ndarray or pd.DataFrame, n_components=None) -> \
 99 |             np.ndarray or pd.DataFrame:
100 |         """
101 |         Fit the model to data and transform the data using the fit model
102 | 
103 |         :param data:  n observation x k feature data array
104 |         :param n_components:  number of components to retain when transforming
105 |           data
106 |         :return np.ndarray or pd.DataFrame: transformed data
107 |         """
108 | 
109 |         self.fit(data)
110 |         return self.transform(data, components=n_components)
111 | 


--------------------------------------------------------------------------------
/src/seqc/stats/resampled_nonparametric.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from functools import partial
  3 | from multiprocessing import Pool
  4 | from contextlib import closing
  5 | from itertools import repeat
  6 | import numpy as np
  7 | import numpy.ma as ma
  8 | import pandas as pd
  9 | from scipy.stats.mstats import count_tied_groups, rankdata
 10 | from scipy.stats.mstats import kruskalwallis as _kruskalwallis
 11 | from scipy.special import erfc
 12 | from statsmodels.sandbox.stats.multicomp import multipletests
 13 | 
 14 | 
 15 | def get_memory():
 16 |     """
 17 |     """
 18 |     return os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') / (1024 ** 3)
 19 | 
 20 | 
 21 | def _mannwhitneyu(x, y, use_continuity=True):
 22 |     """
 23 |     Computes the Mann-Whitney statistic
 24 |     Missing values in `x` and/or `y` are discarded.
 25 |     Parameters
 26 |     ----------
 27 |     x : ndarray,
 28 |         Input, vector or observations x features matrix
 29 |     y : ndarray,
 30 |         Input, vector or observations x features matrix. If matrix, must have
 31 |         same number of features as x
 32 |     use_continuity : {True, False}, optional
 33 |         Whether a continuity correction (1/2.) should be taken into account.
 34 |     Returns
 35 |     -------
 36 |     statistic : float
 37 |         The Mann-Whitney statistic
 38 |     approx z : float
 39 |         The normal-approximated z-score for U.
 40 |     pvalue : float
 41 |         Approximate p-value assuming a normal distribution.
 42 |     """
 43 |     if x.ndim == 1 and y.ndim == 1:
 44 |         x, y = x[:, np.newaxis], y[:, np.newaxis]
 45 |     ranks = rankdata(np.concatenate([x, y]), axis=0)
 46 |     nx, ny = x.shape[0], y.shape[0]
 47 |     nt = nx + ny
 48 |     U = ranks[:nx].sum(0) - nx * (nx + 1) / 2.
 49 | 
 50 |     mu = (nx * ny) / 2.
 51 |     u = np.amin([U, nx*ny - U], axis=0)  # get smaller U by convention
 52 | 
 53 |     sigsq = np.ones(ranks.shape[1]) * (nt ** 3 - nt) / 12.
 54 | 
 55 |     for i in np.arange(len(sigsq)):
 56 |         ties = count_tied_groups(ranks[:, i])
 57 |         sigsq[i] -= np.sum(v * (k ** 3 - k) for (k, v) in ties.items()) / 12.
 58 |     sigsq *= nx * ny / float(nt * (nt - 1))
 59 | 
 60 |     if use_continuity:
 61 |         z = (U - 1 / 2. - mu) / np.sqrt(sigsq)
 62 |     else:
 63 |         z = (U - mu) / np.sqrt(sigsq)
 64 | 
 65 |     prob = erfc(abs(z) / np.sqrt(2))
 66 |     return np.vstack([u, z, prob]).T
 67 | 
 68 | 
 69 | def find_sampling_value(group_data, percentile):
 70 |     """
 71 | 
 72 |     :param group_data:
 73 |     :param int percentile:
 74 |     :return:
 75 |     """
 76 |     return min(np.percentile(g.sum(axis=1), percentile) for g in group_data)
 77 | 
 78 | 
 79 | def normalize(data, downsample_value, upsample=False, labels=None):
 80 |     """
 81 |     :param data:
 82 |     :param downsample_value: value to normalize cell counts to. In current implementation,
 83 |         a small number of cells (10%) are upsampled to this value.
 84 |     :param upsample: if False, all observations with size < downsample_value are excluded.
 85 |         if True, those cells are upsampled to downsample_value.
 86 |     :return:
 87 |     """
 88 |     obs_size = data.sum(axis=1)
 89 |     if not upsample:
 90 |         keep = obs_size >= downsample_value
 91 |         data = data[keep, :]
 92 |         if labels is not None:
 93 |             labels = labels[keep]
 94 |     norm = (data * downsample_value) / data.sum(axis=1)[:, np.newaxis]
 95 |     if labels is not None:
 96 |         return norm, labels
 97 |     else:
 98 |         return norm
 99 | 
100 | 
101 | def _draw_sample(normalized_data, n):
102 |     """
103 |     :param normalized_data:
104 |     :param n:
105 |     """
106 |     np.random.seed()
107 |     idx = np.random.randint(0, normalized_data.shape[0], n)
108 |     sample = normalized_data[idx, :]
109 |     p = np.random.sample(sample.shape)  # round samples probabilistically
110 | 
111 |     return np.floor(sample) + (sample % 1 > p).astype(int)
112 | 
113 | 
114 | def _mw_sampling_function(norm_data, n_cell):
115 |     """
116 |     :param norm_data:
117 |     :param n_cell:
118 |     :return:
119 |     """
120 |     a, b = (_draw_sample(d, n_cell) for d in norm_data)
121 |     return _mannwhitneyu(a, b)  # dim = (n_genes, 3)
122 | 
123 | 
124 | def confidence_interval(z):
125 |     """
126 | 
127 |     :param z:
128 |     :return:
129 |     """
130 |     return np.percentile(z, [2.5, 97.5], axis=0).T
131 | 
132 | 
133 | def mannwhitneyu(
134 |         x, y, n_iter=50, sampling_percentile=10, alpha=0.05, verbose=False,
135 |         upsample=False):
136 |     """
137 |     :param x: observations by features array or DataFrame (ndim must be 2, although there
138 |         needn't be more than one feature)
139 |     :param y: observations by features array or DataFrama. Features must be the same as x
140 |     :param n_iter: number of times to sample x and y
141 |     :param sampling_percentile: percentile to downsample to. observations with row sums
142 |         lower than this value will be excluded
143 |     :param alpha: significance threshold for FDR correction
144 |     :param verbose: if True, report number of cells sampled in each iteration and the
145 |         integer value to which cells are downsampled
146 |     :param upsample: if False, cells with size lower than sampling_percentile are
147 |         discarded. If True, those cells are upsampled.
148 |     :return pd.DataFrame: DataFrame with columns:
149 |         U: median u-statistic over the n_iter iterations of the test
150 |         z_approx: median approximate tie-corrected z-score for the mann-whitney U-test
151 |         z_lo: lower bound, 95% confidence interval over z
152 |         z_hi: upper bound, 95% confidence interval over z
153 |         p: p-value for z_approx
154 |         q: FDR-corrected q-value over all tests in output, using two-stage BH-FDR.
155 |     """
156 | 
157 |     # do some sanity checks on input data
158 |     if isinstance(x, pd.DataFrame) and isinstance(y, pd.DataFrame):
159 |         assert np.array_equal(x.columns, y.columns)
160 |         labels = x.columns
161 |         x = x.values
162 |         y = y.values
163 |     elif x.ndim > 1:
164 |         assert x.shape[1] == y.shape[1]
165 |         labels = None
166 |     else:
167 |         labels = None
168 | 
169 |     # calculate sampling values
170 |     v = find_sampling_value([x, y], sampling_percentile)
171 |     norm_data = [normalize(d, v, upsample) for d in [x, y]]
172 |     n_cell = min(d.shape[0] for d in norm_data)
173 |     sampling_function = partial(_mw_sampling_function, n_cell=n_cell)
174 | 
175 |     if verbose:  # report sampling values
176 |         print('sampling %d cells (with replacement) per iteration' % n_cell)
177 |         print('sampling %d molecules per cell' % v)
178 | 
179 |     with closing(Pool()) as pool:
180 |         results = pool.map(sampling_function, repeat(norm_data, n_iter))
181 | 
182 |     results = np.stack(results)  # u, z, p
183 | 
184 |     ci = confidence_interval(results[:, :, 1])
185 |     results = pd.DataFrame(
186 |         data=np.concatenate([np.median(results, axis=0), ci], axis=1),
187 |         index=labels,
188 |         columns=['U', 'z_approx', 'p', 'z_lo', 'z_hi'])
189 | 
190 |     # add multiple-testing correction
191 |     results['q'] = multipletests(results['p'], alpha=alpha, method='fdr_tsbh')[1]
192 | 
193 |     # remove low-value genes whose median sampling value is -inf
194 |     neginf = np.isneginf(results['z_approx'])
195 |     results.ix[neginf, 'z_lo'] = np.nan
196 |     results.ix[neginf, 'z_approx'] = 0
197 |     results.ix[neginf, ['p', 'q']] = 1.
198 | 
199 |     results = results[['U', 'z_approx', 'z_lo', 'z_hi', 'p', 'q']].sort_values('q')
200 |     results.iloc[:, 1:4] = np.round(results.iloc[:, 1:4], 2)
201 | 
202 |     return results
203 | 
204 | 
205 | def _kw_sampling_function(data, splits, n_cell):
206 |     data = [_draw_sample(d, n_cell) for d in np.split(data, splits)]
207 |     return _kruskal(data)
208 | 
209 | 
210 | def _kruskal(data):
211 |     """
212 |     Compute the Kruskal-Wallis H-test for independent samples
213 |     Parameters
214 |     ----------
215 |     sample1, sample2, ... : array_like
216 |        Two or more arrays with the sample measurements can be given as
217 |        arguments.
218 |     Returns
219 |     -------
220 |     statistic : float
221 |        The Kruskal-Wallis H statistic, corrected for ties
222 |     pvalue : float
223 |        The p-value for the test using the assumption that H has a chi
224 |        square distribution
225 |     Notes
226 |     -----
227 |     For more details on `kruskal`, see `stats.kruskal`.
228 |     """
229 |     results = []
230 |     for i in np.arange(data[0].shape[1]):
231 |         args = [d[:, i] for d in data]
232 |         try:
233 |             results.append(_kruskalwallis(*args))
234 |         except ValueError:
235 |             results.append([0, 1.])
236 |     return results
237 | 
238 | 
239 | def category_to_numeric(labels):
240 |     """transform categorical labels to a numeric array"""
241 |     labels = np.array(labels)
242 |     if np.issubdtype(labels.dtype, np.integer):
243 |         return labels
244 |     else:
245 |         cats = np.unique(labels)
246 |         map_ = dict(zip(cats, np.arange(cats.shape[0])))
247 |         return np.array([map_[i] for i in labels])
248 | 
249 | 
250 | def kruskalwallis(
251 |         data, labels, n_iter=50, sampling_percentile=10, alpha=0.05, verbose=False,
252 |         upsample=False):
253 |     """
254 |     :param data: np.ndarray or pd.DataFrame of observations x features
255 |     :param labels: observation labels for categories to be compared
256 |     :param n_iter: number of times to sample x and y
257 |     :param sampling_percentile: percentile to downsample to. observations with row sums
258 |         lower than this value will be excluded
259 |     :param alpha: significance threshold for FDR correction
260 |     :param verbose: if True, report number of cells sampled in each iteration and the
261 |         integer value to which cells are downsampled
262 |     :param upsample: if False, cells with size lower than sampling_percentile are
263 |         discarded. If True, those cells are upsampled.
264 |     :return pd.DataFrame: DataFrame with columns:
265 |         H: median u-statistic over the n_iter iterations of the test
266 |         z_approx: median approximate tie-corrected z-score for the mann-whitney U-test
267 |         z_lo: lower bound, 95% confidence interval over z
268 |         z_hi: upper bound, 95% confidence interval over z
269 |         p: p-value for z_approx
270 |         q: FDR-corrected q-value over all tests in output, using two-stage BH-FDR.
271 |     """
272 | 
273 |     if isinstance(data, pd.DataFrame):
274 |         features = data.columns
275 |         data = data.values
276 |     elif isinstance(data, np.ndarray):
277 |         features = None
278 |     else:
279 |         raise ValueError('data must be a np.ndarray or pd.DataFrame, not %s' %
280 |                          repr(type(data)))
281 | 
282 |     # if labels are not numeric, transform to numeric categories
283 |     labels = category_to_numeric(labels)
284 |     if not labels.shape[0] == data.shape[0]:
285 |         raise ValueError('labels (shape=%s) must match dimension 0 of data (shape=%s)' %
286 |                          (repr(labels.shape), repr(labels.data)))
287 | 
288 |     idx = np.argsort(labels)
289 |     data = data[idx, :]  # will copy
290 |     labels = labels[idx]
291 | 
292 |     splits = np.where(np.diff(labels))[0] + 1
293 | 
294 |     # calculate sampling values and downsample data
295 |     v = find_sampling_value(np.split(data, splits), sampling_percentile)
296 |     norm_data, labels = normalize(data, v, upsample, labels)
297 | 
298 |     splits = np.where(np.diff(labels))[0] + 1  # rediff, norm_data causes loss
299 | 
300 |     n_cell = min(d.shape[0] for d in np.split(norm_data, splits))
301 |     sampling_function = partial(_kw_sampling_function, n_cell=n_cell, splits=splits)
302 | 
303 |     if verbose:  # report sampling values
304 |         print('sampling %d cells (with replacement) per iteration' % n_cell)
305 |         print('sampling %d molecules per cell' % v)
306 | 
307 |     with closing(Pool()) as pool:
308 |         results = pool.map(sampling_function, repeat(norm_data, n_iter))
309 | 
310 |     results = np.stack(results)  # H, p
311 | 
312 |     ci = confidence_interval(results[:, :, 0])  # around H
313 |     results = pd.DataFrame(
314 |         data=np.concatenate([np.median(results, axis=0), ci], axis=1),
315 |         index=features,
316 |         columns=['H', 'p', 'H_lo', 'H_hi'])
317 | 
318 |     results['q'] = multipletests(results['p'], alpha=alpha, method='fdr_tsbh')[1]
319 |     results = results[['H', 'H_lo', 'H_hi', 'p', 'q']]
320 |     return results
321 | 
322 | 


--------------------------------------------------------------------------------
/src/seqc/stats/smoothing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import multiprocessing
 4 | from sklearn.neighbors import NearestNeighbors
 5 | 
 6 | 
 7 | class smoothing:
 8 |     """Data smoothing kernels
 9 | 
10 |     :method kneighbors: transforms each observation (row) of data by setting it
11 |       equal to the average of its k-nearest neighbors
12 |     """
13 | 
14 |     @staticmethod
15 |     def kneighbors(data: np.array or pd.DataFrame, n_neighbors=50, pca=None, **kwargs):
16 |         """
17 |         Smooth gene expression values by setting the expression of each gene in each
18 |         cell equal to the mean value of itself and its n_neighbors
19 | 
20 |         :param data: np.ndarray | pd.DataFrame; genes x cells array
21 |         :param n_neighbors: int; number of neighbors to smooth over
22 |         :param pca: dimensionality reduced matrix, knn will be run on this and applied
23 |           to data (runs much faster)
24 |         :param kwargs: keyword arguments to pass sklearn.NearestNeighbors
25 |         :return: np.ndarray | pd.DataFrame; same as input
26 |         """
27 | 
28 |         if isinstance(data, pd.DataFrame):
29 |             data_ = data.values
30 |         elif isinstance(data, np.ndarray):
31 |             data_ = data
32 |         else:
33 |             raise TypeError("data must be a pd.DataFrame or np.ndarray")
34 | 
35 |         knn = NearestNeighbors(
36 |             n_neighbors=n_neighbors,
37 |             n_jobs=multiprocessing.cpu_count() - 1,
38 |             **kwargs)
39 | 
40 |         if pca is not None:
41 |             knn.fit(pca)
42 |             inds = knn.kneighbors(pca, return_distance=False)
43 |         else:
44 |             knn.fit(data_)
45 |             inds = knn.kneighbors(data_, return_distance=False)
46 | 
47 |         # smoothing creates large intermediates; break up to avoid memory errors
48 |         pieces = []
49 |         num_partitions = np.round(data_.shape[0] / 2000) + 1
50 |         if num_partitions > 2:  # 2 partitions produces start + end, need a third to split
51 |             sep = np.linspace(0, data_.shape[0] + 1, num_partitions, dtype=int)
52 |             for start, end in zip(sep, sep[1:]):
53 |                 pieces.append(data_[inds[start:end, :], :].mean(axis=1))
54 |             res = np.vstack(pieces)
55 |         else:
56 |             res = data_[inds, :].mean(axis=1)
57 | 
58 |         if isinstance(data, pd.DataFrame):
59 |             res = pd.DataFrame(res, index=data.index, columns=data.columns)
60 | 
61 |         return res
62 | 


--------------------------------------------------------------------------------
/src/seqc/stats/tree.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Tree:
 4 | 
 5 |     def __init__(self, id, left=None, right=None, dist=None):
 6 |         self.id = id
 7 |         self.left = left
 8 |         self.right = right
 9 |         self.dist = dist
10 | 
11 |     def __repr__(self):
12 |         return '<Node id=%s, left=%a, right=%a, dist=%a>' % (
13 |             self.id,
14 |             self.left.id if self.left is not None else None,
15 |             self.right.id if self.left is not None else None,
16 |             self.dist if self.dist is not None else None)
17 | 
18 |     @classmethod
19 |     def from_linkage(cls, Z):
20 |         current_id = Z.shape[0] * 2
21 |         tree = {}
22 |         for (left, right, dist, n_children) in Z[::-1]:
23 |             tree[left] = Tree(id=left)
24 |             tree[right] = Tree(id=right)
25 |             if current_id not in tree:
26 |                 tree[current_id] = Tree(id=current_id, left=tree[left], right=tree[right], dist=dist)
27 |             else:
28 |                 tree[current_id].left = tree[left]
29 |                 tree[current_id].right = tree[right]
30 |                 tree[current_id].dist = dist
31 |             current_id -= 1
32 |         return tree[max(tree.keys())]
33 | 
34 |     def is_leaf(self):
35 |         return True if self.left is None and self.right is None else False
36 | 
37 |     @staticmethod
38 |     def nodes2labels(nodes):
39 |         return [n.id for n in nodes]
40 | 
41 |     def get_daughter(self, id_):
42 |         for daughter in self.dfs():
43 |             if daughter.id == id_:
44 |                 return daughter
45 |         return None
46 | 
47 |     def has_daughter(self, id_):
48 |         for daughter in self.dfs():
49 |             if daughter.id == id_:
50 |                 return True
51 |         return False
52 | 
53 |     def dfs(self):
54 |         visited, stack = [], [self]
55 |         while stack:
56 |             vertex = stack.pop()
57 |             yield vertex
58 |             if vertex not in visited:
59 |                 visited.append(vertex)
60 |                 if vertex.left is not None:
61 |                     stack.append(vertex.left)
62 |                 if vertex.right is not None:
63 |                     stack.append(vertex.right)
64 | 
65 |     def bfs(self):
66 |         visited, queue = [], [self]
67 |         while queue:
68 |             vertex = queue.pop(0)
69 |             yield vertex
70 |             if vertex not in visited:
71 |                 visited.append(vertex)
72 |                 if vertex.left is not None:
73 |                     queue.append(vertex.left)
74 |                 if vertex.right is not None:
75 |                     queue.append(vertex.right)
76 | 


--------------------------------------------------------------------------------
/src/seqc/stats/tsne.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import bhtsne
 4 | from seqc.stats.pca import PCA
 5 | 
 6 | class TSNE:
 7 | 
 8 |     def __init__(self, n_components: int=2, run_pca: bool=False,
 9 |                  n_pca_components: int=20, fillna: float=None, **kwargs):
10 |         """
11 |         t-stochastic neighbor embedding
12 | 
13 | 
14 |         :param normalize: if True, scales features to unit size
15 |         :param run_pca: if True, runs PCA on the input data and runs tSNE on the
16 |           components retained by PCA.
17 |         :param n_components: number of tSNE components to return
18 |         :param n_pca_components: number of components to which data should be projected,
19 |           if run_pca is True
20 |         :param fillna: fills np.nan values with this float value
21 |         :param kwargs:  additional keyword arguments to pass tsne
22 | 
23 |         :method fit_transform: fits the tSNE model to data and returns the transformed
24 |           result
25 | 
26 |         """
27 | 
28 |         self.run_pca = run_pca
29 |         self.n_components = n_components
30 |         self.n_pca_components = n_pca_components
31 |         self.kwargs = kwargs
32 |         self.tsne = None
33 |         self.pca = None
34 |         self.fillna = fillna
35 | 
36 |     def fit_transform(self, data: np.ndarray or pd.DataFrame) -> None:
37 |         """
38 |         fit the tSNE model to data given the parameters provided during
39 |          initialization and transform the output
40 | 
41 |         :param data: n observation x k feature data array
42 |         :return np.ndarray or pd.DataFrame: tsne results
43 |         """
44 |         if isinstance(data, pd.DataFrame):
45 |             data_ = data.values
46 |         else:
47 |             data_ = data
48 | 
49 |         if self.fillna is not None:
50 |             data_[np.where(np.isnan(data_))] = self.fillna
51 |             data_[np.where(np.isinf(data_))] = self.fillna
52 |         if self.run_pca:
53 |             self.pca = PCA(n_components=self.n_pca_components)
54 |             data_ = self.pca.fit_transform(data_)
55 | 
56 |         res = bhtsne.tsne(data_.astype(float), dimensions=self.n_components, **self.kwargs)
57 | 
58 |         if isinstance(data, pd.DataFrame):
59 |             self.tsne = pd.DataFrame(res, index=data.index)
60 |         else:
61 |             self.tsne = res
62 |         return self.tsne
63 | 


--------------------------------------------------------------------------------
/src/seqc/stats/ttest.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections.abc import Callable
  3 | from multiprocessing import Pool, cpu_count
  4 | from functools import partial
  5 | from contextlib import closing
  6 | from scipy.stats import t
  7 | import pandas as pd
  8 | from statsmodels.sandbox.stats.multicomp import multipletests
  9 | 
 10 | 
 11 | def estimate_multinomial(x):
 12 |     """estimate empirical multinomial expectation for a set of cells with each cell
 13 |      normalized to contribute equally to the expectation.
 14 | 
 15 |     :param np.ndarray x: cell x gene array containing expression data
 16 |     :return np.ndarray: multinomial expectation over genes of x
 17 |     """
 18 |     return (x / x.sum(axis=1)[:, np.newaxis]).mean(axis=0)
 19 | 
 20 | 
 21 | def assert_input_non_negative(*args):
 22 |     """
 23 |     :param [np.ndarray] args: input numpy arrays
 24 |     :return None:
 25 |     """
 26 |     if any(np.any(np.less(a, 0)) for a in args):
 27 |         raise ValueError('input data must be non-negative')
 28 | 
 29 | 
 30 | def _sampling_function(n_iter, n_molecules, theta, n_cells):
 31 |     """
 32 | 
 33 |     :param n_iter:
 34 |     :param n_molecules:
 35 |     :param theta:
 36 |     :param n_cells:
 37 |     :return:
 38 |     """
 39 | 
 40 |     def online_mean_var(nb, mu_b, var_b, na, mu_a, var_a):
 41 |         nx = na + nb
 42 |         delta = mu_b - mu_a
 43 |         mu_x_ = mu_a + delta * nb / nx
 44 |         var_x_ = (na * (var_a + mu_a ** 2) + nb * (var_b + mu_b ** 2)) / nx - mu_x_ ** 2
 45 |         return nx, mu_x_, var_x_
 46 | 
 47 |     res_mu = np.zeros((n_iter, theta.shape[0]), dtype=np.float32)
 48 |     res_var = np.zeros((n_iter, theta.shape[0]), dtype=np.float32)
 49 |     n_cells //= 10
 50 |     for i in np.arange(n_iter):
 51 |         # break sampling (n_cells) into 10 pieces
 52 |         obs = np.random.multinomial(n_molecules, theta, n_cells)
 53 |         mu_x = np.mean(obs, axis=0)
 54 |         var_x = np.mean(obs, axis=0)
 55 |         n_x = obs.shape[0]
 56 |         for _ in np.arange(9):
 57 |             obs = np.random.multinomial(n_molecules, theta, n_cells)
 58 |             mu = np.mean(obs, axis=0)
 59 |             var = np.mean(obs, axis=0)
 60 |             n = obs.shape[0]
 61 |             n_x, mu_x, var_x = online_mean_var(n, mu, var, n_x, mu_x, var_x)
 62 |         res_mu[i, :] = mu_x
 63 |         res_var[i, :] = var_x / n_x
 64 |     return res_mu, res_var
 65 | 
 66 | 
 67 | def sample_moments(mult_probability, n_samples, n_cells, n_molecules):
 68 |     """sample mean and variance of n_cells, each containing n_molecules. n_samples mean/
 69 |     variance pairs are sampled on each call.
 70 | 
 71 |     :param mult_probability:
 72 |     :param n_samples:
 73 |     :param n_cells:
 74 |     :param n_molecules:
 75 |     :return:
 76 |     """
 77 | 
 78 |     # parition iterations among available compute cores
 79 |     ncpu = cpu_count()
 80 |     if n_samples > ncpu:
 81 |         samples_per_process = np.array([n_samples // ncpu] * ncpu)
 82 |         samples_per_process[:n_samples % ncpu] += 1
 83 |     else:
 84 |         samples_per_process = np.ones((n_samples,))
 85 | 
 86 |     # map iterations across compute cores
 87 |     sampler = partial(
 88 |         _sampling_function, n_molecules=n_molecules, theta=mult_probability,
 89 |         n_cells=n_cells)
 90 |     with closing(Pool(ncpu)) as pool:
 91 |         results = pool.map(sampler, samples_per_process)
 92 |         mu, var = (np.vstack(mats) for mats in zip(*results))
 93 | 
 94 |     # all means should be finite
 95 |     assert np.sum(np.isnan(mu)) == 0
 96 | 
 97 |     # in cases where variance is np.nan, we can safely set the variance to zero since the
 98 |     # mean for that tissue will also be zero; this will eliminate singularities caused by
 99 |     # one tissue never expressing a protein.
100 |     var[np.isnan(var)] = 0
101 | 
102 |     return mu, var
103 | 
104 | 
105 | def whelch_satterthwaite_df(a_var, b_var, a_n, b_n):
106 |     t1 = a_var.mean(axis=0)
107 |     t2 = b_var.mean(axis=0)
108 |     numerator = (t1 / a_n + t2 / b_n) ** 2
109 |     denominator = t1 ** 2 / (a_n ** 2 * (a_n - 1)) + t2 ** 2 / (b_n ** 2 * (b_n - 1))
110 |     df = numerator / denominator
111 |     return df
112 | 
113 | 
114 | def whelchs_t(a_mu, a_var, b_mu, b_var, a_n, b_n):
115 |     """
116 | 
117 |     :param np.ndarray a_mu:
118 |     :param np.ndarray a_var:
119 |     :param np.ndarray b_mu:
120 |     :param np.ndarray b_var:
121 |     :param int a_n:
122 |     :param int b_n:
123 |     :return float, float: statistic and p-value
124 |     """
125 |     df = whelch_satterthwaite_df(a_var, b_var, a_n, b_n)
126 |     numerator = a_mu - b_mu  # (samples, genes)
127 |     denominator = np.sqrt(a_var + b_var)  # (samples, genes)
128 |     statistic = numerator / denominator  # (samples, genes)
129 | 
130 |     # statistic has NaNs where there are no observations of a or b (DivideByZeroError)
131 |     statistic[np.isnan(statistic)] = 0
132 |     median_statistic = np.median(np.abs(statistic), axis=0)
133 |     p = (1 - t.cdf(median_statistic, df)) * 2  # p-value
134 |     ci_95 = np.percentile(np.abs(statistic), [2.5, 97.5], axis=0).T
135 | 
136 |     return median_statistic, p, ci_95
137 | 
138 | 
139 | def bootstrap_t(a, b, n_samples=100, n_cells=None, alpha=0.05,
140 |                 downsample_value_function=np.median, labels=None):
141 |     """
142 | 
143 |     :param np.ndarray a:
144 |     :param np.ndarray b:
145 |     :param int n_samples:
146 |     :param int n_cells:
147 |     :param float alpha: acceptable type-I error (default = 0.05)
148 |     :param Callable downsample_value_function: function that identifies the number of
149 |       molecules n to sample from a and b. the sampling number will be the minimum of the
150 |       result across a and b. default = np.median. Other values include np.mean and np.max.
151 |     :param labels: feature labels for columns of a & b
152 |     :return (int, int) statistic, q_val:
153 |     """
154 |     assert_input_non_negative(a, b)
155 |     mult_a = estimate_multinomial(a)
156 |     mult_b = estimate_multinomial(b)
157 | 
158 |     # get number of molecules to sample
159 |     a_sizes = a.sum(axis=1)
160 |     b_sizes = b.sum(axis=1)
161 |     n_molecules = min(
162 |         map(lambda x: downsample_value_function(x).astype(int), [a_sizes, b_sizes]))
163 | 
164 |     # set n_cells to the smaller of the two passed samples (e.g. if comparing two sets,
165 |     # one with 130 cells, and one with 1902 cells, n_cells = 130).
166 |     if n_cells is None:
167 |         n_cells = min(a.shape[0], b.shape[0])
168 | 
169 |     a_mu, a_var = sample_moments(mult_a, n_samples, n_cells, n_molecules)
170 |     b_mu, b_var = sample_moments(mult_b, n_samples, n_cells, n_molecules)
171 | 
172 |     statistic, p, ci_95 = whelchs_t(a_mu, a_var, b_mu, b_var, a.shape[0], b.shape[0])
173 | 
174 |     q = multipletests(p, alpha=alpha, method='fdr_tsbh')[1]
175 | 
176 |     results = pd.DataFrame(
177 |         data=np.vstack([statistic, ci_95.T, p, q]).T,
178 |         index=labels,
179 |         columns=['t', 't_ci95_low', 't_ci95_high', 'p', 'q'])
180 | 
181 |     return results
182 | 


--------------------------------------------------------------------------------
/src/seqc/summary/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/__init__.py


--------------------------------------------------------------------------------
/src/seqc/summary/css/simple-sidebar.css:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * Start Bootstrap - Simple Sidebar (http://startbootstrap.com/)
  3 |  * Copyright 2013-2016 Start Bootstrap
  4 |  * Licensed under MIT (https://github.com/BlackrockDigital/startbootstrap/blob/gh-pages/LICENSE)
  5 |  */
  6 | 
  7 |  body {
  8 |     overflow-x: hidden;
  9 |  }
 10 | 
 11 | /* Toggle Styles */
 12 | 
 13 | #wrapper {
 14 |     padding-left: 0;
 15 |     -webkit-transition: all 0.5s ease;
 16 |     -moz-transition: all 0.5s ease;
 17 |     -o-transition: all 0.5s ease;
 18 |     transition: all 0.5s ease;
 19 | }
 20 | 
 21 | #wrapper.toggled {
 22 |     padding-left: 250px;
 23 | }
 24 | 
 25 | #sidebar-wrapper {
 26 |     z-index: 1000;
 27 |     position: fixed;
 28 |     left: 250px;
 29 |     width: 0;
 30 |     height: 100%;
 31 |     margin-left: -250px;
 32 |     overflow-y: auto;
 33 |     background: #000;
 34 |     -webkit-transition: all 0.5s ease;
 35 |     -moz-transition: all 0.5s ease;
 36 |     -o-transition: all 0.5s ease;
 37 |     transition: all 0.5s ease;
 38 | }
 39 | 
 40 | #wrapper.toggled #sidebar-wrapper {
 41 |     width: 250px;
 42 | }
 43 | 
 44 | #page-content-wrapper {
 45 |     width: 100%;
 46 |     position: absolute;
 47 |     padding: 15px;
 48 | }
 49 | 
 50 | #wrapper.toggled #page-content-wrapper {
 51 |     position: absolute;
 52 |     margin-right: -250px;
 53 | }
 54 | 
 55 | /* Sidebar Styles */
 56 | 
 57 | .sidebar-nav {
 58 |     position: absolute;
 59 |     top: 0;
 60 |     width: 250px;
 61 |     margin: 0;
 62 |     padding: 0;
 63 |     list-style: none;
 64 | }
 65 | 
 66 | .sidebar-nav li {
 67 |     text-indent: 20px;
 68 |     line-height: 40px;
 69 | }
 70 | 
 71 | .sidebar-nav li a {
 72 |     display: block;
 73 |     text-decoration: none;
 74 |     color: #999999;
 75 | }
 76 | 
 77 | .sidebar-nav li a:hover {
 78 |     text-decoration: none;
 79 |     color: #fff;
 80 |     background: rgba(255,255,255,0.2);
 81 | }
 82 | 
 83 | .sidebar-nav li a:active,
 84 | .sidebar-nav li a:focus {
 85 |     text-decoration: none;
 86 | }
 87 | 
 88 | .sidebar-nav > .sidebar-brand {
 89 |     height: 65px;
 90 |     font-size: 18px;
 91 |     line-height: 60px;
 92 | }
 93 | 
 94 | .sidebar-nav > .sidebar-brand a {
 95 |     color: #999999;
 96 | }
 97 | 
 98 | .sidebar-nav > .sidebar-brand a:hover {
 99 |     color: #fff;
100 |     background: none;
101 | }
102 | 
103 | @media(min-width:768px) {
104 |     #wrapper {
105 |         padding-left: 250px;
106 |     }
107 | 
108 |     #wrapper.toggled {
109 |         padding-left: 0;
110 |     }
111 | 
112 |     #sidebar-wrapper {
113 |         width: 250px;
114 |     }
115 | 
116 |     #wrapper.toggled #sidebar-wrapper {
117 |         width: 0;
118 |     }
119 | 
120 |     #page-content-wrapper {
121 |         padding: 20px;
122 |         position: relative;
123 |     }
124 | 
125 |     #wrapper.toggled #page-content-wrapper {
126 |         position: relative;
127 |         margin-right: 0;
128 |     }
129 | }


--------------------------------------------------------------------------------
/src/seqc/summary/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/src/seqc/summary/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/src/seqc/summary/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/src/seqc/summary/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/src/seqc/summary/html_/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/html_/__init__.py


--------------------------------------------------------------------------------
/src/seqc/summary/img/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/img/__init__.py


--------------------------------------------------------------------------------
/src/seqc/summary/static/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/static/__init__.py


--------------------------------------------------------------------------------
/src/seqc/summary/templates/mini_summary_base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html><html lang="en">
 2 | <head>
 3 | <title>{{output_prefix}} Mini Summary</title><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1">
 4 | <style> .pagebreak { page-break-before: always; } </style>
 5 | </head>
 6 | <body>
 7 | <center><h2>{{output_prefix}} Mini Summary</h2></center>
 8 | <h3>Overall Statistics</h3>
 9 | <table>
10 | 	<tr><td># Reads:</td><td>{{mini_summary_d['n_reads']}}</td></tr>
11 | 	{% if mini_summary_d['uniqmapped_pct'] == 'N/A' %}
12 | 		<tr><td>% of uniquely mapped reads:</td><td>N/A</td></tr>
13 | 		<tr><td>% of multi-mapped reads:</td><td>N/A</td></tr>
14 | 		<tr><td>% of unmapped reads:</td><td>N/A</td></tr>
15 | 		<tr><td>% of filtered reads mapping to genome:</td><td>N/A</td></tr>
16 | 	{% else %}
17 | 		<tr><td>% of uniquely mapped reads:</td><td>{{'%.2f%%' % mini_summary_d['uniqmapped_pct']}}</td></tr>
18 | 		<tr><td>% of multi-mapped reads:</td><td>{{'%.2f%%' % mini_summary_d['multimapped_pct']}}</td></tr>
19 | 		<tr><td>% of unmapped reads:</td><td>{{'%.2f%%' % mini_summary_d['unmapped_pct']}}</td></tr>
20 | 		<tr><td>% of filtered reads mapping to genome:</td><td>{{'%.2f%%' % mini_summary_d['genomic_read_pct']}}</td></tr>
21 | 	{% endif %}
22 | 	<tr><td>Sequencing saturation rate:</td><td>{{'%.2f%%' % mini_summary_d['seq_sat_rate']}}</td></tr>
23 | 	<tr><td>&nbsp</td></tr>
24 | 	<tr><td># Cells:</td><td>{{'%d' % mini_summary_d['n_cells']}}</td></tr>
25 | 	<tr><td>Median molecules per cell:</td><td>{{'%d' % mini_summary_d['med_molcs_per_cell']}}</td></tr>
26 | 	<tr><td>Average reads per cell:</td><td>{{'%d' % mini_summary_d['avg_reads_per_cell']}}</td></tr>
27 | 	<tr><td>Average reads per molecule:</td><td>{{'%.2f' % mini_summary_d['avg_reads_per_molc']}}</td></tr>
28 | 	{% if 'mt_rna_fraction' in mini_summary_d %}
29 | 		<tr><td>% of cells filtered by high mt-RNA content:</td><td>{{'%.2f%%' % mini_summary_d['mt_rna_fraction']}}</td></tr>
30 | 	{% endif %}
31 | </table>
32 | 
33 | <h3>Cell Size Distribution</h3>
34 | <center><img src="{{cellsize_fig}}" style="width:40%;height:40%;"></center>
35 | 
36 | <div class="pagebreak"> </div>
37 | <h3>Filtering</h3>
38 | Indian red indicates cells that have been filtered<br>
39 | <center><img src="{{filter_fig}}" style="width:95%;height:95%;"></center>
40 | 
41 | 
42 | <div class="pagebreak"> </div>
43 | <h3>PCA Components</h3>
44 | <center><img src="{{pca_fig}}" style="width:95%;height:95%;"></center>
45 | 
46 | <div class="pagebreak"> </div>
47 | <h3>Phenograph Clustering</h3>
48 | Library size has been regressed out of all PCA components. We ran Phenograph clustering algorithm on the dataset with revised PCA components and with 80 nearest neighbors.<br><br>
49 | <center><img src="{{tsne_and_phenograph_fig}}" style="width:99%;height:99%;"></center>
50 | 
51 | <h3>Warnings</h3>
52 | <table>    
53 |     {% for w,m in warning_d.items() %}
54 |     	<tr><td>{{w}}:</td><td>{{m}}</td></tr>
55 |     {% endfor %}
56 | </table>
57 | </body>


--------------------------------------------------------------------------------
/src/seqc/summary/templates/section_base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 | 
 6 |     <meta charset="utf-8">
 7 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 8 |     <meta name="viewport" content="width=device-width, shrink-to-fit=no, initial-scale=1">
 9 |     <meta name="description" content="">
10 |     <meta name="author" content="">
11 | 
12 |     <title>SEQC report</title>  <!-- consider adding variable here -->
13 | 
14 |     <!-- Bootstrap Core CSS -->
15 |     <link href="../css/bootstrap.min.css" rel="stylesheet">
16 | 
17 |     <!-- Custom CSS -->
18 |     <link href="../css/simple-sidebar.css" rel="stylesheet">
19 | 
20 |     <!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->
21 |     <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
22 |     <!--[if lt IE 9]>
23 |     <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
24 |     <script src="https://oss.maxcdn.com/libs/respond.js/1.4.2/respond.min.js"></script>
25 |     <![endif]-->
26 | 
27 | </head>
28 | 
29 | <body>
30 | 
31 |     <div id="wrapper">
32 | 
33 |         <!-- Sidebar -->
34 |         <div id="sidebar-wrapper">
35 |             <ul class="sidebar-nav">
36 |                 <li class="sidebar-title">
37 |                     <a href="{{index_section_link}}">SEQC Run Summary</a>
38 |                 </li>
39 |                 {% for s in sections %}
40 |                 <li>
41 |                     <a href="{{s.filename}}">{{s.name}}</a>
42 |                 </li>
43 |                 {% endfor %}
44 |             </ul>
45 |         </div>
46 |         <!-- /Sidebar -->
47 | 
48 |         <!-- Page Content -->
49 |         <div id="page-content-wrapper">
50 |             {% block content %}{% endblock %}
51 |         </div> <!-- /#page-content-wrapper -->
52 | 
53 |     </div> <!-- /#wrapper -->
54 | 
55 | 
56 |     <!-- jQuery -->
57 |     <script src="../js/jquery.js"></script>
58 | 
59 |     <!-- Bootstrap Core JavaScript -->
60 |     <script src="../js/bootstrap.min.js"></script>
61 | 
62 | </body>
63 | 
64 | </html>
65 | 


--------------------------------------------------------------------------------
/src/seqc/summary/templates/section_content.html:
--------------------------------------------------------------------------------
 1 | {% extends "section_base.html" %}}
 2 | {% block content %}
 3 |     <h2>{{section.name}}</h2>
 4 | 
 5 |     <div class="row">
 6 |         {% for name, c in section.content.items() %}
 7 | 
 8 |         <h3>{{name}}</h3>
 9 | 
10 |         {% if c.keys is defined %}
11 |         <div align="right" class="col-sm-4">
12 |             {% for k in c.keys %}
13 |             {{k}}<br>
14 |             {% endfor %}
15 |         </div>
16 |         <div align="left" class="col-sm-8">
17 |             {% for v in c.values %}
18 |             {{v}}<br>
19 |             {% endfor %}
20 |         </div>
21 |         {% elif c.text is defined %}
22 |         <div align="left" class="col-sm-12">
23 |             {{c.text}}
24 |         </div>
25 |         {% elif c.image is defined %}
26 |         <div class="col-sm-8">
27 |             <img src="../img/{{c.image}}" alt="{{c.caption}}">  <!-- need style here to set size in pixels -->
28 |         </div>
29 |         <div align="left" class="col-sm-4">
30 |             {{c.legend}}
31 |         </div>
32 |         {% endif %}
33 | 
34 |         {% endfor %}
35 |     </div>
36 | {% endblock %}


--------------------------------------------------------------------------------
/src/seqc/summary/test.py:
--------------------------------------------------------------------------------
 1 | import nose2
 2 | import unittest
 3 | from seqc.summary import summary
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | class TestSummary(unittest.TestCase):
 8 | 
 9 |     def test_render_section(self):
10 |         s1 = summary.Section.from_alignment_summary(
11 |             '/var/folders/y3/ysxvl2w921d881nfpvx5ypvh0000gn/T/seqc/test_no_aws_in_drop_v2'
12 |             '/alignment_summary.txt')
13 |         s1.render('./src/seqc/summary/test_summary.html')
14 | 
15 | if __name__ == "__main__":
16 |     nose2.main()
17 | 


--------------------------------------------------------------------------------
/src/seqc/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/tests/__init__.py


--------------------------------------------------------------------------------
/src/seqc/tests/test_args.py:
--------------------------------------------------------------------------------
 1 | import nose2
 2 | import unittest
 3 | 
 4 | import seqc
 5 | from seqc.core import main
 6 | 
 7 | 
 8 | # class TestSEQC(unittest.TestCase):
 9 | #     def setUp(self):
10 | #         pass
11 | 
12 | #     def tearDown(self):
13 | #         pass
14 | 
15 | #     def test_args(self):
16 | 
17 | #         argv = ["start", "-k", "/Users/dchun/dpeerlab-chunj.pem", "-t", "t2.micro"]
18 | 
19 | #         self.assertRaises(ValueError, lambda: main.main(argv))
20 | 
21 | # class MyUnitTest(unittest.TestCase):
22 | #     def setUp(self):
23 | #         pass
24 | 
25 | #     def tearDown(self):
26 | #         pass
27 | 
28 | #     def test_args(self):
29 | 
30 | #         # argv = [
31 | #         #     "run", "ten_x_v2", "--local",
32 | #         #     "--index", "s3://seqc-public/genomes/hg38_chr19/",
33 | #         #     "--barcode-files", "s3://seqc-public/barcodes/ten_x_v2/flat/",
34 | #         #     "--genomic-fastq", "./test-data/genomic/",
35 | #         #     "--barcode-fastq", "./test-data/barcode/",
36 | #         #     "--output-prefix", "./test-data/seqc-results/",
37 | #         #     "--email", "jaeyoung.chun@gmail.com",
38 | #         #     "--star-args", "\"runRNGseed=0\""
39 | #         # ]
40 | 
41 | #         argv = [
42 | #             "run"
43 | #         ]        
44 | 
45 | #         try:
46 | #             main.main(argv)
47 | #             # self.assertRaises(BaseException, lambda: main.main(argv))
48 | #         except:
49 | #             pass
50 | #         # self.assertRaises(ValueError, lambda: main.main(argv))
51 | 
52 | 
53 | # class TestSEQC(unittest.TestCase):
54 | #     def setUp(self):
55 | #         pass
56 | 
57 | #     def tearDown(self):
58 | #         pass
59 | 
60 | #     def test_args(self):
61 | 
62 | #         from seqc.sequence import gtf
63 | 
64 | #         # remove any invalid ids from the annotation file
65 | #         gr = gtf.Reader("./test-data/homo_sapiens.gtf.gz")
66 | 
67 | #         for line_fields in gr:
68 | #             record = gtf.Record(line_fields)
69 | #             print(record)
70 | #             biotype = record.attribute("gene_biotype")
71 | #             print(biotype)
72 | 
73 | #         # self.assertRaises(ValueError, lambda: main.main(argv))
74 | 
75 | 
76 | if __name__ == "__main__":
77 | 
78 |     unittest.main()
79 | 


--------------------------------------------------------------------------------
/src/seqc/tests/test_dataset.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | TestDataset = namedtuple(
 4 |     "datasets",
 5 |     ["barcode_fastq", "genomic_fastq", "merged_fastq", "bam", "index", "barcodes",],
 6 | )
 7 | 
 8 | dataset_s3 = TestDataset(
 9 |     barcode_fastq="s3://seqc-public/test/%s/barcode/",  # platform
10 |     genomic_fastq="s3://seqc-public/test/%s/genomic/",  # platform
11 |     merged_fastq="s3://seqc-public/test/%s/%s_merged.fastq.gz",  # platform, platform
12 |     bam="s3://seqc-public/test/%s/Aligned.out.bam",  # platform
13 |     index="s3://seqc-public/genomes/hg38_chr19/",
14 |     barcodes="s3://seqc-public/barcodes/%s/flat/",  # platform
15 | )
16 | 
17 | dataset_local = TestDataset(
18 |     barcode_fastq="test-data/datasets/%s/barcode/",  # platform
19 |     genomic_fastq="test-data/datasets/%s/genomic/",  # platform
20 |     merged_fastq=None,
21 |     bam="test-data/datasets/%s/Aligned.out.bam",  # platform
22 |     index="test-data/datasets/genomes/hg38_chr19/",
23 |     barcodes="test-data/datasets/barcodes/%s/flat/",  # platform
24 | )
25 | 


--------------------------------------------------------------------------------
/src/seqc/tests/test_run_e2e_local.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import uuid
  4 | import shutil
  5 | import subprocess
  6 | import re
  7 | from nose2.tools import params
  8 | from seqc.core import main
  9 | from test_dataset import dataset_local, dataset_s3
 10 | 
 11 | 
 12 | def get_output_file_list(test_id, test_folder):
 13 | 
 14 |     proc = subprocess.Popen(
 15 |         ["find", test_folder, "-type", "f"],
 16 |         stdout=subprocess.PIPE,
 17 |         stderr=subprocess.PIPE,
 18 |     )
 19 |     stdout, _ = proc.communicate()
 20 |     files = stdout.decode().splitlines()
 21 | 
 22 |     # extract only filenames (i.e. remove directory hierarchy)
 23 |     # convert to a set for easy comparison
 24 |     files = set(map(lambda filename: filename.replace(test_folder + "/", ""), files))
 25 | 
 26 |     return files
 27 | 
 28 | 
 29 | def expected_output_files(file_prefix):
 30 | 
 31 |     files = set(
 32 |         [
 33 |             f"{file_prefix}.h5",
 34 |             f"{file_prefix}_alignment_summary.txt",
 35 |             f"{file_prefix}_cell_filters.png",
 36 |             f"{file_prefix}_de_gene_list.txt",
 37 |             f"{file_prefix}_dense.csv",
 38 |             f"{file_prefix}_merged.fastq.gz",
 39 |             f"{file_prefix}_mini_summary.json",
 40 |             f"{file_prefix}_mini_summary.pdf",
 41 |             f"{file_prefix}_seqc_log.txt",
 42 |             f"{file_prefix}_sparse_counts_barcodes.csv",
 43 |             f"{file_prefix}_sparse_counts_genes.csv",
 44 |             f"{file_prefix}_sparse_molecule_counts.mtx",
 45 |             f"{file_prefix}_sparse_read_counts.mtx",
 46 |             f"{file_prefix}_summary.tar.gz",
 47 |             f"{file_prefix}_Aligned.out.bam",
 48 |         ]
 49 |     )
 50 | 
 51 |     return files
 52 | 
 53 | 
 54 | class TestRunLocal(unittest.TestCase):
 55 |     @classmethod
 56 |     def setUp(cls):
 57 |         cls.test_id = str(uuid.uuid4())
 58 |         cls.path_temp = os.path.join(
 59 |             os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4())
 60 |         )
 61 |         os.makedirs(cls.path_temp, exist_ok=True)
 62 |         with open("seqc_log.txt", "wt") as f:
 63 |             f.write("Dummy log.\n")
 64 |             f.write("nose2 captures input, so no log is produced.\n")
 65 |             f.write("This causes pipeline errors.\n")
 66 | 
 67 |     @classmethod
 68 |     def tearDown(self):
 69 |         if os.path.isdir(self.path_temp):
 70 |             shutil.rmtree(self.path_temp, ignore_errors=True)
 71 | 
 72 |     def test_using_dataset_in_s3(self, platform="ten_x_v2"):
 73 |         # must NOT end with a slash
 74 |         file_prefix = "test"
 75 |         output_prefix = os.path.join(self.path_temp, file_prefix)
 76 | 
 77 |         params = [
 78 |             ("run", platform),
 79 |             ("--local",),
 80 |             ("--output-prefix", output_prefix),
 81 |             ("--index", dataset_s3.index),
 82 |             ("--barcode-files", dataset_s3.barcodes % platform),
 83 |             ("--barcode-fastq", dataset_s3.barcode_fastq % platform),
 84 |             ("--genomic-fastq", dataset_s3.genomic_fastq % platform),
 85 |             ("--star-args", "runRNGseed=0"),
 86 |         ]
 87 | 
 88 |         argv = [element for tupl in params for element in tupl]
 89 | 
 90 |         if platform != "drop_seq":
 91 |             argv += ["--barcode-files", dataset_s3.barcodes % platform]
 92 | 
 93 |         main.main(argv)
 94 | 
 95 |         # get output file list
 96 |         files = get_output_file_list(self.test_id, self.path_temp)
 97 | 
 98 |         # check if each expected file is found in the list of files generated
 99 |         for file in expected_output_files(file_prefix):
100 |             self.assertIn(file, files)
101 | 
102 |     def test_using_local_dataset(self, platform="ten_x_v2"):
103 |         # must NOT end with a slash
104 |         file_prefix = "test"
105 |         output_prefix = os.path.join(self.path_temp, file_prefix)
106 | 
107 |         params = [
108 |             ("run", platform),
109 |             ("--local",),
110 |             ("--output-prefix", output_prefix),
111 |             ("--index", dataset_local.index),
112 |             ("--barcode-files", dataset_local.barcodes % platform),
113 |             ("--barcode-fastq", dataset_local.barcode_fastq % platform),
114 |             ("--genomic-fastq", dataset_local.genomic_fastq % platform),
115 |             ("--star-args", "runRNGseed=0"),
116 |         ]
117 | 
118 |         argv = [element for tupl in params for element in tupl]
119 | 
120 |         if platform != "drop_seq":
121 |             argv += ["--barcode-files", dataset_local.barcodes % platform]
122 | 
123 |         main.main(argv)
124 | 
125 |         # get output file list
126 |         files = get_output_file_list(self.test_id, self.path_temp)
127 | 
128 |         # check if each expected file is found in the list of files generated
129 |         for file in expected_output_files(file_prefix):
130 |             self.assertIn(file, files)
131 | 


--------------------------------------------------------------------------------
/src/seqc/tests/test_run_e2e_remote.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import uuid
  4 | import shutil
  5 | import re
  6 | from seqc.core import main
  7 | from seqc import io
  8 | import boto3
  9 | from nose2.tools import params
 10 | from test_dataset import dataset_s3
 11 | 
 12 | 
 13 | def get_instance_by_test_id(test_id):
 14 | 
 15 |     ec2 = boto3.resource("ec2")
 16 |     instances = ec2.instances.filter(
 17 |         Filters=[{"Name": "tag:TestID", "Values": [test_id]}]
 18 |     )
 19 |     instances = list(instances)
 20 | 
 21 |     if len(instances) != 1:
 22 |         raise Exception("Test ID is not found or not unique!")
 23 | 
 24 |     return instances[0]
 25 | 
 26 | 
 27 | def expected_output_files(output_prefix):
 28 | 
 29 |     files = set(
 30 |         [
 31 |             f"{output_prefix}.h5",
 32 |             f"{output_prefix}_Aligned.out.bam",
 33 |             f"{output_prefix}_alignment_summary.txt",
 34 |             f"{output_prefix}_cell_filters.png",
 35 |             f"{output_prefix}_de_gene_list.txt",
 36 |             f"{output_prefix}_dense.csv",
 37 |             f"{output_prefix}_merged.fastq.gz",
 38 |             f"{output_prefix}_mini_summary.json",
 39 |             f"{output_prefix}_mini_summary.pdf",
 40 |             f"{output_prefix}_seqc_log.txt",
 41 |             f"{output_prefix}_sparse_counts_barcodes.csv",
 42 |             f"{output_prefix}_sparse_counts_genes.csv",
 43 |             f"{output_prefix}_sparse_molecule_counts.mtx",
 44 |             f"{output_prefix}_sparse_read_counts.mtx",
 45 |             f"{output_prefix}_summary.tar.gz",
 46 |             f"seqc_log.txt",
 47 |         ]
 48 |     )
 49 | 
 50 |     return files
 51 | 
 52 | 
 53 | def expected_output_files_run_from_merged(output_prefix):
 54 | 
 55 |     files = expected_output_files(output_prefix)
 56 | 
 57 |     excludes = set([f"{output_prefix}_merged.fastq.gz"])
 58 | 
 59 |     return files - excludes
 60 | 
 61 | 
 62 | def expected_output_files_run_from_bam(output_prefix):
 63 | 
 64 |     files = expected_output_files(output_prefix)
 65 | 
 66 |     excludes = set(
 67 |         [
 68 |             f"{output_prefix}_Aligned.out.bam",
 69 |             f"{output_prefix}_alignment_summary.txt",
 70 |             f"{output_prefix}_merged.fastq.gz",
 71 |         ]
 72 |     )
 73 | 
 74 |     return files - excludes
 75 | 
 76 | 
 77 | def get_output_file_list(test_id, s3_bucket, test_folder):
 78 | 
 79 |     # get instance and wait until terminated
 80 |     instance = get_instance_by_test_id(test_id)
 81 |     instance.wait_until_terminated()
 82 | 
 83 |     # check files generated in S3
 84 |     files = io.S3.listdir(s3_bucket, test_folder)
 85 | 
 86 |     # extract only filenames (i.e. remove directory hierarchy)
 87 |     # convert to a set for easy comparison
 88 |     files = set(map(lambda filename: filename.replace(test_folder, ""), files))
 89 | 
 90 |     return files
 91 | 
 92 | 
 93 | def check_for_success_msg(s3_seqc_log_uri, path_temp):
 94 | 
 95 |     # download seqc_log.txt
 96 |     io.S3.download(
 97 |         link=s3_seqc_log_uri, prefix=path_temp, overwrite=True, recursive=False
 98 |     )
 99 | 
100 |     # check if seqc_log.txt has a successful message
101 |     with open(os.path.join(path_temp, "seqc_log.txt"), "rt") as fin:
102 |         logs = fin.read()
103 |         match = re.search(r"Execution completed successfully", logs, re.MULTILINE)
104 | 
105 |         return True if match else False
106 | 
107 | 
108 | class TestRunRemote(unittest.TestCase):
109 | 
110 |     email = os.environ["SEQC_TEST_EMAIL"]
111 |     rsa_key = os.environ["SEQC_TEST_RSA_KEY"]
112 |     ami_id = os.environ["SEQC_TEST_AMI_ID"]
113 | 
114 |     s3_bucket = "dp-lab-cicd"
115 | 
116 |     @classmethod
117 |     def setUp(cls):
118 |         cls.test_id = str(uuid.uuid4())
119 |         cls.path_temp = os.path.join(
120 |             os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4())
121 |         )
122 |         os.makedirs(cls.path_temp, exist_ok=True)
123 | 
124 |     @classmethod
125 |     def tearDown(self):
126 |         if os.path.isdir(self.path_temp):
127 |             shutil.rmtree(self.path_temp, ignore_errors=True)
128 | 
129 |     @params("in_drop_v2", "ten_x_v2")
130 |     def test_remote_from_raw_fastq(self, platform="ten_x_v2"):
131 |         output_prefix = "from-raw-fastq"
132 |         # must end with a slash
133 |         test_folder = f"seqc/run-{platform}-{self.test_id}/"
134 | 
135 |         params = [
136 |             ("run", platform),
137 |             ("--output-prefix", "from-raw-fastq"),
138 |             ("--upload-prefix", f"s3://{self.s3_bucket}/{test_folder}"),
139 |             ("--index", dataset_s3.index),
140 |             ("--email", self.email),
141 |             ("--barcode-fastq", dataset_s3.barcode_fastq % platform),
142 |             ("--genomic-fastq", dataset_s3.genomic_fastq % platform),
143 |             ("--instance-type", "r5.2xlarge"),
144 |             ("--spot-bid", "1.0"),
145 |             ("--rsa-key", self.rsa_key),
146 |             ("--debug",),
147 |             ("--remote-update",),
148 |             ("--ami-id", self.ami_id),
149 |             ("--user-tags", f"TestID:{self.test_id}"),
150 |         ]
151 | 
152 |         argv = [element for tupl in params for element in tupl]
153 | 
154 |         if platform != "drop_seq":
155 |             argv += ["--barcode-files", dataset_s3.barcodes % platform]
156 | 
157 |         main.main(argv)
158 | 
159 |         # wait until terminated
160 |         # get output file list
161 |         files = get_output_file_list(self.test_id, self.s3_bucket, test_folder)
162 | 
163 |         # check for the exact same filenames
164 |         self.assertSetEqual(files, expected_output_files(output_prefix))
165 | 
166 |         # check for success message in seqc_log.txt
167 |         has_success_msg = check_for_success_msg(
168 |             s3_seqc_log_uri="s3://{}/{}".format(
169 |                 self.s3_bucket, os.path.join(test_folder, "seqc_log.txt")
170 |             ),
171 |             path_temp=self.path_temp,
172 |         )
173 | 
174 |         self.assertTrue(
175 |             has_success_msg, msg="Unable to find the success message in the log"
176 |         )
177 | 
178 |     def test_remote_from_merged(self, platform="in_drop_v2"):
179 |         output_prefix = "from-merged"
180 |         # must end with a slash
181 |         test_folder = f"seqc/run-{platform}-{self.test_id}/"
182 | 
183 |         params = [
184 |             ("run", platform),
185 |             ("--output-prefix", output_prefix),
186 |             ("--upload-prefix", f"s3://{self.s3_bucket}/{test_folder}"),
187 |             ("--index", dataset_s3.index),
188 |             ("--email", self.email),
189 |             ("--merged-fastq", dataset_s3.merged_fastq % (platform, platform)),
190 |             ("--rsa-key", self.rsa_key),
191 |             ("--instance-type", "r5.2xlarge"),
192 |             ("--ami-id", self.ami_id),
193 |             ("--remote-update",),
194 |             ("--user-tags", f"TestID:{self.test_id}")
195 |             # ('--spot-bid', '1.0')
196 |         ]
197 | 
198 |         argv = [element for tupl in params for element in tupl]
199 | 
200 |         if platform != "drop_seq":
201 |             argv += ["--barcode-files", dataset_s3.barcodes % platform]
202 | 
203 |         main.main(argv)
204 | 
205 |         # wait until terminated
206 |         # get output file list
207 |         files = get_output_file_list(self.test_id, self.s3_bucket, test_folder)
208 | 
209 |         # check for the exact same filenames
210 |         self.assertSetEqual(files, expected_output_files_run_from_merged(output_prefix))
211 | 
212 |         # check for success message in seqc_log.txt
213 |         has_success_msg = check_for_success_msg(
214 |             s3_seqc_log_uri="s3://{}/{}".format(
215 |                 self.s3_bucket, os.path.join(test_folder, "seqc_log.txt")
216 |             ),
217 |             path_temp=self.path_temp,
218 |         )
219 | 
220 |         self.assertTrue(
221 |             has_success_msg, msg="Unable to find the success message in the log"
222 |         )
223 | 
224 |     def test_remote_from_bamfile(self, platform="in_drop_v2"):
225 |         output_prefix = "from-bamfile"
226 |         # must end with a slash
227 |         test_folder = f"seqc/run-{platform}-{self.test_id}/"
228 | 
229 |         params = [
230 |             ("run", platform),
231 |             ("--output-prefix", output_prefix),
232 |             ("--upload-prefix", f"s3://{self.s3_bucket}/{test_folder}"),
233 |             ("--index", dataset_s3.index),
234 |             ("--email", self.email),
235 |             ("--alignment-file", dataset_s3.bam % platform),
236 |             ("--rsa-key", self.rsa_key),
237 |             ("--instance-type", "r5.2xlarge"),
238 |             ("--debug",),
239 |             ("--ami-id", self.ami_id),
240 |             ("--remote-update",),
241 |             ("--user-tags", f"TestID:{self.test_id}")
242 |             # ('--spot-bid', '1.0')
243 |         ]
244 | 
245 |         argv = [element for tupl in params for element in tupl]
246 | 
247 |         if platform != "drop_seq":
248 |             argv += ["--barcode-files", dataset_s3.barcodes % platform]
249 | 
250 |         main.main(argv)
251 | 
252 |         # wait until terminated
253 |         # get output file list
254 |         files = get_output_file_list(self.test_id, self.s3_bucket, test_folder)
255 | 
256 |         # check for the exact same filenames
257 |         self.assertSetEqual(files, expected_output_files_run_from_bam(output_prefix))
258 | 
259 |         # check for success message in seqc_log.txt
260 |         has_success_msg = check_for_success_msg(
261 |             s3_seqc_log_uri="s3://{}/{}".format(
262 |                 self.s3_bucket, os.path.join(test_folder, "seqc_log.txt")
263 |             ),
264 |             path_temp=self.path_temp,
265 |         )
266 | 
267 |         self.assertTrue(
268 |             has_success_msg, msg="Unable to find the success message in the log"
269 |         )
270 | 


--------------------------------------------------------------------------------
/src/seqc/tests/test_run_gtf.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase, mock
 2 | import os
 3 | import uuid
 4 | import shutil
 5 | import nose2
 6 | from seqc.sequence import gtf
 7 | from test_dataset import dataset_local
 8 | 
 9 | 
10 | class TestGtf(TestCase):
11 |     @classmethod
12 |     def setUp(cls):
13 |         cls.test_id = str(uuid.uuid4())
14 |         cls.path_temp = os.path.join(
15 |             os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4())
16 |         )
17 |         cls.annotation = os.path.join(dataset_local.index, "annotations.gtf")
18 | 
19 |     @classmethod
20 |     def tearDown(self):
21 |         if os.path.isdir(self.path_temp):
22 |             shutil.rmtree(self.path_temp, ignore_errors=True)
23 | 
24 |     def test_construct_translator(self):
25 |         translator = gtf.GeneIntervals(self.annotation)
26 |         self.assertIsNotNone(translator)
27 | 
28 |     def test_num_of_transcripts(self):
29 |         rd = gtf.Reader(self.annotation)
30 |         num_transcripts = sum(1 for _ in rd.iter_transcripts())
31 |         # awk -F'\t' '$3=="transcript" { print $0 }' annotations.gtf | wc -l
32 |         self.assertEqual(num_transcripts, 12747)
33 | 
34 |     def test_iter_transcripts(self):
35 |         rd = gtf.Reader(self.annotation)
36 |         (transcript_chromosome, transcript_strand, transcript_gene_id), exons = next(
37 |             rd.iter_transcripts()
38 |         )
39 | 
40 |         # this should give us 3 exons of the first transcript of the first gene found in inverse order:
41 |         #
42 |         # chr19  HAVANA  gene        60951  71626  .  -  .  gene_id  "ENSG00000282458.1";  gene_type      "transcribed_processed_pseudogene";  gene_status  "KNOWN";                             gene_name    "WASH5P";  level      2;         havana_gene      "OTTHUMG00000180466.8";
43 |         # chr19  HAVANA  transcript  60951  70976  .  -  .  gene_id  "ENSG00000282458.1";  transcript_id  "ENST00000632506.1";                 gene_type    "transcribed_processed_pseudogene";  gene_status  "KNOWN";   gene_name  "WASH5P";  transcript_type  "processed_transcript";              transcript_status  "KNOWN";  transcript_name  "WASH5P-008";  level        2;  tag                       "basic";              transcript_support_level  "1";                     havana_gene               "OTTHUMG00000180466.8";  havana_transcript         "OTTHUMT00000471217.2";
44 |         # chr19  HAVANA  exon        70928  70976  .  -  .  gene_id  "ENSG00000282458.1";  transcript_id  "ENST00000632506.1";                 gene_type    "transcribed_processed_pseudogene";  gene_status  "KNOWN";   gene_name  "WASH5P";  transcript_type  "processed_transcript";              transcript_status  "KNOWN";  transcript_name  "WASH5P-008";  exon_number  1;  exon_id                   "ENSE00003781173.1";  level                     2;                       tag                       "basic";                 transcript_support_level  "1";                     havana_gene        "OTTHUMG00000180466.8";  havana_transcript         "OTTHUMT00000471217.2";
45 |         # chr19  HAVANA  exon        66346  66499  .  -  .  gene_id  "ENSG00000282458.1";  transcript_id  "ENST00000632506.1";                 gene_type    "transcribed_processed_pseudogene";  gene_status  "KNOWN";   gene_name  "WASH5P";  transcript_type  "processed_transcript";              transcript_status  "KNOWN";  transcript_name  "WASH5P-008";  exon_number  2;  exon_id                   "ENSE00003783498.1";  level                     2;                       tag                       "basic";                 transcript_support_level  "1";                     havana_gene        "OTTHUMG00000180466.8";  havana_transcript         "OTTHUMT00000471217.2";
46 |         # chr19  HAVANA  exon        60951  61894  .  -  .  gene_id  "ENSG00000282458.1";  transcript_id  "ENST00000632506.1";                 gene_type    "transcribed_processed_pseudogene";  gene_status  "KNOWN";   gene_name  "WASH5P";  transcript_type  "processed_transcript";              transcript_status  "KNOWN";  transcript_name  "WASH5P-008";  exon_number  3;  exon_id                   "ENSE00003783010.1";  level                     2;                       tag                       "basic";                 transcript_support_level  "1";                     havana_gene        "OTTHUMG00000180466.8";  havana_transcript         "OTTHUMT00000471217.2";
47 | 
48 |         self.assertEqual(transcript_chromosome, "chr19")
49 |         self.assertEqual(transcript_strand, "-")
50 |         self.assertEqual(transcript_gene_id, 282458)
51 |         self.assertEqual(len(exons), 3)
52 | 
53 |         # 8th column has exon ID
54 |         self.assertIn("ENSE00003783010.1", exons[0][8])  # exon number 3
55 |         self.assertIn("ENSE00003783498.1", exons[1][8])  # exon number 2
56 |         self.assertIn("ENSE00003781173.1", exons[2][8])  # exon number 1
57 | 
58 |     def test_translate(self):
59 |         translator = gtf.GeneIntervals(self.annotation)
60 |         # chr19	HAVANA	gene	60951	71626	.	-	.	gene_id "ENSG00000282458.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; level 2; havana_gene "OTTHUMG00000180466.8";
61 |         gene_id = translator.translate("chr19", "-", 60951)
62 |         self.assertEqual(gene_id, 282458)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     nose2.main()
67 | 


--------------------------------------------------------------------------------
/src/seqc/tests/test_run_readarray.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase, mock
 2 | import os
 3 | import uuid
 4 | import shutil
 5 | import nose2
 6 | from test_dataset import dataset_local
 7 | from seqc.sequence.encodings import DNA3Bit
 8 | from seqc.read_array import ReadArray
 9 | from seqc.sequence import gtf
10 | 
11 | 
12 | class TestReadArray(TestCase):
13 |     @classmethod
14 |     def setUp(cls):
15 |         cls.test_id = str(uuid.uuid4())
16 |         cls.path_temp = os.path.join(
17 |             os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4())
18 |         )
19 |         cls.annotation = os.path.join(dataset_local.index, "annotations.gtf")
20 |         cls.translator = gtf.GeneIntervals(cls.annotation, 10000)
21 | 
22 |     @classmethod
23 |     def tearDown(self):
24 |         if os.path.isdir(self.path_temp):
25 |             shutil.rmtree(self.path_temp, ignore_errors=True)
26 | 
27 |     def test_read_array_creation(self, platform="ten_x_v2"):
28 |         ra, _ = ReadArray.from_alignment_file(
29 |             dataset_local.bam % platform, self.translator, required_poly_t=0
30 |         )
31 |         self.assertIsNotNone(ra)
32 | 
33 |     def test_read_array_rmt_decode_10x_v2(self):
34 |         platform = "ten_x_v2"
35 | 
36 |         # create a readarray
37 |         ra, _ = ReadArray.from_alignment_file(
38 |             dataset_local.bam % platform, self.translator, required_poly_t=0
39 |         )
40 | 
41 |         # see if we can decode numeric UMI back to nucleotide sequence
42 |         dna3bit = DNA3Bit()
43 |         for rmt in ra.data["rmt"]:
44 |             decoded = dna3bit.decode(rmt).decode()
45 |             # ten_x_v2 UMI length = 10 nt
46 |             self.assertEqual(len(decoded), 10)
47 | 
48 |     def test_read_array_rmt_decode_10x_v3(self):
49 |         platform = "ten_x_v3"
50 | 
51 |         # create a readarray
52 |         ra, _ = ReadArray.from_alignment_file(
53 |             dataset_local.bam % platform, self.translator, required_poly_t=0
54 |         )
55 | 
56 |         # see if we can decode numeric UMI back to nucleotide sequence
57 |         dna3bit = DNA3Bit()
58 |         for rmt in ra.data["rmt"]:
59 |             decoded = dna3bit.decode(rmt).decode()
60 |             # ten_x_v3 UMI length = 12 nt
61 |             self.assertEqual(len(decoded), 12)
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     nose2.main()
66 | 


--------------------------------------------------------------------------------
/src/seqc/tests/test_run_rmt_correction.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase, mock
 2 | import nose2
 3 | import os
 4 | import numpy as np
 5 | from seqc.read_array import ReadArray
 6 | from seqc import rmt_correction
 7 | 
 8 | 
 9 | class TestRmtCorrection(TestCase):
10 |     @classmethod
11 |     def setUp(self):
12 |         # pre-allocate arrays
13 |         n_barcodes = 183416337
14 |         data = np.recarray((n_barcodes,), ReadArray._dtype)
15 |         genes = np.zeros(n_barcodes, dtype=np.int32)
16 |         positions = np.zeros(n_barcodes, dtype=np.int32)
17 |         self.ra = ReadArray(data, genes, positions)
18 | 
19 |     @classmethod
20 |     def tearDown(self):
21 |         pass
22 | 
23 |     def test_should_return_correct_ra_size(self):
24 | 
25 |         ra_size = self.ra.data.nbytes + self.ra.genes.nbytes + self.ra.positions.nbytes
26 | 
27 |         self.assertEqual(4768824762, ra_size)
28 | 
29 |     # 64GB
30 |     @mock.patch(
31 |         "seqc.rmt_correction._get_available_memory", return_value=50 * 1024 ** 3
32 |     )
33 |     def test_should_return_correct_max_workers(self, mock_mem):
34 | 
35 |         n_workers = rmt_correction._calc_max_workers(self.ra)
36 | 
37 |         self.assertEqual(n_workers, 5)
38 | 
39 |     # 1TB
40 |     @mock.patch("seqc.rmt_correction._get_available_memory", return_value=1079354630144)
41 |     def test_should_return_correct_max_workers2(self, mock_mem):
42 | 
43 |         n_workers = rmt_correction._calc_max_workers(self.ra)
44 | 
45 |         self.assertEqual(n_workers, 119)
46 | 
47 |     # having less memory than ra size
48 |     @mock.patch("seqc.rmt_correction._get_available_memory")
49 |     def test_should_return_one_if_ra_larger_than_mem(self, mock_mem):
50 | 
51 |         ra_size = self.ra.data.nbytes + self.ra.genes.nbytes + self.ra.positions.nbytes
52 | 
53 |         # assume the available memory is a half of ra
54 |         mock_mem.return_value = int(ra_size) / 2
55 | 
56 |         n_workers = rmt_correction._calc_max_workers(self.ra)
57 | 
58 |         self.assertEqual(n_workers, 1)
59 | 
60 | 
61 | class TestRmtCorrection2(TestCase):
62 |     @classmethod
63 |     def setUp(self):
64 |         # pre-allocate arrays
65 |         n_barcodes = 183416337
66 |         data = np.recarray((n_barcodes,), ReadArray._dtype)
67 |         genes = np.zeros(n_barcodes, dtype=np.int32)
68 |         positions = np.zeros(n_barcodes, dtype=np.int32)
69 |         self.ra = ReadArray(data, genes, positions)
70 | 
71 |         import pickle
72 | 
73 |         with open("pre-correction-ra.pickle", "wb") as fout:
74 |             pickle.dump(self.ra, fout)
75 | 
76 |     @classmethod
77 |     def tearDown(self):
78 |         import os
79 | 
80 |         try:
81 |             os.remove("pre-correction-ra.pickle")
82 |         except:
83 |             pass
84 | 
85 |     @mock.patch("seqc.rmt_correction._correct_errors_by_cell_group", return_value=0)
86 |     def test_correct_errors_by_chunks(self, mock_correct):
87 |         cell_group = [1, 2, 3]
88 |         x = rmt_correction._correct_errors_by_cell_group_chunks(
89 |             self.ra, cell_group, 0.02, 0.05
90 |         )
91 |         mock_correct.assert_called()
92 |         self.assertEquals(len(cell_group), mock_correct.call_count)
93 |         self.assertEquals([0, 0, 0], x)
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     nose2.main()
98 | 


--------------------------------------------------------------------------------
/src/seqc/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.11"
2 | 


--------------------------------------------------------------------------------