├── .circleci └── config.yml ├── .github ├── ISSUE_TEMPLATE │ └── need-some-support.md └── workflows │ └── python-app.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── README.md ├── install-SUSE.md ├── install-dev.md └── run-test.md ├── nose2.cfg ├── repackage.py ├── requirements.txt ├── setup.py └── src ├── scripts └── SEQC └── seqc ├── __init__.py ├── alignment ├── __init__.py ├── sam.py └── star.py ├── barcode_correction.py ├── core ├── __init__.py ├── download.py ├── index.py ├── instances.py ├── main.py ├── notebook.py ├── parser.py ├── progress.py ├── run.py ├── start.py ├── terminate.py └── verify.py ├── distance.py ├── ec2.py ├── email_.py ├── exceptions.py ├── filter.py ├── gene_info.py ├── h5.py ├── io.py ├── log.py ├── multialignment.py ├── notebooks ├── __init__.py ├── analysis_template.json ├── notebooks.py └── test_notebooks.py ├── platforms.py ├── plot.py ├── read_array.py ├── reader.py ├── rmt_correction.py ├── run_mast.R ├── sequence ├── __init__.py ├── barcodes.py ├── encodings.py ├── fastq.py ├── gtf.py └── index.py ├── sparse_frame.py ├── stats ├── __init__.py ├── anova.py ├── correlation.py ├── experimental_yield.py ├── g_test.py ├── graph_diffusion.py ├── gsea.py ├── mast.py ├── pca.py ├── resampled_nonparametric.py ├── smoothing.py ├── tree.py ├── tsne.py └── ttest.py ├── summary ├── __init__.py ├── css │ ├── bootstrap.css │ ├── bootstrap.min.css │ └── simple-sidebar.css ├── fonts │ ├── glyphicons-halflings-regular.eot │ ├── glyphicons-halflings-regular.svg │ ├── glyphicons-halflings-regular.ttf │ ├── glyphicons-halflings-regular.woff │ └── glyphicons-halflings-regular.woff2 ├── html_ │ └── __init__.py ├── img │ └── __init__.py ├── js │ ├── bootstrap.js │ ├── bootstrap.min.js │ └── jquery.js ├── static │ └── __init__.py ├── summary.py ├── templates │ ├── mini_summary_base.html │ ├── section_base.html │ └── section_content.html └── test.py ├── tests ├── __init__.py ├── test_args.py ├── test_dataset.py ├── test_index.py ├── test_run_e2e_local.py ├── test_run_e2e_remote.py ├── test_run_gtf.py ├── test_run_readarray.py └── test_run_rmt_correction.py └── version.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | python: circleci/python@0.2.1 5 | 6 | jobs: 7 | build-and-test: 8 | executor: python/default 9 | steps: 10 | - checkout 11 | - python/load-cache 12 | - run: 13 | name: Install cython/numpy/bhtsne 14 | command: | 15 | pip install Cython 16 | pip install numpy 17 | pip install bhtsne 18 | - python/install-deps 19 | - python/save-cache 20 | - run: 21 | name: Install seqc 22 | command: pip install . 23 | - run: 24 | name: Test 25 | command: | 26 | export TMPDIR="/tmp" 27 | python -m nose2 -s src/seqc/tests test_run_rmt_correction 28 | 29 | 30 | workflows: 31 | main: 32 | jobs: 33 | - build-and-test 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/need-some-support.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Need some support 3 | about: Ask for help or create a bug report 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. ... 16 | 2. ... 17 | 18 | **Expected behavior** 19 | A clear and concise description of what you expected to happen. 20 | 21 | **Screenshots** 22 | If applicable, add screenshots to help explain your problem. 23 | 24 | **Assay** 25 | - 10x v2/v3, Drop-seq, In-drop, ... 26 | 27 | **Runtime Environment** 28 | - SEQC Version [e.g. v0.2.2] 29 | - HPC/AWS/GCP/Desktop 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: [push, pull_request] 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 3.8 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.8 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install flake8 pytest 23 | pip install Cython 24 | pip install numpy 25 | pip install bhtsne 26 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 27 | - name: Lint with flake8 28 | run: | 29 | # stop the build if there are Python syntax errors or undefined names 30 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 31 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 32 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 33 | - name: Install SEQC 34 | run: pip install . 35 | - name: Test with nose2 36 | run: | 37 | export TMPDIR="/tmp" 38 | nose2 -s src/seqc/tests test_run_rmt_correction 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg* 2 | *.idea* 3 | *__pycache__* 4 | .idea* 5 | testfiles* 6 | *.DS_Store* 7 | *seqc.log 8 | build/* 9 | dist/* 10 | .project 11 | .pydevproject 12 | .c9/ 13 | test-data/ 14 | dask-worker-space/ 15 | 16 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/seqc/summary/*/*.css 2 | include src/seqc/summary/fonts/* 3 | include src/seqc/summary/*/*.py 4 | include src/seqc/summary/*/*.js 5 | include src/seqc/summary/*/*.html 6 | include src/seqc/notebooks/*.json 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SEquence Quality Control (SEQC -- /sek-si:/) 2 | 3 | ## Overview: 4 | 5 | SEQC is a python package that processes single-cell sequencing data in the cloud and analyzes it interactively on your local machine. 6 | 7 | To faciliate easy installation and use, we have made available Amazon Machine Images (AMIs) that come with all of SEQC's dependencies pre-installed. In addition, we have uploaded common genome indices (`-i/--index parameter`) and barcode data (`--barcode-files`) to public Amazon S3 repositories. These links can be provided to SEQC and it will automatically fetch them prior to initiating an analysis run. Finally, it can fetch input data directly from BaseSpace or amazon s3 for analysis. 8 | 9 | For users with access to in-house compute clusters, SEQC can be installed on your systems and run using the `--local` parameter. 10 | 11 | ## Dependencies: 12 | 13 | ### Python 3 14 | 15 | Python3 must be installed on your local machine to run SEQC. We recommend installing Python3 through Miniconda (https://docs.conda.io/en/latest/miniconda.html). 16 | 17 | ### Python 3 Libraries 18 | 19 | We recommend creating a virtual environment before installing anything: 20 | 21 | ```bash 22 | conda create -n seqc python=3.7.7 pip 23 | conda activate seqc 24 | ``` 25 | 26 | ```bash 27 | pip install Cython 28 | pip install numpy 29 | pip install bhtsne 30 | ``` 31 | 32 | ### STAR, Samtools, and HDF5 33 | 34 | To process data locally using SEQC, you must install the STAR Aligner, Samtools, and hdf5. If you only intend to use SEQC to trigger remote processing on AWS, these dependencies are optional. We recommend installing samtools and hdf5 through your package manager, if possible. 35 | 36 | ## SEQC Installation 37 | 38 | Once all dependencies have been installed, SEQC can be installed by running: 39 | 40 | ```bash 41 | export SEQC_VERSION="0.2.11" 42 | wget https://github.com/hisplan/seqc/archive/v${SEQC_VERSION}.tar.gz 43 | tar xvzf v${SEQC_VERSION}.tar.gz 44 | cd seqc-${SEQC_VERSION} 45 | pip install . 46 | ``` 47 | 48 | ## Hardware Requirements: 49 | 50 | For processing a single lane (~200M reads) against human- and mouse-scale genomes, SEQC requires 30GB RAM, approximately 200GB free hard drive space, and scales linearly with additional compute cores. If running on AWS (see below), jobs are automatically scaled up or down according to the size of the input. There are no hardware requirements for the computer used to launch remote instances. 51 | 52 | ## Running SEQC on Local Machine: 53 | 54 | Download an example dataset (1k PBMCs from a healthy donor; freely available at 10x Genomics https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0/pbmc_1k_v3): 55 | 56 | ```bash 57 | wget https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/pbmc_1k_v3_fastqs.tar 58 | tar xvf pbmc_1k_v3_fastqs.tar 59 | ``` 60 | 61 | Move R1 FASTQ files to the `barcode` folder and R2 FASTQ files to the `genomic` folder: 62 | 63 | ```bash 64 | mkdir barcode 65 | mkdir genomic 66 | mv ./pbmc_1k_v3_fastqs/*R1*.fastq.gz barcode/ 67 | mv ./pbmc_1k_v3_fastqs/*R2*.fastq.gz genomic/ 68 | ``` 69 | 70 | Download the 10x barcode whitelist file: 71 | 72 | ```bash 73 | mkdir whitelist 74 | wget https://seqc-public.s3.amazonaws.com/barcodes/ten_x_v3/flat/3M-february-2018.txt 75 | mv 3M-february-2018.txt ./whitelist/ 76 | ``` 77 | 78 | The resulting directory structure should look something like this: 79 | 80 | ``` 81 | . 82 | ├── barcode 83 | │   ├── pbmc_1k_v3_S1_L001_R1_001.fastq.gz 84 | │   └── pbmc_1k_v3_S1_L002_R1_001.fastq.gz 85 | ├── genomic 86 | │   ├── pbmc_1k_v3_S1_L001_R2_001.fastq.gz 87 | │   └── pbmc_1k_v3_S1_L002_R2_001.fastq.gz 88 | ├── pbmc_1k_v3_fastqs 89 | │   ├── pbmc_1k_v3_S1_L001_I1_001.fastq.gz 90 | │   └── pbmc_1k_v3_S1_L002_I1_001.fastq.gz 91 | ├── pbmc_1k_v3_fastqs.tar 92 | └── whitelist 93 | └── 3M-february-2018.txt 94 | ``` 95 | 96 | Create a reference package (STAR index + gene annotation): 97 | 98 | ```bash 99 | SEQC index \ 100 | --organism homo_sapiens \ 101 | --ensemble-release 93 \ 102 | --valid-biotypes protein_coding lincRNA antisense IG_V_gene IG_D_gene IG_J_gene IG_C_gene TR_V_gene TR_D_gene TR_J_gene TR_C_gene \ 103 | --read-length 101 \ 104 | --folder index \ 105 | --local 106 | ``` 107 | 108 | Run SEQC: 109 | 110 | ```bash 111 | export AWS_DEFAULT_REGION=us-east-1 112 | export SEQC_MAX_WORKERS=7 113 | 114 | SEQC run ten_x_v3 \ 115 | --index ./index/ \ 116 | --barcode-files ./whitelist/ \ 117 | --barcode-fastq ./barcode/ \ 118 | --genomic-fastq ./genomic/ \ 119 | --output-prefix PBMC \ 120 | --no-filter-low-coverage \ 121 | --min-poly-t 0 \ 122 | --star-args runRNGseed=0 \ 123 | --local 124 | ``` 125 | 126 | ## Running SEQC on Amazon Web Services: 127 | 128 | SEQC can be run on any unix-based operating system, however it also features the ability to automatically spawn Amazon Web Services instances to process your data. 129 | 130 | 1. Set up an AWS account 131 | 2. Install and configure AWS CLI 132 | 3. Create and upload an rsa-key for AWS 133 | 134 | Run SEQC: 135 | 136 | ```bash 137 | SEQC run ten_x_v2 \ 138 | --ami-id ami-08652ee2477761403 \ 139 | --user-tags Job:Test,Project:PBMC-Test,Sample:pbmc_1k_v3 \ 140 | --index s3://seqc-public/genomes/hg38_long_polya/ \ 141 | --barcode-files s3://seqc-public/barcodes/ten_x_v2/flat/ \ 142 | --genomic-fastq s3://.../genomic/ \ 143 | --barcode-fastq s3://.../barcode/ \ 144 | --upload-prefix s3://.../seqc-results/ \ 145 | --output-prefix PBMC \ 146 | --no-filter-low-coverage \ 147 | --min-poly-t 0 \ 148 | --star-args runRNGseed=0 149 | ``` 150 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # docs 2 | 3 | ## Developers 4 | 5 | - [Environment setup for development](./install-dev.md) 6 | - [Running test](./run-test.md) 7 | 8 | 9 | ## Generating Reference Packages 10 | 11 | This generates a reference package (STAR index and GTF) using SEQC. 12 | 13 | - Ensembl 85 14 | - Gene annotation file that contains only the reference chromosomes (no scaffolds, no patches) 15 | - Only these biotypes: 'protein_coding', 'lincRNA', 'IG_V_gene', 'IG_C_gene', 'IG_J_gene', 'TR_C_gene', 'TR_J_gene', 'TR_V_gene', 'TR_D_gene', 'IG_D_gene' 16 | - Not passing anything to `--additional-id-types` 17 | - Setting the read length to 101 (internally, this becomes 100) 18 | 19 | ### Local 20 | 21 | ```bash 22 | SEQC index \ 23 | -o homo_sapiens \ 24 | -f homo_sapiens \ 25 | --ensemble-release 85 \ 26 | --valid-biotypes protein_coding lincRNA antisense IG_V_gene IG_D_gene IG_J_gene IG_C_gene TR_V_gene TR_D_gene TR_J_gene TR_C_gene \ 27 | --read-length 101 \ 28 | --folder ./test-data/index/ \ 29 | --local 30 | ``` 31 | 32 | ### AWS 33 | 34 | ```bash 35 | SEQC index \ 36 | -o homo_sapiens \ 37 | -f homo_sapiens \ 38 | --ensemble-release 85 \ 39 | --valid-biotypes protein_coding lincRNA antisense IG_V_gene IG_D_gene IG_J_gene IG_C_gene TR_V_gene TR_D_gene TR_J_gene TR_C_gene \ 40 | --read-length 101 \ 41 | --upload-prefix s3://dp-lab-test/seqc/index/86/ \ 42 | --rsa-key ~/dpeerlab-chunj.pem \ 43 | --ami-id ami-037cc8c1417e197c1 44 | ``` 45 | -------------------------------------------------------------------------------- /docs/install-SUSE.md: -------------------------------------------------------------------------------- 1 | # Installation for SUSE 2 | 3 | This was tested with AWS SUSE Linux Enterprise Server 15 SP1 (HVM). 4 | 5 | ## Install gcc & c++ 6 | 7 | ```bash 8 | sudo zypper in gcc-c++ 9 | ``` 10 | 11 | ## Install Miniconda 12 | 13 | ```bash 14 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 15 | bash Miniconda3-latest-Linux-x86_64.sh 16 | ``` 17 | 18 | For more information: 19 | - https://docs.conda.io/en/latest/miniconda.html 20 | - https://conda.io/projects/conda/en/latest/user-guide/install/linux.html#install-linux-silent 21 | 22 | Log out log back in. 23 | 24 | ## Create a Virtual Environment 25 | 26 | ```bash 27 | conda create -n seqc python=3.7.7 pip 28 | conda activate seqc 29 | ``` 30 | 31 | ## Install dependencies 32 | 33 | ``` 34 | pip install Cython 35 | pip install numpy 36 | pip install bhtsne 37 | 38 | conda install -c anaconda hdf5 39 | conda install -c bioconda samtools 40 | conda install -c bioconda star 41 | ``` 42 | 43 | ## Install SEQC 44 | 45 | ``` 46 | wget https://github.com/dpeerlab/seqc/archive/v0.2.11.tar.gz 47 | tar xvzf v0.2.11.tar.gz 48 | cd seqc-0.2.11/ 49 | pip install . 50 | ``` 51 | -------------------------------------------------------------------------------- /docs/install-dev.md: -------------------------------------------------------------------------------- 1 | # Setup for Development 2 | 3 | Last verified: Jun 4, 2020 4 | 5 | ## Create Conda Environment 6 | 7 | ```bash 8 | conda create -n seqc-dev python=3.7.7 pip 9 | conda activate seqc-dev 10 | ``` 11 | 12 | ## Install Dependencies 13 | 14 | ```bash 15 | pip install Cython 16 | pip install numpy 17 | pip install bhtsne 18 | ``` 19 | 20 | For Mac (Mojave 10.14.6), install the following additional components. You must have `brew` to install. 21 | 22 | ``` 23 | brew install cairo 24 | brew install pango 25 | ``` 26 | 27 | ## Install SEQC (editable mode) 28 | 29 | ```bash 30 | pip install --editable . 31 | ``` 32 | 33 | ## Install STAR 34 | 35 | ```bash 36 | curl -OL https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz 37 | tar -xf 2.5.3a.tar.gz 38 | cp STAR-2.5.3a/bin/MacOSX_x86_64/STAR /usr/local/bin/ 39 | ``` 40 | 41 | ## Install samtools 42 | 43 | ```bash 44 | conda install -c bioconda samtools=1.3.1 45 | ``` 46 | 47 | ## Install Packages for Testing 48 | 49 | ```bash 50 | pip install nose 51 | ``` 52 | 53 | ## Install Packages for Linting and Formating 54 | 55 | ```bash 56 | pip install pylint 57 | pip install autopep8 58 | pip install black 59 | ``` 60 | -------------------------------------------------------------------------------- /docs/run-test.md: -------------------------------------------------------------------------------- 1 | # Running Test 2 | 3 | ## Setup 4 | 5 | Set the following environment variables: 6 | 7 | ```bash 8 | export SEQC_TEST_RSA_KEY=/Users/chunj/dpeerlab-chunj.pem 9 | export SEQC_TEST_EMAIL=jaeyoung.chun@gmail.com 10 | export SEQC_TEST_AMI_ID=ami-037cc8c1417e197c1 11 | ``` 12 | 13 | For local test, download test data in S3 to your test machine: 14 | 15 | ``` 16 | aws s3 sync s3://seqc-public/test/ten_x_v2/ ./test-data/datasets/ten_x_v2/ 17 | aws s3 sync s3://seqc-public/barcodes/ten_x_v2/ ./test-data/datasets/barcodes/ten_x_v2/ 18 | aws s3 sync s3://seqc-public/genomes/hg38_chr19/ ./test-data/datasets/genomes/hg38_chr19/ 19 | ``` 20 | 21 | ## Test Everything 22 | 23 | Runs tests based on `nose2.cfg`: 24 | 25 | ```bash 26 | nose2 27 | ``` 28 | 29 | ## SEQC index 30 | 31 | ```bash 32 | nose2 -s src/seqc/tests test_index 33 | ``` 34 | 35 | Besides the nose2 test results, actual SEQC output files can be found here, for example: 36 | 37 | ``` 38 | s3://dp-lab-cicd/seqc/index-ciona_intestinalis-0d19e818-7623-4a1d-bac3-a8c9e3be1e3e/ 39 | ``` 40 | 41 | ## SEQC run 42 | 43 | ### Local 44 | 45 | SEQC will run with `--local`. 46 | 47 | ```bash 48 | nose2 -s src/seqc/tests test_run_e2e_local 49 | ``` 50 | 51 | ### Remote 52 | 53 | SEQC will run on AWS. 54 | 55 | The following will generate a package that can be uploaded to AWS EC2 for testing: 56 | 57 | ```bash 58 | python repackage.py 59 | ``` 60 | 61 | ```bash 62 | nose2 -s src/seqc/tests test_run_e2e_remote 63 | ``` 64 | 65 | Besides the nose2 test results, actual SEQC output files can be found here, for example: 66 | 67 | ``` 68 | s3://dp-lab-cicd/seqc/run-in_drop_v2-a997b408-f883-4ba2-9941-8b541e319850/ 69 | ``` 70 | 71 | ### Clean Up 72 | 73 | ```bash 74 | aws s3 rm s3://dp-lab-cicd/seqc/ --recursive 75 | ``` 76 | -------------------------------------------------------------------------------- /nose2.cfg: -------------------------------------------------------------------------------- 1 | [unittest] 2 | start-dir = src/seqc/tests 3 | test-file-pattern = test_*.py 4 | test-method-prefix = test 5 | -------------------------------------------------------------------------------- /repackage.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | def ignore_test_and_tools(dir_, files): 8 | """Filter files to be moved by shutil.copytree. Ignore any hidden file and the 9 | test and tools directories, which are not needed by the remote instance. 10 | :param dir_: dummy variable, must be present to be passed to shutil.copytree() 11 | :param files: output of os.listdir(), files to be subjected to filtering 12 | :return list: list of files that should be filtered, and not copied. 13 | """ 14 | return [ 15 | f 16 | for f in files 17 | if (f == "test" or f == "test-data" or f == "__pycache__" or f.startswith(".")) 18 | ] 19 | 20 | 21 | setup_dir = os.path.dirname(os.path.realpath(__file__)) 22 | seqc_dir = os.path.expanduser("~/.seqc/seqc") 23 | 24 | print("setup_dir: {}".format(setup_dir)) 25 | print("seqc_dir: {}".format(seqc_dir)) 26 | 27 | # delete the existing one 28 | if os.path.isdir(seqc_dir): 29 | shutil.rmtree(seqc_dir) 30 | 31 | # copy the SEQC files in the working directory to ~/.seqc/seqc 32 | shutil.copytree(setup_dir, seqc_dir, ignore=ignore_test_and_tools) 33 | 34 | # create .tar.gz of ~/.seqc/seqc/* 35 | shutil.make_archive(base_name=seqc_dir, format="gztar", root_dir=seqc_dir) 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython>0.14 2 | numpy>=1.10.0 3 | bhtsne 4 | wikipedia 5 | awscli 6 | numexpr>=2.4 7 | pandas>=1.0.4 8 | paramiko>=2.0.2 9 | regex 10 | requests 11 | nose2 12 | scipy>=1.5.1 13 | boto3 14 | intervaltree 15 | matplotlib 16 | tinydb 17 | tables 18 | fastcluster 19 | statsmodels==0.11.1 20 | ecdsa 21 | jupyter 22 | jinja2 23 | pycrypto 24 | cairocffi==0.8.0 25 | weasyprint==0.42.2 26 | scikit_learn>=0.17 27 | tqdm 28 | pendulum 29 | dask>=2.25.0 30 | distributed>=2.25.0 31 | dill>=0.3.2 32 | bokeh>=2.1.1 33 | numba~=0.51.2 34 | PhenoGraph>=1.5.7 35 | magic@https://github.com/dpeerlab/magic/archive/v0.1.1.tar.gz 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | from subprocess import call 5 | from setuptools import setup 6 | from warnings import warn 7 | import py_compile 8 | from pathlib import Path 9 | 10 | 11 | # Replace py_compile.compile with a function that calls it with doraise=True 12 | # so stop when there is a syntax error 13 | orig_py_compile = py_compile.compile 14 | 15 | 16 | def doraise_py_compile(file, cfile=None, dfile=None, doraise=False): 17 | orig_py_compile(file, cfile=cfile, dfile=dfile, doraise=True) 18 | 19 | 20 | py_compile.compile = doraise_py_compile 21 | 22 | if sys.version_info.major != 3: 23 | raise RuntimeError("SEQC requires Python 3") 24 | if sys.version_info.minor < 5: 25 | warn("Multiprocessing analysis methods may not function on Python versions < 3.5") 26 | 27 | main_ns = {} 28 | 29 | # get version 30 | with open("src/seqc/version.py") as f: 31 | exec(f.read(), main_ns) 32 | 33 | setup( 34 | name="seqc", 35 | version=main_ns["__version__"], 36 | description="Single Cell Sequencing Processing and QC Suite", 37 | author="Ambrose J. Carr", 38 | author_email="mail@ambrosejcarr.com", 39 | package_dir={"": "src"}, 40 | package_data={"": ["*.r", "*.R"]}, 41 | packages=[ 42 | "seqc", 43 | "seqc.sequence", 44 | "seqc.alignment", 45 | "seqc.core", 46 | "seqc.stats", 47 | "seqc.summary", 48 | "seqc.notebooks", 49 | ], 50 | install_requires=[ 51 | dep.strip() for dep in Path("requirements.txt").read_text("utf-8").splitlines() 52 | ], 53 | scripts=["src/scripts/SEQC"], 54 | extras_require={"GSEA_XML": ["html5lib", "lxml", "BeautifulSoup4"]}, 55 | include_package_data=True, 56 | ) 57 | 58 | # look for star 59 | if not shutil.which("STAR"): 60 | warn("SEQC: STAR is not installed. SEQC will not be able to align files.") 61 | 62 | # get location of setup.py 63 | setup_dir = os.path.dirname(os.path.realpath(__file__)) 64 | seqc_dir = os.path.expanduser("~/.seqc/seqc") 65 | 66 | print("setup_dir: {}".format(setup_dir)) 67 | print("seqc_dir: {}".format(seqc_dir)) 68 | 69 | if os.path.isdir(seqc_dir): 70 | shutil.rmtree(seqc_dir) 71 | 72 | 73 | def ignore_test_and_tools(dir_, files): 74 | """Filter files to be moved by shutil.copytree. Ignore any hidden file and the 75 | test and tools directories, which are not needed by the remote instance. 76 | :param dir_: dummy variable, must be present to be passed to shutil.copytree() 77 | :param files: output of os.listdir(), files to be subjected to filtering 78 | :return list: list of files that should be filtered, and not copied. 79 | """ 80 | return [f for f in files if (f == "test" or f.startswith("."))] 81 | 82 | 83 | # install tools and a local copy of seqc. 84 | # copy seqc repository 85 | shutil.copytree(setup_dir, seqc_dir, ignore=ignore_test_and_tools) 86 | shutil.make_archive(base_name=seqc_dir, format="gztar", root_dir=seqc_dir) 87 | -------------------------------------------------------------------------------- /src/scripts/SEQC: -------------------------------------------------------------------------------- 1 | #!/usr/local/python3 2 | 3 | import sys 4 | from seqc.core.main import main 5 | 6 | if __name__ == "__main__": 7 | main(sys.argv[1:]) 8 | -------------------------------------------------------------------------------- /src/seqc/__init__.py: -------------------------------------------------------------------------------- 1 | from .h5 import H5 2 | from .version import __version__ 3 | from . import stats 4 | # from . import plot 5 | -------------------------------------------------------------------------------- /src/seqc/alignment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/alignment/__init__.py -------------------------------------------------------------------------------- /src/seqc/alignment/sam.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from subprocess import Popen, PIPE 3 | import shutil 4 | import gzip 5 | 6 | 7 | def get_version(): 8 | 9 | proc = Popen(["samtools", "--version"], stderr=PIPE, stdout=PIPE) 10 | out, err = proc.communicate() 11 | if err: 12 | raise ChildProcessError(err) 13 | 14 | # e.g. 15 | # samtools 1.9 16 | # Using htslib 1.9 17 | # Copyright (C) 2018 Genome Research Ltd. 18 | # --> 'samtools 1.9' 19 | version = out.decode().strip().split("\n")[0] 20 | 21 | # --> '1.9' 22 | version = version.split(" ")[1] 23 | 24 | return version 25 | 26 | 27 | class SamRecord: 28 | """Simple record object allowing access to Sam record properties""" 29 | 30 | __slots__ = ["_record", "_parsed_name_field"] 31 | 32 | NameField = namedtuple("NameField", ["pool", "cell", "rmt", "poly_t", "name"]) 33 | 34 | def __init__(self, record): 35 | self._record = record 36 | self._parsed_name_field = None 37 | 38 | def __repr__(self): 39 | return "".format("\t".join(self._record)) 40 | 41 | def __bytes__(self): 42 | return "\t".join(self._record) + "\n" 43 | 44 | @property 45 | def qname(self) -> str: 46 | return self._record[0] 47 | 48 | @property 49 | def flag(self) -> int: 50 | return int(self._record[1]) 51 | 52 | @property 53 | def rname(self) -> str: 54 | return self._record[2] 55 | 56 | @property 57 | def pos(self) -> int: 58 | return int(self._record[3]) 59 | 60 | @property 61 | def mapq(self) -> int: 62 | return int(self._record[4]) 63 | 64 | @property 65 | def cigar(self) -> str: 66 | return self._record[5] 67 | 68 | @property 69 | def rnext(self) -> str: 70 | return self._record[6] 71 | 72 | @property 73 | def pnext(self) -> int: 74 | return int(self._record[7]) 75 | 76 | @property 77 | def tlen(self) -> int: 78 | return int(self._record[8]) 79 | 80 | @property 81 | def seq(self) -> str: 82 | return self._record[9] 83 | 84 | @property 85 | def qual(self) -> str: 86 | return self._record[10] 87 | 88 | @property 89 | def optional_fields(self): 90 | flags_ = {} 91 | for f in self._record[11:]: 92 | k, _, v = f.split(":") 93 | flags_[k] = int(v) 94 | return flags_ 95 | 96 | def _parse_name_field(self): 97 | fields, name = self.qname.split(";") 98 | processed_fields = fields.split(":") 99 | processed_fields.append(name) 100 | self._parsed_name_field = self.NameField(*processed_fields) 101 | 102 | @property 103 | def pool(self) -> str: 104 | try: 105 | return self._parsed_name_field.pool 106 | except AttributeError: 107 | self._parse_name_field() 108 | return self._parsed_name_field.pool 109 | 110 | @property 111 | def rmt(self) -> str: 112 | try: 113 | return self._parsed_name_field.rmt 114 | except AttributeError: 115 | self._parse_name_field() 116 | return self._parsed_name_field.rmt 117 | 118 | @property 119 | def cell(self) -> str: 120 | try: 121 | return self._parsed_name_field.cell 122 | except AttributeError: 123 | self._parse_name_field() 124 | return self._parsed_name_field.cell 125 | 126 | @property 127 | def poly_t(self) -> str: 128 | try: 129 | return self._parsed_name_field.poly_t 130 | except AttributeError: 131 | self._parse_name_field() 132 | return self._parsed_name_field.poly_t 133 | 134 | @property 135 | def name(self): 136 | try: 137 | return self._parsed_name_field.name 138 | except AttributeError: 139 | self._parse_name_field() 140 | return self._parsed_name_field.name 141 | 142 | @property 143 | def is_mapped(self): 144 | return False if (int(self.flag) & 4) else True 145 | 146 | @property 147 | def is_unmapped(self): 148 | return not self.is_mapped 149 | 150 | @property 151 | def is_multimapped(self): 152 | return True if self.optional_fields["NH"] > 1 else False 153 | 154 | @property 155 | def is_uniquely_mapped(self): 156 | return True if self.optional_fields["NH"] == 1 else False 157 | 158 | @property 159 | def strand(self): 160 | minus_strand = int(self.flag) & 16 161 | return "-" if minus_strand else "+" 162 | 163 | # # todo this takes up 66% of the processing time for parsing the sam record 164 | # @property 165 | # def dust_low_complexity_score(self) -> int: 166 | # 167 | # # Counts of 3-mers in the sequence 168 | # counts = {} 169 | # for i in range(len(self.seq) - 2): 170 | # kmer = self.seq[i:i + 3] 171 | # counts[kmer] = counts.get(kmer, 0) + 1 172 | # 173 | # # Calculate dust score # todo this is 30% faster when vectorized 174 | # score = sum([i * (i - 1) / 2 for i in counts.values()]) / (len(self.seq) - 3) 175 | # 176 | # # Scale score (Max score possible is no. of 3mers/2) 177 | # score = int(score / ((len(self.seq) - 2) / 2) * 100) 178 | # 179 | # return score 180 | 181 | 182 | class Reader: 183 | """Simple sam reader, optimized for utility rather than speed""" 184 | 185 | def __init__(self, samfile: str): 186 | """ 187 | :param samfile: str, location of a .sam file 188 | 189 | usage: 190 | if rd = Reader(samfile) 191 | :method __iter__: iterate over the .sam file's records (also usable in for loop) 192 | :method __len__: return the number of alignments in the file 193 | :method itermultialignments: return tuples of multiple alignments, all from the 194 | same fastq record 195 | """ 196 | 197 | self._samfile = samfile 198 | try: 199 | samfile_iterator = iter(self) 200 | next(samfile_iterator) 201 | except RuntimeError as ex: 202 | raise ex 203 | except: 204 | raise ValueError( 205 | "%s is an invalid samfile. Please check file formatting." % samfile 206 | ) 207 | 208 | @property 209 | def samfile(self): 210 | return self._samfile 211 | 212 | def _open(self): 213 | """ 214 | seamlessly open self._samfile, whether gzipped or uncompressed 215 | :returns: open file object 216 | """ 217 | if self.samfile.endswith(".gz"): 218 | fobj = gzip.open(self.samfile, "rb") 219 | elif self.samfile.endswith(".bam"): 220 | if not shutil.which("samtools"): 221 | raise RuntimeError("samtools utility must be installed to run bamfiles") 222 | p = Popen(["samtools", "view", self.samfile], stdout=PIPE) 223 | fobj = p.stdout 224 | else: 225 | fobj = open(self.samfile, "rb") 226 | return fobj 227 | 228 | def __len__(self): 229 | return sum(1 for _ in self) 230 | 231 | def __iter__(self): 232 | """return an iterator over all non-header records in samfile""" 233 | fobj = self._open() 234 | try: 235 | for line in fobj: 236 | line = line.decode() 237 | # todo move this if statement to execute only until header is exhausted 238 | if line.startswith("@"): 239 | continue 240 | yield SamRecord(line.strip().split("\t")) 241 | finally: 242 | fobj.close() 243 | 244 | def iter_multialignments(self): 245 | """yields tuples of all alignments for each fastq record""" 246 | sam_iter = iter(self) 247 | fq = [next(sam_iter)] 248 | for record in sam_iter: 249 | if record.qname == fq[0].qname: 250 | fq.append(record) 251 | else: 252 | yield tuple(fq) 253 | fq = [record] 254 | yield tuple(fq) 255 | -------------------------------------------------------------------------------- /src/seqc/alignment/star.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen, PIPE 2 | from multiprocessing import cpu_count 3 | from os import makedirs 4 | import shlex 5 | 6 | 7 | def get_version(): 8 | 9 | proc = Popen(["STAR", "--version"], stderr=PIPE, stdout=PIPE) 10 | out, err = proc.communicate() 11 | if err: 12 | raise ChildProcessError(err) 13 | 14 | version = out.decode().strip() 15 | 16 | if version.startswith("STAR_"): 17 | # e.g. STAR_2.5.3a 18 | # --> 2.5.3a 19 | return out.decode().strip().split("_")[1] 20 | else: 21 | # e.g. 2.7.3a 22 | return version 23 | 24 | 25 | def default_alignment_args( 26 | fastq_records: str, n_threads: int or str, index: str, output_dir: str 27 | ) -> dict: 28 | """default arguments for STAR alignment 29 | 30 | To report unaligned reads, add '--outSAMunmapped': 'Within', 31 | 32 | :param fastq_records: str, name of fastq file 33 | :param n_threads: int or str, number of threads to allocate when calling STAR 34 | :param index: str, location of the STAR index 35 | :param output_dir: str, prefix for output files 36 | :return: dict, default alignment arguments 37 | """ 38 | default_align_args = { 39 | "--runMode": "alignReads", 40 | "--runThreadN": str(n_threads), 41 | "--genomeDir": index, 42 | "--outFilterType": "BySJout", 43 | "--outFilterMultimapNmax": "10", # require unique alignments 44 | "--limitOutSJcollapsed": "2000000", # deal with many splice variants 45 | "--alignSJDBoverhangMin": "8", 46 | "--outFilterMismatchNoverLmax": "0.04", 47 | "--alignIntronMin": "20", 48 | "--alignIntronMax": "1000000", 49 | "--readFilesIn": fastq_records, 50 | "--outSAMprimaryFlag": "AllBestScore", # all equal-scoring reads are primary 51 | "--outSAMtype": "BAM Unsorted", 52 | "--outFileNamePrefix": output_dir, 53 | } 54 | if fastq_records.endswith(".gz"): 55 | default_align_args["--readFilesCommand"] = "gunzip -c" 56 | if fastq_records.endswith(".bz2"): 57 | default_align_args["--readFilesCommand"] = "bunzip2 -c" 58 | return default_align_args 59 | 60 | 61 | def align( 62 | fastq_file: str, 63 | index: str, 64 | n_threads: int, 65 | alignment_dir: str, 66 | reverse_fastq_file: str or bool = None, 67 | **kwargs 68 | ) -> str: 69 | """align a fastq file, or a paired set of fastq files 70 | 71 | :param fastq_file: str, location of a fastq file 72 | :param index: str, folder containing the STAR index 73 | :param n_threads: int, number of parallel alignment processes to spawn 74 | :param alignment_dir: str, directory for output data 75 | :param reverse_fastq_file: optional, location of reverse paired-end fastq file 76 | :param kwargs: additional kwargs for STAR, passed without the leading '--' 77 | :return: str, .sam file location 78 | """ 79 | 80 | runtime_args = default_alignment_args(fastq_file, n_threads, index, alignment_dir) 81 | 82 | for k, v in kwargs.items(): # overwrite or add any arguments passed from cmdline 83 | if not isinstance(k, str): 84 | try: 85 | k = str(k) 86 | except ValueError: 87 | raise ValueError("arguments passed to STAR must be strings") 88 | if not isinstance(v, str): 89 | try: 90 | v = str(v) 91 | except ValueError: 92 | raise ValueError("arguments passed to STAR must be strings") 93 | runtime_args["--" + k] = v 94 | 95 | # construct command line arguments for STAR 96 | cmd = ["STAR"] 97 | if reverse_fastq_file: 98 | for key, value in runtime_args.items(): 99 | if key == "--readFilesIn": 100 | cmd.extend((key, value)) 101 | cmd.append(reverse_fastq_file) 102 | else: 103 | cmd.extend((key, value)) 104 | else: 105 | for pair in runtime_args.items(): 106 | cmd.extend(pair) 107 | 108 | cmd = shlex.split(" ".join(cmd)) 109 | aln = Popen(cmd, stderr=PIPE, stdout=PIPE) 110 | _, err = aln.communicate() 111 | if err: 112 | raise ChildProcessError(err) 113 | 114 | return alignment_dir + "Aligned.out.bam" 115 | 116 | 117 | def create_index( 118 | fasta: str, gtf: str, genome_dir: str, read_length: int = 75, **kwargs 119 | ) -> None: 120 | """Create a new STAR index 121 | 122 | :param fasta: complete filepath to fasta file 123 | :param gtf: complete filepath to gtf file 124 | :param genome_dir: directory in which new index should be constructed 125 | :param read_length: length of reads that will be aligned against this index 126 | :param kwargs: additional keyword arguments to pass to the genome construction call. 127 | to pass --sjdbFileChrStartEnd filename, pass sjdbFileChrStartEnd=filename (no --) 128 | :return: None 129 | """ 130 | ncpu = str(cpu_count()) 131 | makedirs(genome_dir, exist_ok=True) 132 | overhang = str(read_length - 1) 133 | 134 | # Popen is hard to work as far as process substitution is concerned. 135 | # let's just gunzip it before passing to STAR. 136 | if fasta.endswith(".gz"): 137 | proc_gunzip = Popen(["gunzip", fasta]) 138 | out, err = proc_gunzip.communicate() 139 | if err: 140 | raise ChildProcessError(err) 141 | fasta = fasta.replace(".gz", "") 142 | 143 | cmd = [ 144 | "STAR", 145 | "--runMode", 146 | "genomeGenerate", 147 | "--runThreadN", 148 | ncpu, 149 | "--genomeDir", 150 | genome_dir, 151 | "--genomeFastaFiles", 152 | fasta, 153 | "--sjdbGTFfile", 154 | gtf, 155 | "--sjdbOverhang", 156 | overhang, 157 | ] 158 | 159 | for k, v in kwargs.items(): 160 | cmd.append("--{}".format(k)) 161 | cmd.append(v) 162 | 163 | p = Popen(cmd, stderr=PIPE, stdout=PIPE) 164 | out, err = p.communicate() 165 | if err: 166 | raise ChildProcessError(err) 167 | -------------------------------------------------------------------------------- /src/seqc/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .progress import progress 2 | from .run import run 3 | from .index import index 4 | from .instances import instances 5 | from .terminate import terminate 6 | from .start import start 7 | from .notebook import notebook -------------------------------------------------------------------------------- /src/seqc/core/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | from seqc import io 3 | 4 | 5 | def s3_data(files_or_links, output_prefix): 6 | """downloads any data provided by s3 links, otherwise gets list of files. 7 | 8 | :param list files_or_links: str files or str s3 links to files 9 | :param str output_prefix: prefix to prepend files 10 | :returns list files: filename(s) of downloaded files 11 | """ 12 | files = [] 13 | for f in files_or_links: 14 | if not f.startswith("s3://"): 15 | if f.endswith("/"): 16 | files.extend(f + subfile for subfile in os.listdir(f)) 17 | else: 18 | files.append(f) 19 | else: 20 | recursive = True if f.endswith("/") else False 21 | files.extend( 22 | io.S3.download(f, output_prefix, overwrite=True, recursive=recursive) 23 | ) 24 | return files 25 | -------------------------------------------------------------------------------- /src/seqc/core/index.py: -------------------------------------------------------------------------------- 1 | def index(args): 2 | """create an index for SEQC. 3 | 4 | :param args: parsed arguments. This function is only called if subprocess_name is 5 | 'index' 6 | """ 7 | 8 | # functions to be pickled and run remotely must import all their own modules 9 | import sys 10 | import logging 11 | from seqc import ec2, log, io 12 | from seqc.sequence.index import Index 13 | from seqc.alignment import star 14 | from seqc import version 15 | 16 | logging.basicConfig( 17 | level=logging.DEBUG, 18 | handlers=[ 19 | logging.FileHandler(args.log_name), 20 | logging.StreamHandler(sys.stdout), 21 | ], 22 | ) 23 | 24 | log.info("SEQC=v{}".format(version.__version__)) 25 | log.info("STAR=v{}".format(star.get_version())) 26 | log.args(args) 27 | 28 | with ec2.instance_clean_up( 29 | email=args.email, 30 | upload=args.upload_prefix, 31 | log_name=args.log_name, 32 | debug=args.debug, 33 | terminate=args.terminate, 34 | running_remote=args.remote, 35 | ): 36 | 37 | idx = Index(args.organism, args.ids, args.folder) 38 | idx.create_index( 39 | s3_location=args.upload_prefix, 40 | ensemble_release=args.ensemble_release, 41 | read_length=args.read_length, 42 | valid_biotypes=args.valid_biotypes, 43 | ) 44 | 45 | # upload the log file (seqc_log.txt, nohup.log, Log.out) 46 | if args.upload_prefix: 47 | bucket, key = io.S3.split_link(args.upload_prefix) 48 | for item in [args.log_name, "./nohup.log", "./Log.out"]: 49 | try: 50 | ec2.Retry(retries=5)(io.S3.upload_file)(item, bucket, key) 51 | log.info( 52 | "Successfully uploaded {} to {}".format( 53 | item, args.upload_prefix 54 | ) 55 | ) 56 | except FileNotFoundError: 57 | log.notify( 58 | "Item {} was not found! Continuing with upload...".format(item) 59 | ) 60 | 61 | log.info("DONE.") 62 | -------------------------------------------------------------------------------- /src/seqc/core/instances.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import os 3 | 4 | 5 | def instances(args): 6 | """list instances and return 7 | 8 | :param args: namespace object from argparse, must contain args.rsa_key, the path to 9 | the rsa-key used to start the instances you want to list 10 | :return None: 11 | """ 12 | 13 | if args.rsa_key is None: 14 | raise ValueError('-k/--rsa-key does not point to a valid file object. ') 15 | if not os.path.isfile(args.rsa_key): 16 | raise ValueError('-k/--rsa-key does not point to a valid file object. ') 17 | 18 | keyname = args.rsa_key.rpartition('.')[0].rpartition('/')[-1] 19 | 20 | ec2 = boto3.resource('ec2') 21 | all_instances = ec2.instances.filter( 22 | Filters=[ 23 | {'Name': 'key-name', 24 | 'Values': [keyname]}]) 25 | for i in all_instances.all(): 26 | print('id: %s, type: %s, launch-time: %s, state: %s, ip %s' % ( 27 | i.id, i.instance_type, str(i.launch_time), i.state, i.public_ip_address)) 28 | -------------------------------------------------------------------------------- /src/seqc/core/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | 3 | import sys 4 | from seqc import core 5 | from seqc.core import parser, verify 6 | from seqc import ec2 7 | import boto3 8 | 9 | 10 | def clean_up_security_groups(): 11 | """ 12 | Cleanning all the unused security groups that were created/started using SEQC 13 | when the number of unused ones is greater than 300 14 | """ 15 | ec2 = boto3.resource("ec2") 16 | sgs = list(ec2.security_groups.all()) 17 | insts = list(ec2.instances.all()) 18 | 19 | all_sgs = set([sg.group_name for sg in sgs]) # get all security groups 20 | all_inst_sgs = set( 21 | [sg["GroupName"] for inst in insts for sg in inst.security_groups] 22 | ) # get security groups associated with instances 23 | unused_sgs = all_sgs - all_inst_sgs # get ones without instance association 24 | 25 | if len(unused_sgs) >= 100: 26 | print("Cleaning up the unused security groups:") 27 | client = boto3.client("ec2") 28 | for g in unused_sgs: 29 | all_inst_sgs = set( 30 | [sg["GroupName"] for inst in insts for sg in inst.security_groups] 31 | ) # since deleting ones takes a while, doublecheck whether 32 | if g.startswith("SEQC") and ( 33 | g not in all_inst_sgs 34 | ): # only cleaning ones associated with SEQC # the security group is still unused 35 | client.delete_security_group(GroupName=g) 36 | print(g + " deleted") 37 | 38 | 39 | def main(argv): 40 | """Check arguments, then call the appropriate sub-module 41 | 42 | Created to allow the main pipeline to be tested from the earliest entry point 43 | (command-line arguments). 44 | 45 | :param argv: output of sys.argv[1:] 46 | """ 47 | arguments = parser.parse_args(argv) 48 | 49 | func = getattr(core, arguments.subparser_name) 50 | assert func is not None 51 | 52 | # notebooks execute local 53 | if arguments.subparser_name == "notebook": 54 | return func(arguments) 55 | 56 | if arguments.remote: 57 | # todo improve how verification works; it's not really necessary, what is needed 58 | # is a method to determine volume size for remote. 59 | verification_func = getattr(verify, arguments.subparser_name) 60 | verified_args = verification_func(arguments) 61 | remote_args = { 62 | k: getattr(verified_args, k) 63 | for k in ( 64 | "rsa_key", 65 | "instance_type", 66 | "spot_bid", 67 | "volume_size", 68 | "user_tags", 69 | "remote_update", 70 | "ami_id", 71 | ) 72 | if getattr(verified_args, k) 73 | } 74 | 75 | # store the command-line arguments supplied by the user 76 | # the same aguments will be used to run SEQC on EC2 77 | remote_args["argv"] = argv 78 | 79 | # clean up AWS security groups 80 | clean_up_security_groups() 81 | 82 | # start EC2 instance and run the function 83 | ec2.AWSInstance(synchronous=False, **remote_args)(func)(verified_args) 84 | else: 85 | # run the function locally 86 | func(arguments) 87 | 88 | 89 | if __name__ == "__main__": 90 | main(sys.argv[1:]) 91 | -------------------------------------------------------------------------------- /src/seqc/core/notebook.py: -------------------------------------------------------------------------------- 1 | from seqc.notebooks.notebooks import Notebook 2 | from seqc import log 3 | 4 | 5 | def notebook(args): 6 | if args.subsubparser_name == 'merge': 7 | # need to also take a output directory because this thing will write stuff. 8 | # then merge the things 9 | # then return? 10 | n = Notebook(args.output_filename, *args.input_data) 11 | n.merge_data(merged_sample_name=args.output_filename) 12 | log.info('Merged samples written to %s' % args.input_data) 13 | elif args.subsubparser_name == 'generate': 14 | n = Notebook(args.output_stem, args.input_count_matrix) 15 | n.write_template() 16 | log.info('Notebook Template written to %s' % n.notebook_path) 17 | n.run_notebook() 18 | log.info('Notebook Run and written to %s' % n.notebook_path) 19 | 20 | -------------------------------------------------------------------------------- /src/seqc/core/progress.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen, PIPE 2 | from seqc import ec2 3 | from paramiko.ssh_exception import AuthenticationException 4 | from botocore.exceptions import ClientError 5 | 6 | 7 | def progress(args): 8 | """print progress of requested seqc run(s) to less 9 | 10 | :param args: namespace object from argparse, must include rsa-key and instance-id 11 | :return None: 12 | """ 13 | if args.rsa_key is None: 14 | raise ValueError('User must supply -k/--rsa-key or set the environment variable ' 15 | 'AWS_RSA_KEY') 16 | 17 | if args.instance_ids is None: 18 | raise ValueError('No instances specified. Please supply an instance using the -i ' 19 | 'parameter.') 20 | 21 | for id_ in args.instance_ids: 22 | connection = ec2.SSHConnection(id_, args.rsa_key) 23 | try: 24 | out, err = connection.execute('cat ./seqc_log.txt') 25 | except AuthenticationException: 26 | raise ValueError('instance %s cannot be found.' % repr(id_)) 27 | except ClientError: 28 | raise ValueError('instance %s cannot be found.' % repr(id_)) 29 | p = Popen(['less'], stdin=PIPE) 30 | p.communicate(input='\n'.join(out).encode()) 31 | -------------------------------------------------------------------------------- /src/seqc/core/start.py: -------------------------------------------------------------------------------- 1 | from seqc import ec2 2 | import os 3 | 4 | 5 | def start(args): 6 | """start an aws instance""" 7 | 8 | if args.rsa_key is None: 9 | raise ValueError("-k/--rsa-key does not point to a valid file object. ") 10 | if not os.path.isfile(args.rsa_key): 11 | raise ValueError("-k/--rsa-key does not point to a valid file object. ") 12 | 13 | instance = ec2.AWSInstance( 14 | rsa_key=args.rsa_key, 15 | instance_type=args.instance_type, 16 | spot_bid=args.spot_bid, 17 | volume_size=args.volume_size, 18 | ami_id=args.ami_id, 19 | ) 20 | 21 | instance.start() 22 | -------------------------------------------------------------------------------- /src/seqc/core/terminate.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from botocore.exceptions import ClientError 3 | 4 | 5 | def terminate(args): 6 | """print progress of requested seqc run to top 7 | 8 | :param args: namespace object from argparse, must include rsa-key and instance-id 9 | :return None: 10 | """ 11 | ec2 = boto3.resource("ec2") 12 | for id_ in args.instance_ids: 13 | instance = ec2.Instance(id=id_) 14 | try: 15 | response = instance.terminate() 16 | print("termination signal sent:\n%s" % response) 17 | except ClientError: 18 | print("instance %s does not exist") 19 | -------------------------------------------------------------------------------- /src/seqc/core/verify.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import inspect 4 | from math import ceil 5 | from seqc import io, platforms, ec2 6 | 7 | 8 | def filesize(filename): 9 | """return filesize of filename in bytes 10 | 11 | :param str filename: full path to file 12 | :return int: number of bytes in filename 13 | """ 14 | return os.stat(filename).st_size 15 | 16 | 17 | def validate_and_return_size(filename): 18 | """return true if a link or filepath points to a valid file or directory 19 | 20 | :param str filename: filepath or s3 link 21 | :return None: raises errors if path or link is invalid. 22 | """ 23 | if filename.startswith("s3://"): 24 | io.S3.check_links([filename]) 25 | return io.S3.obtain_size(filename) 26 | else: 27 | if os.path.isfile(filename): 28 | return filesize(filename) 29 | elif os.path.isdir(filename.rstrip("/")): 30 | return sum(filesize(filename + f) for f in os.listdir(filename)) 31 | else: 32 | print(filename) 33 | raise ValueError("%s does not point to a valid file") 34 | 35 | 36 | def estimate_required_volume_size(args): 37 | """estimate the size of volume that should be attached to an aws instance to run SEQC 38 | 39 | :param args: namespace object containing filepaths or download links to input data 40 | :return int: size of volume in gb 41 | """ 42 | # using worst-case estimates to make sure we don't run out of space, 35 = genome index 43 | total = (35 * 1e10) + sum(validate_and_return_size(f) for f in args.barcode_files) 44 | 45 | # todo stopped here; remove aws dependency 46 | if args.barcode_fastq and args.genomic_fastq: 47 | total += ( 48 | sum(validate_and_return_size(f) for f in args.barcode_fastq) * 14 + 9e10 49 | ) 50 | total += ( 51 | sum(validate_and_return_size(f) for f in args.genomic_fastq) * 14 + 9e10 52 | ) 53 | total += validate_and_return_size(args.index) 54 | 55 | elif args.alignment_file: 56 | total += (validate_and_return_size(args.alignment_file) * 2) + 4e10 57 | total += validate_and_return_size(args.index) 58 | 59 | elif args.merged_fastq: 60 | total += (validate_and_return_size(args.merged_fastq) * 13) + 9e10 61 | total += validate_and_return_size(args.index) 62 | 63 | elif args.read_array: 64 | total += validate_and_return_size(args.read_array) 65 | 66 | if args.basespace: 67 | if not args.basespace_token or args.basespace_token == "None": 68 | raise ValueError( 69 | "If the --basespace argument is used, the basespace token must be " 70 | "specified in the seqc config file or passed as --basespace-token" 71 | ) 72 | 73 | io.BaseSpace.check_sample(args.basespace, args.basespace_token) 74 | total += ( 75 | io.BaseSpace.check_size(args.basespace, args.basespace_token) * 14 + 9e10 76 | ) 77 | 78 | return ceil(total * 1e-9) 79 | 80 | 81 | def run(args) -> float: 82 | """ 83 | verify data input through the command line arguments, fixes minor issues, and 84 | throws exceptions if invalid parameters are encountered 85 | 86 | additionally, this function obtains a rough estimate of how much 87 | volume storage is needed for a remote run. 88 | 89 | :param Namespace args: Namespace object, output from ArgumentParser.parse_args() 90 | :returns total: float, estimated Kb of Volume space needed to run SEQC remotely. 91 | """ 92 | 93 | if args.rsa_key is None: 94 | raise ValueError("-k/--rsa-key does not point to a valid file object. ") 95 | if not os.path.isfile(args.rsa_key): 96 | raise ValueError("-k/--rsa-key does not point to a valid file object. ") 97 | 98 | if args.output_prefix.endswith("/"): 99 | raise ValueError("output_stem should not be a directory.") 100 | if not args.index.endswith("/"): 101 | raise ValueError('index must be a directory, and must end with "/"') 102 | 103 | # check platform name; raises ValueError if invalid 104 | platform_name(args.platform) 105 | 106 | # check to make sure that --email-status is passed with remote run 107 | if args.remote and not args.email: 108 | raise ValueError("Please supply the --email-status flag for a remote SEQC run.") 109 | # if args.instance_type not in ['c3', 'c4', 'r3']: # todo fix this instance check 110 | # raise ValueError('All AWS instance types must be either c3, c4, or r3.') 111 | # if args.terminate not in ['True', 'true', 'False', 'false', 'on-success']: 112 | # raise ValueError('the --no-terminate flag must be either True, False, ' 113 | # 'or on-success.') 114 | 115 | # make sure at least one input has been passed 116 | valid_inputs = ( 117 | args.barcode_fastq, 118 | args.genomic_fastq, 119 | args.merged_fastq, 120 | args.alignment_file, 121 | args.basespace, 122 | args.read_array, 123 | ) 124 | if not any(valid_inputs): 125 | raise ValueError( 126 | "At least one input argument (-b/-g, -m, -s, -r, --basespace) must be passed " 127 | "to SEQC." 128 | ) 129 | if not args.barcode_files: # todo clean this up and fold into platform somehow 130 | if args.platform != "drop_seq": 131 | raise ValueError("--barcode-files is required for this platform.") 132 | 133 | # make sure at most one input type has been passed 134 | num_inputs = 0 135 | if args.barcode_fastq or args.genomic_fastq: 136 | if not all((args.barcode_fastq, args.genomic_fastq)): 137 | raise ValueError( 138 | "if either genomic or barcode fastq are provided, both must be provided" 139 | ) 140 | num_inputs += 1 141 | num_inputs += sum( 142 | 1 143 | for i in ( 144 | args.merged_fastq, 145 | args.alignment_file, 146 | args.basespace, 147 | args.read_array, 148 | ) 149 | if i 150 | ) 151 | if num_inputs > 1: 152 | raise ValueError( 153 | "user should provide at most one input argument (-b/-g, -m, -s, -r, " 154 | "--basespace" 155 | ) 156 | 157 | # if basespace is being used, make sure there is a valid basespace token 158 | if args.basespace and not hasattr(args, "basespace_token"): 159 | raise RuntimeError( 160 | "if --basespace input is selected, user must provide an OAuth " 161 | "token using the --basespace-token parameter." 162 | ) 163 | 164 | # check that spot-bid is correct 165 | if args.spot_bid is not None: 166 | if args.spot_bid < 0: 167 | raise ValueError("bid %f must be a non-negative float." % args.spot_bid) 168 | 169 | if args.upload_prefix and not args.upload_prefix.startswith("s3://"): 170 | raise ValueError("upload_prefix should be an s3 address beginning with s3://") 171 | 172 | if args.upload_prefix.startswith("s3://"): 173 | ec2.check_bucket(args.upload_prefix) 174 | 175 | if args.volume_size is None: 176 | setattr(args, "volume_size", estimate_required_volume_size(args)) 177 | 178 | return args 179 | 180 | 181 | def index(args): 182 | """add a default volume_size if it was not otherwise passed to seqc. 183 | 184 | :param args: namespace object from argparse 185 | :return: updated namespace object with volume_size set. 186 | """ 187 | if args.volume_size is None: 188 | setattr(args, "volume_size", 100) 189 | return args 190 | 191 | 192 | def executables(*execs): 193 | """ 194 | checks whether executables are installed on the machine of the 195 | current seqc run. 196 | 197 | :param execs: Tuple of executables to check 198 | :returns : Tuple of boolean (True if a specific executable is installed). 199 | """ 200 | return tuple(map(lambda exe: shutil.which(exe) is not None, execs)) 201 | 202 | 203 | def platform_name(name: str): 204 | """ 205 | checks whether the platform name supplied by the user is supported by the current 206 | iteration of seqc. 207 | :param name: string of platform name to check 208 | :return: name (if supported by seqc). 209 | """ 210 | choices = [ 211 | x[0] 212 | for x in inspect.getmembers(platforms, inspect.isclass) 213 | if issubclass(x[1], platforms.AbstractPlatform) 214 | ][1:] 215 | if name not in choices: 216 | raise ValueError( 217 | "Please specify a valid platform name for SEQC. The available " 218 | "options are: {}".format(choices) 219 | ) 220 | # throw error for mars1_seq since we don't have the appropriate primer length yet 221 | if name == "mars1_seq": 222 | raise ValueError("Mars1-seq is currently not stable in this version of SEQC.") 223 | return name 224 | -------------------------------------------------------------------------------- /src/seqc/distance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def jsd(p, q) -> float: 5 | """Jensen Shannon distance of two variables normalized variables p and q 6 | 7 | Note that if p and q are not normalized, this function will not return a proper 8 | distance, so matrices should be normalized prior to use 9 | 10 | use with sklearn.NearestNeighbors: 11 | 12 | >>> from sklearn.neighbors import NearestNeighbors 13 | >>> # set some dummy variables 14 | >>> data = np.random.random((100, 100)) 15 | >>> data = data / data.sum(axis=1)[:, np.newaxis] # norm rows 16 | >>> assert(np.all(np.array(data.sum(axis=1) == 1)))3 17 | >>> k = 10 18 | >>> 19 | >>> nn = NearestNeighbors(k=k, metric='pyfunc', algorithm='ball_tree', 20 | >>> metric_params={'func': jsd}) 21 | >>> nn.fit(data) 22 | 23 | Parameters 24 | ---------- 25 | p, q : np.array 26 | 27 | Returns 28 | ------- 29 | float : kl divergence between p and q 30 | """ 31 | idx = np.logical_or(p != 0, q != 0) 32 | p = p[idx] 33 | q = q[idx] 34 | m = (p + q) / 2 35 | return np.sqrt((.5 * kldiv(p, m)) + (.5 * kldiv(q, m))) 36 | 37 | 38 | def kldiv(x: np.ndarray, m: np.ndarray) -> float: 39 | """Modified Kullback-Liebler divergence of two variables x and m. 40 | 41 | depends upon normalization done by jsd parent function, namely that (1) there are no 42 | zero-valued entries in m, and (2) both x and m are probability distributions that 43 | sum to 1 44 | 45 | Parameters 46 | ---------- 47 | x, m : normalized probability vectors 48 | 49 | Returns 50 | ------- 51 | float : kl divergence between p and q 52 | """ 53 | return np.nansum(x * np.log2(x / m)) 54 | -------------------------------------------------------------------------------- /src/seqc/email_.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen, PIPE 2 | import os 3 | 4 | 5 | def email_user(attachment: str, email_body: str, email_address: str) -> None: 6 | """ 7 | sends an email to email address with text contents of email_body and attachment 8 | attached. Email will come from "ec2-User@ 9 | 10 | :param attachment: the file location of the attachment to append to the email 11 | :param email_body: text to send in the body of the email 12 | :param email_address: the address to which the email should be sent""" 13 | 14 | # todo if remote is sending double emails, add quotes around attachment. 15 | if isinstance(email_body, str): 16 | email_body = email_body.encode() 17 | email_args = ['mutt', '-e', 'set content_type="text/html"', '-a', attachment, '-s', 18 | 'Remote Process', '--', email_address] 19 | email_process = Popen(email_args, stdin=PIPE) 20 | email_process.communicate(email_body) 21 | -------------------------------------------------------------------------------- /src/seqc/exceptions.py: -------------------------------------------------------------------------------- 1 | class RetryLimitExceeded(Exception): 2 | pass 3 | 4 | 5 | class InstanceNotRunningError(Exception): 6 | pass 7 | 8 | 9 | class EC2RuntimeError(Exception): 10 | pass 11 | 12 | 13 | class ConfigurationError(Exception): 14 | pass 15 | 16 | 17 | class ArgumentParserError(Exception): 18 | pass 19 | 20 | 21 | class EmptyMatrixError(Exception): 22 | pass 23 | -------------------------------------------------------------------------------- /src/seqc/h5.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | 5 | 6 | class H5: 7 | 8 | def __init__(self, archive_name: str): 9 | """Wrapper for the pandas HDFStore class which ensures that all interactions with 10 | the archive result in a closed, flushed archive. 11 | 12 | In order to ensure data usability, all data must be submitted in DataFrame format. 13 | This decision was made to encourage users to pair metadata with sequencing data, 14 | and reduce the incidence of unexpected data permutation. 15 | 16 | :param archive_name: name of the h5 archive to open. If the archive does not exist 17 | it will be created using a blosc5 filter 18 | 19 | :method ls: list contents of the archive 20 | :method save: save an object to the h5 archive 21 | :method load: load an object from the archive 22 | :method remove: remove a DataFrame from the archive 23 | :method is_open: returns True if the h5 archive is open, else False 24 | """ 25 | if os.path.isfile(archive_name): 26 | self._archive = pd.HDFStore(archive_name, mode='a') 27 | self._archive.close() 28 | else: 29 | self._archive = pd.HDFStore( 30 | archive_name, mode='a', complib='blosc', complevel=5) 31 | self._archive.close() 32 | 33 | def __repr__(self): 34 | self._archive.open() 35 | try: 36 | return repr(self._archive) 37 | finally: 38 | self._archive.close() 39 | 40 | def save(self, data: pd.DataFrame, location: str) -> None: 41 | """Save DataFrame data to the h5 archive in location. 42 | 43 | :param data: DataFrame object to store 44 | :param location: filepath to save the object in the h5 hierarchy 45 | """ 46 | if not isinstance(data, pd.DataFrame): 47 | if isinstance(data, np.ndarray): 48 | res = input('np.ndarray class detected. Save as pd.DataFrame with ' 49 | 'ascending integer indices? [y/n] ') 50 | if res in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']: 51 | data = pd.DataFrame(data) 52 | else: 53 | print('User elected not to save DataFrame, archive is unmodified.') 54 | return 55 | else: 56 | raise TypeError('only pd.DataFrame objects can be saved using this ' 57 | 'class. To save np.ndarray objects please see the tables ' 58 | 'package.') 59 | self._archive.open() 60 | try: 61 | self._archive[location] = data 62 | finally: 63 | self._archive.close() 64 | 65 | def load(self, location: str) -> None: 66 | """Load and return the dataframe found at location in the archive. 67 | 68 | :param location: str, location of object to retrieve from h5 69 | :return: pd.DataFrame, object found at location 70 | """ 71 | self._archive.open() 72 | try: 73 | return self._archive[location] 74 | finally: 75 | self._archive.close() 76 | 77 | def ls(self) -> None: 78 | """list archive contents""" 79 | try: 80 | self._archive.open() 81 | print(self._archive) 82 | finally: 83 | self._archive.close() 84 | 85 | def remove(self, location: str) -> None: 86 | """remove the DataFrame at location from the archive 87 | 88 | Note: removing a dataframe at a branch node will remove all leaves sharing this 89 | prefix. e.g. in an archive containing: 90 | 91 | /data 92 | /data/filtered 93 | /data/metadata 94 | /new_data/data 95 | 96 | removing /data would remove the first three DataFrame objects from the archive. 97 | 98 | :param location: location of DataFrame to remove 99 | :return: None 100 | """ 101 | 102 | self._archive.open() 103 | try: 104 | if location not in self._archive.keys(): 105 | raise ValueError( 106 | '{} not contained in archive, nothing to remove.'.format(location)) 107 | else: 108 | removed = [k for k in self._archive.keys() 109 | if k.startswith(location + '/')] 110 | if len(removed) != 0: 111 | res = input( 112 | 'Removing branch node {}, which is a prefix for {!a} will remove ' 113 | 'all listed DataFrames. Continue with removal? [y/n] '.format( 114 | location, removed)) 115 | if res not in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']: 116 | print('returned without deletion.') 117 | return 118 | self._archive.remove(location) 119 | finally: 120 | self._archive.close() 121 | 122 | @property 123 | def is_open(self) -> bool: 124 | return self._archive.is_open 125 | -------------------------------------------------------------------------------- /src/seqc/multialignment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import itertools 3 | import time 4 | import seqc 5 | 6 | 7 | class UnionFind: 8 | """Union-find data structure. 9 | 10 | Each unionFind instance X maintains a family of disjoint sets of 11 | hashable objects, supporting the following two methods: 12 | 13 | - X[item] returns a name for the set containing the given item. 14 | Each set is named by an arbitrarily-chosen one of its members; as 15 | long as the set remains unchanged it will keep the same name. If 16 | the item is not yet part of a set in X, a new singleton set is 17 | created for it. 18 | 19 | - X.union(item1, item2, ...) merges the sets containing each item 20 | into a single larger set. If any item is not yet part of a set 21 | in X, it is added to X as one of the members of the merged set. 22 | """ 23 | 24 | def __init__(self): 25 | """Create a new empty union-find structure.""" 26 | self.weights = {} 27 | self.parents = {} 28 | 29 | def __getitem__(self, obj): 30 | """Find and return the name of the set containing the object.""" 31 | 32 | # check for previously unknown object 33 | if obj not in self.parents: 34 | self.parents[obj] = obj 35 | self.weights[obj] = 1 36 | return obj 37 | 38 | # find path of objects leading to the root 39 | path = [obj] 40 | root = self.parents[obj] 41 | while root != path[-1]: 42 | path.append(root) 43 | root = self.parents[root] 44 | 45 | # compress the path and return 46 | for ancestor in path: 47 | self.parents[ancestor] = root 48 | return root 49 | 50 | def __iter__(self): 51 | """Iterate through all items ever found or unioned by this structure.""" 52 | return iter(self.parents) 53 | 54 | def union(self, *objects): 55 | """Find the sets containing the objects and merge them all.""" 56 | roots = [self[x] for x in objects] 57 | heaviest = max([(self.weights[r], r) for r in roots])[1] 58 | for r in roots: 59 | if r != heaviest: 60 | self.weights[heaviest] += self.weights[r] 61 | self.parents[r] = heaviest 62 | 63 | def union_all(self, iterable): 64 | for i in iterable: 65 | self.union(*i) 66 | 67 | def find_all(self, vals): 68 | vals = [self.find_component(v) for v in vals] 69 | unique = set(vals) 70 | reindex = dict(zip(unique, range(len(unique)))) 71 | set_membership = np.array([reindex[v] for v in vals]) 72 | sets = np.array(list(reindex.values())) 73 | return set_membership, sets 74 | 75 | def find_component(self, iterable): 76 | """Return the set that obj belongs to 77 | 78 | If the iterable contains items that have been unioned, then any entry in the 79 | iterable will be sufficient to identify the set that obj belongs to. Use the 80 | first entry, and return the set associated with iterable. 81 | 82 | If the iterable has not been entered into the structure, this method can yield 83 | incorrect results 84 | """ 85 | return self[next(iter(iterable))] 86 | 87 | 88 | def intersection(set_l): 89 | res = set_l[0] 90 | for s in set_l: 91 | res = set(set(res) & set(s)) 92 | return res 93 | 94 | 95 | # # Some constants 96 | # NO_DISAMBIGUATION = 0 97 | # RESOLVED_GENE = 1 98 | # NO_GENE_RESOLVED = 2 99 | # MULTIPLE_MODELS = 3 100 | 101 | 102 | # def reduce_coalignment_array(arr, threshold = 0.0001): 103 | # res = {} 104 | # for g in arr: 105 | # temp = {} 106 | # for k in arr[g]: 107 | # if arr[g][k] < threshold: 108 | # continue 109 | # temp[tuple(sorted(k))] = arr[g][k] 110 | # if len(temp)>0: 111 | # res[g] = temp 112 | # return res 113 | 114 | # #def strip(genes): 115 | # # return tuple(sorted([int(g[2:]) for g in genes])) 116 | # def strip(genes): 117 | # return tuple(sorted(genes)) 118 | # def strip_model(mod): 119 | # res = {} 120 | # for k in mod: 121 | # res[tuple(sorted(k))]=mod[k] 122 | # return res 123 | 124 | # def split_to_disjoint(obs): 125 | # res = [] 126 | # uf = UnionFind() 127 | # uf.union_all(obs.keys()) 128 | # set_membership, sets = uf.find_all(obs.keys()) 129 | 130 | # for s in sets: 131 | # d = {} 132 | # for k in np.array(list(obs.keys()))[set_membership == s]: 133 | # d[tuple(k)] = obs[tuple(k)] 134 | # res.append(d) 135 | # return res 136 | 137 | # def get_indices(inds, obs_subset): 138 | # res = [] 139 | # for genes in obs_subset: 140 | # res += inds[genes] 141 | # return res 142 | 143 | # def model_to_gene(model): 144 | # for g in model: 145 | # if model[g]==1: 146 | # return g 147 | 148 | 149 | # def get_combinations(l): 150 | # res = [] 151 | # for i in range(len(l)): 152 | # res += itertools.combinations(l,i+1) 153 | # return res 154 | 155 | # # rank the different possible models by their scores 156 | # def best_fit_model(obs_s, coalignment_mat): 157 | # #obs_s = strip_model(obs) 158 | # gene_l = single_gene_list(obs_s) # From the list of observation create a list of unique single genes from which different models can be inferred 159 | 160 | 161 | # if len(obs_s) == 1: 162 | # if len(list(obs_s.keys())[0]) == 1: 163 | # return [{gene_l[0]:1}], NO_DISAMBIGUATION 164 | 165 | # possible_genes = intersection(list(obs_s.keys())) 166 | 167 | # #There is one gene that resolve the disambiguation 168 | # if len(possible_genes) == 1: 169 | # model = {} 170 | # for g in gene_l: 171 | # model[g] = 0 172 | # model[list(possible_genes)[0]] = 1 173 | # return [model], RESOLVED_GENE 174 | 175 | # #There is more than one gene that can explain it, no model can be decided 176 | # if len(possible_genes) > 1: 177 | # return [], NO_GENE_RESOLVED 178 | 179 | # #There are multiple competing models. For now we don't decide bewteen them 180 | # return [], MULTIPLE_MODELS 181 | # # mod_score_list = [] 182 | # # for mod in get_combinations(gene_l): 183 | # # model = {} 184 | # # for k in gene_l: 185 | # # if k in mod: 186 | # # model[k] = 1 187 | # # else: 188 | # # model[k] = 0 189 | # # score = model_score(model, obs_s, coalignment_mat) 190 | # # mod_score_list.append((model,score)) 191 | 192 | # #Here to decide if there is one model that's obviously better 193 | # # return mod_score_list, MULTIPLE_MODELS 194 | 195 | # # get a model and returns its likelihood score comparing the expected number of reads and the observed 196 | # # model is basically just a bool dic of all the unique genes with flags of wether or not they're in model 197 | # # observed is a dictionary of all gene combinations and their expected proportion 198 | # # coalignment_mat is the coalignment matrix used to calculate the expected number of reads 199 | # # eg: 200 | # # model - {A:1, B:0} 201 | # # observed - {A: 100 B:50, AB: 30 } 202 | # # 203 | # def model_score(model, observed, coalignment_mat): 204 | # exp = {} 205 | # tot = {} 206 | # for gene in model: 207 | # # patch for SC000 208 | # if gene==0: 209 | # tot[gene] = model[gene]*observed[gene,] 210 | # # Theres a common edge case where a gene A will only be aligned with other genes as well, in this case we update our observation vector to include A:0 211 | # elif (gene, ) not in observed: 212 | # tot[gene] = 0 213 | # elif gene not in coalignment_mat: 214 | # raise KeyError('{} not found in coalignment matrix'.format(gene)) 215 | # elif (gene, ) not in coalignment_mat[gene]: 216 | # tot[gene] = 0 217 | # else: 218 | # tot[gene] = model[gene]*(observed[gene,]/coalignment_mat[gene][gene,]) 219 | 220 | # keys = get_combinations(model.keys()) #get a list of all possible molecule combinations 221 | 222 | # # key is a set of genes and the expected number of reads for it is the sum of expected reads from all genes shared by the key, 223 | # # these in turn are the total reads for a gene (extrapoletaed from the uniqely mapped) multiplied by the coalignment factor (present in the coalignment matrix) 224 | # # e.g. if A has 20% coalignment with B and there are 80 reads mapped uniquely to A, we expect 80/0.8 * 0.2 = 20 reads to be mapped to AB from A (and more from B) 225 | # for k in keys: 226 | # k = tuple(sorted(k)) 227 | # sum = 0 228 | # for gene in k: 229 | # #Patch for SC000 230 | # if gene==0: 231 | # if k==(0,): 232 | # sum=1 233 | # else: 234 | # sum = 0 235 | # ##### 236 | # elif k in coalignment_mat[gene]: 237 | # sum += tot[gene]*coalignment_mat[gene][k] 238 | # exp[k] = sum 239 | 240 | # score = calc_score(observed, exp) 241 | # return score 242 | 243 | # def calc_score(obs, exp): 244 | # sum = 0 245 | # for k in obs: 246 | # if k not in exp: 247 | # print(k) 248 | # k = tuple(sorted(k)) 249 | # print ('bad key') 250 | # diff = (obs[k]-exp[k])**2 251 | # if exp[k]!=0: 252 | # diff /= exp[k] 253 | # sum += diff 254 | # return sum 255 | 256 | # #Get a dictionary of observations per gene/s and return a list of single unique genes 257 | # def single_gene_list(obs): 258 | # l = [] 259 | # for genes in obs: 260 | # for g in genes: 261 | # l.append(g) 262 | # return list(set(l)) 263 | 264 | -------------------------------------------------------------------------------- /src/seqc/notebooks/__init__.py: -------------------------------------------------------------------------------- 1 | from . import notebooks 2 | -------------------------------------------------------------------------------- /src/seqc/notebooks/notebooks.py: -------------------------------------------------------------------------------- 1 | from jinja2 import Environment, FileSystemLoader 2 | import os 3 | import pandas as pd 4 | import tempfile 5 | 6 | import nbformat 7 | from nbconvert.preprocessors import ExecutePreprocessor 8 | 9 | 10 | class Notebook: 11 | 12 | def __init__(self, output_stem: str, *data): 13 | 14 | # strip notebook affix if user provided it; this is a common error mode 15 | if output_stem.endswith('.ipynb'): 16 | output_stem = output_stem.replace('.ipynb', '') 17 | self._output_stem = output_stem 18 | 19 | self._data = data 20 | self._this_dir = os.path.dirname(os.path.abspath(__file__)) 21 | 22 | @property 23 | def notebook_path(self): 24 | return self._output_stem + '.ipynb' 25 | 26 | @property 27 | def merged_data(self): 28 | if isinstance(self._data, str): 29 | if os.path.isfile(self._data): 30 | return os.path.abspath(self._data) 31 | elif isinstance(self._data, (list, tuple)) and isinstance(self._data[0], str): 32 | if os.path.isfile(self._data[0]): 33 | return os.path.abspath(self._data[0]) 34 | raise TypeError('Data is not a 1-length iterable or string that contains a filepath') 35 | 36 | def merge_data(self, merged_sample_name=None, remove_unmerged=False): 37 | """ 38 | This function will merge any datasets provided as nested lists. 39 | Each top-level value is considered an input alias. 40 | Any second-level list is considered to be a group of files to be joined 41 | 42 | :param bool remove_unmerged: if True, this function will delete the unmerged files after 43 | completion 44 | :param str merged_sample_name: name of merged csv file 45 | :return None: The list of merged file names will replace the list passed to the class in 46 | self._datasets 47 | """ 48 | dfs = [pd.read_csv(csv, index_col=0) for csv in self._data] 49 | df = pd.concat( 50 | dfs, 51 | keys=list(range(len(self._data))), 52 | names=['sample_number', 'cell_id'] 53 | ) 54 | 55 | if not merged_sample_name: 56 | merged_sample_name = self._output_stem + '_merged_data.csv' 57 | df.to_csv(merged_sample_name) 58 | 59 | # delete original files, if requested 60 | if remove_unmerged: 61 | for csv in self._data: 62 | os.remove(csv) 63 | 64 | # update file urns 65 | self._data = merged_sample_name 66 | 67 | def write_template(self): 68 | """write a filled ipython notebook to disk 69 | 70 | :return: 71 | """ 72 | 73 | j2_env = Environment(loader=FileSystemLoader(self._this_dir), trim_blocks=True) 74 | rendered = j2_env.get_template('analysis_template.json').render( 75 | output_stem=self._output_stem, 76 | data=os.path.abspath(self.merged_data), 77 | ) 78 | with open(self._output_stem + '.ipynb', 'w') as fdw: 79 | fdw.write(rendered) 80 | 81 | def run_notebook(self, notebook_filename=None): 82 | 83 | if not notebook_filename: 84 | notebook_filename = self._output_stem + '.ipynb' 85 | 86 | dir_ = os.getcwd() 87 | with open(notebook_filename) as f: 88 | nb = nbformat.read(f, as_version=4) 89 | 90 | ep = ExecutePreprocessor(timeout=600, kernel_name='python3') 91 | ep.preprocess(nb, {'metadata': {'path': dir_}}) 92 | 93 | with open(notebook_filename, 'wt') as f: 94 | nbformat.write(nb, f) 95 | -------------------------------------------------------------------------------- /src/seqc/notebooks/test_notebooks.py: -------------------------------------------------------------------------------- 1 | from . import notebooks 2 | import tempfile 3 | import pytest 4 | import numpy as np 5 | import pandas as pd 6 | import uuid 7 | import os 8 | from seqc.core import main 9 | 10 | 11 | @pytest.fixture() 12 | def testing_data(): 13 | dir_ = tempfile.mkdtemp() 14 | test_data = [np.random.randint(10, 110, (100, 100)) for _ in range(4)] 15 | test_files = [] 16 | for f in test_data: 17 | filename = '{}/{}'.format(dir_, uuid.uuid4()) 18 | pd.DataFrame(f).to_csv(filename) 19 | test_files.append(filename) 20 | return test_files 21 | 22 | 23 | @pytest.fixture() 24 | def merged_data(testing_data): 25 | output_stem = os.path.join(tempfile.mkdtemp(), 'test_notebooks') 26 | n = notebooks.Notebook(output_stem, *testing_data) 27 | n.merge_data() 28 | return n.merged_data 29 | 30 | 31 | def test_template_filling(testing_data): 32 | output_stem = os.path.join(tempfile.mkdtemp(), 'test_notebooks') 33 | n = notebooks.Notebook(output_stem, *testing_data) 34 | n.merge_data() 35 | n.write_template() 36 | n.run_notebook() 37 | print(os.listdir(os.path.dirname(output_stem))) 38 | 39 | 40 | def test_merge_api(testing_data): 41 | output_filename = os.path.join(tempfile.mkdtemp(), 'test_notebooks.ipynb') 42 | args = ['notebook', 'merge', '-o', output_filename, '-i'] + testing_data 43 | main.main(args) 44 | 45 | 46 | def test_generate_api(merged_data): 47 | output_stem = os.path.join(tempfile.mkdtemp(), 'test_notebooks') 48 | args = ['notebook', 'generate', '-o', output_stem, '-i', merged_data] 49 | main.main(args) 50 | 51 | -------------------------------------------------------------------------------- /src/seqc/reader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gzip 3 | import bz2 4 | 5 | 6 | class Reader: 7 | """ 8 | Basic reader object that seamlessly loops over multiple input files 9 | 10 | Can be subclassed to create readers for specific file types (fastq, gtf, etc.) 11 | """ 12 | 13 | def __init__(self, files_): 14 | 15 | if isinstance(files_, list): 16 | self._files = files_ 17 | elif isinstance(files_, str): 18 | self._files = [files_] 19 | else: 20 | raise TypeError('files_ must be a string filename or a list of such names.') 21 | 22 | @property 23 | def filenames(self): 24 | return self._files 25 | 26 | def __len__(self): 27 | """ 28 | return the length of the Reader object. This depends on the implementation of 29 | self.__iter__(); it does not necessarily represent the length of the file in 30 | lines. 31 | """ 32 | return sum(1 for _ in self) 33 | 34 | def __iter__(self): 35 | for f in self._files: 36 | if f.endswith('.gz'): 37 | file_input = gzip.open(f, 'rb') 38 | elif f.endswith('.bz2'): 39 | file_input = bz2.open(f, 'rb') 40 | else: 41 | file_input = open(f, 'rb') 42 | for record in file_input: 43 | yield record 44 | file_input.close() 45 | 46 | @property 47 | def size(self) -> int: 48 | """return the collective size of all files being read in bytes""" 49 | return sum(os.stat(f).st_size for f in self._files) 50 | -------------------------------------------------------------------------------- /src/seqc/run_mast.R: -------------------------------------------------------------------------------- 1 | suppressMessages(library(MAST)) 2 | suppressPackageStartupMessages({library(data.table)}) 3 | options(mc.cores = 1) # gives me error messages when I use > 1 4 | 5 | loadData <- function(input_data) { 6 | df <- (read.csv(input_data, row.names=NULL)) 7 | } 8 | 9 | extractConditions <- function(df) { 10 | # extract conditions (sg) from column names of the df 11 | sg <- factor(unlist(df[1])) 12 | return(sg) 13 | } 14 | 15 | annotateDF <- function(df, sg) { 16 | df[1] <- NULL 17 | df <- t(df) 18 | names(df) <- sg 19 | return(df) 20 | } 21 | 22 | runMAST <- function(df, sg) { 23 | # extract columns and row information 24 | # add a cell number column to avoid duplicate row names 25 | wellKey <- seq_len(dim(df)[2]) 26 | wellKey <- lapply(wellKey, toString) 27 | condition <- as.numeric(unlist(as.list(sg))) 28 | cdata <- data.frame(cbind(wellKey=wellKey, condition=condition)) 29 | fdata <- data.frame(primerid=row.names(df)) 30 | 31 | # create the sca object. Note that we do filtering before 32 | # we create the test matrix, so no additional filtering of cells is added here 33 | exprsArray <- as.matrix(df) 34 | dimnames(exprsArray)[[2]] <- cdata$wellKey 35 | sca <- FromMatrix(exprsArray, cdata, fdata) 36 | 37 | # calculate cellular detection rate 38 | cdr2 <-colSums(assay(sca)>0) 39 | colData(sca)$cngeneson <- scale(cdr2) 40 | colData(sca)$cond <- as.numeric(unlist(as.list(sg))) 41 | 42 | # carry out DE analysis 43 | zlmCond <- zlm.SingleCellAssay(~cond + cngeneson, sca) 44 | #res <- lrTest(zlmCond, CoefficientHypothesis("cond")) 45 | 46 | #only test the cluster coefficient. 47 | summaryCond <- summary(zlmCond, doLRT=TRUE) 48 | summaryDt <- summaryCond$datatable 49 | fcHurdle <- merge(summaryDt[contrast=='cond' & component=='H',.(primerid, `Pr(>Chisq)`)], summaryDt[contrast=='cond' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') 50 | 51 | fcHurdle <- fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')] 52 | fcHurdleSig <- fcHurdle[(fdr<=0.05) & (abs(coef)>=log2(1.25)) ] 53 | setorder(fcHurdleSig, fdr) 54 | 55 | return(fcHurdleSig) 56 | } 57 | 58 | saveResult <- function(result, filename) { 59 | resultDf <- as.data.frame(result) 60 | colnames(resultDf)[1] = 'gene' 61 | colnames(resultDf)[2] = 'p' 62 | colnames(resultDf)[3] = 'logFC' 63 | colnames(resultDf)[6] = 'p.fdr.adj' 64 | resultDf <- resultDf[,c('gene','p','p.fdr.adj','logFC')] 65 | write.table(resultDf, file = filename, row.names = FALSE, col.names = TRUE, sep = ",", quote = FALSE) 66 | } 67 | 68 | testMAST <- function(input_filename, save_filename) { 69 | df <- loadData(input_filename) 70 | sg <- extractConditions(df) 71 | df <- annotateDF(df, sg) 72 | result <- runMAST(df, sg) 73 | saveResult(result, save_filename) 74 | } 75 | 76 | # args should be: 77 | # 1. input_filename 78 | # 2. output_filename 79 | 80 | args <- commandArgs(trailingOnly = TRUE) 81 | stopifnot(length(args) == 2) 82 | 83 | testMAST(args[1], args[2]) 84 | -------------------------------------------------------------------------------- /src/seqc/sequence/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/sequence/__init__.py -------------------------------------------------------------------------------- /src/seqc/sequence/barcodes.py: -------------------------------------------------------------------------------- 1 | from seqc.sequence.encodings import DNA3Bit 2 | from sys import maxsize 3 | 4 | # todo document me 5 | def generate_hamming_dist_1(seq): 6 | """ Return a list of all sequences that are up to 1 hamming distance from seq 7 | :param seq: 8 | """ 9 | res = [] 10 | l = DNA3Bit.seq_len(seq) 11 | #=barcode 12 | 13 | # generate all sequences that are dist 1 14 | for i in range(l): 15 | mask = 0b111 << (i * 3) 16 | cur_chr = (seq & mask) >> (i * 3) 17 | res += [seq & (~mask) | (new_chr << (i * 3)) 18 | for new_chr in DNA3Bit.bin2strdict.keys() if new_chr != cur_chr] 19 | 20 | return res 21 | 22 | 23 | def find_correct_barcode(code, barcodes_list, exact_match=False): 24 | """ 25 | For a given barcode find the closest correct barcode to it from the list (limited to 26 | one ED), a string representing the error and the edit distance 27 | NOTE: for now this function looks for a barcode with ED==1 and does not bother 28 | looking for the minimum 29 | 30 | :param exact_match: 31 | :param barcodes_list: 32 | :param code: 33 | :returns: 34 | """ 35 | 36 | # Return the barcode if it exists 37 | if code in barcodes_list: 38 | return code, 0 39 | 40 | # If perfect match is required, return an error since the barcode does not appear 41 | # in the correct barcode list 42 | if exact_match: 43 | return 0, maxsize 44 | 45 | min_ed = maxsize 46 | cor_code = 0 47 | for bc in barcodes_list: 48 | hamm_d = hamming_dist_bin(code, bc) 49 | if hamm_d == 1: 50 | min_ed = 1 51 | cor_code = bc 52 | break 53 | if hamm_d < min_ed: 54 | min_ed = hamm_d 55 | cor_code = bc 56 | 57 | return cor_code, min_ed 58 | 59 | 60 | def hamming_dist_bin(c1, c2): 61 | """Return the hamming distance between two numbers representing a sequence (3 bits 62 | per base) 63 | 64 | :param c1: 65 | :param c2: 66 | :return: 67 | """ 68 | if DNA3Bit.seq_len(c1) != DNA3Bit.seq_len(c2): 69 | return maxsize 70 | d = 0 71 | while c1 > 0: 72 | if c1 & 0b111 != c2 & 0b111: 73 | d += 1 74 | c1 >>= 3 75 | c2 >>= 3 76 | return d 77 | 78 | 79 | def list_errors(s1, s2): 80 | """ 81 | Return the list of nucleotide transformations that turn s1 to s2. 82 | An error is a six bit int representing a two chr string of type "AG","CT", etc. 83 | 84 | :param s2: 85 | :param s1: 86 | 87 | :returns: 88 | """ 89 | 90 | # return the actual error 91 | err_list = [] 92 | while s1 > 0: 93 | if s1 & 0b111 != s2 & 0b111: 94 | err_list.append((s1 & 0b111, s2 & 0b111)) 95 | s1 >>= 3 96 | s2 >>= 3 97 | return err_list 98 | -------------------------------------------------------------------------------- /src/seqc/sequence/encodings.py: -------------------------------------------------------------------------------- 1 | 2 | class DNA3Bit(object): 3 | """ 4 | Compact 3-bit encoding scheme for sequence data. 5 | """ 6 | 7 | @staticmethod 8 | def bits_per_base(): 9 | return 3 10 | 11 | # TODO: The sam reader needs to be fixed so text files are read as text not binary 12 | str2bindict = {65: 0b100, 67: 0b110, 71: 0b101, 84: 0b011, 78: 0b111, 13 | 97: 0b100, 99: 0b110, 103: 0b101, 116: 0b011, 110: 0b111, 14 | 'A': 0b100, 'C': 0b110, 'G': 0b101, 'T': 0b011, 'N': 0b111, 15 | 'a': 0b100, 'c': 0b110, 'g': 0b101, 't': 0b011, 'n': 0b111} 16 | bin2strdict = {0b100: b'A', 0b110: b'C', 0b101: b'G', 0b011: b'T', 0b111: b'N'} 17 | 18 | @staticmethod 19 | def encode(b) -> int: 20 | """ 21 | Convert string nucleotide sequence into binary, note: string is stored so 22 | that the first nucleotide is in the MSB position 23 | 24 | :param bytes|str b: sequence containing nucleotides to be encoded 25 | """ 26 | res = 0 27 | for c in b: 28 | res <<= 3 29 | res += DNA3Bit.str2bindict[c] 30 | return res 31 | 32 | @staticmethod 33 | def decode(i: int) -> bytes: 34 | """ 35 | Convert binary nucleotide sequence into string 36 | 37 | :param i: int, encoded sequence to be converted back to nucleotides 38 | """ 39 | if i < 0: 40 | message = 'i must be an unsigned (positive) integer, not {0!s}'.format(i) 41 | raise ValueError(message) 42 | r = b'' 43 | while i > 0: 44 | r = DNA3Bit.bin2strdict[i & 0b111] + r 45 | i >>= 3 46 | return r 47 | 48 | # TODO: another ooption is to use i.bit_length and take into account preceding 0's 49 | @staticmethod 50 | def seq_len(i: int) -> int: 51 | """ 52 | Return the length of an encoded sequence based on its binary representation 53 | 54 | :param i: int, encoded sequence 55 | """ 56 | l = 0 57 | while i > 0: 58 | l += 1 59 | i >>= 3 60 | return l 61 | 62 | @staticmethod 63 | def contains(s: int, char: int) -> bool: 64 | """ 65 | return true if the char (bin representation) is contained in seq (binary 66 | representation) 67 | 68 | :param char: int, encoded character (one must be only one nucleotide) 69 | :param s: int, sequence of encoded nucleotides 70 | """ 71 | while s > 0: 72 | if char == (s & 0b111): 73 | return True 74 | s >>= 3 75 | return False 76 | 77 | @staticmethod 78 | def ints2int(ints): 79 | """ 80 | convert an iterable of sequences [i1, i2, i3] into a concatenated single integer 81 | 0bi1i2i3. In cases where the sequence is longer than 64 bits, python will 82 | transition seamlessly to a long int representation, however the user must be 83 | aware that downsteam interaction with numpy or other fixed-size representations 84 | may not function 85 | 86 | :param ints: iterable of encoded sequences to concatenate 87 | """ 88 | 89 | res = 0 90 | for num in ints: 91 | tmp = num 92 | # Get length of next number to concatenate (with enough room for leading 0's) 93 | while tmp > 0: 94 | res <<= 3 95 | tmp >>= 3 96 | res += num 97 | return res 98 | 99 | @staticmethod 100 | def count(seq, char_bin): 101 | """ 102 | count how many times char is in seq. 103 | char needs to be an encoded value of one of the bases. 104 | """ 105 | if char_bin not in DNA3Bit.bin2strdict.keys(): 106 | raise ValueError("DNA3Bit.count was called with an invalid char code - " 107 | "{}".format(char_bin)) 108 | res = 0 109 | while seq > 0: 110 | if seq & 0b111 == char_bin: 111 | res += 1 112 | seq >>= 3 113 | return res 114 | 115 | 116 | # TODO: this was written for tests, not sure it's being used anymore 117 | # @staticmethod 118 | # def gc_content(i: int) -> float: 119 | # """ 120 | # calculates percentage of nucleotides in i that is G or C# 121 | # 122 | # :param i: int, encoded sequence 123 | # """ 124 | # gc = 0 125 | # length = 0 126 | # while i > 0: 127 | # length += 1 128 | # masked = i & 111 129 | # if masked == 0b100 or masked == 0b100: 130 | # gc += 1 131 | # i >>= 3 132 | # return gc / length 133 | -------------------------------------------------------------------------------- /src/seqc/sequence/fastq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from seqc import reader 4 | 5 | 6 | class FastqRecord: 7 | """Fastq record object 8 | 9 | Defines several properties for accessing fastq record information: 10 | :property name: name field 11 | :property sequence: sequence field 12 | :property name2: second name field 13 | :property quality: quality field 14 | 15 | Also defines several methods for accessing SEQC annotation fields: 16 | :property annotations: list of annotations 17 | :property metadata: dictionary of read metadata (if any present) 18 | :property average_quality: return the mean quality of FastqRecord 19 | """ 20 | 21 | __slots__ = ["_data"] 22 | 23 | def __init__(self, record: [bytes, bytes, bytes, bytes]): 24 | self._data = list(record) 25 | 26 | @property 27 | def name(self) -> bytes: 28 | return self._data[0] 29 | 30 | @name.setter 31 | def name(self, value: bytes): 32 | self._data[0] = value 33 | 34 | @property 35 | def sequence(self) -> bytes: 36 | return self._data[1] 37 | 38 | @sequence.setter 39 | def sequence(self, value: bytes): 40 | self._data[1] = value 41 | 42 | @property 43 | def name2(self) -> bytes: 44 | return self._data[2] 45 | 46 | @name2.setter 47 | def name2(self, value: bytes): 48 | self._data[2] = value 49 | 50 | @property 51 | def quality(self) -> bytes: 52 | return self._data[3] 53 | 54 | @quality.setter 55 | def quality(self, value: bytes): 56 | self._data[3] = value 57 | 58 | def __bytes__(self) -> bytes: 59 | return b"".join(self._data) 60 | 61 | def __str__(self) -> str: 62 | return bytes(self).decode() 63 | 64 | def __len__(self) -> int: 65 | return len(self.sequence) 66 | 67 | @property 68 | def annotations(self) -> list: 69 | """ 70 | returns: 71 | -------- 72 | list of annotations present in the fastq header 73 | """ 74 | try: 75 | end = self.name.index(b";") 76 | return self.name[:end].split(b":") 77 | except ValueError: 78 | return [] 79 | 80 | @property 81 | def metadata(self) -> dict: 82 | """ 83 | returns: 84 | -------- 85 | dictionary of annotations and fields, if any are present""" 86 | try: 87 | start = self.name.rindex(b"|") 88 | except ValueError: 89 | return {} 90 | fields = {} 91 | for field in self.name[start + 1 :].split(b":"): 92 | k, v = field.split(b"=") 93 | fields[k] = v 94 | return fields 95 | 96 | def add_annotation(self, values) -> None: 97 | """prepends a list of annotations to the name field of self.name 98 | :param values: 99 | """ 100 | self._data[0] = b"@" + b":".join(values) + b";" + self.name[1:] 101 | 102 | def add_metadata(self, values) -> None: 103 | """appends a list of metadata fields to the name field of self.name 104 | :param values: 105 | """ 106 | self.name += b"|" + b":".join(k + "=" + v for k, v in values.items()) 107 | 108 | def average_quality(self) -> int: 109 | """""" 110 | return ( 111 | np.mean(np.frombuffer(self.quality, dtype=np.int8, count=len(self))).astype( 112 | int 113 | ) 114 | - 33 115 | ) 116 | 117 | 118 | class Reader(reader.Reader): 119 | """ 120 | Fastq Reader, defines some special methods for reading and summarizing fastq data: 121 | 122 | :method __iter__: Iterator over fastq Record objects 123 | :method __len__: return number of records in file 124 | :method estimate_sequence_length: estimate the length of fastq sequences in file 125 | """ 126 | 127 | @staticmethod 128 | def record_grouper(iterable): 129 | args = [iter(iterable)] * 4 130 | return zip(*args) 131 | 132 | def __iter__(self): 133 | for record in self.record_grouper(super().__iter__()): 134 | yield FastqRecord(record) 135 | 136 | def __len__(self): 137 | """ 138 | return the length of the Reader object. This depends on the implementation of 139 | self.__iter__(); it does not necessarily represent the length of the file in 140 | lines. 141 | """ 142 | return sum(1 for _ in self) / 4 143 | 144 | def estimate_sequence_length(self): 145 | """ 146 | estimate the sequence length of a fastq file from the first 10000 records of 147 | the file. 148 | 149 | :return: int mean, float standard deviation, (np.ndarray: observed lengths, 150 | np.ndarray: counts per length) 151 | """ 152 | i = 0 153 | records = iter(self) 154 | data = np.empty(10000, dtype=int) 155 | while i < 10000: 156 | try: 157 | seq = next(records).sequence 158 | except StopIteration: # for fastq files shorter than 10000 records 159 | data = data[:i] 160 | break 161 | data[i] = len(seq) - 1 # last character is a newline 162 | i += 1 163 | return np.mean(data), np.std(data), np.unique(data, return_counts=True) 164 | 165 | 166 | def merge_paired(merge_function, fout, genomic, barcode=None) -> (str, int): 167 | """ 168 | General function to annotate genomic fastq with barcode information from reverse read. 169 | Takes a merge_function which indicates which kind of platform was used to generate 170 | the data, and specifies how the merging should be done. 171 | 172 | :param merge_function: function from merge_functions.py 173 | :param fout: merged output file name 174 | :param genomic: fastq containing genomic data 175 | :param barcode: fastq containing barcode data 176 | :return str fout, filename of merged fastq file 177 | 178 | """ 179 | directory, filename = os.path.split(fout) 180 | if directory and not os.path.isdir(directory): 181 | os.makedirs(directory, exist_ok=True) 182 | genomic = Reader(genomic) 183 | if barcode: 184 | barcode = Reader(barcode) 185 | with open(fout, "wb") as f: 186 | for g, b in zip(genomic, barcode): 187 | r = merge_function(g, b) 188 | f.write(bytes(r)) 189 | else: 190 | with open(fout, "wb") as f: 191 | for g in genomic: 192 | r = merge_function(g) 193 | f.write(bytes(r)) 194 | 195 | return fout 196 | 197 | 198 | def truncate(fastq_file, lengths): 199 | """ 200 | 201 | :param str fastq_file: the input fastq file 202 | :param [int] lengths: a list of integer lengths to truncate the input fastq file 203 | :return: 204 | """ 205 | # get sequence length of input file 206 | r = Reader(fastq_file) 207 | length = None 208 | for record in r: 209 | length = len(record.sequence) 210 | break 211 | 212 | print("sequence length in file is %d" % length) 213 | 214 | # remove any lengths longer than sequence length of file 215 | lengths = sorted([l for l in lengths if l < length])[::-1] # largest to smallest 216 | 217 | # open a bunch of files 218 | files = [] 219 | for l in lengths: 220 | name = ( 221 | fastq_file.replace(".gz", "").replace(".fastq", "") + "_%d_" % l + ".fastq" 222 | ) 223 | files.append(open(name, "wb")) 224 | 225 | i = 0 226 | indices = list(range(len(lengths))) 227 | for record in r: 228 | if i > 10e6: 229 | break 230 | for j in indices: 231 | record.sequence = record.sequence[:-1][: lengths[j]] + b"\n" 232 | record.quality = record.quality[:-1][: lengths[j]] + b"\n" 233 | files[j].write(bytes(record)) 234 | i += 1 235 | 236 | for f in files: 237 | f.close() 238 | -------------------------------------------------------------------------------- /src/seqc/sparse_frame.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from scipy.sparse import coo_matrix 4 | from collections import OrderedDict 5 | from seqc.sequence.gtf import create_gene_id_to_official_gene_symbol_map 6 | from seqc.sequence.gtf import ensembl_gene_id_to_official_gene_symbol 7 | 8 | 9 | class SparseFrame: 10 | def __init__(self, data, index, columns): 11 | """ 12 | lightweight wrapper of scipy.stats.coo_matrix to provide pd.DataFrame-like access 13 | to index, column, and shape properties. 14 | 15 | :param data: scipy.stats.coo_matrix 16 | :param index: np.ndarray: row index 17 | :param columns: np.ndarray: column index 18 | 19 | :property data: scipy.stats.coo_matrix 20 | :property index: np.ndarray row index 21 | :property columns: np.ndarray column index 22 | :property shape: (int, int), number of rows and columns 23 | :method sum: wrapper of np.sum() 24 | """ 25 | 26 | if not isinstance(data, coo_matrix): 27 | raise TypeError("data must be type coo_matrix") 28 | if not isinstance(index, np.ndarray): 29 | raise TypeError("index must be type np.ndarray") 30 | if not isinstance(columns, np.ndarray): 31 | raise TypeError("columns must be type np.ndarray") 32 | 33 | self._data = data 34 | self._index = index 35 | self._columns = columns 36 | 37 | @property 38 | def data(self): 39 | return self._data 40 | 41 | @data.setter 42 | def data(self, item): 43 | if not isinstance(item, coo_matrix): 44 | raise TypeError("data must be type coo_matrix") 45 | self._data = item 46 | 47 | @property 48 | def index(self): 49 | return self._index 50 | 51 | @index.setter 52 | def index(self, item): 53 | try: 54 | self._index = np.array(item) 55 | except: 56 | raise TypeError("self.index must be convertible into a np.array object") 57 | 58 | @property 59 | def columns(self): 60 | return self._columns 61 | 62 | @columns.setter 63 | def columns(self, item): 64 | try: 65 | self._columns = np.array(item) 66 | except: 67 | raise TypeError("self.columns must be convertible into a np.array object") 68 | 69 | @property 70 | def shape(self): 71 | return len(self.index), len(self.columns) 72 | 73 | def sum(self, axis=0): 74 | """ 75 | sum over provided axis 76 | 77 | :param axis: options: 0 (rows) or 1 (columns) 78 | :return: np.ndarray vector of column or row sums 79 | """ 80 | return self.data.sum(axis=axis) 81 | 82 | @classmethod 83 | def from_dict(cls, dictionary, genes_to_symbols=False): 84 | """create a SparseFrame from a dictionary 85 | 86 | :param dict dictionary: dictionary in form (cell, gene) -> count 87 | :param str|bool genes_to_symbols: convert genes into symbols. If not False, user 88 | must provide the location of a .gtf file to carry out conversion. Otherwise the 89 | column index will retain the original integer ids 90 | :return SparseFrame: SparseFrame containing dictionary data 91 | """ 92 | 93 | # todo this throws an uninformative error in the case that there are no active 94 | # reads in the ReadArray 95 | i, j = (np.array(v, dtype=int) for v in zip(*dictionary.keys())) 96 | data = np.fromiter(dictionary.values(), dtype=int) 97 | 98 | # map cells to small values 99 | uniq_i = np.unique(i) 100 | imap = OrderedDict(zip(uniq_i, np.arange(uniq_i.shape[0]))) 101 | 102 | uniq_j = np.unique(j) 103 | jmap = OrderedDict(zip(uniq_j, np.arange(uniq_j.shape[0]))) 104 | 105 | i_inds = np.fromiter((imap[v] for v in i), dtype=int) 106 | j_inds = np.fromiter((jmap[v] for v in j), dtype=int) 107 | 108 | coo = coo_matrix( 109 | (data, (i_inds, j_inds)), shape=(len(imap), len(jmap)), dtype=np.int32 110 | ) 111 | 112 | index = np.fromiter(imap.keys(), dtype=int) 113 | columns = np.fromiter(jmap.keys(), dtype=int) 114 | 115 | if genes_to_symbols: 116 | if not os.path.isfile(genes_to_symbols): 117 | raise ValueError( 118 | "genes_to_symbols argument %s is not a valid annotation " 119 | "file" % repr(genes_to_symbols) 120 | ) 121 | gmap = create_gene_id_to_official_gene_symbol_map(genes_to_symbols) 122 | columns = np.array( 123 | ensembl_gene_id_to_official_gene_symbol(columns, gene_id_map=gmap) 124 | ) 125 | 126 | return cls(coo, index, columns) 127 | -------------------------------------------------------------------------------- /src/seqc/stats/__init__.py: -------------------------------------------------------------------------------- 1 | from .ttest import bootstrap_t as ttest 2 | from .gsea import GSEA as gsea 3 | from .correlation import correlation 4 | from .anova import ANOVA as anova 5 | from .graph_diffusion import GraphDiffusion as graph_diffusion 6 | from .smoothing import smoothing 7 | from .tree import Tree as tree 8 | from .pca import PCA as pca 9 | from .tsne import TSNE as tsne 10 | from .g_test import g_test 11 | from .mast import run_mast 12 | from .resampled_nonparametric import mannwhitneyu, kruskalwallis -------------------------------------------------------------------------------- /src/seqc/stats/anova.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from collections import namedtuple 3 | import numpy as np 4 | import pandas as pd 5 | from functools import partial 6 | from scipy.stats.mstats import kruskalwallis, rankdata 7 | from scipy.stats import t 8 | from statsmodels.sandbox.stats.multicomp import multipletests 9 | 10 | class ANOVA: 11 | 12 | def __init__(self, data, group_assignments, alpha=0.05): 13 | """ 14 | Carry out ANOVA between the groups of data 15 | 16 | :param data: n cells x k genes 2d array 17 | :param group_assignments: n cells 1d vector 18 | :param alpha: float (0, 1], acceptable type I error 19 | """ 20 | # make sure group_assignments and data have the same length 21 | warnings.warn('DeprecationWarning: This function is deprecated.') 22 | if not data.shape[0] == group_assignments.shape[0]: 23 | raise ValueError( 24 | 'Group assignments shape ({!s}) must equal the number of rows in data ' 25 | '({!s}).'.format(group_assignments.shape[0], data.shape[0])) 26 | 27 | # todo 28 | # may want to verify that each group has at least two observations 29 | # (else variance won't work) 30 | 31 | # store index if both data and group_assignments are pandas objects 32 | if isinstance(data, pd.DataFrame) and isinstance(group_assignments, pd.Series): 33 | # ensure assignments and data indices are aligned 34 | try: 35 | ordered_assignments = group_assignments[data.index] 36 | if not len(ordered_assignments) == data.shape[0]: 37 | raise ValueError( 38 | 'Index mismatch between data and group_assignments detected when ' 39 | 'aligning indices. check for duplicates.') 40 | except: 41 | raise ValueError('Index mismatch between data and group_assignments.') 42 | 43 | # sort data by cluster assignment 44 | idx = np.argsort(ordered_assignments.values) 45 | self.data = data.iloc[idx, :].values 46 | ordered_assignments = ordered_assignments.iloc[idx] 47 | self.group_assignments = ordered_assignments.values 48 | self.index = data.columns 49 | 50 | else: # get arrays from input values 51 | self.index = None # inputs were not all indexed pandas objects 52 | 53 | try: 54 | data = np.array(data) 55 | except: 56 | raise ValueError('data must be convertible to a np.ndarray') 57 | 58 | try: 59 | group_assignments = np.array(group_assignments) 60 | except: 61 | raise ValueError('group_assignments must be convertible to a np.ndarray') 62 | 63 | idx = np.argsort(group_assignments) 64 | self.data = data[idx, :] 65 | self.group_assignments = group_assignments[idx] 66 | 67 | self.post_hoc = None 68 | self.groups = np.unique(group_assignments) 69 | 70 | # get points to split the array, create slicers for each group 71 | self.split_indices = np.where(np.diff(self.group_assignments))[0] + 1 72 | # todo is this a faster way of calculating the below anova? 73 | # self.array_views = np.array_split(self.data, self.split_indices, axis=0) 74 | 75 | if not 0 < alpha <= 1: 76 | raise ValueError('Parameter alpha must fall within the interval (0, 1].') 77 | self.alpha = alpha 78 | 79 | self._anova = None 80 | 81 | def anova(self, min_mean_expr=None): 82 | """ 83 | carry out non-parametric ANOVA across the groups of self. 84 | 85 | :param min_mean_expr: minimum average gene expression value that must be reached 86 | in at least one cluster for the gene to be considered 87 | :return: 88 | """ 89 | if self._anova is not None: 90 | return self._anova 91 | 92 | # run anova 93 | f = lambda v: kruskalwallis(*np.split(v, self.split_indices))[1] 94 | pvals = np.apply_along_axis(f, 0, self.data) # todo could shunt to a multiprocessing pool 95 | 96 | # correct the pvals 97 | _, pval_corrected, _, _ = multipletests(pvals, self.alpha, method='fdr_tsbh') 98 | 99 | # store data & return 100 | if self.index is not None: 101 | self._anova = pd.Series(pval_corrected, index=self.index) 102 | else: 103 | self._anova = pval_corrected 104 | return self._anova 105 | 106 | def post_hoc_tests(self): 107 | """ 108 | carries out post-hoc tests between genes with significant ANOVA results using 109 | Welch's U-test on ranked data. 110 | """ 111 | if self._anova is None: 112 | self.anova() 113 | 114 | anova_significant = np.array(self._anova) < 1 # call array in case it is a Series 115 | 116 | # limit to significant data, convert to column-wise ranks. 117 | data = self.data[:, anova_significant] 118 | rank_data = np.apply_along_axis(rankdata, 0, data) 119 | # assignments = self.group_assignments[anova_significant] 120 | 121 | split_indices = np.where(np.diff(self.group_assignments))[0] + 1 122 | array_views = np.array_split(rank_data, split_indices, axis=0) 123 | 124 | # get mean and standard deviations of each 125 | fmean = partial(np.mean, axis=0) 126 | fvar = partial(np.var, axis=0) 127 | mu = np.vstack(list(map(fmean, array_views))).T # transpose to get gene rows 128 | n = np.array(list(map(lambda x: x.shape[0], array_views))) 129 | s = np.vstack(list(map(fvar, array_views))).T 130 | s_norm = s / n # transpose to get gene rows 131 | 132 | # calculate T 133 | numerator = mu[:, np.newaxis, :] - mu[:, :, np.newaxis] 134 | denominator = np.sqrt(s_norm[:, np.newaxis, :] + s_norm[:, :, np.newaxis]) 135 | statistic = numerator / denominator 136 | 137 | # calculate df 138 | s_norm2 = s**2 / (n**2 * n-1) 139 | numerator = (s_norm[:, np.newaxis, :] + s_norm[:, :, np.newaxis]) ** 2 140 | denominator = (s_norm2[:, np.newaxis, :] + s_norm2[:, :, np.newaxis]) 141 | df = np.floor(numerator / denominator) 142 | 143 | # get significance 144 | p = t.cdf(np.abs(statistic), df) # note, two tailed test 145 | 146 | # calculate fdr correction; because above uses 2-tails, alpha here is halved 147 | # because each test is evaluated twice due to the symmetry of vectorization. 148 | p_adj = multipletests(np.ravel(p), alpha=self.alpha, method='fdr_tsbh')[1] 149 | p_adj = p_adj.reshape(*p.shape) 150 | 151 | phr = namedtuple('PostHocResults', ['p_adj', 'statistic', 'mu']) 152 | self.post_hoc = phr(p_adj, statistic, mu) 153 | 154 | if self.index is not None: 155 | p_adj = pd.Panel( 156 | p_adj, items=self.index[anova_significant], major_axis=self.groups, 157 | minor_axis=self.groups) 158 | statistic = pd.Panel( 159 | statistic, items=self.index[anova_significant], major_axis=self.groups, 160 | minor_axis=self.groups) 161 | mu = pd.DataFrame(mu, self.index[anova_significant], columns=self.groups) 162 | 163 | return p_adj, statistic, mu 164 | 165 | def population_markers(self, p_crit=0.0): 166 | """ 167 | Return markers that are significantly differentially expressed in one 168 | population vs all others 169 | 170 | :param p_crit: float, fraction populations that may be indistinguishable from the 171 | highest expressing population for each gene. If zero, each marker gene is 172 | significantly higher expressed in one population relative to all others. 173 | If 0.1, 10% of populations may share high expression of a gene, and those 174 | populations will be marked as expressing that gene. 175 | 176 | """ 177 | if self.post_hoc is None: 178 | self.post_hoc_tests() 179 | 180 | # get highest mean for each gene 181 | top_gene_idx = np.argmax(self.post_hoc.mu, axis=1) 182 | 183 | # index p_adj first dimension with each sample, will reduce to 2d genes x samples 184 | top_gene_sig = self.post_hoc.p_adj[:, top_gene_idx, :] 185 | 186 | # for each gene, count the number of non-significant DE results. 187 | sig = np.array(top_gene_sig < self.alpha) 188 | num_sig = np.sum(sig, axis=2) 189 | 190 | # if this is greater than N - 1 * p_crit, discard the gene. 191 | n = self.post_hoc.p_adj.shape[2] - 1 # number of genes, sub 1 for self 192 | idx_marker_genes = np.where(num_sig < n * (1 - p_crit)) 193 | marker_genes = sig[idx_marker_genes, :] 194 | 195 | # correctly index these genes 196 | if self.index: 197 | pass # todo fix this 198 | 199 | return marker_genes 200 | -------------------------------------------------------------------------------- /src/seqc/stats/correlation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | class correlation: 6 | """Fast vectorized correlation methods 7 | 8 | :method vector(x, y): correlate each column in y with a vector in x 9 | :method map(x, y): correlate each column of x with each column in y 10 | :method eigv(evec, data): get pairwise correlations of eigenvectors 11 | with columns of data 12 | """ 13 | 14 | @staticmethod 15 | def vector(x: np.array, y: np.array): 16 | """ 17 | Correlate each column in y with a vector x 18 | 19 | :param x: np.ndarray vector of length n 20 | :param y: np.ndarray matrix of shape (n, k) 21 | :returns: vector of length n 22 | """ 23 | # x = x[:, np.newaxis] # for working with matrices 24 | mu_x = x.mean() # cells 25 | mu_y = y.mean(axis=0) # cells by gene --> cells by genes 26 | sigma_x = x.std() 27 | sigma_y = y.std(axis=0) 28 | 29 | return ((y * x).mean(axis=0) - mu_y * mu_x) / (sigma_y * sigma_x) 30 | 31 | @staticmethod 32 | def map(x: np.ndarray, y: np.ndarray): 33 | """Correlate each row of x with each row of y 34 | 35 | :param x: np.array; shape N x T. 36 | :param y: np.array; shape M x T. 37 | :returns: np.array; shape N x M in which each element is a correlation 38 | coefficient. 39 | """ 40 | assert(x.shape[1] == y.shape[1]) 41 | n = x.shape[1] 42 | x_diff = x - x.mean(axis=-1)[:, None] 43 | y_diff = y - y.mean(axis=-1)[:, None] 44 | x_std = x.std(axis=-1) 45 | y_std = y.std(axis=-1) 46 | return np.dot(x_diff, y_diff.T) / (n * x_std[:, np.newaxis] * y_std) 47 | 48 | @staticmethod 49 | def eigv(evec, data, components=tuple(), knn=10): 50 | """ 51 | get pairwise correlations of eigenvectors with columns in data 52 | 53 | :param evec: eigenvectors 54 | :param data: np.ndarray genes x cells data matrix 55 | :param components: which eigenvectors to select 56 | :param knn: number of neighbors to smooth gene expression values over 57 | :return: 58 | """ 59 | if isinstance(data, pd.DataFrame): 60 | D = data.values 61 | elif isinstance(data, np.ndarray): 62 | D = data 63 | else: 64 | raise TypeError('data must be a pd.DataFrame or np.ndarray') 65 | 66 | # set components, remove zero if it was specified 67 | if not components: 68 | components = np.arange(evec.shape[1]) 69 | else: 70 | components = np.array(components) 71 | components = components[components != 0] 72 | 73 | eigv_corr = np.empty((D.shape[1], evec.shape[1]), dtype=np.float) 74 | 75 | for component_index in components: 76 | component_data = evec[:, component_index] 77 | 78 | order = np.argsort(component_data) 79 | x = pd.DataFrame(component_data[order]).rolling( 80 | window=knn, center=False).mean()[knn:].values 81 | # this fancy indexing will copy self.molecules 82 | vals = pd.DataFrame(D[order, :]).rolling( 83 | window=knn, center=False, axis=0).mean()[knn:].values 84 | eigv_corr[:, component_index] = correlation.vector(x, vals) 85 | 86 | # this is sorted by order, need it in original order (reverse the sort) 87 | eigv_corr = eigv_corr[:, components] 88 | if isinstance(data, pd.DataFrame): 89 | eigv_corr = pd.DataFrame(eigv_corr, index=data.columns, columns=components) 90 | return eigv_corr 91 | -------------------------------------------------------------------------------- /src/seqc/stats/experimental_yield.py: -------------------------------------------------------------------------------- 1 | class ExperimentalYield: 2 | 3 | output = ( 4 | '{divide}\nINPUT\n{divide}\n' 5 | 'Total input reads:\t{n_fastq}\n' 6 | '{divide}\nALIGNMENT (% FROM INPUT)\n{divide}\n' 7 | 'Total reads aligned:\t{n_sam} ({prop_al}%)\n' 8 | ' - Genomic alignments:\t{genomic} ({prop_gen}%)\n' 9 | ' - PhiX alignments:\t{phi_x} ({prop_phix}%)\n' 10 | ' - Transcriptome alignments:\t{trans} ({prop_trans}%)\n' 11 | '{divide}\nFILTERING (% FROM ALIGNMENT)\n{divide}\n' 12 | 'Genomic alignments:\t{genomic} ({bad_gen}%)\n' 13 | 'PhiX alignments:\t{phi_x} ({bad_phi}%)\n' 14 | 'Incorrect barcodes:\t{wrong_cb} ({bad_cb}%)\n' 15 | 'Missing cell barcodes/RMT:\t{no_cell} ({bad_cell}%)\n' 16 | 'N present in RMT:\t{rmt_N} ({bad_rmtN}%)\n' 17 | 'N present in CB:\t{cell_N} ({bad_cellN}%)\n' 18 | 'Insufficient poly(T):\t{poly_t} ({bad_polyt}%)\n' 19 | 'High dust score:\t{dust} ({bad_dust}%)\n' 20 | '{divide}\nCELL/MOLECULE COUNT DISTRIBUTION\n{divide}\n' 21 | 'Total molecules:\t\t{tot_mc}\n' 22 | 'Molecules lost:\t{mols_lost}\n' 23 | 'Cells lost:\t{cells_lost}\n' 24 | 'Cell description:\n{cell_desc}\n' 25 | '{divide}\nSUMMARY\n{divide}\n' 26 | 'Total retained reads:\t{n_good} ({prop_good}%)\n' 27 | 'Total reads unaligned:\t{lost_al} ({prop_un}%)\n' 28 | 'Total reads filtered:\t{n_bad} ({prop_bad}%)\n' 29 | '{divide}\n') 30 | 31 | @classmethod 32 | def construct_run_summary(cls, summary: dict): 33 | """ 34 | calculates basic loss statistics and constructs a summary 35 | that will be sent to the user after the SEQC run has completed. 36 | 37 | :param summary: dictionary constructed during error correction 38 | :return: output of basic summary statistics 39 | """ 40 | if not summary: 41 | return 42 | 43 | # obtain values from summary 44 | n_fastq = summary['n_fastq'] 45 | n_sam = summary['n_sam'] 46 | genomic = summary['gene_0'] 47 | phix = summary['phi_x'] 48 | no_cell = summary['cell_0'] 49 | # no_rmt = summary['rmt_0'] 50 | rmt_N = summary['rmt_N'] 51 | cell_N = summary['cell_N'] 52 | dust = summary['dust'] 53 | poly_t = summary['poly_t'] 54 | tot_mc = summary['total_mc'] 55 | mols_lost = list(summary['mols_lost'].items()) 56 | cells_lost = list(summary['cells_lost'].items()) 57 | cell_desc = summary['cell_desc'].to_string() 58 | divide = '-' * 40 59 | 60 | # run summary will not be calculated if user started SEQC midway 61 | if n_fastq == 'NA' or n_sam == 'NA': 62 | return 63 | 64 | # calculate summary statistics 65 | trans = n_sam - genomic - phix 66 | prop_al = round((n_sam/n_fastq) * 100, 1) 67 | prop_gen = round((genomic/n_sam) * 100, 1) 68 | prop_phix = round((phix/n_sam) * 100, 1) 69 | prop_trans = round((trans/n_sam) * 100, 1) 70 | lost_al = n_fastq - n_sam 71 | prop_un = round(100 - prop_al, 1) 72 | n_bad = genomic + phix + no_cell + rmt_N + cell_N + poly_t + dust 73 | # n_bad = genomic + phix + no_cell + no_rmt + rmt_N + poly_t 74 | # wrong_cb does not apply to drop-seq 75 | try: 76 | wrong_cb = summary['cb_wrong'] 77 | n_bad += wrong_cb 78 | bad_cb = round((wrong_cb/n_bad) * 100, 1) 79 | except KeyError: 80 | wrong_cb = 0 81 | bad_cb = 0 82 | # continue with calculations 83 | n_good = n_sam - n_bad 84 | bad_gen = round((genomic/n_bad) * 100, 1) 85 | bad_phi = round((phix/n_bad) * 100, 1) 86 | bad_cell = round((no_cell/n_bad) * 100, 1) 87 | # bad_rmt = round((no_rmt/n_bad) * 100, 1) 88 | bad_rmtN = round((rmt_N/n_bad) * 100, 1) 89 | bad_cellN = round((cell_N/n_bad) * 100, 1) 90 | bad_polyt = round((poly_t/n_bad) * 100, 1) 91 | bad_dust = round((dust/n_bad) * 100, 1) 92 | prop_bad = round((n_bad/n_fastq) * 100, 1) 93 | prop_good = round((n_good/n_fastq) * 100, 1) 94 | 95 | # format output 96 | output = cls.output.format( 97 | n_fastq=n_fastq, n_sam=n_sam, genomic=genomic, phi_x=phix, no_cell=no_cell, 98 | wrong_cb=wrong_cb, rmt_N=rmt_N, poly_t=poly_t, divide=divide, 99 | prop_al=prop_al, prop_gen=prop_gen, prop_phix=prop_phix, lost_al=lost_al, 100 | n_bad=n_bad, n_good=n_good, prop_good=prop_good, prop_bad=prop_bad, 101 | prop_un=prop_un, bad_gen=bad_gen, bad_phi=bad_phi, bad_cb=bad_cb, 102 | bad_cell=bad_cell, bad_rmtN=bad_rmtN, bad_polyt=bad_polyt, trans=trans, 103 | cell_N=cell_N, bad_cellN=bad_cellN, dust=dust, bad_dust=bad_dust, 104 | prop_trans=prop_trans, tot_mc=tot_mc, mols_lost=mols_lost, 105 | cells_lost=cells_lost, cell_desc=cell_desc) 106 | return output 107 | -------------------------------------------------------------------------------- /src/seqc/stats/g_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from contextlib import closing 4 | from multiprocessing import Pool 5 | from sklearn.cluster import KMeans 6 | 7 | 8 | def _assign(d): 9 | """ 10 | 11 | :param np.ndarray d: 1d vector of scaled differences 12 | :return np.ndarray: 1d boolean gene-enrichment assignment vector 13 | """ 14 | km = KMeans(n_clusters=2) 15 | km.fit(d[:, np.newaxis]) 16 | assignments = km.labels_.astype(bool) 17 | if np.argmax(km.cluster_centers_) == 0: 18 | return assignments 19 | else: 20 | return ~assignments 21 | 22 | 23 | def g_test(data, labels, log=False): 24 | """ 25 | 26 | :param pd.DataFrame data: 27 | :param labels: 28 | :param log: 29 | :return: 30 | """ 31 | 32 | if log: 33 | data = np.log(data + 1) 34 | 35 | data = pd.DataFrame(data.values / data.values.sum(axis=1)[:, np.newaxis], 36 | index=labels, columns=data.columns) 37 | 38 | # calculate data that are useful for determining observed and expected values 39 | gene_sums = data.sum(axis=0) 40 | grouped = data.groupby(axis=0, level=0) # group only once 41 | category_sizes = grouped.size() 42 | category_fractions = category_sizes / category_sizes.sum() # normalize 43 | 44 | # get observed, expected 45 | expected = pd.DataFrame( 46 | data=np.dot(category_fractions.values[:, np.newaxis], 47 | gene_sums.values[np.newaxis, :]), 48 | index=category_sizes.index, 49 | columns=gene_sums.index) 50 | observed = grouped.sum() 51 | 52 | # scaled ratios are used in both g-test, and partitioning of expressed vs. not 53 | logratio = np.log(observed / expected) 54 | logratio.values[~np.isfinite(logratio.values)] = 0 55 | scaled_diff = observed * logratio 56 | 57 | g = 2 * np.sum(scaled_diff, axis=0) # g-test 58 | 59 | # todo only assign significant values 60 | # todo calculate significance 61 | with closing(Pool()) as pool: 62 | assignments = pool.map(_assign, scaled_diff.values.T) 63 | 64 | assignments = pd.DataFrame( 65 | data=np.vstack(assignments).T, 66 | index=category_sizes.index, 67 | columns=data.columns 68 | ) 69 | 70 | return g, assignments 71 | -------------------------------------------------------------------------------- /src/seqc/stats/graph_diffusion.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | from scipy.sparse.linalg import eigs 4 | from numpy.linalg import norm 5 | from scipy.sparse import csr_matrix, find 6 | from sklearn.neighbors import NearestNeighbors 7 | 8 | 9 | class GraphDiffusion: 10 | def __init__(self, knn=10, normalization='smarkov', epsilon=1, 11 | n_diffusion_components=10): 12 | """ 13 | Run diffusion maps on the data. This implementation is based on the 14 | diffusion geometry library in Matlab: 15 | https://services.math.duke.edu/~mauro/code.html#DiffusionGeom and was implemented 16 | by Pooja Kathail 17 | 18 | :param knn: Number of neighbors for graph construction to determine distances 19 | between cells 20 | :param normalization: method for normalizing the matrix of weights 21 | 'bimarkov' force row and column sums to be 1 22 | 'markov' force row sums to be 1 23 | 'smarkov' symmetric conjugate to markov 24 | 'beltrami' Laplace-Beltrami normalization ala Coifman-Lafon 25 | 'sbeltrami' symmetric conjugate to beltrami 26 | 'FokkerPlanck' Fokker-Planck normalization 27 | 'sFokkerPlanck' symmetric conjugate to Fokker-Planck normalization 28 | :param epsilon: Gaussian standard deviation for converting distances to affinities 29 | :param n_diffusion_components: Number of diffusion components to generate 30 | """ 31 | if normalization not in ['bimarkov', 'smarkov', 'markov', 'sbeltrami', 'beltrami', 32 | 'FokkerPlanck', 'sFokkerPlanck']: 33 | raise ValueError( 34 | 'Unsupported normalization. Please refer to the docstring for the ' 35 | 'supported methods') 36 | 37 | self.knn = knn 38 | self.normalization = normalization 39 | self.epsilon = epsilon 40 | self.n_diffusion_components = n_diffusion_components 41 | self.eigenvectors = None 42 | self.eigenvalues = None 43 | self.diffusion_operator = None 44 | self.weights = None 45 | 46 | @staticmethod 47 | def keigs(T, k, P, take_diagonal=0): 48 | """ return k largest magnitude eigenvalues for the matrix T. 49 | :param T: Matrix to find eigen values/vectors of 50 | :param k: number of eigen values/vectors to return 51 | :param P: in the case of symmetric normalizations, 52 | this is the NxN diagonal matrix which relates the nonsymmetric 53 | version to the symmetric form via conjugation 54 | :param take_diagonal: if 1, returns the eigenvalues as a vector rather than as a 55 | diagonal matrix. 56 | """ 57 | D, V = eigs(T, k, tol=1e-4, maxiter=1000) 58 | D = np.real(D) 59 | V = np.real(V) 60 | inds = np.argsort(D)[::-1] 61 | D = D[inds] 62 | V = V[:, inds] 63 | if P is not None: 64 | V = P.dot(V) 65 | 66 | # Normalize 67 | for i in range(V.shape[1]): 68 | V[:, i] = V[:, i] / norm(V[:, i]) 69 | V = np.round(V, 10) 70 | 71 | if take_diagonal == 0: 72 | D = np.diag(D) 73 | 74 | return V, D 75 | 76 | @staticmethod # todo fix; what is S? 77 | def bimarkov(W, max_iters=100, abs_error=0.00001, **kwargs): 78 | """normalization method for GraphDiffusion""" 79 | 80 | if W.size == 0: 81 | return 82 | 83 | # process input 84 | if W.shape[0] != W.shape[1]: 85 | raise ValueError('Bimarkov.py: kernel must be NxN\n') 86 | 87 | N = W.shape[0] 88 | 89 | # initialize 90 | p = np.ones(N) 91 | 92 | # iterative 93 | for i in range(max_iters): 94 | 95 | S = np.ravel(W.sum(axis=1)) 96 | err = np.max(np.absolute(1.0 - np.max(S)), np.absolute(1.0 - np.min(S))) 97 | 98 | if err < abs_error: 99 | break 100 | 101 | D = csr_matrix((np.divide(1, np.sqrt(S)), (range(N), range(N))), shape=[N, N]) 102 | p *= S 103 | W = D.dot(W).dot(D) 104 | 105 | # iron out numerical errors 106 | T = (W + W.T) / 2 107 | return T, p 108 | 109 | @staticmethod 110 | def smarkov(D, N, W): 111 | """normalization method for GraphDiffusion""" 112 | D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N]) 113 | P = D 114 | T = D.dot(W).dot(D) 115 | T = (T + T.T) / 2 116 | return T, P 117 | 118 | @staticmethod 119 | def markov(D, N, W): 120 | """normalization method for GraphDiffusion""" 121 | T = csr_matrix((D, (range(N), range(N))), shape=[N, N]).dot(W) 122 | return T, None 123 | 124 | @staticmethod 125 | def sbeltrami(D, N, W): 126 | """normalization method for GraphDiffusion""" 127 | P = csr_matrix((D, (range(N), range(N))), shape=[N, N]) 128 | K = P.dot(W).dot(P) 129 | 130 | D = np.ravel(K.sum(axis=1)) 131 | D[D != 0] = 1 / D[D != 0] 132 | 133 | D = csr_matrix((D, (range(N), range(N))), shape=[N, N]) 134 | P = D 135 | T = D.dot(K).dot(D) 136 | 137 | T = (T + T.T) / 2 138 | return T, P 139 | 140 | @staticmethod 141 | def beltrami(D, N, W): 142 | """normalization method for GraphDiffusion""" 143 | D = csr_matrix((D, (range(N), range(N))), shape=[N, N]) 144 | K = D.dot(W).dot(D) 145 | 146 | D = np.ravel(K.sum(axis=1)) 147 | D[D != 0] = 1 / D[D != 0] 148 | 149 | V = csr_matrix((D, (range(N), range(N))), shape=[N, N]) 150 | T = V.dot(K) 151 | return T, None 152 | 153 | @staticmethod 154 | def FokkerPlanck(D, N, W): 155 | """normalization method for GraphDiffusion""" 156 | D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N]) 157 | K = D.dot(W).dot(D) 158 | 159 | D = np.ravel(K.sum(axis=1)) 160 | D[D != 0] = 1 / D[D != 0] 161 | 162 | D = csr_matrix((D, (range(N), range(N))), shape=[N, N]) 163 | T = D.dot(K) 164 | return T, None 165 | 166 | @staticmethod 167 | def sFokkerPlanck(D, N, W): 168 | """normalization method for GraphDiffusion""" 169 | print('(sFokkerPlanck) ... ') 170 | 171 | D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N]) 172 | K = D.dot(W).dot(D) 173 | 174 | D = np.ravel(K.sum(axis=1)) 175 | D[D != 0] = 1 / D[D != 0] 176 | 177 | D = csr_matrix((np.sqrt(D), (range(N), range(N))), shape=[N, N]) 178 | P = D 179 | T = D.dot(K).dot(D) 180 | 181 | T = (T + T.T) / 2 182 | return T, P 183 | 184 | def fit(self, data, verbose=True): 185 | """ 186 | :param data: Data matrix of samples X features 187 | :param verbose: print progress report 188 | 189 | :return: Dictionary containing diffusion operator, weight matrix, 190 | diffusion eigen vectors, and diffusion eigen values 191 | """ 192 | if verbose: 193 | print('Running Diffusion maps with the following parameters:') 194 | print('Normalization: %s' % self.normalization) 195 | print('Number of nearest neighbors k: %d' % self.knn) 196 | print('Epsilon: %.4f' % self.epsilon) 197 | 198 | # Nearest neighbors 199 | start = time.process_time() 200 | N = data.shape[0] 201 | nbrs = NearestNeighbors(n_neighbors=self.knn).fit(data) 202 | distances, indices = nbrs.kneighbors(data) 203 | 204 | # Adjacency matrix 205 | rows = np.zeros(N * self.knn, dtype=np.int32) 206 | cols = np.zeros(N * self.knn, dtype=np.int32) 207 | dists = np.zeros(N * self.knn) 208 | location = 0 209 | for i in range(N): 210 | inds = range(location, location + self.knn) 211 | rows[inds] = indices[i, :] 212 | cols[inds] = i 213 | dists[inds] = distances[i, :] 214 | location += self.knn 215 | W = csr_matrix((dists, (rows, cols)), shape=[N, N]) 216 | 217 | # Symmetrize W 218 | W = W + W.T 219 | 220 | # Convert to affinity (with selfloops) 221 | rows, cols, dists = find(W) 222 | rows = np.append(rows, range(N)) 223 | cols = np.append(cols, range(N)) 224 | dists = np.append(dists / (self.epsilon ** 2), np.zeros(N)) 225 | W = csr_matrix((np.exp(-dists), (rows, cols)), shape=[N, N]) 226 | 227 | # Create D 228 | D = np.ravel(W.sum(axis=1)) 229 | D[D != 0] = 1 / D[D != 0] 230 | 231 | # Go through the various normalizations 232 | fnorm = getattr(self, self.normalization) 233 | T, P = fnorm(D=D, N=N, W=W) 234 | 235 | if self.normalization != 'bimarkov' and verbose: 236 | print('%.2f seconds' % (time.process_time() - start)) 237 | 238 | # Eigen value decomposition 239 | V, D = GraphDiffusion.keigs(T, self.n_diffusion_components, P, take_diagonal=1) 240 | self.eigenvectors = V 241 | self.eigenvalues = D 242 | self.diffusion_operator = T 243 | self.weights = W 244 | return {'operator': T, 'eigval': D, 'eigvec': V, 'weights': W} 245 | -------------------------------------------------------------------------------- /src/seqc/stats/gsea.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | import shlex 4 | import glob 5 | import re 6 | import numpy as np 7 | import pandas as pd 8 | from scipy.special import expit 9 | 10 | 11 | class GSEA: 12 | 13 | def __init__(self, correlations, output_stem=None): 14 | """initialize a gsea object 15 | :param pd.Series correlations: correlations in the range of [-1, 1] whose index 16 | contains gene names 17 | :param str output_stem: the filestem for the output data 18 | 19 | :method linear_scale: method to linearly scale a vector to lie on the interval 20 | [-1, 1] 21 | :method logisitc_scale: method to scale a vector by the logistic function to lie 22 | on the interval [-1, 1] 23 | :method run: run GSEA on these correlations 24 | """ 25 | if not isinstance(correlations, pd.Series): 26 | raise TypeError('correlations must be a pandas series') 27 | if not ((np.min(correlations) >= -1) & (np.max(correlations) <= 1)): 28 | raise RuntimeError( 29 | 'input correlations were not contained within the interval [-1, 1]. ' 30 | 'Please use JavaGSEA.linear_scale() or JavaGSEA.logistic_scale() to ' 31 | 'scale values to this interval before running.') 32 | self._correlations = correlations.sort_values() 33 | self._rnk = None 34 | if output_stem is None: 35 | self._output_stem = os.environ['TMPDIR'] + 'gsea_corr_{!s}'.format( 36 | np.random.randint(0, 1000000)) 37 | elif not isinstance(output_stem, str): 38 | raise TypeError('output stem must be a str reference to a file prefix') 39 | elif output_stem.find('-') > -1: 40 | raise ValueError('output_stem cannot contain the dash (-) character.') 41 | else: 42 | self._output_stem = output_stem 43 | self._results = {} 44 | 45 | @property 46 | def correlations(self): 47 | return self._correlations 48 | 49 | @correlations.setter 50 | def correlations(self): 51 | raise RuntimeError('Please create a new object to compare different correlations') 52 | 53 | @property 54 | def results(self): 55 | return self._results 56 | 57 | @staticmethod 58 | def linear_scale(data: pd.Series) -> pd.Series: 59 | """scale input vector to interval [-1, 1] using a linear scaling 60 | :return correlations: pd.Series, data scaled to the interval [-1, 1] 61 | """ 62 | data = data.copy() 63 | data -= np.min(data, axis=0) 64 | data /= np.max(data, axis=0) / 2 65 | data -= 1 66 | return data 67 | 68 | @staticmethod 69 | def logistic_scale(data: pd.Series) -> pd.Series: 70 | """scale input vector to interval [-1, 1] using a sigmoid scaling 71 | :return correlations: pd.Series, data scaled to the interval [-1, 1] 72 | """ 73 | return pd.Series((expit(data.values) * 2) - 1, index=data.index) 74 | 75 | def _save_rank_file(self) -> None: 76 | """save the correlations to a .rnk file""" 77 | self._rnk = self._output_stem + '.rnk' 78 | df = pd.DataFrame(self._correlations).fillna(0) 79 | df.to_csv(self._rnk, sep='\t', header=False) 80 | 81 | @staticmethod 82 | def _gmt_options(): 83 | """ 84 | Private method. identifies GMT files available for mouse or human genomes 85 | :return: str, file options 86 | """ 87 | 88 | mouse_options = os.listdir(os.path.expanduser('~/.seqc/tools/mouse')) 89 | human_options = os.listdir(os.path.expanduser('~/.seqc/tools/human')) 90 | print('Available GSEA .gmt files:\n\nmouse:\n{m}\n\nhuman:\n{h}\n'.format( 91 | m='\n'.join(mouse_options), 92 | h='\n'.join(human_options))) 93 | print('Please specify the gmt_file parameter as gmt_file=(organism, filename)') 94 | 95 | def run(self, gmt_file): 96 | """ 97 | Helper function. Run GSEA on an already-ranked list of corrleations. To see 98 | available files, leave gmt_file parameter empty 99 | 100 | :param (str, str) gmt_file: organism and filename of gmt file to use 101 | :return (pd.DataFrame, pd.DataFrame): positive and negative GSEA enrichments 102 | """ 103 | out_dir, out_prefix = os.path.split(self._output_stem) 104 | os.makedirs(out_dir, exist_ok=True) 105 | 106 | if self._rnk is None: 107 | self._save_rank_file() 108 | 109 | if not gmt_file: 110 | self._gmt_options() 111 | return 112 | else: 113 | if not len(gmt_file) == 2: 114 | raise ValueError('gmt_file should be a tuple of (organism, filename).') 115 | else: 116 | gmt_file = os.path.expanduser('~/.seqc/tools/{}/{}').format(*gmt_file) 117 | 118 | # Construct the GSEA call 119 | cmd = shlex.split( 120 | 'java -cp {user}/.seqc/tools/gsea2-2.2.1.jar -Xmx1g ' 121 | 'xtools.gsea.GseaPreranked -collapse false -mode Max_probe -norm meandiv ' 122 | '-nperm 1000 -include_only_symbols true -make_sets true -plot_top_x 0 ' 123 | '-set_max 500 -set_min 50 -zip_report false -gui false -rnk {rnk} ' 124 | '-rpt_label {out_prefix} -out {out_dir}/ -gmx {gmt_file}' 125 | ''.format(user=os.path.expanduser('~'), rnk=self._rnk, out_prefix=out_prefix, 126 | out_dir=out_dir, gmt_file=gmt_file)) 127 | 128 | # Call GSEA 129 | p = subprocess.Popen(cmd, stderr=subprocess.PIPE) 130 | _, err = p.communicate() 131 | 132 | # find the file that GSEA created 133 | if err: 134 | print(err.decode()) 135 | return 136 | else: 137 | pattern = '{p}.GseaPreranked.[0-9]*'.format(p=out_prefix) 138 | files = os.listdir(out_dir) 139 | folder = None 140 | for f in files: 141 | mo = re.match(pattern, f) 142 | if mo: 143 | folder = out_dir + '/' + mo.group(0) 144 | if folder is None: 145 | raise RuntimeError( 146 | 'seqc.JavaGSEA was not able to recover the output of the Java ' 147 | 'executable. This likely represents a bug.') 148 | 149 | # recover information from run 150 | names = ['size', 'es', 'nes', 'p', 'fdr_q', 'fwer_p', 'rank_at_max', 151 | 'leading_edge'] 152 | pos = pd.DataFrame.from_csv(glob.glob(folder + '/gsea*pos*xls')[0], 153 | sep='\t', infer_datetime_format=False, parse_dates=False).iloc[:, :-1] 154 | pos.drop(['GS
follow link to MSigDB', 'GS DETAILS'], axis=1, inplace=True) 155 | neg = pd.DataFrame.from_csv(glob.glob(folder + '/gsea*neg*xls')[0], 156 | sep='\t', infer_datetime_format=False, parse_dates=False).iloc[:, :-1] 157 | neg.drop(['GS
follow link to MSigDB', 'GS DETAILS'], axis=1, inplace=True) 158 | pos.columns, neg.columns = names, names 159 | self._results[gmt_file] = {'positive': pos, 'negative': neg} 160 | return list(self._results[gmt_file].values()) 161 | -------------------------------------------------------------------------------- /src/seqc/stats/mast.py: -------------------------------------------------------------------------------- 1 | import math 2 | import subprocess 3 | import imp 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | 8 | 9 | def run_mast(counts_filtered, clustering_communities, output_prefix): 10 | # Differentially Expression Analysis using MAST 11 | log_counts = (counts_filtered + 1.0).applymap(math.log2) 12 | de_results = [] # array containing the differentially expression analysis for each cluster 13 | for c in range(np.max(clustering_communities) + 1): 14 | tmp_input_file = output_prefix + "_cluster_" + str(c) + "_mast_input.csv" 15 | tmp_output_file = output_prefix + "_cluster_" + str(c) + "_mast_results.csv" 16 | reduced_tdf1 = log_counts.iloc[np.where(clustering_communities == c)[0]] 17 | reduced_tdf2 = log_counts.iloc[np.where(clustering_communities != c)[0]] 18 | reduced_df = pd.concat([reduced_tdf1, reduced_tdf2]) 19 | reduced_df.index = pd.Index([1 if i < len(reduced_tdf1.index) else 0 for i in range(len(reduced_tdf1.index) + len(reduced_tdf2.index))]) 20 | reduced_df.to_csv(tmp_input_file) 21 | 22 | path_to_run_mast = imp.find_module('seqc')[1] 23 | args = 'Rscript {p} {i} {o}'.format(p=os.path.join(path_to_run_mast, 'run_mast.R'), i=tmp_input_file, o=tmp_output_file) 24 | with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as p: 25 | out, err = p.communicate() 26 | if os.path.isfile(tmp_output_file): 27 | de_gene_df = pd.read_csv(tmp_output_file) 28 | if len(de_gene_df.index) > 0: 29 | de_results.append(de_gene_df) 30 | else: # if no differentially expressed genes 31 | de_results.append(None) 32 | else: 33 | de_results.append(None) 34 | 35 | de_gene_list_file = output_prefix + "_de_gene_list.txt" 36 | with open(de_gene_list_file, "w") as f: 37 | f.write("Differential Expression Analysis Using MAST\n\n") 38 | c = 1 39 | for de_result in de_results: 40 | if de_result is not None: 41 | f.write("Differentially expressed genes for cluster %d:\n" % (c)) 42 | f.write("%-10s %-10s %-10s %-10s\n" % ("Gene", "p", "p.fdr", "logFC")) 43 | 44 | for i in range(len(de_result)): 45 | p_v = "%.2e" % de_result.loc[i][1] 46 | p_fdr = "%.2e" % de_result.loc[i][2] 47 | logFC = "%.2f" % de_result.loc[i][3] 48 | f.write("%-10s %-10s %-10s %-10s\n" % (de_result.loc[i][0], p_v, p_fdr, logFC)) 49 | else: 50 | f.write("No differentially expressed genes has been found for cluster %d.\n" % (c)) 51 | c += 1 52 | f.write("\n") 53 | f.close() 54 | return de_gene_list_file -------------------------------------------------------------------------------- /src/seqc/stats/pca.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | class PCA: 6 | 7 | def __init__(self, n_components=30): 8 | """ 9 | construct a model for Principle Component Analysis 10 | 11 | :param n_components: number of principle components to retain 12 | 13 | :property eigenvalues: stores the eigenvalues computed by fit() 14 | :property loadings: stores the eigenvectors of the pca decomposition computed by 15 | fit() 16 | :method fit: fit the model to the data 17 | :method transform: project the data onto a subset of the principle components 18 | (default: all components other than the first) 19 | :method fit_transform: fit and transform the data, returning the projected result 20 | """ 21 | self.n_components = n_components 22 | self.loadings = None 23 | self.eigenvalues = None 24 | 25 | def fit(self, data: np.ndarray or pd.DataFrame, fillna=0): 26 | """ 27 | Fit the model to data 28 | 29 | :param data: n observation x k feature data array 30 | :param fillna: fill np.NaN values with this value. If None, will not fill. 31 | :return: None 32 | """ 33 | 34 | if isinstance(data, pd.DataFrame): 35 | X = data.values 36 | elif isinstance(data, np.ndarray): 37 | X = data 38 | else: 39 | raise TypeError('data must be a pd.DataFrame or np.ndarray') 40 | 41 | if fillna is not None: 42 | X[np.where(np.isnan(X))] = fillna 43 | X[np.where(np.isinf(X))] = fillna 44 | 45 | # Compute covariance matrix 46 | if X.shape[1] < X.shape[0]: 47 | C = np.cov(X, rowvar=False) 48 | # if N > D, we better use this matrix for the eigendecomposition 49 | else: 50 | C = np.multiply((1 / X.shape[0]), np.dot(X, X.T)) 51 | 52 | # Perform eigendecomposition of C 53 | C[np.where(np.isnan(C))] = 0 54 | C[np.where(np.isinf(C))] = 0 55 | l, M = np.linalg.eig(C) 56 | 57 | # Sort eigenvectors in descending order 58 | ind = np.argsort(l)[::-1] 59 | l = l[ind] 60 | if self.n_components < 1: 61 | self.n_components = ( 62 | np.where(np.cumsum(np.divide(l, np.sum(l)), axis=0) >= 63 | self.n_components)[0][0] + 1) 64 | print('Embedding into ' + str(self.n_components) + ' dimensions.') 65 | elif self.n_components > M.shape[1]: 66 | self.n_components = M.shape[1] 67 | print('Target dimensionality reduced to ' + str(self.n_components) + '.') 68 | 69 | M = M[:, ind[:self.n_components]] 70 | l = l[:self.n_components] 71 | 72 | # Apply mapping on the data 73 | if X.shape[1] >= X.shape[0]: 74 | M = np.multiply(np.dot(X.T, M), (1 / np.sqrt(X.shape[0] * l)).T) 75 | 76 | self.loadings = M 77 | self.eigenvalues = l 78 | 79 | def transform(self, data, components=None) -> np.ndarray or pd.DataFrame: 80 | """ 81 | Transform data using the fit PCA model. 82 | 83 | :param data: n observation x k feature data array 84 | :param components: components to retain when transforming 85 | data, if None, uses all components except for the first 86 | :return: np.ndarray containing transformed data 87 | """ 88 | 89 | if components is None: 90 | components = np.arange(1, self.n_components) 91 | 92 | projected = np.dot(data, self.loadings[:, components]) 93 | if isinstance(data, pd.DataFrame): 94 | return pd.DataFrame(projected, index=data.index, columns=components) 95 | else: 96 | return projected 97 | 98 | def fit_transform(self, data: np.ndarray or pd.DataFrame, n_components=None) -> \ 99 | np.ndarray or pd.DataFrame: 100 | """ 101 | Fit the model to data and transform the data using the fit model 102 | 103 | :param data: n observation x k feature data array 104 | :param n_components: number of components to retain when transforming 105 | data 106 | :return np.ndarray or pd.DataFrame: transformed data 107 | """ 108 | 109 | self.fit(data) 110 | return self.transform(data, components=n_components) 111 | -------------------------------------------------------------------------------- /src/seqc/stats/resampled_nonparametric.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import partial 3 | from multiprocessing import Pool 4 | from contextlib import closing 5 | from itertools import repeat 6 | import numpy as np 7 | import numpy.ma as ma 8 | import pandas as pd 9 | from scipy.stats.mstats import count_tied_groups, rankdata 10 | from scipy.stats.mstats import kruskalwallis as _kruskalwallis 11 | from scipy.special import erfc 12 | from statsmodels.sandbox.stats.multicomp import multipletests 13 | 14 | 15 | def get_memory(): 16 | """ 17 | """ 18 | return os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') / (1024 ** 3) 19 | 20 | 21 | def _mannwhitneyu(x, y, use_continuity=True): 22 | """ 23 | Computes the Mann-Whitney statistic 24 | Missing values in `x` and/or `y` are discarded. 25 | Parameters 26 | ---------- 27 | x : ndarray, 28 | Input, vector or observations x features matrix 29 | y : ndarray, 30 | Input, vector or observations x features matrix. If matrix, must have 31 | same number of features as x 32 | use_continuity : {True, False}, optional 33 | Whether a continuity correction (1/2.) should be taken into account. 34 | Returns 35 | ------- 36 | statistic : float 37 | The Mann-Whitney statistic 38 | approx z : float 39 | The normal-approximated z-score for U. 40 | pvalue : float 41 | Approximate p-value assuming a normal distribution. 42 | """ 43 | if x.ndim == 1 and y.ndim == 1: 44 | x, y = x[:, np.newaxis], y[:, np.newaxis] 45 | ranks = rankdata(np.concatenate([x, y]), axis=0) 46 | nx, ny = x.shape[0], y.shape[0] 47 | nt = nx + ny 48 | U = ranks[:nx].sum(0) - nx * (nx + 1) / 2. 49 | 50 | mu = (nx * ny) / 2. 51 | u = np.amin([U, nx*ny - U], axis=0) # get smaller U by convention 52 | 53 | sigsq = np.ones(ranks.shape[1]) * (nt ** 3 - nt) / 12. 54 | 55 | for i in np.arange(len(sigsq)): 56 | ties = count_tied_groups(ranks[:, i]) 57 | sigsq[i] -= np.sum(v * (k ** 3 - k) for (k, v) in ties.items()) / 12. 58 | sigsq *= nx * ny / float(nt * (nt - 1)) 59 | 60 | if use_continuity: 61 | z = (U - 1 / 2. - mu) / np.sqrt(sigsq) 62 | else: 63 | z = (U - mu) / np.sqrt(sigsq) 64 | 65 | prob = erfc(abs(z) / np.sqrt(2)) 66 | return np.vstack([u, z, prob]).T 67 | 68 | 69 | def find_sampling_value(group_data, percentile): 70 | """ 71 | 72 | :param group_data: 73 | :param int percentile: 74 | :return: 75 | """ 76 | return min(np.percentile(g.sum(axis=1), percentile) for g in group_data) 77 | 78 | 79 | def normalize(data, downsample_value, upsample=False, labels=None): 80 | """ 81 | :param data: 82 | :param downsample_value: value to normalize cell counts to. In current implementation, 83 | a small number of cells (10%) are upsampled to this value. 84 | :param upsample: if False, all observations with size < downsample_value are excluded. 85 | if True, those cells are upsampled to downsample_value. 86 | :return: 87 | """ 88 | obs_size = data.sum(axis=1) 89 | if not upsample: 90 | keep = obs_size >= downsample_value 91 | data = data[keep, :] 92 | if labels is not None: 93 | labels = labels[keep] 94 | norm = (data * downsample_value) / data.sum(axis=1)[:, np.newaxis] 95 | if labels is not None: 96 | return norm, labels 97 | else: 98 | return norm 99 | 100 | 101 | def _draw_sample(normalized_data, n): 102 | """ 103 | :param normalized_data: 104 | :param n: 105 | """ 106 | np.random.seed() 107 | idx = np.random.randint(0, normalized_data.shape[0], n) 108 | sample = normalized_data[idx, :] 109 | p = np.random.sample(sample.shape) # round samples probabilistically 110 | 111 | return np.floor(sample) + (sample % 1 > p).astype(int) 112 | 113 | 114 | def _mw_sampling_function(norm_data, n_cell): 115 | """ 116 | :param norm_data: 117 | :param n_cell: 118 | :return: 119 | """ 120 | a, b = (_draw_sample(d, n_cell) for d in norm_data) 121 | return _mannwhitneyu(a, b) # dim = (n_genes, 3) 122 | 123 | 124 | def confidence_interval(z): 125 | """ 126 | 127 | :param z: 128 | :return: 129 | """ 130 | return np.percentile(z, [2.5, 97.5], axis=0).T 131 | 132 | 133 | def mannwhitneyu( 134 | x, y, n_iter=50, sampling_percentile=10, alpha=0.05, verbose=False, 135 | upsample=False): 136 | """ 137 | :param x: observations by features array or DataFrame (ndim must be 2, although there 138 | needn't be more than one feature) 139 | :param y: observations by features array or DataFrama. Features must be the same as x 140 | :param n_iter: number of times to sample x and y 141 | :param sampling_percentile: percentile to downsample to. observations with row sums 142 | lower than this value will be excluded 143 | :param alpha: significance threshold for FDR correction 144 | :param verbose: if True, report number of cells sampled in each iteration and the 145 | integer value to which cells are downsampled 146 | :param upsample: if False, cells with size lower than sampling_percentile are 147 | discarded. If True, those cells are upsampled. 148 | :return pd.DataFrame: DataFrame with columns: 149 | U: median u-statistic over the n_iter iterations of the test 150 | z_approx: median approximate tie-corrected z-score for the mann-whitney U-test 151 | z_lo: lower bound, 95% confidence interval over z 152 | z_hi: upper bound, 95% confidence interval over z 153 | p: p-value for z_approx 154 | q: FDR-corrected q-value over all tests in output, using two-stage BH-FDR. 155 | """ 156 | 157 | # do some sanity checks on input data 158 | if isinstance(x, pd.DataFrame) and isinstance(y, pd.DataFrame): 159 | assert np.array_equal(x.columns, y.columns) 160 | labels = x.columns 161 | x = x.values 162 | y = y.values 163 | elif x.ndim > 1: 164 | assert x.shape[1] == y.shape[1] 165 | labels = None 166 | else: 167 | labels = None 168 | 169 | # calculate sampling values 170 | v = find_sampling_value([x, y], sampling_percentile) 171 | norm_data = [normalize(d, v, upsample) for d in [x, y]] 172 | n_cell = min(d.shape[0] for d in norm_data) 173 | sampling_function = partial(_mw_sampling_function, n_cell=n_cell) 174 | 175 | if verbose: # report sampling values 176 | print('sampling %d cells (with replacement) per iteration' % n_cell) 177 | print('sampling %d molecules per cell' % v) 178 | 179 | with closing(Pool()) as pool: 180 | results = pool.map(sampling_function, repeat(norm_data, n_iter)) 181 | 182 | results = np.stack(results) # u, z, p 183 | 184 | ci = confidence_interval(results[:, :, 1]) 185 | results = pd.DataFrame( 186 | data=np.concatenate([np.median(results, axis=0), ci], axis=1), 187 | index=labels, 188 | columns=['U', 'z_approx', 'p', 'z_lo', 'z_hi']) 189 | 190 | # add multiple-testing correction 191 | results['q'] = multipletests(results['p'], alpha=alpha, method='fdr_tsbh')[1] 192 | 193 | # remove low-value genes whose median sampling value is -inf 194 | neginf = np.isneginf(results['z_approx']) 195 | results.ix[neginf, 'z_lo'] = np.nan 196 | results.ix[neginf, 'z_approx'] = 0 197 | results.ix[neginf, ['p', 'q']] = 1. 198 | 199 | results = results[['U', 'z_approx', 'z_lo', 'z_hi', 'p', 'q']].sort_values('q') 200 | results.iloc[:, 1:4] = np.round(results.iloc[:, 1:4], 2) 201 | 202 | return results 203 | 204 | 205 | def _kw_sampling_function(data, splits, n_cell): 206 | data = [_draw_sample(d, n_cell) for d in np.split(data, splits)] 207 | return _kruskal(data) 208 | 209 | 210 | def _kruskal(data): 211 | """ 212 | Compute the Kruskal-Wallis H-test for independent samples 213 | Parameters 214 | ---------- 215 | sample1, sample2, ... : array_like 216 | Two or more arrays with the sample measurements can be given as 217 | arguments. 218 | Returns 219 | ------- 220 | statistic : float 221 | The Kruskal-Wallis H statistic, corrected for ties 222 | pvalue : float 223 | The p-value for the test using the assumption that H has a chi 224 | square distribution 225 | Notes 226 | ----- 227 | For more details on `kruskal`, see `stats.kruskal`. 228 | """ 229 | results = [] 230 | for i in np.arange(data[0].shape[1]): 231 | args = [d[:, i] for d in data] 232 | try: 233 | results.append(_kruskalwallis(*args)) 234 | except ValueError: 235 | results.append([0, 1.]) 236 | return results 237 | 238 | 239 | def category_to_numeric(labels): 240 | """transform categorical labels to a numeric array""" 241 | labels = np.array(labels) 242 | if np.issubdtype(labels.dtype, np.integer): 243 | return labels 244 | else: 245 | cats = np.unique(labels) 246 | map_ = dict(zip(cats, np.arange(cats.shape[0]))) 247 | return np.array([map_[i] for i in labels]) 248 | 249 | 250 | def kruskalwallis( 251 | data, labels, n_iter=50, sampling_percentile=10, alpha=0.05, verbose=False, 252 | upsample=False): 253 | """ 254 | :param data: np.ndarray or pd.DataFrame of observations x features 255 | :param labels: observation labels for categories to be compared 256 | :param n_iter: number of times to sample x and y 257 | :param sampling_percentile: percentile to downsample to. observations with row sums 258 | lower than this value will be excluded 259 | :param alpha: significance threshold for FDR correction 260 | :param verbose: if True, report number of cells sampled in each iteration and the 261 | integer value to which cells are downsampled 262 | :param upsample: if False, cells with size lower than sampling_percentile are 263 | discarded. If True, those cells are upsampled. 264 | :return pd.DataFrame: DataFrame with columns: 265 | H: median u-statistic over the n_iter iterations of the test 266 | z_approx: median approximate tie-corrected z-score for the mann-whitney U-test 267 | z_lo: lower bound, 95% confidence interval over z 268 | z_hi: upper bound, 95% confidence interval over z 269 | p: p-value for z_approx 270 | q: FDR-corrected q-value over all tests in output, using two-stage BH-FDR. 271 | """ 272 | 273 | if isinstance(data, pd.DataFrame): 274 | features = data.columns 275 | data = data.values 276 | elif isinstance(data, np.ndarray): 277 | features = None 278 | else: 279 | raise ValueError('data must be a np.ndarray or pd.DataFrame, not %s' % 280 | repr(type(data))) 281 | 282 | # if labels are not numeric, transform to numeric categories 283 | labels = category_to_numeric(labels) 284 | if not labels.shape[0] == data.shape[0]: 285 | raise ValueError('labels (shape=%s) must match dimension 0 of data (shape=%s)' % 286 | (repr(labels.shape), repr(labels.data))) 287 | 288 | idx = np.argsort(labels) 289 | data = data[idx, :] # will copy 290 | labels = labels[idx] 291 | 292 | splits = np.where(np.diff(labels))[0] + 1 293 | 294 | # calculate sampling values and downsample data 295 | v = find_sampling_value(np.split(data, splits), sampling_percentile) 296 | norm_data, labels = normalize(data, v, upsample, labels) 297 | 298 | splits = np.where(np.diff(labels))[0] + 1 # rediff, norm_data causes loss 299 | 300 | n_cell = min(d.shape[0] for d in np.split(norm_data, splits)) 301 | sampling_function = partial(_kw_sampling_function, n_cell=n_cell, splits=splits) 302 | 303 | if verbose: # report sampling values 304 | print('sampling %d cells (with replacement) per iteration' % n_cell) 305 | print('sampling %d molecules per cell' % v) 306 | 307 | with closing(Pool()) as pool: 308 | results = pool.map(sampling_function, repeat(norm_data, n_iter)) 309 | 310 | results = np.stack(results) # H, p 311 | 312 | ci = confidence_interval(results[:, :, 0]) # around H 313 | results = pd.DataFrame( 314 | data=np.concatenate([np.median(results, axis=0), ci], axis=1), 315 | index=features, 316 | columns=['H', 'p', 'H_lo', 'H_hi']) 317 | 318 | results['q'] = multipletests(results['p'], alpha=alpha, method='fdr_tsbh')[1] 319 | results = results[['H', 'H_lo', 'H_hi', 'p', 'q']] 320 | return results 321 | 322 | -------------------------------------------------------------------------------- /src/seqc/stats/smoothing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import multiprocessing 4 | from sklearn.neighbors import NearestNeighbors 5 | 6 | 7 | class smoothing: 8 | """Data smoothing kernels 9 | 10 | :method kneighbors: transforms each observation (row) of data by setting it 11 | equal to the average of its k-nearest neighbors 12 | """ 13 | 14 | @staticmethod 15 | def kneighbors(data: np.array or pd.DataFrame, n_neighbors=50, pca=None, **kwargs): 16 | """ 17 | Smooth gene expression values by setting the expression of each gene in each 18 | cell equal to the mean value of itself and its n_neighbors 19 | 20 | :param data: np.ndarray | pd.DataFrame; genes x cells array 21 | :param n_neighbors: int; number of neighbors to smooth over 22 | :param pca: dimensionality reduced matrix, knn will be run on this and applied 23 | to data (runs much faster) 24 | :param kwargs: keyword arguments to pass sklearn.NearestNeighbors 25 | :return: np.ndarray | pd.DataFrame; same as input 26 | """ 27 | 28 | if isinstance(data, pd.DataFrame): 29 | data_ = data.values 30 | elif isinstance(data, np.ndarray): 31 | data_ = data 32 | else: 33 | raise TypeError("data must be a pd.DataFrame or np.ndarray") 34 | 35 | knn = NearestNeighbors( 36 | n_neighbors=n_neighbors, 37 | n_jobs=multiprocessing.cpu_count() - 1, 38 | **kwargs) 39 | 40 | if pca is not None: 41 | knn.fit(pca) 42 | inds = knn.kneighbors(pca, return_distance=False) 43 | else: 44 | knn.fit(data_) 45 | inds = knn.kneighbors(data_, return_distance=False) 46 | 47 | # smoothing creates large intermediates; break up to avoid memory errors 48 | pieces = [] 49 | num_partitions = np.round(data_.shape[0] / 2000) + 1 50 | if num_partitions > 2: # 2 partitions produces start + end, need a third to split 51 | sep = np.linspace(0, data_.shape[0] + 1, num_partitions, dtype=int) 52 | for start, end in zip(sep, sep[1:]): 53 | pieces.append(data_[inds[start:end, :], :].mean(axis=1)) 54 | res = np.vstack(pieces) 55 | else: 56 | res = data_[inds, :].mean(axis=1) 57 | 58 | if isinstance(data, pd.DataFrame): 59 | res = pd.DataFrame(res, index=data.index, columns=data.columns) 60 | 61 | return res 62 | -------------------------------------------------------------------------------- /src/seqc/stats/tree.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Tree: 4 | 5 | def __init__(self, id, left=None, right=None, dist=None): 6 | self.id = id 7 | self.left = left 8 | self.right = right 9 | self.dist = dist 10 | 11 | def __repr__(self): 12 | return '' % ( 13 | self.id, 14 | self.left.id if self.left is not None else None, 15 | self.right.id if self.left is not None else None, 16 | self.dist if self.dist is not None else None) 17 | 18 | @classmethod 19 | def from_linkage(cls, Z): 20 | current_id = Z.shape[0] * 2 21 | tree = {} 22 | for (left, right, dist, n_children) in Z[::-1]: 23 | tree[left] = Tree(id=left) 24 | tree[right] = Tree(id=right) 25 | if current_id not in tree: 26 | tree[current_id] = Tree(id=current_id, left=tree[left], right=tree[right], dist=dist) 27 | else: 28 | tree[current_id].left = tree[left] 29 | tree[current_id].right = tree[right] 30 | tree[current_id].dist = dist 31 | current_id -= 1 32 | return tree[max(tree.keys())] 33 | 34 | def is_leaf(self): 35 | return True if self.left is None and self.right is None else False 36 | 37 | @staticmethod 38 | def nodes2labels(nodes): 39 | return [n.id for n in nodes] 40 | 41 | def get_daughter(self, id_): 42 | for daughter in self.dfs(): 43 | if daughter.id == id_: 44 | return daughter 45 | return None 46 | 47 | def has_daughter(self, id_): 48 | for daughter in self.dfs(): 49 | if daughter.id == id_: 50 | return True 51 | return False 52 | 53 | def dfs(self): 54 | visited, stack = [], [self] 55 | while stack: 56 | vertex = stack.pop() 57 | yield vertex 58 | if vertex not in visited: 59 | visited.append(vertex) 60 | if vertex.left is not None: 61 | stack.append(vertex.left) 62 | if vertex.right is not None: 63 | stack.append(vertex.right) 64 | 65 | def bfs(self): 66 | visited, queue = [], [self] 67 | while queue: 68 | vertex = queue.pop(0) 69 | yield vertex 70 | if vertex not in visited: 71 | visited.append(vertex) 72 | if vertex.left is not None: 73 | queue.append(vertex.left) 74 | if vertex.right is not None: 75 | queue.append(vertex.right) 76 | -------------------------------------------------------------------------------- /src/seqc/stats/tsne.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import bhtsne 4 | from seqc.stats.pca import PCA 5 | 6 | class TSNE: 7 | 8 | def __init__(self, n_components: int=2, run_pca: bool=False, 9 | n_pca_components: int=20, fillna: float=None, **kwargs): 10 | """ 11 | t-stochastic neighbor embedding 12 | 13 | 14 | :param normalize: if True, scales features to unit size 15 | :param run_pca: if True, runs PCA on the input data and runs tSNE on the 16 | components retained by PCA. 17 | :param n_components: number of tSNE components to return 18 | :param n_pca_components: number of components to which data should be projected, 19 | if run_pca is True 20 | :param fillna: fills np.nan values with this float value 21 | :param kwargs: additional keyword arguments to pass tsne 22 | 23 | :method fit_transform: fits the tSNE model to data and returns the transformed 24 | result 25 | 26 | """ 27 | 28 | self.run_pca = run_pca 29 | self.n_components = n_components 30 | self.n_pca_components = n_pca_components 31 | self.kwargs = kwargs 32 | self.tsne = None 33 | self.pca = None 34 | self.fillna = fillna 35 | 36 | def fit_transform(self, data: np.ndarray or pd.DataFrame) -> None: 37 | """ 38 | fit the tSNE model to data given the parameters provided during 39 | initialization and transform the output 40 | 41 | :param data: n observation x k feature data array 42 | :return np.ndarray or pd.DataFrame: tsne results 43 | """ 44 | if isinstance(data, pd.DataFrame): 45 | data_ = data.values 46 | else: 47 | data_ = data 48 | 49 | if self.fillna is not None: 50 | data_[np.where(np.isnan(data_))] = self.fillna 51 | data_[np.where(np.isinf(data_))] = self.fillna 52 | if self.run_pca: 53 | self.pca = PCA(n_components=self.n_pca_components) 54 | data_ = self.pca.fit_transform(data_) 55 | 56 | res = bhtsne.tsne(data_.astype(float), dimensions=self.n_components, **self.kwargs) 57 | 58 | if isinstance(data, pd.DataFrame): 59 | self.tsne = pd.DataFrame(res, index=data.index) 60 | else: 61 | self.tsne = res 62 | return self.tsne 63 | -------------------------------------------------------------------------------- /src/seqc/stats/ttest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections.abc import Callable 3 | from multiprocessing import Pool, cpu_count 4 | from functools import partial 5 | from contextlib import closing 6 | from scipy.stats import t 7 | import pandas as pd 8 | from statsmodels.sandbox.stats.multicomp import multipletests 9 | 10 | 11 | def estimate_multinomial(x): 12 | """estimate empirical multinomial expectation for a set of cells with each cell 13 | normalized to contribute equally to the expectation. 14 | 15 | :param np.ndarray x: cell x gene array containing expression data 16 | :return np.ndarray: multinomial expectation over genes of x 17 | """ 18 | return (x / x.sum(axis=1)[:, np.newaxis]).mean(axis=0) 19 | 20 | 21 | def assert_input_non_negative(*args): 22 | """ 23 | :param [np.ndarray] args: input numpy arrays 24 | :return None: 25 | """ 26 | if any(np.any(np.less(a, 0)) for a in args): 27 | raise ValueError('input data must be non-negative') 28 | 29 | 30 | def _sampling_function(n_iter, n_molecules, theta, n_cells): 31 | """ 32 | 33 | :param n_iter: 34 | :param n_molecules: 35 | :param theta: 36 | :param n_cells: 37 | :return: 38 | """ 39 | 40 | def online_mean_var(nb, mu_b, var_b, na, mu_a, var_a): 41 | nx = na + nb 42 | delta = mu_b - mu_a 43 | mu_x_ = mu_a + delta * nb / nx 44 | var_x_ = (na * (var_a + mu_a ** 2) + nb * (var_b + mu_b ** 2)) / nx - mu_x_ ** 2 45 | return nx, mu_x_, var_x_ 46 | 47 | res_mu = np.zeros((n_iter, theta.shape[0]), dtype=np.float32) 48 | res_var = np.zeros((n_iter, theta.shape[0]), dtype=np.float32) 49 | n_cells //= 10 50 | for i in np.arange(n_iter): 51 | # break sampling (n_cells) into 10 pieces 52 | obs = np.random.multinomial(n_molecules, theta, n_cells) 53 | mu_x = np.mean(obs, axis=0) 54 | var_x = np.mean(obs, axis=0) 55 | n_x = obs.shape[0] 56 | for _ in np.arange(9): 57 | obs = np.random.multinomial(n_molecules, theta, n_cells) 58 | mu = np.mean(obs, axis=0) 59 | var = np.mean(obs, axis=0) 60 | n = obs.shape[0] 61 | n_x, mu_x, var_x = online_mean_var(n, mu, var, n_x, mu_x, var_x) 62 | res_mu[i, :] = mu_x 63 | res_var[i, :] = var_x / n_x 64 | return res_mu, res_var 65 | 66 | 67 | def sample_moments(mult_probability, n_samples, n_cells, n_molecules): 68 | """sample mean and variance of n_cells, each containing n_molecules. n_samples mean/ 69 | variance pairs are sampled on each call. 70 | 71 | :param mult_probability: 72 | :param n_samples: 73 | :param n_cells: 74 | :param n_molecules: 75 | :return: 76 | """ 77 | 78 | # parition iterations among available compute cores 79 | ncpu = cpu_count() 80 | if n_samples > ncpu: 81 | samples_per_process = np.array([n_samples // ncpu] * ncpu) 82 | samples_per_process[:n_samples % ncpu] += 1 83 | else: 84 | samples_per_process = np.ones((n_samples,)) 85 | 86 | # map iterations across compute cores 87 | sampler = partial( 88 | _sampling_function, n_molecules=n_molecules, theta=mult_probability, 89 | n_cells=n_cells) 90 | with closing(Pool(ncpu)) as pool: 91 | results = pool.map(sampler, samples_per_process) 92 | mu, var = (np.vstack(mats) for mats in zip(*results)) 93 | 94 | # all means should be finite 95 | assert np.sum(np.isnan(mu)) == 0 96 | 97 | # in cases where variance is np.nan, we can safely set the variance to zero since the 98 | # mean for that tissue will also be zero; this will eliminate singularities caused by 99 | # one tissue never expressing a protein. 100 | var[np.isnan(var)] = 0 101 | 102 | return mu, var 103 | 104 | 105 | def whelch_satterthwaite_df(a_var, b_var, a_n, b_n): 106 | t1 = a_var.mean(axis=0) 107 | t2 = b_var.mean(axis=0) 108 | numerator = (t1 / a_n + t2 / b_n) ** 2 109 | denominator = t1 ** 2 / (a_n ** 2 * (a_n - 1)) + t2 ** 2 / (b_n ** 2 * (b_n - 1)) 110 | df = numerator / denominator 111 | return df 112 | 113 | 114 | def whelchs_t(a_mu, a_var, b_mu, b_var, a_n, b_n): 115 | """ 116 | 117 | :param np.ndarray a_mu: 118 | :param np.ndarray a_var: 119 | :param np.ndarray b_mu: 120 | :param np.ndarray b_var: 121 | :param int a_n: 122 | :param int b_n: 123 | :return float, float: statistic and p-value 124 | """ 125 | df = whelch_satterthwaite_df(a_var, b_var, a_n, b_n) 126 | numerator = a_mu - b_mu # (samples, genes) 127 | denominator = np.sqrt(a_var + b_var) # (samples, genes) 128 | statistic = numerator / denominator # (samples, genes) 129 | 130 | # statistic has NaNs where there are no observations of a or b (DivideByZeroError) 131 | statistic[np.isnan(statistic)] = 0 132 | median_statistic = np.median(np.abs(statistic), axis=0) 133 | p = (1 - t.cdf(median_statistic, df)) * 2 # p-value 134 | ci_95 = np.percentile(np.abs(statistic), [2.5, 97.5], axis=0).T 135 | 136 | return median_statistic, p, ci_95 137 | 138 | 139 | def bootstrap_t(a, b, n_samples=100, n_cells=None, alpha=0.05, 140 | downsample_value_function=np.median, labels=None): 141 | """ 142 | 143 | :param np.ndarray a: 144 | :param np.ndarray b: 145 | :param int n_samples: 146 | :param int n_cells: 147 | :param float alpha: acceptable type-I error (default = 0.05) 148 | :param Callable downsample_value_function: function that identifies the number of 149 | molecules n to sample from a and b. the sampling number will be the minimum of the 150 | result across a and b. default = np.median. Other values include np.mean and np.max. 151 | :param labels: feature labels for columns of a & b 152 | :return (int, int) statistic, q_val: 153 | """ 154 | assert_input_non_negative(a, b) 155 | mult_a = estimate_multinomial(a) 156 | mult_b = estimate_multinomial(b) 157 | 158 | # get number of molecules to sample 159 | a_sizes = a.sum(axis=1) 160 | b_sizes = b.sum(axis=1) 161 | n_molecules = min( 162 | map(lambda x: downsample_value_function(x).astype(int), [a_sizes, b_sizes])) 163 | 164 | # set n_cells to the smaller of the two passed samples (e.g. if comparing two sets, 165 | # one with 130 cells, and one with 1902 cells, n_cells = 130). 166 | if n_cells is None: 167 | n_cells = min(a.shape[0], b.shape[0]) 168 | 169 | a_mu, a_var = sample_moments(mult_a, n_samples, n_cells, n_molecules) 170 | b_mu, b_var = sample_moments(mult_b, n_samples, n_cells, n_molecules) 171 | 172 | statistic, p, ci_95 = whelchs_t(a_mu, a_var, b_mu, b_var, a.shape[0], b.shape[0]) 173 | 174 | q = multipletests(p, alpha=alpha, method='fdr_tsbh')[1] 175 | 176 | results = pd.DataFrame( 177 | data=np.vstack([statistic, ci_95.T, p, q]).T, 178 | index=labels, 179 | columns=['t', 't_ci95_low', 't_ci95_high', 'p', 'q']) 180 | 181 | return results 182 | -------------------------------------------------------------------------------- /src/seqc/summary/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/__init__.py -------------------------------------------------------------------------------- /src/seqc/summary/css/simple-sidebar.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Start Bootstrap - Simple Sidebar (http://startbootstrap.com/) 3 | * Copyright 2013-2016 Start Bootstrap 4 | * Licensed under MIT (https://github.com/BlackrockDigital/startbootstrap/blob/gh-pages/LICENSE) 5 | */ 6 | 7 | body { 8 | overflow-x: hidden; 9 | } 10 | 11 | /* Toggle Styles */ 12 | 13 | #wrapper { 14 | padding-left: 0; 15 | -webkit-transition: all 0.5s ease; 16 | -moz-transition: all 0.5s ease; 17 | -o-transition: all 0.5s ease; 18 | transition: all 0.5s ease; 19 | } 20 | 21 | #wrapper.toggled { 22 | padding-left: 250px; 23 | } 24 | 25 | #sidebar-wrapper { 26 | z-index: 1000; 27 | position: fixed; 28 | left: 250px; 29 | width: 0; 30 | height: 100%; 31 | margin-left: -250px; 32 | overflow-y: auto; 33 | background: #000; 34 | -webkit-transition: all 0.5s ease; 35 | -moz-transition: all 0.5s ease; 36 | -o-transition: all 0.5s ease; 37 | transition: all 0.5s ease; 38 | } 39 | 40 | #wrapper.toggled #sidebar-wrapper { 41 | width: 250px; 42 | } 43 | 44 | #page-content-wrapper { 45 | width: 100%; 46 | position: absolute; 47 | padding: 15px; 48 | } 49 | 50 | #wrapper.toggled #page-content-wrapper { 51 | position: absolute; 52 | margin-right: -250px; 53 | } 54 | 55 | /* Sidebar Styles */ 56 | 57 | .sidebar-nav { 58 | position: absolute; 59 | top: 0; 60 | width: 250px; 61 | margin: 0; 62 | padding: 0; 63 | list-style: none; 64 | } 65 | 66 | .sidebar-nav li { 67 | text-indent: 20px; 68 | line-height: 40px; 69 | } 70 | 71 | .sidebar-nav li a { 72 | display: block; 73 | text-decoration: none; 74 | color: #999999; 75 | } 76 | 77 | .sidebar-nav li a:hover { 78 | text-decoration: none; 79 | color: #fff; 80 | background: rgba(255,255,255,0.2); 81 | } 82 | 83 | .sidebar-nav li a:active, 84 | .sidebar-nav li a:focus { 85 | text-decoration: none; 86 | } 87 | 88 | .sidebar-nav > .sidebar-brand { 89 | height: 65px; 90 | font-size: 18px; 91 | line-height: 60px; 92 | } 93 | 94 | .sidebar-nav > .sidebar-brand a { 95 | color: #999999; 96 | } 97 | 98 | .sidebar-nav > .sidebar-brand a:hover { 99 | color: #fff; 100 | background: none; 101 | } 102 | 103 | @media(min-width:768px) { 104 | #wrapper { 105 | padding-left: 250px; 106 | } 107 | 108 | #wrapper.toggled { 109 | padding-left: 0; 110 | } 111 | 112 | #sidebar-wrapper { 113 | width: 250px; 114 | } 115 | 116 | #wrapper.toggled #sidebar-wrapper { 117 | width: 0; 118 | } 119 | 120 | #page-content-wrapper { 121 | padding: 20px; 122 | position: relative; 123 | } 124 | 125 | #wrapper.toggled #page-content-wrapper { 126 | position: relative; 127 | margin-right: 0; 128 | } 129 | } -------------------------------------------------------------------------------- /src/seqc/summary/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /src/seqc/summary/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /src/seqc/summary/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /src/seqc/summary/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /src/seqc/summary/html_/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/html_/__init__.py -------------------------------------------------------------------------------- /src/seqc/summary/img/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/img/__init__.py -------------------------------------------------------------------------------- /src/seqc/summary/static/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/summary/static/__init__.py -------------------------------------------------------------------------------- /src/seqc/summary/templates/mini_summary_base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {{output_prefix}} Mini Summary 4 | 5 | 6 | 7 |

{{output_prefix}} Mini Summary

8 |

Overall Statistics

9 | 10 | 11 | {% if mini_summary_d['uniqmapped_pct'] == 'N/A' %} 12 | 13 | 14 | 15 | 16 | {% else %} 17 | 18 | 19 | 20 | 21 | {% endif %} 22 | 23 | 24 | 25 | 26 | 27 | 28 | {% if 'mt_rna_fraction' in mini_summary_d %} 29 | 30 | {% endif %} 31 |
# Reads:{{mini_summary_d['n_reads']}}
% of uniquely mapped reads:N/A
% of multi-mapped reads:N/A
% of unmapped reads:N/A
% of filtered reads mapping to genome:N/A
% of uniquely mapped reads:{{'%.2f%%' % mini_summary_d['uniqmapped_pct']}}
% of multi-mapped reads:{{'%.2f%%' % mini_summary_d['multimapped_pct']}}
% of unmapped reads:{{'%.2f%%' % mini_summary_d['unmapped_pct']}}
% of filtered reads mapping to genome:{{'%.2f%%' % mini_summary_d['genomic_read_pct']}}
Sequencing saturation rate:{{'%.2f%%' % mini_summary_d['seq_sat_rate']}}
 
# Cells:{{'%d' % mini_summary_d['n_cells']}}
Median molecules per cell:{{'%d' % mini_summary_d['med_molcs_per_cell']}}
Average reads per cell:{{'%d' % mini_summary_d['avg_reads_per_cell']}}
Average reads per molecule:{{'%.2f' % mini_summary_d['avg_reads_per_molc']}}
% of cells filtered by high mt-RNA content:{{'%.2f%%' % mini_summary_d['mt_rna_fraction']}}
32 | 33 |

Cell Size Distribution

34 |
35 | 36 |
37 |

Filtering

38 | Indian red indicates cells that have been filtered
39 |
40 | 41 | 42 |
43 |

PCA Components

44 |
45 | 46 |
47 |

Phenograph Clustering

48 | Library size has been regressed out of all PCA components. We ran Phenograph clustering algorithm on the dataset with revised PCA components and with 80 nearest neighbors.

49 |
50 | 51 |

Warnings

52 | 53 | {% for w,m in warning_d.items() %} 54 | 55 | {% endfor %} 56 |
{{w}}:{{m}}
57 | -------------------------------------------------------------------------------- /src/seqc/summary/templates/section_base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | SEQC report 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 26 | 27 | 28 | 29 | 30 | 31 |
32 | 33 | 34 | 46 | 47 | 48 | 49 |
50 | {% block content %}{% endblock %} 51 |
52 | 53 |
54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /src/seqc/summary/templates/section_content.html: -------------------------------------------------------------------------------- 1 | {% extends "section_base.html" %}} 2 | {% block content %} 3 |

{{section.name}}

4 | 5 |
6 | {% for name, c in section.content.items() %} 7 | 8 |

{{name}}

9 | 10 | {% if c.keys is defined %} 11 |
12 | {% for k in c.keys %} 13 | {{k}}
14 | {% endfor %} 15 |
16 |
17 | {% for v in c.values %} 18 | {{v}}
19 | {% endfor %} 20 |
21 | {% elif c.text is defined %} 22 |
23 | {{c.text}} 24 |
25 | {% elif c.image is defined %} 26 |
27 | {{c.caption}} 28 |
29 |
30 | {{c.legend}} 31 |
32 | {% endif %} 33 | 34 | {% endfor %} 35 |
36 | {% endblock %} -------------------------------------------------------------------------------- /src/seqc/summary/test.py: -------------------------------------------------------------------------------- 1 | import nose2 2 | import unittest 3 | from seqc.summary import summary 4 | from collections import OrderedDict 5 | 6 | 7 | class TestSummary(unittest.TestCase): 8 | 9 | def test_render_section(self): 10 | s1 = summary.Section.from_alignment_summary( 11 | '/var/folders/y3/ysxvl2w921d881nfpvx5ypvh0000gn/T/seqc/test_no_aws_in_drop_v2' 12 | '/alignment_summary.txt') 13 | s1.render('./src/seqc/summary/test_summary.html') 14 | 15 | if __name__ == "__main__": 16 | nose2.main() 17 | -------------------------------------------------------------------------------- /src/seqc/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpeerlab/seqc/d07836e430d56d2304e70bc042b483e8cbe22e00/src/seqc/tests/__init__.py -------------------------------------------------------------------------------- /src/seqc/tests/test_args.py: -------------------------------------------------------------------------------- 1 | import nose2 2 | import unittest 3 | 4 | import seqc 5 | from seqc.core import main 6 | 7 | 8 | # class TestSEQC(unittest.TestCase): 9 | # def setUp(self): 10 | # pass 11 | 12 | # def tearDown(self): 13 | # pass 14 | 15 | # def test_args(self): 16 | 17 | # argv = ["start", "-k", "/Users/dchun/dpeerlab-chunj.pem", "-t", "t2.micro"] 18 | 19 | # self.assertRaises(ValueError, lambda: main.main(argv)) 20 | 21 | # class MyUnitTest(unittest.TestCase): 22 | # def setUp(self): 23 | # pass 24 | 25 | # def tearDown(self): 26 | # pass 27 | 28 | # def test_args(self): 29 | 30 | # # argv = [ 31 | # # "run", "ten_x_v2", "--local", 32 | # # "--index", "s3://seqc-public/genomes/hg38_chr19/", 33 | # # "--barcode-files", "s3://seqc-public/barcodes/ten_x_v2/flat/", 34 | # # "--genomic-fastq", "./test-data/genomic/", 35 | # # "--barcode-fastq", "./test-data/barcode/", 36 | # # "--output-prefix", "./test-data/seqc-results/", 37 | # # "--email", "jaeyoung.chun@gmail.com", 38 | # # "--star-args", "\"runRNGseed=0\"" 39 | # # ] 40 | 41 | # argv = [ 42 | # "run" 43 | # ] 44 | 45 | # try: 46 | # main.main(argv) 47 | # # self.assertRaises(BaseException, lambda: main.main(argv)) 48 | # except: 49 | # pass 50 | # # self.assertRaises(ValueError, lambda: main.main(argv)) 51 | 52 | 53 | # class TestSEQC(unittest.TestCase): 54 | # def setUp(self): 55 | # pass 56 | 57 | # def tearDown(self): 58 | # pass 59 | 60 | # def test_args(self): 61 | 62 | # from seqc.sequence import gtf 63 | 64 | # # remove any invalid ids from the annotation file 65 | # gr = gtf.Reader("./test-data/homo_sapiens.gtf.gz") 66 | 67 | # for line_fields in gr: 68 | # record = gtf.Record(line_fields) 69 | # print(record) 70 | # biotype = record.attribute("gene_biotype") 71 | # print(biotype) 72 | 73 | # # self.assertRaises(ValueError, lambda: main.main(argv)) 74 | 75 | 76 | if __name__ == "__main__": 77 | 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /src/seqc/tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | TestDataset = namedtuple( 4 | "datasets", 5 | ["barcode_fastq", "genomic_fastq", "merged_fastq", "bam", "index", "barcodes",], 6 | ) 7 | 8 | dataset_s3 = TestDataset( 9 | barcode_fastq="s3://seqc-public/test/%s/barcode/", # platform 10 | genomic_fastq="s3://seqc-public/test/%s/genomic/", # platform 11 | merged_fastq="s3://seqc-public/test/%s/%s_merged.fastq.gz", # platform, platform 12 | bam="s3://seqc-public/test/%s/Aligned.out.bam", # platform 13 | index="s3://seqc-public/genomes/hg38_chr19/", 14 | barcodes="s3://seqc-public/barcodes/%s/flat/", # platform 15 | ) 16 | 17 | dataset_local = TestDataset( 18 | barcode_fastq="test-data/datasets/%s/barcode/", # platform 19 | genomic_fastq="test-data/datasets/%s/genomic/", # platform 20 | merged_fastq=None, 21 | bam="test-data/datasets/%s/Aligned.out.bam", # platform 22 | index="test-data/datasets/genomes/hg38_chr19/", 23 | barcodes="test-data/datasets/barcodes/%s/flat/", # platform 24 | ) 25 | -------------------------------------------------------------------------------- /src/seqc/tests/test_run_e2e_local.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import uuid 4 | import shutil 5 | import subprocess 6 | import re 7 | from nose2.tools import params 8 | from seqc.core import main 9 | from test_dataset import dataset_local, dataset_s3 10 | 11 | 12 | def get_output_file_list(test_id, test_folder): 13 | 14 | proc = subprocess.Popen( 15 | ["find", test_folder, "-type", "f"], 16 | stdout=subprocess.PIPE, 17 | stderr=subprocess.PIPE, 18 | ) 19 | stdout, _ = proc.communicate() 20 | files = stdout.decode().splitlines() 21 | 22 | # extract only filenames (i.e. remove directory hierarchy) 23 | # convert to a set for easy comparison 24 | files = set(map(lambda filename: filename.replace(test_folder + "/", ""), files)) 25 | 26 | return files 27 | 28 | 29 | def expected_output_files(file_prefix): 30 | 31 | files = set( 32 | [ 33 | f"{file_prefix}.h5", 34 | f"{file_prefix}_alignment_summary.txt", 35 | f"{file_prefix}_cell_filters.png", 36 | f"{file_prefix}_de_gene_list.txt", 37 | f"{file_prefix}_dense.csv", 38 | f"{file_prefix}_merged.fastq.gz", 39 | f"{file_prefix}_mini_summary.json", 40 | f"{file_prefix}_mini_summary.pdf", 41 | f"{file_prefix}_seqc_log.txt", 42 | f"{file_prefix}_sparse_counts_barcodes.csv", 43 | f"{file_prefix}_sparse_counts_genes.csv", 44 | f"{file_prefix}_sparse_molecule_counts.mtx", 45 | f"{file_prefix}_sparse_read_counts.mtx", 46 | f"{file_prefix}_summary.tar.gz", 47 | f"{file_prefix}_Aligned.out.bam", 48 | ] 49 | ) 50 | 51 | return files 52 | 53 | 54 | class TestRunLocal(unittest.TestCase): 55 | @classmethod 56 | def setUp(cls): 57 | cls.test_id = str(uuid.uuid4()) 58 | cls.path_temp = os.path.join( 59 | os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4()) 60 | ) 61 | os.makedirs(cls.path_temp, exist_ok=True) 62 | with open("seqc_log.txt", "wt") as f: 63 | f.write("Dummy log.\n") 64 | f.write("nose2 captures input, so no log is produced.\n") 65 | f.write("This causes pipeline errors.\n") 66 | 67 | @classmethod 68 | def tearDown(self): 69 | if os.path.isdir(self.path_temp): 70 | shutil.rmtree(self.path_temp, ignore_errors=True) 71 | 72 | def test_using_dataset_in_s3(self, platform="ten_x_v2"): 73 | # must NOT end with a slash 74 | file_prefix = "test" 75 | output_prefix = os.path.join(self.path_temp, file_prefix) 76 | 77 | params = [ 78 | ("run", platform), 79 | ("--local",), 80 | ("--output-prefix", output_prefix), 81 | ("--index", dataset_s3.index), 82 | ("--barcode-files", dataset_s3.barcodes % platform), 83 | ("--barcode-fastq", dataset_s3.barcode_fastq % platform), 84 | ("--genomic-fastq", dataset_s3.genomic_fastq % platform), 85 | ("--star-args", "runRNGseed=0"), 86 | ] 87 | 88 | argv = [element for tupl in params for element in tupl] 89 | 90 | if platform != "drop_seq": 91 | argv += ["--barcode-files", dataset_s3.barcodes % platform] 92 | 93 | main.main(argv) 94 | 95 | # get output file list 96 | files = get_output_file_list(self.test_id, self.path_temp) 97 | 98 | # check if each expected file is found in the list of files generated 99 | for file in expected_output_files(file_prefix): 100 | self.assertIn(file, files) 101 | 102 | def test_using_local_dataset(self, platform="ten_x_v2"): 103 | # must NOT end with a slash 104 | file_prefix = "test" 105 | output_prefix = os.path.join(self.path_temp, file_prefix) 106 | 107 | params = [ 108 | ("run", platform), 109 | ("--local",), 110 | ("--output-prefix", output_prefix), 111 | ("--index", dataset_local.index), 112 | ("--barcode-files", dataset_local.barcodes % platform), 113 | ("--barcode-fastq", dataset_local.barcode_fastq % platform), 114 | ("--genomic-fastq", dataset_local.genomic_fastq % platform), 115 | ("--star-args", "runRNGseed=0"), 116 | ] 117 | 118 | argv = [element for tupl in params for element in tupl] 119 | 120 | if platform != "drop_seq": 121 | argv += ["--barcode-files", dataset_local.barcodes % platform] 122 | 123 | main.main(argv) 124 | 125 | # get output file list 126 | files = get_output_file_list(self.test_id, self.path_temp) 127 | 128 | # check if each expected file is found in the list of files generated 129 | for file in expected_output_files(file_prefix): 130 | self.assertIn(file, files) 131 | -------------------------------------------------------------------------------- /src/seqc/tests/test_run_e2e_remote.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import uuid 4 | import shutil 5 | import re 6 | from seqc.core import main 7 | from seqc import io 8 | import boto3 9 | from nose2.tools import params 10 | from test_dataset import dataset_s3 11 | 12 | 13 | def get_instance_by_test_id(test_id): 14 | 15 | ec2 = boto3.resource("ec2") 16 | instances = ec2.instances.filter( 17 | Filters=[{"Name": "tag:TestID", "Values": [test_id]}] 18 | ) 19 | instances = list(instances) 20 | 21 | if len(instances) != 1: 22 | raise Exception("Test ID is not found or not unique!") 23 | 24 | return instances[0] 25 | 26 | 27 | def expected_output_files(output_prefix): 28 | 29 | files = set( 30 | [ 31 | f"{output_prefix}.h5", 32 | f"{output_prefix}_Aligned.out.bam", 33 | f"{output_prefix}_alignment_summary.txt", 34 | f"{output_prefix}_cell_filters.png", 35 | f"{output_prefix}_de_gene_list.txt", 36 | f"{output_prefix}_dense.csv", 37 | f"{output_prefix}_merged.fastq.gz", 38 | f"{output_prefix}_mini_summary.json", 39 | f"{output_prefix}_mini_summary.pdf", 40 | f"{output_prefix}_seqc_log.txt", 41 | f"{output_prefix}_sparse_counts_barcodes.csv", 42 | f"{output_prefix}_sparse_counts_genes.csv", 43 | f"{output_prefix}_sparse_molecule_counts.mtx", 44 | f"{output_prefix}_sparse_read_counts.mtx", 45 | f"{output_prefix}_summary.tar.gz", 46 | f"seqc_log.txt", 47 | ] 48 | ) 49 | 50 | return files 51 | 52 | 53 | def expected_output_files_run_from_merged(output_prefix): 54 | 55 | files = expected_output_files(output_prefix) 56 | 57 | excludes = set([f"{output_prefix}_merged.fastq.gz"]) 58 | 59 | return files - excludes 60 | 61 | 62 | def expected_output_files_run_from_bam(output_prefix): 63 | 64 | files = expected_output_files(output_prefix) 65 | 66 | excludes = set( 67 | [ 68 | f"{output_prefix}_Aligned.out.bam", 69 | f"{output_prefix}_alignment_summary.txt", 70 | f"{output_prefix}_merged.fastq.gz", 71 | ] 72 | ) 73 | 74 | return files - excludes 75 | 76 | 77 | def get_output_file_list(test_id, s3_bucket, test_folder): 78 | 79 | # get instance and wait until terminated 80 | instance = get_instance_by_test_id(test_id) 81 | instance.wait_until_terminated() 82 | 83 | # check files generated in S3 84 | files = io.S3.listdir(s3_bucket, test_folder) 85 | 86 | # extract only filenames (i.e. remove directory hierarchy) 87 | # convert to a set for easy comparison 88 | files = set(map(lambda filename: filename.replace(test_folder, ""), files)) 89 | 90 | return files 91 | 92 | 93 | def check_for_success_msg(s3_seqc_log_uri, path_temp): 94 | 95 | # download seqc_log.txt 96 | io.S3.download( 97 | link=s3_seqc_log_uri, prefix=path_temp, overwrite=True, recursive=False 98 | ) 99 | 100 | # check if seqc_log.txt has a successful message 101 | with open(os.path.join(path_temp, "seqc_log.txt"), "rt") as fin: 102 | logs = fin.read() 103 | match = re.search(r"Execution completed successfully", logs, re.MULTILINE) 104 | 105 | return True if match else False 106 | 107 | 108 | class TestRunRemote(unittest.TestCase): 109 | 110 | email = os.environ["SEQC_TEST_EMAIL"] 111 | rsa_key = os.environ["SEQC_TEST_RSA_KEY"] 112 | ami_id = os.environ["SEQC_TEST_AMI_ID"] 113 | 114 | s3_bucket = "dp-lab-cicd" 115 | 116 | @classmethod 117 | def setUp(cls): 118 | cls.test_id = str(uuid.uuid4()) 119 | cls.path_temp = os.path.join( 120 | os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4()) 121 | ) 122 | os.makedirs(cls.path_temp, exist_ok=True) 123 | 124 | @classmethod 125 | def tearDown(self): 126 | if os.path.isdir(self.path_temp): 127 | shutil.rmtree(self.path_temp, ignore_errors=True) 128 | 129 | @params("in_drop_v2", "ten_x_v2") 130 | def test_remote_from_raw_fastq(self, platform="ten_x_v2"): 131 | output_prefix = "from-raw-fastq" 132 | # must end with a slash 133 | test_folder = f"seqc/run-{platform}-{self.test_id}/" 134 | 135 | params = [ 136 | ("run", platform), 137 | ("--output-prefix", "from-raw-fastq"), 138 | ("--upload-prefix", f"s3://{self.s3_bucket}/{test_folder}"), 139 | ("--index", dataset_s3.index), 140 | ("--email", self.email), 141 | ("--barcode-fastq", dataset_s3.barcode_fastq % platform), 142 | ("--genomic-fastq", dataset_s3.genomic_fastq % platform), 143 | ("--instance-type", "r5.2xlarge"), 144 | ("--spot-bid", "1.0"), 145 | ("--rsa-key", self.rsa_key), 146 | ("--debug",), 147 | ("--remote-update",), 148 | ("--ami-id", self.ami_id), 149 | ("--user-tags", f"TestID:{self.test_id}"), 150 | ] 151 | 152 | argv = [element for tupl in params for element in tupl] 153 | 154 | if platform != "drop_seq": 155 | argv += ["--barcode-files", dataset_s3.barcodes % platform] 156 | 157 | main.main(argv) 158 | 159 | # wait until terminated 160 | # get output file list 161 | files = get_output_file_list(self.test_id, self.s3_bucket, test_folder) 162 | 163 | # check for the exact same filenames 164 | self.assertSetEqual(files, expected_output_files(output_prefix)) 165 | 166 | # check for success message in seqc_log.txt 167 | has_success_msg = check_for_success_msg( 168 | s3_seqc_log_uri="s3://{}/{}".format( 169 | self.s3_bucket, os.path.join(test_folder, "seqc_log.txt") 170 | ), 171 | path_temp=self.path_temp, 172 | ) 173 | 174 | self.assertTrue( 175 | has_success_msg, msg="Unable to find the success message in the log" 176 | ) 177 | 178 | def test_remote_from_merged(self, platform="in_drop_v2"): 179 | output_prefix = "from-merged" 180 | # must end with a slash 181 | test_folder = f"seqc/run-{platform}-{self.test_id}/" 182 | 183 | params = [ 184 | ("run", platform), 185 | ("--output-prefix", output_prefix), 186 | ("--upload-prefix", f"s3://{self.s3_bucket}/{test_folder}"), 187 | ("--index", dataset_s3.index), 188 | ("--email", self.email), 189 | ("--merged-fastq", dataset_s3.merged_fastq % (platform, platform)), 190 | ("--rsa-key", self.rsa_key), 191 | ("--instance-type", "r5.2xlarge"), 192 | ("--ami-id", self.ami_id), 193 | ("--remote-update",), 194 | ("--user-tags", f"TestID:{self.test_id}") 195 | # ('--spot-bid', '1.0') 196 | ] 197 | 198 | argv = [element for tupl in params for element in tupl] 199 | 200 | if platform != "drop_seq": 201 | argv += ["--barcode-files", dataset_s3.barcodes % platform] 202 | 203 | main.main(argv) 204 | 205 | # wait until terminated 206 | # get output file list 207 | files = get_output_file_list(self.test_id, self.s3_bucket, test_folder) 208 | 209 | # check for the exact same filenames 210 | self.assertSetEqual(files, expected_output_files_run_from_merged(output_prefix)) 211 | 212 | # check for success message in seqc_log.txt 213 | has_success_msg = check_for_success_msg( 214 | s3_seqc_log_uri="s3://{}/{}".format( 215 | self.s3_bucket, os.path.join(test_folder, "seqc_log.txt") 216 | ), 217 | path_temp=self.path_temp, 218 | ) 219 | 220 | self.assertTrue( 221 | has_success_msg, msg="Unable to find the success message in the log" 222 | ) 223 | 224 | def test_remote_from_bamfile(self, platform="in_drop_v2"): 225 | output_prefix = "from-bamfile" 226 | # must end with a slash 227 | test_folder = f"seqc/run-{platform}-{self.test_id}/" 228 | 229 | params = [ 230 | ("run", platform), 231 | ("--output-prefix", output_prefix), 232 | ("--upload-prefix", f"s3://{self.s3_bucket}/{test_folder}"), 233 | ("--index", dataset_s3.index), 234 | ("--email", self.email), 235 | ("--alignment-file", dataset_s3.bam % platform), 236 | ("--rsa-key", self.rsa_key), 237 | ("--instance-type", "r5.2xlarge"), 238 | ("--debug",), 239 | ("--ami-id", self.ami_id), 240 | ("--remote-update",), 241 | ("--user-tags", f"TestID:{self.test_id}") 242 | # ('--spot-bid', '1.0') 243 | ] 244 | 245 | argv = [element for tupl in params for element in tupl] 246 | 247 | if platform != "drop_seq": 248 | argv += ["--barcode-files", dataset_s3.barcodes % platform] 249 | 250 | main.main(argv) 251 | 252 | # wait until terminated 253 | # get output file list 254 | files = get_output_file_list(self.test_id, self.s3_bucket, test_folder) 255 | 256 | # check for the exact same filenames 257 | self.assertSetEqual(files, expected_output_files_run_from_bam(output_prefix)) 258 | 259 | # check for success message in seqc_log.txt 260 | has_success_msg = check_for_success_msg( 261 | s3_seqc_log_uri="s3://{}/{}".format( 262 | self.s3_bucket, os.path.join(test_folder, "seqc_log.txt") 263 | ), 264 | path_temp=self.path_temp, 265 | ) 266 | 267 | self.assertTrue( 268 | has_success_msg, msg="Unable to find the success message in the log" 269 | ) 270 | -------------------------------------------------------------------------------- /src/seqc/tests/test_run_gtf.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, mock 2 | import os 3 | import uuid 4 | import shutil 5 | import nose2 6 | from seqc.sequence import gtf 7 | from test_dataset import dataset_local 8 | 9 | 10 | class TestGtf(TestCase): 11 | @classmethod 12 | def setUp(cls): 13 | cls.test_id = str(uuid.uuid4()) 14 | cls.path_temp = os.path.join( 15 | os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4()) 16 | ) 17 | cls.annotation = os.path.join(dataset_local.index, "annotations.gtf") 18 | 19 | @classmethod 20 | def tearDown(self): 21 | if os.path.isdir(self.path_temp): 22 | shutil.rmtree(self.path_temp, ignore_errors=True) 23 | 24 | def test_construct_translator(self): 25 | translator = gtf.GeneIntervals(self.annotation) 26 | self.assertIsNotNone(translator) 27 | 28 | def test_num_of_transcripts(self): 29 | rd = gtf.Reader(self.annotation) 30 | num_transcripts = sum(1 for _ in rd.iter_transcripts()) 31 | # awk -F'\t' '$3=="transcript" { print $0 }' annotations.gtf | wc -l 32 | self.assertEqual(num_transcripts, 12747) 33 | 34 | def test_iter_transcripts(self): 35 | rd = gtf.Reader(self.annotation) 36 | (transcript_chromosome, transcript_strand, transcript_gene_id), exons = next( 37 | rd.iter_transcripts() 38 | ) 39 | 40 | # this should give us 3 exons of the first transcript of the first gene found in inverse order: 41 | # 42 | # chr19 HAVANA gene 60951 71626 . - . gene_id "ENSG00000282458.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; level 2; havana_gene "OTTHUMG00000180466.8"; 43 | # chr19 HAVANA transcript 60951 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; 44 | # chr19 HAVANA exon 70928 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 1; exon_id "ENSE00003781173.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; 45 | # chr19 HAVANA exon 66346 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 2; exon_id "ENSE00003783498.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; 46 | # chr19 HAVANA exon 60951 61894 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 3; exon_id "ENSE00003783010.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; 47 | 48 | self.assertEqual(transcript_chromosome, "chr19") 49 | self.assertEqual(transcript_strand, "-") 50 | self.assertEqual(transcript_gene_id, 282458) 51 | self.assertEqual(len(exons), 3) 52 | 53 | # 8th column has exon ID 54 | self.assertIn("ENSE00003783010.1", exons[0][8]) # exon number 3 55 | self.assertIn("ENSE00003783498.1", exons[1][8]) # exon number 2 56 | self.assertIn("ENSE00003781173.1", exons[2][8]) # exon number 1 57 | 58 | def test_translate(self): 59 | translator = gtf.GeneIntervals(self.annotation) 60 | # chr19 HAVANA gene 60951 71626 . - . gene_id "ENSG00000282458.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; level 2; havana_gene "OTTHUMG00000180466.8"; 61 | gene_id = translator.translate("chr19", "-", 60951) 62 | self.assertEqual(gene_id, 282458) 63 | 64 | 65 | if __name__ == "__main__": 66 | nose2.main() 67 | -------------------------------------------------------------------------------- /src/seqc/tests/test_run_readarray.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, mock 2 | import os 3 | import uuid 4 | import shutil 5 | import nose2 6 | from test_dataset import dataset_local 7 | from seqc.sequence.encodings import DNA3Bit 8 | from seqc.read_array import ReadArray 9 | from seqc.sequence import gtf 10 | 11 | 12 | class TestReadArray(TestCase): 13 | @classmethod 14 | def setUp(cls): 15 | cls.test_id = str(uuid.uuid4()) 16 | cls.path_temp = os.path.join( 17 | os.environ["TMPDIR"], "seqc-test", str(uuid.uuid4()) 18 | ) 19 | cls.annotation = os.path.join(dataset_local.index, "annotations.gtf") 20 | cls.translator = gtf.GeneIntervals(cls.annotation, 10000) 21 | 22 | @classmethod 23 | def tearDown(self): 24 | if os.path.isdir(self.path_temp): 25 | shutil.rmtree(self.path_temp, ignore_errors=True) 26 | 27 | def test_read_array_creation(self, platform="ten_x_v2"): 28 | ra, _ = ReadArray.from_alignment_file( 29 | dataset_local.bam % platform, self.translator, required_poly_t=0 30 | ) 31 | self.assertIsNotNone(ra) 32 | 33 | def test_read_array_rmt_decode_10x_v2(self): 34 | platform = "ten_x_v2" 35 | 36 | # create a readarray 37 | ra, _ = ReadArray.from_alignment_file( 38 | dataset_local.bam % platform, self.translator, required_poly_t=0 39 | ) 40 | 41 | # see if we can decode numeric UMI back to nucleotide sequence 42 | dna3bit = DNA3Bit() 43 | for rmt in ra.data["rmt"]: 44 | decoded = dna3bit.decode(rmt).decode() 45 | # ten_x_v2 UMI length = 10 nt 46 | self.assertEqual(len(decoded), 10) 47 | 48 | def test_read_array_rmt_decode_10x_v3(self): 49 | platform = "ten_x_v3" 50 | 51 | # create a readarray 52 | ra, _ = ReadArray.from_alignment_file( 53 | dataset_local.bam % platform, self.translator, required_poly_t=0 54 | ) 55 | 56 | # see if we can decode numeric UMI back to nucleotide sequence 57 | dna3bit = DNA3Bit() 58 | for rmt in ra.data["rmt"]: 59 | decoded = dna3bit.decode(rmt).decode() 60 | # ten_x_v3 UMI length = 12 nt 61 | self.assertEqual(len(decoded), 12) 62 | 63 | 64 | if __name__ == "__main__": 65 | nose2.main() 66 | -------------------------------------------------------------------------------- /src/seqc/tests/test_run_rmt_correction.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, mock 2 | import nose2 3 | import os 4 | import numpy as np 5 | from seqc.read_array import ReadArray 6 | from seqc import rmt_correction 7 | 8 | 9 | class TestRmtCorrection(TestCase): 10 | @classmethod 11 | def setUp(self): 12 | # pre-allocate arrays 13 | n_barcodes = 183416337 14 | data = np.recarray((n_barcodes,), ReadArray._dtype) 15 | genes = np.zeros(n_barcodes, dtype=np.int32) 16 | positions = np.zeros(n_barcodes, dtype=np.int32) 17 | self.ra = ReadArray(data, genes, positions) 18 | 19 | @classmethod 20 | def tearDown(self): 21 | pass 22 | 23 | def test_should_return_correct_ra_size(self): 24 | 25 | ra_size = self.ra.data.nbytes + self.ra.genes.nbytes + self.ra.positions.nbytes 26 | 27 | self.assertEqual(4768824762, ra_size) 28 | 29 | # 64GB 30 | @mock.patch( 31 | "seqc.rmt_correction._get_available_memory", return_value=50 * 1024 ** 3 32 | ) 33 | def test_should_return_correct_max_workers(self, mock_mem): 34 | 35 | n_workers = rmt_correction._calc_max_workers(self.ra) 36 | 37 | self.assertEqual(n_workers, 5) 38 | 39 | # 1TB 40 | @mock.patch("seqc.rmt_correction._get_available_memory", return_value=1079354630144) 41 | def test_should_return_correct_max_workers2(self, mock_mem): 42 | 43 | n_workers = rmt_correction._calc_max_workers(self.ra) 44 | 45 | self.assertEqual(n_workers, 119) 46 | 47 | # having less memory than ra size 48 | @mock.patch("seqc.rmt_correction._get_available_memory") 49 | def test_should_return_one_if_ra_larger_than_mem(self, mock_mem): 50 | 51 | ra_size = self.ra.data.nbytes + self.ra.genes.nbytes + self.ra.positions.nbytes 52 | 53 | # assume the available memory is a half of ra 54 | mock_mem.return_value = int(ra_size) / 2 55 | 56 | n_workers = rmt_correction._calc_max_workers(self.ra) 57 | 58 | self.assertEqual(n_workers, 1) 59 | 60 | 61 | class TestRmtCorrection2(TestCase): 62 | @classmethod 63 | def setUp(self): 64 | # pre-allocate arrays 65 | n_barcodes = 183416337 66 | data = np.recarray((n_barcodes,), ReadArray._dtype) 67 | genes = np.zeros(n_barcodes, dtype=np.int32) 68 | positions = np.zeros(n_barcodes, dtype=np.int32) 69 | self.ra = ReadArray(data, genes, positions) 70 | 71 | import pickle 72 | 73 | with open("pre-correction-ra.pickle", "wb") as fout: 74 | pickle.dump(self.ra, fout) 75 | 76 | @classmethod 77 | def tearDown(self): 78 | import os 79 | 80 | try: 81 | os.remove("pre-correction-ra.pickle") 82 | except: 83 | pass 84 | 85 | @mock.patch("seqc.rmt_correction._correct_errors_by_cell_group", return_value=0) 86 | def test_correct_errors_by_chunks(self, mock_correct): 87 | cell_group = [1, 2, 3] 88 | x = rmt_correction._correct_errors_by_cell_group_chunks( 89 | self.ra, cell_group, 0.02, 0.05 90 | ) 91 | mock_correct.assert_called() 92 | self.assertEquals(len(cell_group), mock_correct.call_count) 93 | self.assertEquals([0, 0, 0], x) 94 | 95 | 96 | if __name__ == "__main__": 97 | nose2.main() 98 | -------------------------------------------------------------------------------- /src/seqc/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.11" 2 | --------------------------------------------------------------------------------