├── _config.yml ├── testing ├── matplotlibrc └── environment_no_python.yml ├── setup.cfg ├── anchor ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_bayesian.py │ ├── test_infotheory.py │ └── test_model.py ├── names.py ├── __init__.py ├── util.py ├── binning.py ├── model.py ├── simulate.py ├── infotheory.py ├── visualize.py └── bayesian.py ├── logo └── v1 │ ├── logo.ai │ ├── logo.pdf │ ├── logo.png │ ├── logo-24.png │ ├── logo-32.png │ ├── logo-64.png │ ├── logo-128.png │ ├── logo-256.png │ └── logo.svg ├── requirements.txt ├── conda_requirements.txt ├── AUTHORS.md ├── MANIFEST.in ├── .editorconfig ├── .gitignore ├── environment.yml ├── LICENSE ├── setup.py ├── .travis.yml ├── Makefile ├── README.md └── CONTRIBUTING.md /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /testing/matplotlibrc: -------------------------------------------------------------------------------- 1 | backend : Agg 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /anchor/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /logo/v1/logo.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YeoLab/anchor/HEAD/logo/v1/logo.ai -------------------------------------------------------------------------------- /logo/v1/logo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YeoLab/anchor/HEAD/logo/v1/logo.pdf -------------------------------------------------------------------------------- /logo/v1/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YeoLab/anchor/HEAD/logo/v1/logo.png -------------------------------------------------------------------------------- /logo/v1/logo-24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YeoLab/anchor/HEAD/logo/v1/logo-24.png -------------------------------------------------------------------------------- /logo/v1/logo-32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YeoLab/anchor/HEAD/logo/v1/logo-32.png -------------------------------------------------------------------------------- /logo/v1/logo-64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YeoLab/anchor/HEAD/logo/v1/logo-64.png -------------------------------------------------------------------------------- /logo/v1/logo-128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YeoLab/anchor/HEAD/logo/v1/logo-128.png -------------------------------------------------------------------------------- /logo/v1/logo-256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YeoLab/anchor/HEAD/logo/v1/logo-256.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | flake8 3 | pandas 4 | matplotlib 5 | seaborn 6 | pytest-cov 7 | scikit-learn 8 | -------------------------------------------------------------------------------- /conda_requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | flake8 3 | pandas 4 | matplotlib 5 | seaborn>=0.6 6 | pytest-cov 7 | scikit-learn 8 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | Credits 2 | ======= 3 | 4 | Development Lead 5 | ---------------- 6 | 7 | - Olga Botvinnik <> 8 | 9 | Contributors 10 | ------------ 11 | 12 | None yet. Why not be the first? 13 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.md 2 | include CONTRIBUTING.md 3 | include LICENSE 4 | include README.md 5 | include requirements.txt 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include docs *.rst conf.py Makefile make.bat 12 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /anchor/names.py: -------------------------------------------------------------------------------- 1 | """ 2 | Names of the modalities 3 | """ 4 | 5 | # Set constants of the names of the models so they can always be referenced 6 | # as variables rather than strings 7 | 8 | # Most of the density is at 0 9 | NEAR_ZERO = 'excluded' 10 | 11 | # Old "middle" modality - most of the density is at 0.5 12 | NEAR_HALF = 'middle' 13 | 14 | # Most of the density is at 1 15 | NEAR_ONE = 'included' 16 | 17 | # The density is split between 0 and 1 18 | BIMODAL = 'bimodal' 19 | 20 | # Cannot decide on one of the above models (the null model fits better) so use 21 | # this model instead 22 | NULL_MODEL = 'uncategorized' 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | htmlcov 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | # Complexity 39 | output/*.html 40 | output/*/index.html 41 | 42 | # Sphinx 43 | docs/_build 44 | 45 | # PyCharm 46 | .idea 47 | 48 | 49 | # Miniconda 50 | miniconda.sh 51 | 52 | # Emacs 53 | *~ -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: anchor-env 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - pycodestyle 9 | - python-dateutil 10 | - coverage 11 | - cycler 12 | - flake8 13 | - freetype 14 | - icu 15 | - libpng 16 | - matplotlib 17 | - mccabe 18 | - mkl 19 | - numpy 20 | - openssl 21 | - pandas 22 | - pip 23 | - py 24 | - pyflakes 25 | - pyparsing 26 | - pyqt 27 | - pytest 28 | - python 29 | - pytz 30 | - qt 31 | - readline 32 | - scikit-learn 33 | - scipy 34 | - seaborn 35 | - setuptools 36 | - sip 37 | - six 38 | - sqlite 39 | - tk 40 | - wheel 41 | - xz 42 | - zlib 43 | - pip: 44 | - anchor-bio 45 | - python-coveralls 46 | - pyyaml 47 | - requests 48 | 49 | -------------------------------------------------------------------------------- /testing/environment_no_python.yml: -------------------------------------------------------------------------------- 1 | name: anchor-env 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - pycodestyle 9 | - python-dateutil 10 | - coverage 11 | - cycler 12 | - flake8 13 | - freetype 14 | - icu 15 | - libpng 16 | - matplotlib 17 | - mccabe 18 | - mkl 19 | - numpy 20 | - openssl 21 | - pandas 22 | - pip 23 | - py 24 | - pyflakes 25 | - pyparsing 26 | - pyqt 27 | - pytest 28 | - pytz 29 | - qt 30 | - readline 31 | - scikit-learn 32 | - scipy 33 | - seaborn 34 | - setuptools 35 | - sip 36 | - six 37 | - sqlite 38 | - tk 39 | - wheel 40 | - xz 41 | - zlib 42 | - pip: 43 | - anchor-bio 44 | - python-coveralls 45 | - pyyaml 46 | - requests 47 | 48 | -------------------------------------------------------------------------------- /anchor/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .bayesian import BayesianModalities 3 | from .binning import BinnedModalities 4 | from .model import ModalityModel 5 | from .names import NEAR_ZERO, NEAR_HALF, NEAR_ONE, BIMODAL, \ 6 | NULL_MODEL 7 | from .simulate import add_noise 8 | from .visualize import MODALITY_TO_COLOR, MODALITY_ORDER, MODALITY_PALETTE,\ 9 | MODALITY_TO_CMAP, ModalitiesViz, violinplot, barplot 10 | 11 | __author__ = 'Olga Botvinnik' 12 | __email__ = 'olga.botvinnik@gmail.com' 13 | __version__ = '1.1.1' 14 | 15 | 16 | __all__ = ['ModalityModel', 'BayesianModalities', 'MODALITY_ORDER', 17 | 'MODALITY_PALETTE', 'MODALITY_TO_COLOR', 'ModalitiesViz', 18 | 'violinplot', 'MODALITY_TO_CMAP', 'BinnedModalities', 19 | 'add_noise', 'BIMODAL', 'NEAR_HALF', 'NEAR_ONE', 'NEAR_ZERO', 20 | 'barplot', 'NULL_MODEL'] 21 | -------------------------------------------------------------------------------- /anchor/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | 6 | @pytest.fixture(params=['no_na', 'with_na']) 7 | def event(request): 8 | x = np.arange(0, 1.1, .1) 9 | if request.param == 'no_na': 10 | return x 11 | elif request.param == 'with_na': 12 | x[x < 0.5] = np.nan 13 | return x 14 | 15 | 16 | @pytest.fixture 17 | def positive_control(): 18 | """Exact, known positive controls for modality estimation""" 19 | size = 20 20 | half = int(size/2) 21 | psi0 = pd.Series(np.zeros(size), name='excluded') 22 | psi1 = pd.Series(np.ones(size), name='included') 23 | middle = pd.Series(0.5 * np.ones(size), name='middle') 24 | bimodal = pd.Series(np.concatenate([np.ones(half), 25 | np.zeros(half)]), 26 | name='bimodal') 27 | uncategorized = pd.Series(np.linspace(0, 1, size), 28 | name='uncategorized') 29 | df = pd.concat([psi0, psi1, middle, bimodal, uncategorized], axis=1) 30 | return df 31 | -------------------------------------------------------------------------------- /anchor/util.py: -------------------------------------------------------------------------------- 1 | from .visualize import MODALITY_ORDER 2 | 3 | n_events = 'Number of alternative events' 4 | 5 | 6 | def tidify_modalities(modality_assignments, name='event_id'): 7 | modalities_tidy = modality_assignments.stack().reset_index() 8 | modalities_tidy = modalities_tidy.rename( 9 | columns={'level_1': name, 0: "modality"}) 10 | return modalities_tidy 11 | 12 | 13 | def count_modalities(tidy_modalities, name='event_id', group_name='phenotype'): 14 | modalities_counts = tidy_modalities.groupby( 15 | [group_name, 'modality']).count().reset_index() 16 | modalities_counts = modalities_counts.rename( 17 | columns={name: n_events}) 18 | n_events_grouped = modalities_counts.groupby('phenotype')[n_events] 19 | modalities_counts['percentage'] = 100*n_events_grouped.apply( 20 | lambda x: x/x.sum()) 21 | return modalities_counts 22 | 23 | 24 | def twodee_counts(modality_counts, group_name='phenotype', group_order=None): 25 | modalities_counts_2d = modality_counts.pivot( 26 | index=group_name, columns='modality', values=n_events) 27 | modalities_counts_2d = modalities_counts_2d.reindex( 28 | columns=MODALITY_ORDER, index=group_order) 29 | modalities_counts_2d = modalities_counts_2d.T 30 | return modalities_counts_2d 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Olga Botvinnik 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | * Neither the name of anchor: Modality Estimator nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | try: 6 | from setuptools import setup 7 | except ImportError: 8 | from distutils.core import setup 9 | 10 | 11 | with open('README.md') as readme_file: 12 | readme = readme_file.read() 13 | 14 | # with open('HISTORY.rst') as history_file: 15 | # history = history_file.read().replace('.. :changelog:', '') 16 | 17 | with open('requirements.txt') as requirements_file: 18 | requirements = requirements_file.read() 19 | 20 | test_requirements = [ 21 | 'pytest' 22 | ] 23 | 24 | setup( 25 | name='anchor-bio', 26 | version='1.1.1', 27 | description="Anchor is a python package to estimate modality of splicing, percent methylated, any data that is normalized between 0 and 1", 28 | long_description=readme, 29 | author="Olga Botvinnik", 30 | author_email='olga.botvinnik@gmail.com', 31 | url='https://github.com/YeoLab/anchor', 32 | packages=[ 33 | 'anchor', 34 | ], 35 | package_dir={'anchor-bio': 36 | 'anchor'}, 37 | include_package_data=True, 38 | install_requires=requirements, 39 | license="BSD", 40 | zip_safe=False, 41 | keywords='anchor', 42 | classifiers=[ 43 | 'Intended Audience :: Developers', 44 | 'License :: OSI Approved :: BSD License', 45 | 'Natural Language :: English', 46 | "Programming Language :: Python :: 2", 47 | 'Programming Language :: Python :: 2.7', 48 | 'Programming Language :: Python :: 3', 49 | 'Programming Language :: Python :: 3.4', 50 | 'Programming Language :: Python :: 3.5', 51 | 'Programming Language :: Python :: 3.6', 52 | ], 53 | test_suite='tests', 54 | tests_require=test_requirements 55 | ) 56 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '2.7' 4 | - '3.4' 5 | - '3.5' 6 | - '3.6' 7 | before_install: 8 | # http://conda.pydata.org/docs/travis.html#the-travis-yml-file 9 | - wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 10 | - bash miniconda.sh -b -p $HOME/miniconda 11 | - export PATH="$HOME/miniconda/bin:$PATH" 12 | - hash -r 13 | - conda config --set always_yes yes --set changeps1 no 14 | - conda update -q conda 15 | - conda info -a 16 | install: 17 | - conda create -n testenv --yes python=$TRAVIS_PYTHON_VERSION pip 18 | - source activate testenv 19 | - conda install --file conda_requirements.txt 20 | - pip install -r requirements.txt 21 | before_script: 22 | - git config --global user.email "olga.botvinnik@gmail.com" 23 | - git config --global user.name "YeoLab" 24 | - git config --global push.default simple 25 | - export REPO_URL_GITHUB="https://$GH_TOKEN@github.com/$GH_REPO.git" 26 | - export LANGUAGE=en_US.UTF-8 27 | - export LANG=en_US.UTF-8 28 | - export LC_ALL=en_US.UTF-8 29 | - locale-gen en_US.UTF-8 30 | - sudo dpkg-reconfigure locales 31 | script: 32 | - make coverage 33 | - make lint 34 | deploy: 35 | provider: pypi 36 | user: obot 37 | password: 38 | secure: KLIIx6HQkgj3wkhLrcX06Ry9jztDl/mz5SV5xo1ZMmqRkNbME3UxVIF1SwlCLhdZvycVh6MvLBLn/8Uq8xGGxtFqBYGf3y1+Coalmmvv4X/fGUby0F8PkmegULwh6h65tqt5KtGIh9M0eknUYnx/Ib/r3ontd1yCrvul14LGyAcKOR9oP6nmrLOwIqMPXeD5s5zlrGvEKnTulxowOgKOGavnyxaFfBqmdRlGokbhXxUGGMmW3PNRx2jlSFTPozZwzp2mfCkKg/7hslwIYln4ovkTNXiFIiOMm4WRH9/Nzae3nSxTcMw7SB53lrgcL3HoA2sBS/sjh623VdWbuzUX9FQw3We9lb2AfJglsZgkW4xQ1wfKS9WwrgTwHN1Q7490//J4xk39SnTvm8sdNQYIKviNJLhmqPTEFJoxRZnbtmuLMrc5qi14663RLYhL/veTqHGi3J3iNwlK1tZax15CayXUeasuRjHlUcBvOnfpK6YYSuNvlFUGmzFNkAYPaX/BJ/J9il39LNTByregWzJThuct+hl2nG2V0VBFOSlzJgIVphQchcl5JMTF8dboR3BuuvxTMbb7LyU1mwQBO85jaT7ikJqZpG9o+cG2uJ5KYE4gt+LWaMReHVN9xWaWA+HsJginyP5WrnjSnivIetyw0f/oR5C1sCaVmb8DY80Ap0Q= 39 | on: 40 | tags: true 41 | repo: YeoLab/anchor 42 | after_success: 43 | - pip install codecov 44 | - codecov 45 | -------------------------------------------------------------------------------- /anchor/binning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from .infotheory import binify, bin_range_strings, jsd 5 | from .visualize import MODALITY_ORDER 6 | 7 | 8 | class BinnedModalities(object): 9 | 10 | modalities = MODALITY_ORDER 11 | score_name = 'Jensen-Shannon Divergence' 12 | 13 | def __init__(self, bins=(0, 1./3, 2./3, 1)): 14 | if len(bins) != 4: 15 | raise ValueError('Length of "bins" must be exactly 4 bin edges') 16 | self.bins = bins 17 | 18 | self.bin_ranges = bin_range_strings(self.bins) 19 | uniform_probabilities = [stop-start for start, stop in 20 | zip(bins, bins[1:])] 21 | 22 | self.desired_distributions = pd.DataFrame( 23 | np.array([[1, 0, 0], [0.5, 0, 0.5], 24 | [0, 0, 1], [0, 1, 0], uniform_probabilities]).T, 25 | index=self.bin_ranges, columns=self.modalities) 26 | 27 | def fit(self, data): 28 | binned = binify(data, bins=self.bins) 29 | if isinstance(binned, pd.DataFrame): 30 | fitted = binned.apply(lambda x: self.desired_distributions.apply( 31 | lambda y: jsd(x, y))) 32 | else: 33 | fitted = self.desired_distributions.apply(lambda x: jsd(x, binned)) 34 | fitted.name = self.score_name 35 | return fitted 36 | 37 | def predict(self, fitted): 38 | """Assign the most likely modality given the fitted data 39 | 40 | Parameters 41 | ---------- 42 | fitted : pandas.DataFrame or pandas.Series 43 | Either a (n_modalities, features) DatFrame or (n_modalities,) 44 | Series, either of which will return the best modality for each 45 | feature. 46 | """ 47 | if fitted.shape[0] != len(self.modalities): 48 | raise ValueError("This data doesn't look like it had the distance " 49 | "between it and the five modalities calculated") 50 | return fitted.idxmin() 51 | 52 | def fit_predict(self, data): 53 | return self.predict(self.fit(data)) 54 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | 3 | help: 4 | @echo "lint - check code style with flake8" 5 | @echo "test - run tests quickly" 6 | @echo "coverage - check code coverage quickly" 7 | @echo "install - Maybe get Miniconda, and install this package into an environment" 8 | @echo " using conda. Requires 'PYTHON_VERSION' argument (see example below)" 9 | @echo " Example:" 10 | @echo " 'make PYTHON_VERSION=2.7 install' will install anchor into an environment" 11 | @echo " called 'anchor_py2.7', cowardly not overwriting any environment that" 12 | @echo " existed there before." 13 | 14 | clean-build: 15 | rm -fr build/ 16 | rm -fr dist/ 17 | rm -fr .eggs/ 18 | find . -name '*.egg-info' -exec rm -fr {} + 19 | find . -name '*.egg' -exec rm -f {} + 20 | 21 | clean-pyc: 22 | find . -name '*.pyc' -exec rm -f {} + 23 | find . -name '*.pyo' -exec rm -f {} + 24 | find . -name '*~' -exec rm -f {} + 25 | find . -name '__pycache__' -exec rm -fr {} + 26 | 27 | clean-test: 28 | rm -fr .tox/ 29 | rm -f .coverage 30 | rm -fr htmlcov/ 31 | 32 | test: 33 | cp testing/matplotlibrc . 34 | py.test 35 | rm matplotlibrc 36 | 37 | coverage: clean-pyc 38 | cp testing/matplotlibrc . 39 | py.test --cov=./ 40 | rm matplotlibrc 41 | 42 | lint: 43 | flake8 anchor 44 | 45 | get_miniconda: 46 | # http://conda.pydata.org/docs/travis.html#the-travis-yml-file 47 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 48 | bash miniconda.sh -b -p $$HOME/miniconda 49 | export PATH="$HOME/miniconda/bin:$PATH" 50 | hash -r 51 | conda config --set always_yes yes --set changeps1 no 52 | conda update -q conda 53 | conda info -a 54 | 55 | install: 56 | # --- Create anchor environment 57 | conda create -n anchor_py${PYTHON_VERSION} --yes python=${PYTHON_VERSION} pip 58 | \ 59 | # Update conda again 60 | conda update conda \ 61 | \ 62 | # --- Activate environment 63 | source activate anchor_py${PYTHON_VERSION} \ 64 | \ 65 | # --- Install conda requirements first, then the rest by pip \ 66 | conda install --yes --file conda_requirements.txt \ 67 | pip install -r requirements.txt \ 68 | 69 | # --- Install anchor itself 70 | pip install . 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Anchor logo](https://raw.githubusercontent.com/YeoLab/anchor/master/logo/v1/logo.png) 2 | 3 | [![](https://img.shields.io/travis/YeoLab/anchor.svg)](https://travis-ci.org/YeoLab/anchor)[![](https://img.shields.io/pypi/v/anchor.svg)](https://pypi.python.org/pypi/anchor)[![codecov](https://codecov.io/gh/YeoLab/anchor/branch/master/graph/badge.svg)](https://codecov.io/gh/YeoLab/anchor) 4 | 5 | ## What is `anchor`? 6 | 7 | Anchor is a python package to find unimodal, bimodal, and multimodal features in any data that is normalized between 0 and 1, for example alternative splicing or other percent-based units. 8 | 9 | * Free software: BSD license 10 | * Documentation: https://YeoLab.github.io/anchor 11 | 12 | ## Installation 13 | 14 | To install `anchor`, we recommend using the 15 | [Anaconda Python Distribution](http://anaconda.org/) and creating an 16 | environment, so the `anchor` code and dependencies don't interfere with 17 | anything else. Here is the command to create an environment: 18 | 19 | 20 | ``` 21 | conda create -n anchor-env pandas scipy numpy matplotlib seaborn 22 | ``` 23 | 24 | ### Stable (recommended) 25 | 26 | 27 | To install this code from the Python Package Index, you'll need to specify ``anchor-bio`` (``anchor`` was already taken - boo). 28 | 29 | ``` 30 | pip install anchor-bio 31 | ``` 32 | 33 | ### Bleeding-edge (for the brave) 34 | 35 | If you want the latest and greatest version, clone this github repository and use `pip` to install 36 | 37 | ``` 38 | git clone git@github.com:YeoLab/anchor 39 | cd anchor 40 | pip install . # The "." means "install *this*, the folder where I am now" 41 | ``` 42 | 43 | 44 | ## Usage 45 | 46 | `anchor` was structured like `scikit-learn`, where if you want the "final 47 | answer" of your estimator, you use `fit_transform()`, but if you want to see the 48 | intermediates, you use `fit()`. 49 | 50 | If you want the modality assignments for your data, first make sure that you 51 | have a `pandas.DataFrame`, here it is called `data`, in the format (samples, 52 | features). This uses a log2 Bayes Factor cutoff of 5, and the default Beta 53 | distribution parameterizations (shown [here]()) 54 | 55 | ```python 56 | import anchor 57 | 58 | bm = anchor.BayesianModalities() 59 | modalities = bm.fit_transform(data) 60 | ``` 61 | 62 | If you want to see all the intermediate Bayes factors, then you can do: 63 | 64 | ```python 65 | import anchor 66 | 67 | bm = anchor.BayesianModalities() 68 | bayes_factors = bm.fit(data) 69 | ``` 70 | 71 | 72 | ## History 73 | 74 | ### 1.1.1 (2017-06-29) 75 | 76 | - In `infotheory.binify`, round the decimal numbers before they are written as strings 77 | 78 | ### 1.0.1 (2017-06-28) 79 | 80 | - Documentation and build fixes 81 | 82 | ### 1.0.0 (2017-06-28) 83 | 84 | * Updated to Python 3.5, 3.6 85 | 86 | ### 0.1.0 (2015-07-08) 87 | 88 | * First release on PyPI. 89 | -------------------------------------------------------------------------------- /anchor/tests/test_bayesian.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import numpy.testing as npt 4 | import pandas as pd 5 | import pandas.util.testing as pdt 6 | import pytest 7 | 8 | 9 | class TestModalityEstimator(object): 10 | @pytest.fixture 11 | def step(self): 12 | return 1. 13 | 14 | @pytest.fixture 15 | def vmax(self): 16 | return 20. 17 | 18 | @pytest.fixture(params=[2, 3]) 19 | def logbf_thresh(self, request): 20 | return request.param 21 | 22 | @pytest.fixture 23 | def estimator(self, logbf_thresh): 24 | from anchor.bayesian import BayesianModalities, ONE_PARAMETER_MODELS, \ 25 | TWO_PARAMETER_MODELS 26 | 27 | return BayesianModalities( 28 | one_parameter_models=ONE_PARAMETER_MODELS, 29 | two_parameter_models=TWO_PARAMETER_MODELS, 30 | logbf_thresh=logbf_thresh) 31 | 32 | def test_init(self, logbf_thresh): 33 | from anchor import BayesianModalities, ModalityModel 34 | from anchor.bayesian import ONE_PARAMETER_MODELS, \ 35 | TWO_PARAMETER_MODELS 36 | 37 | estimator = BayesianModalities( 38 | one_parameter_models=ONE_PARAMETER_MODELS, 39 | two_parameter_models=TWO_PARAMETER_MODELS, 40 | logbf_thresh=logbf_thresh) 41 | 42 | true_one_param_models = {k: ModalityModel(**v) 43 | for k, v in ONE_PARAMETER_MODELS.items()} 44 | 45 | true_two_param_models = {k: ModalityModel(**v) 46 | for k, v in TWO_PARAMETER_MODELS.items()} 47 | 48 | npt.assert_equal(estimator.logbf_thresh, logbf_thresh) 49 | pdt.assert_dict_equal(estimator.one_param_models, 50 | true_one_param_models) 51 | pdt.assert_dict_equal(estimator.two_param_models, 52 | true_two_param_models) 53 | 54 | @pytest.mark.xfail 55 | def test_fit_transform_greater_than1(self, estimator): 56 | nrows = 10 57 | ncols = 5 58 | data = pd.DataFrame( 59 | np.abs(np.random.randn(nrows, ncols).reshape(nrows, ncols))+10) 60 | estimator.fit(data) 61 | 62 | @pytest.mark.xfail 63 | def test_fit_transform_less_than1(self, estimator): 64 | nrows = 10 65 | ncols = 5 66 | data = pd.DataFrame( 67 | np.abs(np.random.randn(nrows, ncols).reshape(nrows, ncols))-10) 68 | estimator.fit(data) 69 | 70 | def test_positive_control(self, estimator, positive_control): 71 | """Make sure estimator correctly assigns modalities to known events""" 72 | log2bf = estimator.fit(positive_control.copy()) 73 | test = estimator.predict(log2bf) 74 | 75 | pdt.assert_numpy_array_equal(test.values, test.index.values) 76 | 77 | def test_violinplot(self, estimator): 78 | estimator.violinplot(n=100) 79 | fig = plt.gcf() 80 | assert len(fig.axes) == len(estimator.models) 81 | plt.close('all') 82 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | Contributions are welcome, and they are greatly appreciated! Every little bit helps, and credit will always be given. 5 | 6 | You can contribute in many ways: 7 | 8 | Types of Contributions 9 | ---------------------- 10 | 11 | ### Report Bugs 12 | 13 | Report bugs at . 14 | 15 | If you are reporting a bug, please include: 16 | 17 | - Your operating system name and version. 18 | - Any details about your local setup that might be helpful in troubleshooting. 19 | - Detailed steps to reproduce the bug. 20 | 21 | ### Fix Bugs 22 | 23 | Look through the GitHub issues for bugs. Anything tagged with "bug" is open to whoever wants to implement it. 24 | 25 | ### Implement Features 26 | 27 | Look through the GitHub issues for features. Anything tagged with "feature" is open to whoever wants to implement it. 28 | 29 | ### Write Documentation 30 | 31 | anchor: Modality Estimator could always use more documentation, whether as part of the official anchor: Modality Estimator docs, in docstrings, or even on the web in blog posts, articles, and such. 32 | 33 | ### Submit Feedback 34 | 35 | The best way to send feedback is to file an issue at . 36 | 37 | If you are proposing a feature: 38 | 39 | - Explain in detail how it would work. 40 | - Keep the scope as narrow as possible, to make it easier to implement. 41 | - Remember that this is a volunteer-driven project, and that contributions are welcome :) 42 | 43 | Get Started! 44 | ------------ 45 | 46 | Ready to contribute? Here's how to set up anchor for local development. 47 | 48 | 1. Fork the anchor repo on GitHub. 49 | 2. Clone your fork locally: 50 | 51 | $ git clone git@github.com:your_name_here/anchor.git 52 | 53 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development: 54 | 55 | $ mkvirtualenv anchor 56 | $ cd anchor/ 57 | $ python setup.py develop 58 | 59 | 4. Create a branch for local development: 60 | 61 | $ git checkout -b name-of-your-bugfix-or-feature 62 | 63 | Now you can make your changes locally. 64 | 65 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox: 66 | 67 | $ flake8 anchor tests 68 | $ py.test 69 | $ tox 70 | 71 | To get flake8 and tox, just pip install them into your virtualenv. 72 | 73 | 6. Commit your changes and push your branch to GitHub: 74 | 75 | $ git add . 76 | $ git commit -m "Your detailed description of your changes." 77 | $ git push origin name-of-your-bugfix-or-feature 78 | 79 | 7. Submit a pull request through the GitHub website. 80 | 81 | Pull Request Guidelines 82 | ----------------------- 83 | 84 | Before you submit a pull request, check that it meets these guidelines: 85 | 86 | 1. The pull request should include tests. 87 | 2. If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add the feature to the list in README.rst. 88 | 3. The pull request should work for Python 2.6, 2.7, 3.3, and 3.4, and for PyPy. Check and make sure that the tests pass for all supported Python versions. 89 | 90 | Tips 91 | ---- 92 | 93 | To run a subset of tests: 94 | 95 | $ python -m unittest tests.test_anchor 96 | -------------------------------------------------------------------------------- /anchor/tests/test_infotheory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas.util.testing as pdt 3 | import pandas as pd 4 | import pytest 5 | import six 6 | 7 | 8 | @pytest.fixture 9 | def size(): 10 | return 10 11 | 12 | 13 | @pytest.fixture 14 | def data(size): 15 | df = pd.DataFrame(np.tile(np.arange(size), (size, 1))) 16 | df.index = df.index.astype(str) 17 | df.columns = df.columns.astype(str) 18 | return df 19 | 20 | 21 | @pytest.fixture 22 | def df1(data): 23 | return data 24 | 25 | 26 | @pytest.fixture 27 | def df2(data): 28 | return data.T 29 | 30 | 31 | @pytest.fixture 32 | def p(df1, bins): 33 | from anchor.infotheory import binify 34 | 35 | return binify(df1, bins) 36 | 37 | 38 | @pytest.fixture 39 | def q(df2, bins): 40 | from anchor.infotheory import binify 41 | 42 | return binify(df2, bins) 43 | 44 | 45 | @pytest.fixture 46 | def bins(size): 47 | return np.linspace(0, size, num=5) 48 | 49 | 50 | @pytest.fixture( 51 | params=((None, ['0-2.5', '2.5-5', '5-7.5', '7.5-10']), 52 | (':.2f', ['0.00-2.50', '2.50-5.00', '5.00-7.50', '7.50-10.00']))) 53 | def fmt_true(request): 54 | return request.param 55 | 56 | 57 | def test_bin_range_strings(bins, fmt_true): 58 | from anchor.infotheory import bin_range_strings 59 | 60 | fmt, true = fmt_true 61 | 62 | if fmt is None: 63 | test = bin_range_strings(bins) 64 | else: 65 | test = bin_range_strings(bins, fmt=fmt) 66 | 67 | assert test == true 68 | 69 | 70 | @pytest.fixture( 71 | params=(pytest.mark.xfail(-np.ones(10)), 72 | pytest.mark.xfail(np.zeros(10)), 73 | pytest.mark.xfail(np.ones(10)))) 74 | def x(request): 75 | return request.param 76 | 77 | 78 | def test__check_prob_dist(x): 79 | from anchor.infotheory import _check_prob_dist 80 | 81 | # All the tests should raise an error 82 | _check_prob_dist(x) 83 | 84 | 85 | def test_binify(df1, bins): 86 | from anchor.infotheory import binify 87 | 88 | test = binify(df1, bins) 89 | 90 | s = ''',0,1,2,3,4,5,6,7,8,9 91 | 0-2.5,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 92 | 2.5-5,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0 93 | 5-7.5,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0 94 | 7.5-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0''' 95 | true = pd.read_csv(six.StringIO(s), index_col=0) 96 | pdt.assert_frame_equal(test, true) 97 | 98 | 99 | def test_kld(p, q): 100 | from anchor.infotheory import kld 101 | test = kld(p, q) 102 | 103 | s = '''0,1.7369655941662063 104 | 1,1.7369655941662063 105 | 2,1.7369655941662063 106 | 3,2.321928094887362 107 | 4,2.321928094887362 108 | 5,1.7369655941662063 109 | 6,1.7369655941662063 110 | 7,1.7369655941662063 111 | 8,2.321928094887362 112 | 9,2.321928094887362''' 113 | true = pd.read_csv(six.StringIO(s), index_col=0, squeeze=True, header=None) 114 | true.index.name = None 115 | true.name = None 116 | true.index = true.index.astype(str) 117 | 118 | pdt.assert_series_equal(test, true) 119 | 120 | 121 | def test_jsd(p, q): 122 | from anchor.infotheory import jsd 123 | test = jsd(p, q) 124 | 125 | s = '''0,0.49342260576014463 126 | 1,0.49342260576014463 127 | 2,0.49342260576014463 128 | 3,0.6099865470109875 129 | 4,0.6099865470109875 130 | 5,0.49342260576014463 131 | 6,0.49342260576014463 132 | 7,0.49342260576014463 133 | 8,0.6099865470109875 134 | 9,0.6099865470109875''' 135 | true = pd.read_csv(six.StringIO(s), index_col=0, squeeze=True, header=None) 136 | true.index.name = None 137 | true.name = None 138 | true.index = true.index.astype(str) 139 | 140 | pdt.assert_series_equal(test, true) 141 | -------------------------------------------------------------------------------- /anchor/tests/test_model.py: -------------------------------------------------------------------------------- 1 | from collections import Iterable 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import numpy.testing as npt 6 | import pytest 7 | from scipy import stats 8 | from scipy.misc import logsumexp 9 | 10 | 11 | class TestModalityModel(object): 12 | @pytest.fixture() 13 | def x(self): 14 | return np.arange(0, 1.1, 0.1) 15 | 16 | @pytest.fixture(params=[1, np.arange(1, 5)]) 17 | def alphas(self, request): 18 | return request.param 19 | 20 | @pytest.fixture(params=[1, np.arange(1, 5)]) 21 | def betas(self, request): 22 | return request.param 23 | 24 | @pytest.fixture() 25 | def alpha(self): 26 | return np.arange(1, 5) 27 | 28 | @pytest.fixture() 29 | def beta(self): 30 | return 1. 31 | 32 | @pytest.fixture() 33 | def model(self, alpha, beta): 34 | from anchor import ModalityModel 35 | 36 | return ModalityModel(alpha, beta) 37 | 38 | def test_init(self, alphas, betas): 39 | from anchor import ModalityModel 40 | 41 | model = ModalityModel(alphas, betas) 42 | 43 | true_alphas = alphas 44 | true_betas = betas 45 | if not isinstance(alphas, Iterable) and not isinstance(betas, 46 | Iterable): 47 | true_alphas = [alphas] 48 | true_betas = [betas] 49 | 50 | true_alphas = np.array(true_alphas) \ 51 | if isinstance(true_alphas, Iterable) else np.ones( 52 | len(true_betas)) * true_alphas 53 | true_betas = np.array(true_betas) \ 54 | if isinstance(true_betas, Iterable) else np.ones( 55 | len(true_alphas)) * true_betas 56 | 57 | true_rvs = [stats.beta(a, b) for a, b in 58 | zip(true_alphas, true_betas)] 59 | true_scores = np.ones(true_alphas.shape).astype(float) 60 | true_scores = true_scores / true_scores.max() 61 | true_prob_parameters = true_scores / true_scores.sum() 62 | 63 | npt.assert_array_equal(model.alphas, true_alphas) 64 | npt.assert_array_equal(model.betas, true_betas) 65 | npt.assert_array_equal(model.scores, true_scores) 66 | npt.assert_array_equal(model.prob_parameters, true_prob_parameters) 67 | for test_rv, true_rv in zip(model.rvs, true_rvs): 68 | npt.assert_array_equal(test_rv.args, true_rv.args) 69 | 70 | def test_logliks(self, x, model): 71 | test_logliks = model.logliks(x) 72 | 73 | true_x = x.copy() 74 | true_x[true_x == 0] = 0.001 75 | true_x[true_x == 1] = 0.999 76 | true_logliks = np.array([np.log(prob) + rv.logpdf(true_x).sum() 77 | for prob, rv in zip(model.prob_parameters, 78 | model.rvs)]) 79 | npt.assert_array_equal(test_logliks, true_logliks) 80 | 81 | def test_logsumexp_logliks(self, x, model): 82 | test_logsumexp_logliks = model.logsumexp_logliks(x) 83 | 84 | npt.assert_array_equal(test_logsumexp_logliks, 85 | logsumexp(model.logliks(x))) 86 | 87 | def test_eq(self, alphas, betas): 88 | from anchor import ModalityModel 89 | 90 | model1 = ModalityModel(alphas, betas) 91 | model2 = ModalityModel(alphas, betas) 92 | assert model1 == model2 93 | 94 | def test_ne(self, alphas, betas): 95 | from anchor import ModalityModel 96 | 97 | if np.all(alphas == betas): 98 | assert 1 99 | return 100 | 101 | model1 = ModalityModel(alphas, betas) 102 | model2 = ModalityModel(betas, alphas) 103 | assert model1 != model2 104 | 105 | def test_violinplot(self, model): 106 | model.violinplot(n=100) 107 | ax = plt.gca() 108 | assert len(ax.collections) == len(model.rvs) 109 | plt.close('all') 110 | -------------------------------------------------------------------------------- /anchor/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model splicing events as beta distributions 3 | """ 4 | 5 | from collections import Iterable 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import pandas as pd 10 | from scipy import stats 11 | from scipy.misc import logsumexp 12 | import seaborn as sns 13 | 14 | from .visualize import violinplot 15 | 16 | 17 | VERY_SMALL_NUMBER = 0.001 18 | SINGLE_FEATURE_COLUMNS = [r'$\log$ Likelihood', r'$\alpha$', r'$\beta$'] 19 | 20 | 21 | class ModalityModel(object): 22 | """Object to model modalities from beta distributions""" 23 | 24 | def __init__(self, alphas, betas, ylabel='$\Psi$'): 25 | """Model a family of beta distributions 26 | 27 | Parameters 28 | ---------- 29 | alphas : float or list-like 30 | List of values for the alpha parameter of the Beta distribution. If 31 | this is a single value (not a list), it will be assumed that this 32 | value is constant, and will be propagated through to have as many 33 | values as the "betas" parameter 34 | betas : float or list-like 35 | List of values for the alpha parameter of the Beta distribution. If 36 | this is a single value (not a list), it will be assumed that this 37 | value is constant, and will be propagated through to have as many 38 | values as the "alphas" parameter 39 | ylabel : str, optional 40 | Name of the value you're estimating. Originally developed for 41 | alternative splicing "percent spliced in"/"Psi" scores, the default 42 | is the Greek letter Psi 43 | """ 44 | if not isinstance(alphas, Iterable) and not isinstance(betas, 45 | Iterable): 46 | alphas = [alphas] 47 | betas = [betas] 48 | 49 | self.ylabel = ylabel 50 | 51 | self.alphas = np.array(alphas) if isinstance(alphas, Iterable) \ 52 | else np.ones(len(betas)) * alphas 53 | self.betas = np.array(betas) if isinstance(betas, Iterable) \ 54 | else np.ones(len(alphas)) * betas 55 | 56 | self.rvs = [stats.beta(a, b) for a, b in 57 | zip(self.alphas, self.betas)] 58 | self.scores = np.ones(self.alphas.shape).astype(float) 59 | self.prob_parameters = self.scores/self.scores.sum() 60 | 61 | def __eq__(self, other): 62 | """Test equality with other model""" 63 | return np.all(self.alphas == other.alphas) \ 64 | and np.all(self.betas == other.betas) \ 65 | and np.all(self.prob_parameters == other.prob_parameters) 66 | 67 | def __ne__(self, other): 68 | """Test not equality with other model""" 69 | return not self.__eq__(other) 70 | 71 | def logliks(self, x): 72 | """Calculate log-likelihood of a feature x for each model 73 | 74 | Converts all values that are exactly 1 or exactly 0 to 0.999 and 0.001 75 | because they are out of range of the beta distribution. 76 | 77 | Parameters 78 | ---------- 79 | x : numpy.array-like 80 | A single vector to estimate the log-likelihood of the models on 81 | 82 | Returns 83 | ------- 84 | logliks : numpy.array 85 | Log-likelihood of these data in each member of the model's family 86 | """ 87 | x = x.copy() 88 | 89 | # Replace exactly 0 and exactly 1 values with a very small number 90 | # (machine epsilon, the smallest number that this computer is capable 91 | # of storing) because 0 and 1 are not in the Beta distribution. 92 | x[x == 0] = VERY_SMALL_NUMBER 93 | x[x == 1] = 1 - VERY_SMALL_NUMBER 94 | 95 | return np.array([np.log(prob) + rv.logpdf(x[np.isfinite(x)]).sum() 96 | for prob, rv in 97 | zip(self.prob_parameters, self.rvs)]) 98 | 99 | def single_feature_logliks(self, feature): 100 | data = zip(self.logliks(feature), self.alphas, self.betas) 101 | return pd.DataFrame(data, columns=SINGLE_FEATURE_COLUMNS) 102 | 103 | def logsumexp_logliks(self, x): 104 | """Calculate how well this model fits these data 105 | 106 | Parameters 107 | ---------- 108 | x : numpy.array-like 109 | A single vector to estimate the log-likelihood of the models on 110 | 111 | Returns 112 | ------- 113 | logsumexp_logliks : float 114 | Total log-likelihood of this model given this data 115 | """ 116 | return logsumexp(self.logliks(x)) 117 | 118 | @staticmethod 119 | def nice_number_string(number, decimal_places=2): 120 | """Convert floats to either integers or a nice looking fraction""" 121 | if number == np.round(number): 122 | return str(int(number)) 123 | elif number < 1 and number > 0: 124 | inverse = 1 / number 125 | if int(inverse) == np.round(inverse): 126 | return r'\frac{{1}}{{{}}}'.format(int(inverse)) 127 | else: 128 | template = '{{:.{0}}}'.format(decimal_places) 129 | return template.format(number) 130 | 131 | def violinplot(self, n=1000, **kwargs): 132 | """Plot violins of each distribution in the model family 133 | 134 | Parameters 135 | ---------- 136 | n : int 137 | Number of random variables to generate 138 | kwargs : dict or keywords 139 | Any keyword arguments to seaborn.violinplot 140 | 141 | Returns 142 | ------- 143 | ax : matplotlib.Axes object 144 | Axes object with violins plotted 145 | """ 146 | kwargs.setdefault('palette', 'Purples') 147 | 148 | dfs = [] 149 | 150 | for rv in self.rvs: 151 | psi = rv.rvs(n) 152 | df = pd.Series(psi, name=self.ylabel).to_frame() 153 | alpha, beta = rv.args 154 | alpha = self.nice_number_string(alpha, decimal_places=2) 155 | beta = self.nice_number_string(beta, decimal_places=2) 156 | 157 | df['parameters'] = '$\\alpha = {0}$\n$\\beta = {1}$'.format( 158 | alpha, beta) 159 | dfs.append(df) 160 | data = pd.concat(dfs) 161 | 162 | if 'ax' not in kwargs: 163 | fig, ax = plt.subplots(figsize=(len(self.alphas)*0.625, 4)) 164 | else: 165 | ax = kwargs.pop('ax') 166 | ax = violinplot(x='parameters', y=self.ylabel, data=data, 167 | ax=ax, **kwargs) 168 | sns.despine(ax=ax) 169 | return ax 170 | -------------------------------------------------------------------------------- /anchor/simulate.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import seaborn as sns 6 | 7 | import six 8 | 9 | from .visualize import violinplot, MODALITY_ORDER, MODALITY_TO_COLOR, barplot 10 | 11 | 12 | def add_noise(data, iteration_per_noise=100, 13 | noise_percentages=np.arange(0, 101, step=10), plot=True, 14 | violinplot_kws=None, figure_prefix='anchor_simulation'): 15 | 16 | data_dfs = [] 17 | 18 | violinplot_kws = {} if violinplot_kws is None else violinplot_kws 19 | 20 | width = len(data.columns) * 0.75 21 | alpha = max(0.05, 1. / iteration_per_noise) 22 | 23 | for noise_percentage in noise_percentages: 24 | if plot: 25 | fig, ax = plt.subplots(figsize=(width, 3)) 26 | for iteration in range(iteration_per_noise): 27 | if iteration > 0 and noise_percentage == 0: 28 | continue 29 | noisy_data = data.copy() 30 | shape = (noisy_data.shape[0] * noise_percentage / 100, 31 | noisy_data.shape[1]) 32 | size = np.product(shape) 33 | noise_ind = np.random.choice(noisy_data.index, 34 | size=noise_percentage, 35 | replace=False) 36 | noisy_data.loc[noise_ind] = np.random.uniform( 37 | low=0., high=1., size=size).reshape(shape) 38 | 39 | renamer = dict( 40 | (col, '{}_noise{}_iter{}'.format( 41 | col, noise_percentage, iteration)) 42 | for col in noisy_data.columns) 43 | 44 | renamed = noisy_data.rename(columns=renamer) 45 | data_dfs.append(renamed) 46 | if plot: 47 | noisy_data_tidy = noisy_data.unstack() 48 | noisy_data_tidy = noisy_data_tidy.reset_index() 49 | noisy_data_tidy = noisy_data_tidy.rename( 50 | columns={'level_0': 'Feature ID', 51 | 'level_1': 'Sample ID', 52 | 0: '$\Psi$'}) 53 | violinplot(x='Feature ID', y='$\Psi$', 54 | data=noisy_data_tidy, ax=ax, 55 | **violinplot_kws) 56 | 57 | if plot: 58 | if noise_percentage > 0: 59 | for c in ax.collections: 60 | c.set_alpha(alpha) 61 | ax.set(ylim=(0, 1), title='{}% Uniform Noise'.format( 62 | noise_percentage), yticks=(0, 0.5, 1), ylabel='$\Psi$', 63 | xlabel='') 64 | plt.setp(ax.get_xticklabels(), rotation=90) 65 | sns.despine() 66 | fig.tight_layout() 67 | fig.savefig('{}_noise_percentage_{}.pdf'.format(figure_prefix, 68 | noise_percentage)) 69 | 70 | all_noisy_data = pd.concat(data_dfs, axis=1) 71 | return all_noisy_data 72 | 73 | 74 | class ModalityEvaluator(object): 75 | 76 | def __init__(self, estimator, data, waypoints, fitted, predicted): 77 | self.estimator = estimator 78 | self.data = data 79 | self.predicted = predicted 80 | self.fitted = fitted 81 | self.waypoints = waypoints 82 | 83 | 84 | def evaluate_estimator(estimator, data, waypoints=None, figure_prefix=''): 85 | # 86 | # estimator.violinplot(n=1e3) 87 | # fig = plt.gcf() 88 | # for ax in fig.axes: 89 | # ax.set(yticks=[0, 0.5, 1], xlabel='') 90 | # # xticklabels = 91 | # # ax.set_xticklabels(fontsize=20) 92 | # fig.tight_layout() 93 | # sns.despine() 94 | # fig.savefig('{}_modality_parameterization.pdf'.format(figure_prefix)) 95 | 96 | fitted = estimator.fit(data) 97 | predicted = estimator.predict(fitted) 98 | predicted.name = 'Predicted Modality' 99 | 100 | fitted_tidy = fitted.stack().reset_index() 101 | fitted_tidy = fitted_tidy.rename( 102 | columns={'level_1': 'Feature ID', 'level_0': "Modality", 103 | 0: estimator.score_name}, copy=False) 104 | 105 | predicted_tidy = predicted.to_frame().reset_index() 106 | predicted_tidy = predicted_tidy.rename(columns={'index': 'Feature ID'}) 107 | predicted_tidy = predicted_tidy.merge( 108 | fitted_tidy, left_on=['Feature ID', 'Predicted Modality'], 109 | right_on=['Feature ID', 'Modality']) 110 | 111 | # Make categorical so they are plotted in the correct order 112 | predicted_tidy['Predicted Modality'] = \ 113 | pd.Categorical(predicted_tidy['Predicted Modality'], 114 | categories=MODALITY_ORDER, ordered=True) 115 | predicted_tidy['Modality'] = \ 116 | pd.Categorical(predicted_tidy['Modality'], 117 | categories=MODALITY_ORDER, ordered=True) 118 | 119 | grouped = data.groupby(predicted, axis=1) 120 | 121 | size = 5 122 | 123 | fig, axes = plt.subplots(figsize=(size*0.75, 8), nrows=len(grouped)) 124 | 125 | for ax, (modality, df) in zip(axes, grouped): 126 | random_ids = np.random.choice(df.columns, replace=False, size=size) 127 | random_df = df[random_ids] 128 | 129 | tidy_random = random_df.stack().reset_index() 130 | tidy_random = tidy_random.rename(columns={'level_0': 'sample_id', 131 | 'level_1': 'event_id', 132 | 0: '$\Psi$'}) 133 | sns.violinplot(x='event_id', y='$\Psi$', data=tidy_random, 134 | color=MODALITY_TO_COLOR[modality], ax=ax, 135 | inner=None, bw=0.2, scale='width') 136 | ax.set(ylim=(0, 1), yticks=(0, 0.5, 1), xticks=[], xlabel='', 137 | title=modality) 138 | sns.despine() 139 | fig.tight_layout() 140 | fig.savefig('{}_random_estimated_modalities.pdf'.format(figure_prefix)) 141 | 142 | g = barplot(predicted_tidy, hue='Modality') 143 | g.savefig('{}_modalities_barplot.pdf'.format(figure_prefix)) 144 | 145 | plot_best_worst_fits(predicted_tidy, data, modality_col='Modality', 146 | score=estimator.score_name) 147 | fig = plt.gcf() 148 | fig.savefig('{}_best_worst_fit_violinplots.pdf'.format(figure_prefix)) 149 | 150 | fitted.to_csv('{}_fitted.csv'.format(figure_prefix)) 151 | predicted.to_csv('{}_predicted.csv'.format(figure_prefix)) 152 | 153 | result = ModalityEvaluator(estimator, data, waypoints, fitted, predicted) 154 | 155 | return result 156 | 157 | 158 | def plot_best_worst_fits(assignments_df, data, modality_col='Modality', 159 | score='$\log_2 K$'): 160 | """Violinplots of the highest and lowest scoring of each modality""" 161 | ncols = 2 162 | nrows = len(assignments_df.groupby(modality_col).groups.keys()) 163 | 164 | fig, axes = plt.subplots(nrows=nrows, ncols=ncols, 165 | figsize=(nrows*4, ncols*6)) 166 | 167 | axes_iter = axes.flat 168 | 169 | fits = 'Highest', 'Lowest' 170 | 171 | for modality, df in assignments_df.groupby(modality_col): 172 | df = df.sort_values(score) 173 | 174 | color = MODALITY_TO_COLOR[modality] 175 | 176 | for fit in fits: 177 | if fit == 'Highest': 178 | ids = df['Feature ID'][-10:] 179 | else: 180 | ids = df['Feature ID'][:10] 181 | fit_psi = data[ids] 182 | tidy_fit_psi = fit_psi.stack().reset_index() 183 | tidy_fit_psi = tidy_fit_psi.rename(columns={'level_0': 'Sample ID', 184 | 'level_1': 185 | 'Feature ID', 186 | 0: '$\Psi$'}) 187 | if tidy_fit_psi.empty: 188 | continue 189 | ax = six.next(axes_iter) 190 | violinplot(x='Feature ID', y='$\Psi$', data=tidy_fit_psi, 191 | color=color, ax=ax) 192 | ax.set(title='{} {} {}'.format(fit, score, modality), xticks=[]) 193 | sns.despine() 194 | fig.tight_layout() 195 | -------------------------------------------------------------------------------- /anchor/infotheory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Information-theoretic calculations 3 | """ 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn import cross_validation 8 | 9 | EPSILON = 100 * np.finfo(float).eps 10 | 11 | 12 | def bin_range_strings(bins, fmt=':g'): 13 | """Given a list of bins, make a list of strings of those bin ranges 14 | 15 | Parameters 16 | ---------- 17 | bins : list_like 18 | List of anything, usually values of bin edges 19 | 20 | Returns 21 | ------- 22 | bin_ranges : list 23 | List of bin ranges 24 | 25 | >>> bin_range_strings((0, 0.5, 1)) 26 | ['0-0.5', '0.5-1'] 27 | """ 28 | return [('{' + fmt + '}-{' + fmt + '}').format(i, j) 29 | for i, j in zip(bins, bins[1:])] 30 | 31 | 32 | def _check_prob_dist(x): 33 | if np.any(x < 0): 34 | raise ValueError('Each column of the input dataframes must be ' 35 | '**non-negative** probability distributions') 36 | try: 37 | if np.any(np.abs(x.sum() - np.ones(x.shape[1])) > EPSILON): 38 | raise ValueError('Each column of the input dataframe must be ' 39 | 'probability distributions that **sum to 1**') 40 | except IndexError: 41 | if np.any(np.abs(x.sum() - 1) > EPSILON): 42 | raise ValueError('Each column of the input dataframe must be ' 43 | 'probability distributions that **sum to 1**') 44 | 45 | 46 | def binify(data, bins): 47 | """Makes a histogram of each column the provided binsize 48 | 49 | Parameters 50 | ---------- 51 | data : pandas.DataFrame 52 | A samples x features dataframe. Each feature (column) will be binned 53 | into the provided bins 54 | bins : iterable 55 | Bins you would like to use for this data. Must include the final bin 56 | value, e.g. (0, 0.5, 1) for the two bins (0, 0.5) and (0.5, 1). 57 | nbins = len(bins) - 1 58 | 59 | Returns 60 | ------- 61 | binned : pandas.DataFrame 62 | An nbins x features DataFrame of each column binned across rows 63 | """ 64 | if bins is None: 65 | raise ValueError('Must specify "bins"') 66 | if isinstance(data, pd.DataFrame): 67 | binned = data.apply(lambda x: pd.Series(np.histogram(x, bins=bins, 68 | range=(0, 1))[0])) 69 | elif isinstance(data, pd.Series): 70 | binned = pd.Series(np.histogram(data, bins=bins, range=(0, 1))[0]) 71 | else: 72 | raise ValueError('`data` must be either a 1d vector or 2d matrix') 73 | binned.index = bin_range_strings(bins) 74 | 75 | # Normalize so each column sums to 1 76 | binned = binned / binned.sum().astype(float) 77 | return binned 78 | 79 | 80 | def kld(p, q): 81 | """Kullback-Leiber divergence of two probability distributions pandas 82 | dataframes, p and q 83 | 84 | Parameters 85 | ---------- 86 | p : pandas.DataFrame 87 | An nbins x features DataFrame, or (nbins,) Series 88 | q : pandas.DataFrame 89 | An nbins x features DataFrame, or (nbins,) Series 90 | 91 | Returns 92 | ------- 93 | kld : pandas.Series 94 | Kullback-Lieber divergence of the common columns between the 95 | dataframe. E.g. between 1st column in p and 1st column in q, and 2nd 96 | column in p and 2nd column in q. 97 | 98 | Raises 99 | ------ 100 | ValueError 101 | If the data provided is not a probability distribution, i.e. it has 102 | negative values or its columns do not sum to 1, raise ValueError 103 | 104 | Notes 105 | ----- 106 | The input to this function must be probability distributions, not raw 107 | values. Otherwise, the output makes no sense. 108 | """ 109 | try: 110 | _check_prob_dist(p) 111 | _check_prob_dist(q) 112 | except ValueError: 113 | return np.nan 114 | # If one of them is zero, then the other should be considered to be 0. 115 | # In this problem formulation, log0 = 0 116 | p = p.replace(0, np.nan) 117 | q = q.replace(0, np.nan) 118 | 119 | return (np.log2(p / q) * p).sum(axis=0) 120 | 121 | 122 | def jsd(p, q): 123 | """Finds the per-column JSD between dataframes p and q 124 | 125 | Jensen-Shannon divergence of two probability distrubutions pandas 126 | dataframes, p and q. These distributions are usually created by running 127 | binify() on the dataframe. 128 | 129 | Parameters 130 | ---------- 131 | p : pandas.DataFrame 132 | An nbins x features DataFrame. 133 | q : pandas.DataFrame 134 | An nbins x features DataFrame. 135 | 136 | Returns 137 | ------- 138 | jsd : pandas.Series 139 | Jensen-Shannon divergence of each column with the same names between 140 | p and q 141 | 142 | Raises 143 | ------ 144 | ValueError 145 | If the data provided is not a probability distribution, i.e. it has 146 | negative values or its columns do not sum to 1, raise ValueError 147 | """ 148 | try: 149 | _check_prob_dist(p) 150 | _check_prob_dist(q) 151 | except ValueError: 152 | return np.nan 153 | weight = 0.5 154 | m = weight * (p + q) 155 | 156 | result = weight * kld(p, m) + (1 - weight) * kld(q, m) 157 | return result 158 | 159 | 160 | def entropy(binned, base=2): 161 | """Find the entropy of each column of a dataframe 162 | 163 | Parameters 164 | ---------- 165 | binned : pandas.DataFrame 166 | A nbins x features DataFrame of probability distributions, where each 167 | column sums to 1 168 | base : numeric 169 | The log-base of the entropy. Default is 2, so the resulting entropy 170 | is in bits. 171 | 172 | Returns 173 | ------- 174 | entropy : pandas.Seires 175 | Entropy values for each column of the dataframe. 176 | 177 | Raises 178 | ------ 179 | ValueError 180 | If the data provided is not a probability distribution, i.e. it has 181 | negative values or its columns do not sum to 1, raise ValueError 182 | """ 183 | try: 184 | _check_prob_dist(binned) 185 | except ValueError: 186 | np.nan 187 | return -((np.log(binned) / np.log(base)) * binned).sum(axis=0) 188 | 189 | 190 | def binify_and_jsd(df1, df2, bins, pair=None): 191 | """Binify and calculate jensen-shannon divergence between two dataframes 192 | 193 | Parameters 194 | ---------- 195 | df1, df2 : pandas.DataFrames 196 | Dataframes to calculate JSD between columns of. Must have overlapping 197 | column names 198 | bins : array-like 199 | Bins to use for transforming df{1,2} into probability distributions 200 | pair : str, optional 201 | Name of the pair to save as the name of the series 202 | 203 | Returns 204 | ------- 205 | divergence : pandas.Series 206 | The Jensen-Shannon divergence between columns of df1, df2 207 | """ 208 | binned1 = binify(df1, bins=bins).dropna(how='all', axis=1) 209 | binned2 = binify(df2, bins=bins).dropna(how='all', axis=1) 210 | 211 | binned1, binned2 = binned1.align(binned2, axis=1, join='inner') 212 | 213 | series = np.sqrt(jsd(binned1, binned2)) 214 | series.name = pair 215 | return series 216 | 217 | 218 | def cross_phenotype_jsd(data, groupby, bins, n_iter=100): 219 | """Jensen-Shannon divergence of features across phenotypes 220 | 221 | Parameters 222 | ---------- 223 | data : pandas.DataFrame 224 | A (n_samples, n_features) Dataframe 225 | groupby : mappable 226 | A samples to phenotypes mapping 227 | n_iter : int 228 | Number of bootstrap resampling iterations to perform for the 229 | within-group comparisons 230 | n_bins : int 231 | Number of bins to binify the singles data on 232 | 233 | Returns 234 | ------- 235 | jsd_df : pandas.DataFrame 236 | A (n_features, n_phenotypes^2) dataframe of the JSD between each 237 | feature between and within phenotypes 238 | """ 239 | grouped = data.groupby(groupby) 240 | jsds = [] 241 | 242 | seen = set([]) 243 | 244 | for phenotype1, df1 in grouped: 245 | for phenotype2, df2 in grouped: 246 | pair = tuple(sorted([phenotype1, phenotype2])) 247 | if pair in seen: 248 | continue 249 | seen.add(pair) 250 | 251 | if phenotype1 == phenotype2: 252 | seriess = [] 253 | bs = cross_validation.Bootstrap(df1.shape[0], n_iter=n_iter, 254 | train_size=0.5) 255 | for i, (ind1, ind2) in enumerate(bs): 256 | df1_subset = df1.iloc[ind1, :] 257 | df2_subset = df2.iloc[ind2, :] 258 | seriess.append( 259 | binify_and_jsd(df1_subset, df2_subset, None, bins)) 260 | series = pd.concat(seriess, axis=1, names=None).mean(axis=1) 261 | series.name = pair 262 | jsds.append(series) 263 | else: 264 | series = binify_and_jsd(df1, df2, pair, bins) 265 | jsds.append(series) 266 | return pd.concat(jsds, axis=1) 267 | 268 | 269 | def jsd_df_to_2d(jsd_df): 270 | """Transform a tall JSD dataframe to a square matrix of mean JSDs 271 | 272 | Parameters 273 | ---------- 274 | jsd_df : pandas.DataFrame 275 | A (n_features, n_phenotypes^2) dataframe of the JSD between each 276 | feature between and within phenotypes 277 | 278 | Returns 279 | ------- 280 | jsd_2d : pandas.DataFrame 281 | A (n_phenotypes, n_phenotypes) symmetric dataframe of the mean JSD 282 | between and within phenotypes 283 | """ 284 | jsd_2d = jsd_df.mean().reset_index() 285 | jsd_2d = jsd_2d.rename( 286 | columns={'level_0': 'phenotype1', 'level_1': 'phenotype2', 0: 'jsd'}) 287 | jsd_2d = jsd_2d.pivot(index='phenotype1', columns='phenotype2', 288 | values='jsd') 289 | return jsd_2d + np.tril(jsd_2d.T, -1) 290 | -------------------------------------------------------------------------------- /anchor/visualize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """See log bayes factors which led to modality categorization""" 3 | import locale 4 | 5 | import numpy as np 6 | import matplotlib as mpl 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | import seaborn as sns 10 | 11 | from .names import NEAR_ZERO, NEAR_HALF, NEAR_ONE, BIMODAL, \ 12 | NULL_MODEL 13 | 14 | 15 | darkblue, green, red, purple, yellow, lightblue = sns.color_palette('deep') 16 | MODALITY_ORDER = [NEAR_ZERO, BIMODAL, NEAR_ONE, NEAR_HALF, NULL_MODEL] 17 | 18 | MODALITY_TO_COLOR = {NEAR_ZERO: lightblue, NEAR_HALF: yellow, NEAR_ONE: red, 19 | BIMODAL: purple, NULL_MODEL: 'lightgrey'} 20 | MODALITY_PALETTE = [MODALITY_TO_COLOR[m] for m in MODALITY_ORDER] 21 | 22 | MODALITY_TO_CMAP = { 23 | NEAR_ZERO: sns.light_palette(MODALITY_TO_COLOR[NEAR_ZERO], as_cmap=True), 24 | NEAR_HALF: sns.light_palette(MODALITY_TO_COLOR[NEAR_HALF], as_cmap=True), 25 | NEAR_ONE: sns.light_palette(MODALITY_TO_COLOR[NEAR_ONE], as_cmap=True), 26 | BIMODAL: sns.light_palette(MODALITY_TO_COLOR[BIMODAL], as_cmap=True), 27 | NULL_MODEL: mpl.cm.Greys} 28 | 29 | MODALITY_FACTORPLOT_KWS = dict(hue_order=MODALITY_ORDER, 30 | palette=MODALITY_PALETTE) 31 | 32 | 33 | def violinplot(x=None, y=None, data=None, bw=0.2, scale='width', 34 | inner=None, ax=None, **kwargs): 35 | """Wrapper around Seaborn's Violinplot specifically for [0, 1] ranged data 36 | 37 | What's different: 38 | - bw = 0.2: Sets bandwidth to be small and the same between datasets 39 | - scale = 'width': Sets the width of all violinplots to be the same 40 | - inner = None: Don't plot a boxplot or points inside the violinplot 41 | """ 42 | if ax is None: 43 | ax = plt.gca() 44 | 45 | sns.violinplot(x, y, data=data, bw=bw, scale=scale, inner=inner, ax=ax, 46 | **kwargs) 47 | ax.set(ylim=(0, 1), yticks=(0, 0.5, 1)) 48 | return ax 49 | 50 | 51 | class _ModelLoglikPlotter(object): 52 | def __init__(self): 53 | self.fig = plt.figure(figsize=(5 * 2, 4)) 54 | self.ax_violin = plt.subplot2grid((3, 5), (0, 0), rowspan=3, colspan=1) 55 | self.ax_loglik = plt.subplot2grid((3, 5), (0, 1), rowspan=3, colspan=3) 56 | self.ax_bayesfactor = plt.subplot2grid((3, 5), (0, 4), rowspan=3, 57 | colspan=1) 58 | 59 | def plot(self, feature, logliks, logsumexps, log2bf_thresh, renamed=''): 60 | modality = logsumexps.idxmax() 61 | 62 | self.logliks = logliks 63 | self.logsumexps = logsumexps 64 | 65 | x = feature.to_frame() 66 | if feature.name is None: 67 | feature.name = 'Feature' 68 | x['sample_id'] = feature.name 69 | 70 | violinplot(x='sample_id', y=feature.name, data=x, ax=self.ax_violin, 71 | color=MODALITY_TO_COLOR[modality]) 72 | 73 | self.ax_violin.set(xticks=[], ylabel='') 74 | 75 | for name, loglik in logliks.groupby('Modality')[r'$\log$ Likelihood']: 76 | # print name, 77 | self.ax_loglik.plot(loglik, 'o-', label=name, alpha=0.75, 78 | color=MODALITY_TO_COLOR[name]) 79 | self.ax_loglik.legend(loc='best') 80 | self.ax_loglik.set(ylabel=r'$\log$ Likelihood', 81 | xlabel='Parameterizations', 82 | title='Assignment: {}'.format(modality)) 83 | self.ax_loglik.set_xlabel('phantom', color='white') 84 | 85 | for i, (name, height) in enumerate(logsumexps.iteritems()): 86 | self.ax_bayesfactor.bar(i, height, label=name, 87 | color=MODALITY_TO_COLOR[name]) 88 | xmin, xmax = self.ax_bayesfactor.get_xlim() 89 | self.ax_bayesfactor.hlines(log2bf_thresh, xmin, xmax, 90 | linestyle='dashed') 91 | self.ax_bayesfactor.set(ylabel='$\log K$', xticks=[]) 92 | if renamed: 93 | text = '{} ({})'.format(feature.name, renamed) 94 | else: 95 | text = feature.name 96 | self.fig.text(0.5, .025, text, fontsize=10, ha='center', 97 | va='bottom') 98 | sns.despine() 99 | self.fig.tight_layout() 100 | return self 101 | 102 | 103 | class ModalitiesViz(object): 104 | """Visualize results of modality assignments""" 105 | 106 | modality_order = MODALITY_ORDER 107 | modality_to_color = MODALITY_TO_COLOR 108 | modality_palette = MODALITY_PALETTE 109 | 110 | def bar(self, counts, phenotype_to_color=None, ax=None, percentages=True): 111 | """Draw barplots grouped by modality of modality percentage per group 112 | 113 | Parameters 114 | ---------- 115 | 116 | 117 | Returns 118 | ------- 119 | 120 | 121 | Raises 122 | ------ 123 | 124 | """ 125 | if percentages: 126 | counts = 100 * (counts.T / counts.T.sum()).T 127 | 128 | # with sns.set(style='whitegrid'): 129 | if ax is None: 130 | ax = plt.gca() 131 | 132 | full_width = 0.8 133 | width = full_width / counts.shape[0] 134 | for i, (group, series) in enumerate(counts.iterrows()): 135 | left = np.arange(len(self.modality_order)) + i * width 136 | height = [series[i] if i in series else 0 137 | for i in self.modality_order] 138 | color = phenotype_to_color[group] 139 | ax.bar(left, height, width=width, color=color, label=group, 140 | linewidth=.5, edgecolor='k') 141 | ylabel = 'Percentage of events' if percentages else 'Number of events' 142 | ax.set_ylabel(ylabel) 143 | ax.set_xticks(np.arange(len(self.modality_order)) + full_width / 2) 144 | ax.set_xticklabels(self.modality_order) 145 | ax.set_xlabel('Splicing modality') 146 | ax.set_xlim(0, len(self.modality_order)) 147 | ax.legend(loc='best') 148 | ax.grid(axis='y', linestyle='-', linewidth=0.5) 149 | sns.despine() 150 | 151 | def event_estimation(self, event, logliks, logsumexps, renamed=''): 152 | """Show the values underlying bayesian modality estimations of an event 153 | 154 | Parameters 155 | ---------- 156 | 157 | 158 | Returns 159 | ------- 160 | 161 | 162 | Raises 163 | ------ 164 | """ 165 | plotter = _ModelLoglikPlotter() 166 | plotter.plot(event, logliks, logsumexps, self.modality_to_color, 167 | renamed=renamed) 168 | return plotter 169 | 170 | 171 | def annotate_bars(x, group_col, percentage_col, modality_col, count_col, 172 | **kwargs): 173 | data = kwargs.pop('data') 174 | # print kwargs 175 | ax = plt.gca() 176 | width = 0.8/5. 177 | x_base = -.49 - width/2.5 178 | for group, group_df in data.groupby(group_col): 179 | i = 0 180 | modality_grouped = group_df.groupby(modality_col) 181 | for modality in MODALITY_ORDER: 182 | i += 1 183 | try: 184 | modality_df = modality_grouped.get_group(modality) 185 | except KeyError: 186 | continue 187 | x_position = x_base + width*i + width/2 188 | y_position = modality_df[percentage_col] 189 | try: 190 | value = modality_df[count_col].values[0] 191 | formatted = locale.format('%d', value, grouping=True) 192 | ax.annotate(formatted, (x_position, y_position), 193 | textcoords='offset points', xytext=(0, 2), 194 | ha='center', va='bottom', fontsize=12) 195 | except IndexError: 196 | continue 197 | x_base += 1 198 | 199 | 200 | def barplot(modalities_tidy, x=None, y='Percentage of Features', order=None, 201 | hue='Assigned Modality', **factorplot_kws): 202 | factorplot_kws.setdefault('hue_order', MODALITY_ORDER) 203 | factorplot_kws.setdefault('palette', MODALITY_PALETTE) 204 | factorplot_kws.setdefault('size', 3) 205 | factorplot_kws.setdefault('aspect', 3) 206 | factorplot_kws.setdefault('linewidth', 1) 207 | 208 | if order is not None and x is None: 209 | raise ValueError('If specifying "order", "x" must also ' 210 | 'be specified.') 211 | # y = 'Percentage of features' 212 | groupby = [hue] 213 | groupby_minus_hue = [] 214 | if x is not None: 215 | groupby = [x] + groupby 216 | groupby_minus_hue.append(x) 217 | if 'row' in factorplot_kws: 218 | groupby = groupby + [factorplot_kws['row']] 219 | groupby_minus_hue.append(factorplot_kws['row']) 220 | if 'col' in factorplot_kws: 221 | groupby = groupby + [factorplot_kws['col']] 222 | groupby_minus_hue.append(factorplot_kws['col']) 223 | 224 | # if x is not None: 225 | modality_counts = modalities_tidy.groupby( 226 | groupby).size().reset_index() 227 | modality_counts = modality_counts.rename(columns={0: 'Features'}) 228 | if groupby_minus_hue: 229 | modality_counts[y] = modality_counts.groupby( 230 | groupby_minus_hue)['Features'].apply( 231 | lambda x: 100 * x / x.astype(float).sum()) 232 | else: 233 | modality_counts[y] = 100 * modality_counts['Features']\ 234 | / modality_counts['Features'].sum() 235 | if order is not None: 236 | modality_counts[x] = pd.Categorical( 237 | modality_counts[x], categories=order, 238 | ordered=True) 239 | # else: 240 | # modality_counts[y] = pd.Categorical( 241 | # modality_counts[x], categories=order, 242 | # ordered=True) 243 | # else: 244 | # modality_counts = modalities_tidy.groupby( 245 | # hue).size().reset_index() 246 | # modality_counts = modality_counts.rename(columns={0: 'Features'}) 247 | # modality_counts[y] = \ 248 | # 100 * modality_counts.n_events/modality_counts.n_events.sum() 249 | if x is None: 250 | x = '' 251 | modality_counts[x] = x 252 | 253 | g = sns.factorplot(y=y, x=x, 254 | hue=hue, kind='bar', data=modality_counts, 255 | legend=False, **factorplot_kws) 256 | 257 | # Hacky workaround to add numeric annotations to the plot 258 | g.map_dataframe(annotate_bars, x, group_col=x, 259 | modality_col=hue, count_col='Features', 260 | percentage_col=y) 261 | g.add_legend(label_order=MODALITY_ORDER, title='Modalities') 262 | for ax in g.axes.flat: 263 | ax.locator_params('y', nbins=5) 264 | if ax.is_first_col(): 265 | ax.set(ylabel=y) 266 | return g 267 | -------------------------------------------------------------------------------- /anchor/bayesian.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | from scipy.misc import logsumexp 5 | 6 | from .names import NEAR_ZERO, NEAR_HALF, NEAR_ONE, BIMODAL, NULL_MODEL 7 | from .model import ModalityModel 8 | from .visualize import MODALITY_TO_CMAP, _ModelLoglikPlotter, MODALITY_ORDER 9 | 10 | CHANGING_PARAMETERS = np.arange(2, 21, step=1) 11 | 12 | 13 | TWO_PARAMETER_MODELS = { 14 | BIMODAL: {'alphas': 1. / (CHANGING_PARAMETERS + 10), 15 | 'betas': 1./(CHANGING_PARAMETERS+10)}, 16 | NEAR_HALF: {'alphas': CHANGING_PARAMETERS, 17 | 'betas': CHANGING_PARAMETERS}} 18 | ONE_PARAMETER_MODELS = { 19 | NEAR_ZERO: {'alphas': 1, 'betas': CHANGING_PARAMETERS}, 20 | NEAR_ONE: {'alphas': CHANGING_PARAMETERS, 'betas': 1} 21 | } 22 | 23 | 24 | class BayesianModalities(object): 25 | """Use Bayesian methods to estimate modalities of splicing events""" 26 | 27 | score_name = '$\log_2 K$' 28 | 29 | def __init__(self, one_parameter_models=ONE_PARAMETER_MODELS, 30 | two_parameter_models=TWO_PARAMETER_MODELS, 31 | logbf_thresh=10): 32 | """Initialize an object with models to estimate splicing modality 33 | 34 | Parameters 35 | ---------- 36 | step : float 37 | Distance between parameter values 38 | vmax : float 39 | Maximum parameter value 40 | logbf_thresh : float 41 | Minimum threshold at which the bayes factor difference is defined 42 | to be significant 43 | """ 44 | self.logbf_thresh = logbf_thresh 45 | # self.modality_to_cmap = modality_to_cmap 46 | 47 | self.one_param_models = {k: ModalityModel(**v) 48 | for k, v in one_parameter_models.items()} 49 | self.two_param_models = {k: ModalityModel(**v) 50 | for k, v in two_parameter_models.items()} 51 | self.models = self.one_param_models.copy() 52 | self.models.update(self.two_param_models) 53 | 54 | def _single_feature_logliks_one_step(self, feature, models): 55 | """Get log-likelihood of models at each parameterization for given data 56 | 57 | Parameters 58 | ---------- 59 | feature : pandas.Series 60 | Percent-based values of a single feature. May contain NAs, but only 61 | non-NA values are used. 62 | 63 | Returns 64 | ------- 65 | logliks : pandas.DataFrame 66 | 67 | """ 68 | x_non_na = feature[~feature.isnull()] 69 | if x_non_na.empty: 70 | return pd.DataFrame() 71 | else: 72 | dfs = [] 73 | for name, model in models.items(): 74 | df = model.single_feature_logliks(feature) 75 | df['Modality'] = name 76 | dfs.append(df) 77 | return pd.concat(dfs, ignore_index=True) 78 | 79 | @staticmethod 80 | def assert_non_negative(x): 81 | """Ensure all values are greater than zero 82 | 83 | Parameters 84 | ---------- 85 | x : array_like 86 | A numpy array 87 | 88 | Raises 89 | ------ 90 | AssertionError 91 | If any value in ``x`` is less than 0 92 | """ 93 | assert np.all(x[np.isfinite(x)] >= 0) 94 | 95 | @staticmethod 96 | def assert_less_than_or_equal_1(x): 97 | """Ensure all values are less than 1 98 | 99 | Parameters 100 | ---------- 101 | x : array_like 102 | A numpy array 103 | 104 | Raises 105 | ------ 106 | AssertionError 107 | If any value in ``x`` are greater than 1 108 | """ 109 | assert np.all(x[np.isfinite(x)] <= 1) 110 | 111 | def fit(self, data): 112 | """Get the modality assignments of each splicing event in the data 113 | 114 | Parameters 115 | ---------- 116 | data : pandas.DataFrame 117 | A (n_samples, n_events) dataframe of splicing events' PSI scores. 118 | Must be psi scores which range from 0 to 1 119 | 120 | Returns 121 | ------- 122 | log2_bayes_factors : pandas.DataFrame 123 | A (n_modalities, n_events) dataframe of the estimated log2 124 | bayes factor for each splicing event, for each modality 125 | 126 | Raises 127 | ------ 128 | AssertionError 129 | If any value in ``data`` does not fall only between 0 and 1. 130 | """ 131 | self.assert_less_than_or_equal_1(data.values.flat) 132 | self.assert_non_negative(data.values.flat) 133 | 134 | if isinstance(data, pd.DataFrame): 135 | log2_bayes_factors = data.apply(self.single_feature_fit) 136 | elif isinstance(data, pd.Series): 137 | log2_bayes_factors = self.single_feature_fit(data) 138 | log2_bayes_factors.name = self.score_name 139 | return log2_bayes_factors 140 | 141 | def predict(self, log2_bayes_factors, reset_index=False): 142 | """Guess the most likely modality for each event 143 | 144 | For each event that has at least one non-NA value, if no modalilites 145 | have logsumexp'd logliks greater than the log Bayes factor threshold, 146 | then they are assigned the 'multimodal' modality, because we cannot 147 | reject the null hypothesis that these did not come from the uniform 148 | distribution. 149 | 150 | Parameters 151 | ---------- 152 | log2_bayes_factors : pandas.DataFrame 153 | A (4, n_events) dataframe with bayes factors for the Psi~1, Psi~0, 154 | bimodal, and middle modalities. If an event has no bayes factors 155 | for any of those modalities, it is ignored 156 | reset_index : bool 157 | If True, remove the first level of the index from the dataframe. 158 | Useful if you are using this function to apply to a grouped 159 | dataframe where the first level is something other than the 160 | modality, e.g. the celltype 161 | 162 | Returns 163 | ------- 164 | modalities : pandas.Series 165 | A (n_events,) series with the most likely modality for each event 166 | 167 | """ 168 | if reset_index: 169 | x = log2_bayes_factors.reset_index(level=0, drop=True) 170 | else: 171 | x = log2_bayes_factors 172 | if isinstance(x, pd.DataFrame): 173 | not_na = (x.notnull() > 0).any() 174 | not_na_columns = not_na[not_na].index 175 | x.ix[NULL_MODEL, not_na_columns] = self.logbf_thresh 176 | elif isinstance(x, pd.Series): 177 | x[NULL_MODEL] = self.logbf_thresh 178 | return x.idxmax() 179 | 180 | def fit_predict(self, data): 181 | """Convenience function to assign modalities directly from data""" 182 | return self.predict(self.fit(data)) 183 | 184 | def single_feature_logliks(self, feature): 185 | """Calculate log-likelihoods of each modality's parameterization 186 | 187 | Used for plotting the estimates of a single feature 188 | 189 | Parameters 190 | ---------- 191 | featre : pandas.Series 192 | A single feature's values. All values must range from 0 to 1. 193 | 194 | Returns 195 | ------- 196 | logliks : pandas.DataFrame 197 | The log-likelihood the data, for each model, for each 198 | parameterization 199 | 200 | Raises 201 | ------ 202 | AssertionError 203 | If any value in ``x`` does not fall only between 0 and 1. 204 | """ 205 | self.assert_less_than_or_equal_1(feature.values) 206 | self.assert_non_negative(feature.values) 207 | 208 | logliks = self._single_feature_logliks_one_step( 209 | feature, self.one_param_models) 210 | 211 | logsumexps = self.logliks_to_logsumexp(logliks) 212 | 213 | # If none of the one-parameter models passed, try the two-param models 214 | if (logsumexps <= self.logbf_thresh).all(): 215 | logliks_two_params = self._single_feature_logliks_one_step( 216 | feature, self.two_param_models) 217 | logliks = pd.concat([logliks, logliks_two_params]) 218 | return logliks 219 | 220 | @staticmethod 221 | def logliks_to_logsumexp(logliks): 222 | return logliks.groupby('Modality')[r'$\log$ Likelihood'].apply( 223 | logsumexp) 224 | 225 | def single_feature_fit(self, feature): 226 | """Get the log2 bayes factor of the fit for each modality""" 227 | if np.isfinite(feature).sum() == 0: 228 | series = pd.Series(index=MODALITY_ORDER) 229 | else: 230 | logbf_one_param = pd.Series( 231 | {k: v.logsumexp_logliks(feature) for 232 | k, v in self.one_param_models.items()}) 233 | 234 | # Check if none of the previous features fit 235 | if (logbf_one_param <= self.logbf_thresh).all(): 236 | logbf_two_param = pd.Series( 237 | {k: v.logsumexp_logliks(feature) 238 | for k, v in self.two_param_models.items()}) 239 | series = pd.concat([logbf_one_param, logbf_two_param]) 240 | series[NULL_MODEL] = self.logbf_thresh 241 | else: 242 | series = logbf_one_param 243 | series.index.name = 'Modality' 244 | series.name = self.score_name 245 | return series 246 | 247 | def plot_single_feature_calculation(self, feature, renamed=''): 248 | if np.isfinite(feature).sum() == 0: 249 | raise ValueError('The feature has no finite values') 250 | logliks = self.single_feature_logliks(feature) 251 | logsumexps = self.logliks_to_logsumexp(logliks) 252 | logsumexps[NULL_MODEL] = self.logbf_thresh 253 | 254 | plotter = _ModelLoglikPlotter() 255 | return plotter.plot(feature, logliks, logsumexps, self.logbf_thresh, 256 | renamed=renamed) 257 | 258 | def violinplot(self, n=1000, figsize=None, **kwargs): 259 | r"""Visualize all modality family members with parameters 260 | 261 | Use violinplots to visualize distributions of modality family members 262 | 263 | Parameters 264 | ---------- 265 | n : int 266 | Number of random variables to generate 267 | kwargs : dict or keywords 268 | Any keyword arguments to seaborn.violinplot 269 | 270 | Returns 271 | ------- 272 | fig : matplotlib.Figure object 273 | Figure object with violins plotted 274 | """ 275 | if figsize is None: 276 | nrows = len(self.models) 277 | width = max(len(m.rvs) for name, m in self.models.items())*0.625 278 | height = nrows*2.5 279 | figsize = width, height 280 | fig, axes = plt.subplots(nrows=nrows, figsize=figsize) 281 | 282 | for ax, model_name in zip(axes, MODALITY_ORDER): 283 | try: 284 | model = self.models[model_name] 285 | cmap = MODALITY_TO_CMAP[model_name] 286 | palette = cmap(np.linspace(0, 1, len(model.rvs))) 287 | model.violinplot(n=n, ax=ax, palette=palette, **kwargs) 288 | ax.set(title=model_name, xlabel='') 289 | except KeyError: 290 | continue 291 | fig.tight_layout() 292 | -------------------------------------------------------------------------------- /logo/v1/logo.svg: -------------------------------------------------------------------------------- 1 | logoAnchor --------------------------------------------------------------------------------