├── doc ├── source │ ├── _static │ │ └── dummy │ ├── sort.rst │ ├── util.rst │ ├── balances.rst │ ├── plot.rst │ ├── cluster.rst │ ├── regression.rst │ └── index.rst ├── Makefile └── README.md ├── gneiss ├── cluster │ ├── tests │ │ ├── __init__.py │ │ ├── data │ │ │ ├── test_metadata.txt │ │ │ ├── tree.qza │ │ │ ├── polytomy.qza │ │ │ ├── weighted.biom.qza │ │ │ ├── test_gradient.biom.qza │ │ │ └── test_composition.biom.qza │ │ └── test_pba.py │ ├── __init__.py │ └── _pba.py ├── regression │ ├── tests │ │ ├── __init__.py │ │ ├── data │ │ │ ├── not-regression.pickle │ │ │ ├── lme.pickle │ │ │ ├── ols.pickle │ │ │ ├── lme_tree.qza │ │ │ ├── ols_tree.qza │ │ │ ├── test_tree.qza │ │ │ ├── lme_balances.qza │ │ │ ├── ols_balances.qza │ │ │ ├── test_lme_composition.qza │ │ │ ├── test_ols_composition.qza │ │ │ ├── lovo.csv │ │ │ ├── lovo2.csv │ │ │ ├── exp_ols_results2.txt │ │ │ ├── exp_lme_results2.txt │ │ │ ├── kfold2.csv │ │ │ ├── exp_lme_results.txt │ │ │ ├── exp_ols_results.txt │ │ │ ├── kfold.csv │ │ │ ├── loo.csv │ │ │ ├── loo2.csv │ │ │ ├── coefficients.csv │ │ │ └── pvalues.csv │ │ ├── test_mixedlm.py │ │ ├── test_model.py │ │ └── test_ols.py │ ├── __init__.py │ └── _model.py ├── __init__.py ├── composition │ ├── __init__.py │ ├── tests │ │ ├── test_variance.py │ │ └── test_composition.py │ ├── _variance.py │ └── _composition.py ├── plot │ ├── __init__.py │ ├── tests │ │ ├── data │ │ │ └── example.nwk │ │ ├── test_radial.py │ │ ├── test_regression_plot.py │ │ ├── test_dendrogram.py │ │ ├── test_heatmap.py │ │ └── test_decompose.py │ └── _radial.py ├── _model.py ├── tests │ ├── test_model.py │ ├── data │ │ └── large_tree.nwk │ ├── test_balances.py │ └── test_sort.py ├── sort.py └── balances.py ├── ci ├── conda_requirements.txt ├── pip_requirements.txt └── environment.yml ├── ipynb ├── images │ ├── Slide1.jpg │ ├── Slide2.jpg │ ├── Slide3.jpg │ └── Slide4.jpg ├── 88soils │ └── 238_otu_table.biom └── cfstudy │ ├── cfstudy_taxonomy.qza │ ├── cfstudy_common.biom.qza │ └── cfstudy-qiime2-tutorial.ipynb ├── MANIFEST.in ├── .coveragerc ├── Makefile ├── .gitignore ├── .travis.yml ├── .github └── workflows │ └── master.yml ├── COPYING.txt ├── README.md ├── setup.py └── CHANGELOG.md /doc/source/_static/dummy: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gneiss/cluster/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gneiss/regression/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/sort.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: gneiss.sort -------------------------------------------------------------------------------- /doc/source/util.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: gneiss.util 2 | -------------------------------------------------------------------------------- /doc/source/balances.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: gneiss.balances 2 | -------------------------------------------------------------------------------- /doc/source/plot.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: gneiss.plot 2 | 3 | 4 | -------------------------------------------------------------------------------- /ci/conda_requirements.txt: -------------------------------------------------------------------------------- 1 | pip 2 | biom-format 3 | bokeh=1.1.0 4 | -------------------------------------------------------------------------------- /doc/source/cluster.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: gneiss.cluster 2 | 3 | 4 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/not-regression.pickle: -------------------------------------------------------------------------------- 1 | asdfasdfasdff 2 | -------------------------------------------------------------------------------- /doc/source/regression.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: gneiss.regression 2 | 3 | 4 | -------------------------------------------------------------------------------- /ci/pip_requirements.txt: -------------------------------------------------------------------------------- 1 | coveralls 2 | sphinx 3 | pycodestyle 4 | flake8 5 | -------------------------------------------------------------------------------- /ipynb/images/Slide1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/ipynb/images/Slide1.jpg -------------------------------------------------------------------------------- /ipynb/images/Slide2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/ipynb/images/Slide2.jpg -------------------------------------------------------------------------------- /ipynb/images/Slide3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/ipynb/images/Slide3.jpg -------------------------------------------------------------------------------- /ipynb/images/Slide4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/ipynb/images/Slide4.jpg -------------------------------------------------------------------------------- /gneiss/cluster/tests/data/test_metadata.txt: -------------------------------------------------------------------------------- 1 | x y 2 | s1 1 a 3 | s2 2 a 4 | s3 3 a 5 | s4 4 a 6 | s5 5 a 7 | -------------------------------------------------------------------------------- /gneiss/cluster/tests/data/tree.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/cluster/tests/data/tree.qza -------------------------------------------------------------------------------- /ipynb/88soils/238_otu_table.biom: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/ipynb/88soils/238_otu_table.biom -------------------------------------------------------------------------------- /ipynb/cfstudy/cfstudy_taxonomy.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/ipynb/cfstudy/cfstudy_taxonomy.qza -------------------------------------------------------------------------------- /gneiss/cluster/tests/data/polytomy.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/cluster/tests/data/polytomy.qza -------------------------------------------------------------------------------- /gneiss/regression/tests/data/lme.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/regression/tests/data/lme.pickle -------------------------------------------------------------------------------- /gneiss/regression/tests/data/ols.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/regression/tests/data/ols.pickle -------------------------------------------------------------------------------- /ipynb/cfstudy/cfstudy_common.biom.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/ipynb/cfstudy/cfstudy_common.biom.qza -------------------------------------------------------------------------------- /gneiss/regression/tests/data/lme_tree.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/regression/tests/data/lme_tree.qza -------------------------------------------------------------------------------- /gneiss/regression/tests/data/ols_tree.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/regression/tests/data/ols_tree.qza -------------------------------------------------------------------------------- /gneiss/cluster/tests/data/weighted.biom.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/cluster/tests/data/weighted.biom.qza -------------------------------------------------------------------------------- /gneiss/regression/tests/data/test_tree.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/regression/tests/data/test_tree.qza -------------------------------------------------------------------------------- /gneiss/regression/tests/data/lme_balances.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/regression/tests/data/lme_balances.qza -------------------------------------------------------------------------------- /gneiss/regression/tests/data/ols_balances.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/regression/tests/data/ols_balances.qza -------------------------------------------------------------------------------- /gneiss/cluster/tests/data/test_gradient.biom.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/cluster/tests/data/test_gradient.biom.qza -------------------------------------------------------------------------------- /gneiss/cluster/tests/data/test_composition.biom.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/cluster/tests/data/test_composition.biom.qza -------------------------------------------------------------------------------- /gneiss/regression/tests/data/test_lme_composition.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/regression/tests/data/test_lme_composition.qza -------------------------------------------------------------------------------- /gneiss/regression/tests/data/test_ols_composition.qza: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocore/gneiss/HEAD/gneiss/regression/tests/data/test_ols_composition.qza -------------------------------------------------------------------------------- /gneiss/regression/tests/data/lovo.csv: -------------------------------------------------------------------------------- 1 | ,mse,Rsquared,R2diff 2 | Intercept,4310.47487689949,0.21981627865598752,0.0 3 | x1,4310.4748768994905,0.21981627865598752,0.0 4 | x2,4310.4748768994905,0.21981627865598752,0.0 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include .coveragerc 2 | include CHANGELOG.md 3 | include COPYING.txt 4 | include Makefile 5 | include README.md 6 | 7 | graft gneiss 8 | 9 | global-exclude *.pyc 10 | global-exclude *.pyo 11 | global-exclude .git 12 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/lovo2.csv: -------------------------------------------------------------------------------- 1 | ,mse,Rsquared,R2diff 2 | Intercept,4309.602746314058,0.9949205109438772,5.546379782983557e-05 3 | x1,4305.631922549035,0.994925191132991,5.078360871602072e-05 4 | x2,6910.497401174743,0.9918549810764683,0.0031209936652387693 5 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/exp_ols_results2.txt: -------------------------------------------------------------------------------- 1 | Simplicial Least Squares Results 2 | ============================ 3 | No. Observations 5.0000 4 | Model: OLS 5 | Rsquared: 0.4405 6 | ---------------------------- 7 | c Intercept real 8 | ---------------------------- 9 | Y1 slope 1.60E+00 6.00E-01 10 | Y1 pvalue 1.28E-01 8.05E-02 11 | ============================ 12 | -------------------------------------------------------------------------------- /gneiss/__init__.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | __version__ = "0.4.6" 10 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | # this file is based on the examples provided on scikit-learn's .coveragerc 2 | 3 | [run] 4 | omit = 5 | */tests* 6 | */__init__.py 7 | */gneiss/layouts.py 8 | source = gneiss 9 | branch = True 10 | include = */gneiss/* 11 | 12 | [report] 13 | exclude_lines = 14 | pragma: no cover 15 | raise NotImplementedError 16 | if __name__ == .__main__.: 17 | omit = 18 | */tests* 19 | */__init__.py 20 | */gneiss/layouts.py -------------------------------------------------------------------------------- /gneiss/regression/tests/data/exp_lme_results2.txt: -------------------------------------------------------------------------------- 1 | Simplicial Mixed Linear Model Results 2 | ==================================================== 3 | No. Observations 1600.0000 Model: Simplicial MixedLM 4 | ----------------------------------------------------- 5 | Intercept groups RE x1 x2 6 | ----------------------------------------------------- 7 | Y1 slope 4.21E+00 9.36E-02 1.02E+00 9.25E-01 8 | Y1 pvalue 4.83E-236 4.42E-05 3.97E-35 3.57E-30 9 | ==================================================== 10 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. gneiss documentation master file, created by 2 | sphinx-quickstart on Sat Nov 26 16:35:10 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to gneiss's documentation! 7 | ================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | balances 15 | regression 16 | cluster 17 | plot 18 | sort 19 | util 20 | 21 | 22 | 23 | Indices and tables 24 | ================== 25 | 26 | * :ref:`genindex` 27 | * :ref:`modindex` 28 | * :ref:`search` 29 | 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := help 2 | 3 | ifeq ($(WITH_COVERAGE), TRUE) 4 | TEST_COMMAND = COVERAGE_FILE=.coverage coverage run --rcfile .coveragerc setup.py nosetests --with-doctest 5 | else 6 | TEST_COMMAND = nosetests --with-doctest 7 | endif 8 | 9 | help: 10 | @echo 'Use "make test" to run all the unit tests and docstring tests.' 11 | @echo 'Use "make pep8" to validate PEP8 compliance.' 12 | @echo 'Use "make html" to create html documentation with sphinx' 13 | @echo 'Use "make all" to run all the targets listed above.' 14 | test: 15 | $(TEST_COMMAND) 16 | pep8: 17 | flake8 gneiss setup.py --ignore E303,E731,E722 18 | 19 | all: pep8 test 20 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/kfold2.csv: -------------------------------------------------------------------------------- 1 | ,model_mse,Rsquared,pred_mse 2 | fold_0,3584.5769364882744,0.9916720125376085,663.109474872684 3 | fold_1,3610.186079514632,0.9928445881359773,428.0519957464969 4 | fold_2,3768.107015449521,0.9930668932082859,197.3834417792511 5 | fold_3,3680.381544872904,0.9936245114219088,279.11469601532144 6 | fold_4,3021.72858660822,0.9948952737882077,960.8363758112072 7 | fold_5,3533.664042171059,0.9939605493886041,445.10100399311614 8 | fold_6,3390.1041044617687,0.993817702859089,581.6366207655954 9 | fold_7,3409.878710170101,0.9930604339509851,588.5085391323519 10 | fold_8,3369.763753775731,0.9915125403479148,709.6491871580773 11 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/exp_lme_results.txt: -------------------------------------------------------------------------------- 1 | Simplicial Mixed Linear Model Results 2 | ==================================================== 3 | No. Observations 1600.0000 Model: Simplicial MixedLM 4 | ----------------------------------------------------- 5 | Intercept groups RE x1 x2 6 | ----------------------------------------------------- 7 | Y1 slope 4.21E+00 9.36E-02 1.02E+00 9.25E-01 8 | Y1 pvalue 4.83E-236 4.42E-05 3.97E-35 3.57E-30 9 | Y2 slope 2.12E-01 9.36E-02 1.02E+00 9.25E-01 10 | Y2 pvalue 9.94E-02 4.42E-05 3.97E-35 3.57E-30 11 | ==================================================== 12 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/exp_ols_results.txt: -------------------------------------------------------------------------------- 1 | Simplicial Least Squares Results 2 | ================================== 3 | No. Observations 5.0000 4 | Model: OLS 5 | Rsquared: 0.4405 6 | ---------------------------------- 7 | mse Rsquared R2diff 8 | ---------------------------------- 9 | Intercept 2.1409 0.8916 -0.4511 10 | real 2.1000 0.0000 0.4405 11 | ---------------------------------- 12 | model_mse Rsquared pred_mse 13 | ---------------------------------- 14 | fold_0 0.0000 1.0000 12.5000 15 | fold_1 0.0000 1.0000 14.5000 16 | ================================== 17 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/kfold.csv: -------------------------------------------------------------------------------- 1 | ,model_mse,Rsquared,pred_mse 2 | fold_0,3649.961032819785,0.13217034860627497,483.0402110305879 3 | fold_1,3691.4006801996566,0.2095347670376564,403.8071396736002 4 | fold_2,3830.37833521066,0.15015827888995004,206.67020202108148 5 | fold_3,3729.7569631036204,0.1728096904238252,302.4686128376491 6 | fold_4,3135.1878975392115,0.21496560314322954,897.5468948642697 7 | fold_5,3587.243280769308,0.20003652741163214,453.8238419838613 8 | fold_6,3445.455291051452,0.17658737964869753,599.173109675699 9 | fold_7,3471.7837469746355,0.16790161609540577,604.6536872291142 10 | fold_8,3485.6635258096953,0.12600459767177707,552.1337252734353 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Temporary files 2 | *~ 3 | \#*# 4 | 5 | *.py[cod] 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Packages 11 | *.egg 12 | *.egg-info 13 | dist 14 | build 15 | eggs 16 | parts 17 | bin 18 | var 19 | sdist 20 | develop-eggs 21 | .installed.cfg 22 | lib 23 | lib64 24 | __pycache__ 25 | 26 | # Installer logs 27 | pip-log.txt 28 | 29 | # Unit test / coverage reports 30 | .coverage 31 | .tox 32 | nosetests.xml 33 | 34 | # Translations 35 | *.mo 36 | 37 | # Mr Developer 38 | .mr.developer.cfg 39 | .project 40 | .pydevproject 41 | 42 | # vi 43 | .*.swp 44 | 45 | # Sphinx builds 46 | doc/source/generated 47 | 48 | # OSX files 49 | .DS_Store 50 | 51 | # IPythnon checkpoints 52 | .ipynb_checkpoints -------------------------------------------------------------------------------- /gneiss/composition/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Composition functions (:mod:`gneiss.composition`) 3 | =============================================== 4 | 5 | .. currentmodule:: gneiss.composition 6 | 7 | This module contains compositional functions 8 | 9 | Functions 10 | --------- 11 | 12 | .. autosummary:: 13 | :toctree: generated/ 14 | 15 | variation_matrix 16 | 17 | """ 18 | # ---------------------------------------------------------------------------- 19 | # Copyright (c) 2016--, gneiss development team. 20 | # 21 | # Distributed under the terms of the Modified BSD License. 22 | # 23 | # The full license is in the file COPYING.txt, distributed with this software. 24 | # ---------------------------------------------------------------------------- 25 | from ._composition import ilr_transform 26 | from ._variance import variation_matrix 27 | 28 | 29 | __all__ = ["ilr_transform", "variation_matrix"] 30 | -------------------------------------------------------------------------------- /gneiss/plot/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Plotting functions (:mod:`gneiss.plot`) 3 | =============================================== 4 | 5 | .. currentmodule:: gneiss.plot 6 | 7 | This module contains plotting functionality 8 | 9 | Functions 10 | --------- 11 | 12 | .. autosummary:: 13 | :toctree: generated/ 14 | 15 | heatmap 16 | radialplot 17 | balance_boxplot 18 | balance_barplots 19 | """ 20 | # ---------------------------------------------------------------------------- 21 | # Copyright (c) 2016--, gneiss development team. 22 | # 23 | # Distributed under the terms of the Modified BSD License. 24 | # 25 | # The full license is in the file COPYING.txt, distributed with this software. 26 | # ---------------------------------------------------------------------------- 27 | 28 | from ._heatmap import heatmap 29 | from ._radial import radialplot 30 | from ._decompose import balance_boxplot, balance_barplots, proportion_plot 31 | 32 | 33 | __all__ = ["heatmap", "radialplot", "balance_boxplot", 34 | "balance_barplots", "proportion_plot"] 35 | -------------------------------------------------------------------------------- /gneiss/regression/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Regression functions (:mod:`gneiss.regression`) 3 | =============================================== 4 | 5 | .. currentmodule:: gneiss.regression 6 | 7 | This module contains functions that can convert proportions 8 | to balances for regression analysis 9 | 10 | Functions 11 | --------- 12 | 13 | .. autosummary:: 14 | :toctree: generated/ 15 | 16 | ols 17 | mixedlm 18 | 19 | Classes 20 | ------- 21 | .. autosummary:: 22 | :toctree: generated/ 23 | 24 | OLSModel 25 | LMEModel 26 | 27 | """ 28 | # ---------------------------------------------------------------------------- 29 | # Copyright (c) 2016--, gneiss development team. 30 | # 31 | # Distributed under the terms of the Modified BSD License. 32 | # 33 | # The full license is in the file COPYING.txt, distributed with this software. 34 | # ---------------------------------------------------------------------------- 35 | from ._ols import ols, OLSModel 36 | from ._mixedlm import mixedlm, LMEModel 37 | 38 | 39 | __all__ = ["ols", "OLSModel", "mixedlm", "LMEModel"] 40 | -------------------------------------------------------------------------------- /gneiss/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Clustering functions (:mod:`gneiss.cluster`) 3 | ============================================ 4 | 5 | .. currentmodule:: gneiss.cluster 6 | 7 | This module contains functions to build hierarchical clusterings. 8 | 9 | 10 | Functions 11 | --------- 12 | 13 | .. autosummary:: 14 | :toctree: generated/ 15 | 16 | correlation_linkage 17 | gradient_linkage 18 | rank_linkage 19 | random_linkage 20 | 21 | """ 22 | # ---------------------------------------------------------------------------- 23 | # Copyright (c) 2016--, gneiss development team. 24 | # 25 | # Distributed under the terms of the Modified BSD License. 26 | # 27 | # The full license is in the file COPYING.txt, distributed with this software. 28 | # ---------------------------------------------------------------------------- 29 | from ._pba import (correlation_linkage, gradient_linkage, 30 | rank_linkage, random_linkage) 31 | 32 | 33 | __all__ = ['correlation_linkage', 'gradient_linkage', 34 | 'rank_linkage', 'random_linkage'] 35 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Travis yml file inspired by scikit-bio 2 | # Check on http://lint.travis-ci.org/ after modifying it! 3 | sudo: false 4 | language: python 5 | env: 6 | - PYVERSION=3.6 USE_CYTHON=TRUE MAKE_DOC=TRUE 7 | before_install: 8 | - export MPLBACKEND='Agg' 9 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 10 | - chmod +x miniconda.sh 11 | - ./miniconda.sh -b 12 | - export PATH=/home/travis/miniconda3/bin:$PATH 13 | # Update conda itself 14 | - conda update --yes conda 15 | # Useful for debugging any issues with conda 16 | - conda info -a 17 | install: 18 | - conda create --yes -n test_env python=$PYVERSION 19 | - conda install --yes -n test_env --file ci/conda_requirements.txt -c biocore 20 | - conda install --yes -n test_env cython 21 | - source activate test_env 22 | - pip install -r ci/pip_requirements.txt 23 | - pip install -e . 24 | script: 25 | - WITH_COVERAGE=TRUE make all 26 | - if [ ${MAKE_DOC} ]; then make -C doc clean html; fi 27 | after_success: 28 | - coveralls 29 | notifications: 30 | webhooks: 31 | on_success: change 32 | on_failure: always 33 | -------------------------------------------------------------------------------- /.github/workflows/master.yml: -------------------------------------------------------------------------------- 1 | # much of this is taken from the Empress main.yml file 2 | name: gneiss CI 3 | 4 | on: 5 | pull_request: 6 | branches: 7 | - master 8 | push: 9 | branches: 10 | - master 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | 16 | strategy: 17 | matrix: 18 | python-version: ["3.8", "3.9"] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | with: 23 | persist-credentials: false 24 | fetch-depth: 0 25 | 26 | - uses: conda-incubator/setup-miniconda@v2 27 | with: 28 | activate-environment: gneiss 29 | python-version: ${{ matrix.python-version }} 30 | 31 | - name: Test conda installation 32 | shell: bash -l {0} 33 | run: conda info 34 | 35 | - name: Install conda packages 36 | shell: bash -l {0} 37 | run: conda install -c conda-forge statsmodels scikit-bio biom-format matplotlib flake8 38 | 39 | - name: Install gneiss 40 | shell: bash -l {0} 41 | run: pip install -e .[dev] 42 | 43 | - name: Run tests 44 | shell: bash -l {0} 45 | run: make all 46 | -------------------------------------------------------------------------------- /gneiss/plot/tests/data/example.nwk: -------------------------------------------------------------------------------- 1 | ((((y15:0.200853,(y31:0.42924,(y47:0.914445,y48:0.837693)y32:0.266535)y16:0.327741)y7:0.743731,((y33:0.0653228,y34:0.0404005)y17:0.0864676,((y49:0.26801,(y67:0.340285,(y85:0.135346,y86:0.55117)y68:0.291215)y50:0.332233)y35:0.370523,((y69:0.38023,y70:0.991233)y51:0.647156,(y71:0.615186,y72:0.781904)y52:0.168594)y36:0.732766)y18:0.663758)y8:0.404288)y3:0.591153,((y19:0.565967,y20:0.952246)y9:0.539617,(y21:0.459132,y22:0.269279)y10:0.86029)y4:0.102227)y1:0.569204,(((y23:0.38369,y24:0.856949)y11:0.939149,(y25:0.518678,(y37:0.569999,(y53:0.414425,(y73:0.458147,y74:0.027975)y54:0.00158475)y38:0.199839)y26:0.561358)y12:0.381204)y5:0.472245,(((y39:0.861009,(y55:0.0324591,(y75:0.01456,y76:0.755587)y56:0.94357)y40:0.798439)y27:0.527629,((y57:0.344423,y58:0.0695154)y41:0.230867,(y59:0.656657,(y77:0.473771,y78:0.0236346)y60:0.648203)y42:0.218781)y28:0.763701)y13:0.432767,((y43:0.258421,(y61:0.81704,y62:0.0208181)y44:0.253458)y29:0.41618,((y63:0.445669,(y79:0.223196,(y87:0.659824,y88:0.426299)y80:0.648506)y64:0.506309)y45:0.12089,((y81:0.875534,(y89:0.743842,y90:0.416172)y82:0.306387)y65:0.507717,((y91:0.590584,y92:0.21759)y83:0.846197,(y93:0.377969,(y95:0.591409,(y97:0.0172002,y98:0.612128)y96:0.492351)y94:0.346931)y84:0.505284)y66:0.910185)y46:0.332695)y30:0.91627)y14:0.76228)y6:0.379615)y2:0.802265)y0; 2 | -------------------------------------------------------------------------------- /COPYING.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016--, gneiss development team. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the names scikit-bio, skbio, or biocore nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /gneiss/composition/tests/test_variance.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import numpy as np 9 | import pandas as pd 10 | import unittest 11 | from skbio import DistanceMatrix 12 | from skbio.util import get_data_path 13 | from gneiss.composition._variance import variation_matrix 14 | 15 | 16 | class TestVariationMatrix(unittest.TestCase): 17 | def setUp(self): 18 | pass 19 | 20 | def test_varmat1(self): 21 | X = pd.DataFrame({'x': np.arange(1, 10), 22 | 'y': np.arange(2, 11)}) 23 | res = variation_matrix(X) 24 | exp = DistanceMatrix([[0, 0.032013010420979787 / 2], 25 | [0.032013010420979787 / 2, 0]], ids=['x', 'y']) 26 | self.assertEqual(str(res), str(exp)) 27 | 28 | def test_varmat_larg(self): 29 | np.random.seed(123) 30 | D = 50 31 | N = 100 32 | mean = np.ones(D) * 10 33 | cov = np.eye(D) 34 | n__ = np.random.multivariate_normal(mean, cov, size=N) 35 | X = pd.DataFrame(np.abs(n__), columns=np.arange(D).astype(np.str)) 36 | res = variation_matrix(X) 37 | 38 | exp = DistanceMatrix.read(get_data_path('exp_varmat.txt')) 39 | self.assertEqual(str(res), str(exp)) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /gneiss/composition/tests/test_composition.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import unittest 9 | import numpy as np 10 | import pandas as pd 11 | from gneiss.composition._composition import ilr_transform 12 | from gneiss.cluster import gradient_linkage 13 | import pandas.util.testing as pdt 14 | 15 | 16 | class TestILRTransform(unittest.TestCase): 17 | 18 | def test_ilr(self): 19 | np.random.seed(0) 20 | table = pd.DataFrame([[1, 1, 2, 2], 21 | [1, 2, 2, 1], 22 | [2, 2, 1, 1]], 23 | index=[1, 2, 3], 24 | columns=['a', 'b', 'c', 'd']) 25 | table = table.reindex(columns=np.random.permutation(table.columns)) 26 | ph = pd.Series([1, 2, 3], index=table.index) 27 | tree = gradient_linkage(table, ph) 28 | res_balances = ilr_transform(table, tree) 29 | exp_balances = pd.DataFrame( 30 | [[0.693147, -5.551115e-17, 2.775558e-17], 31 | [0.000000, -4.901291e-01, -4.901291e-01], 32 | [-0.693147, 5.551115e-17, -2.775558e-17]], 33 | columns=['y0', 'y1', 'y2'], 34 | index=[1, 2, 3]) 35 | pdt.assert_frame_equal(res_balances, exp_balances) 36 | 37 | 38 | if __name__ == '__main__': 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /gneiss/_model.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import abc 9 | import numpy as np 10 | 11 | 12 | class Model(metaclass=abc.ABCMeta): 13 | 14 | def __init__(self, Y, Xs): 15 | """ 16 | Abstract container for balance models. 17 | 18 | Parameters 19 | ---------- 20 | Y : pd.DataFrame 21 | Response matrix. This is the matrix being predicted. 22 | Also known as the dependent variable in univariate analysis. 23 | Xs : iterable of pd.DataFrame 24 | Design matrices. Also known as the independent variable 25 | in univariate analysis. Note that this allows for multiple 26 | design matrices to be inputted to enable multiple data block 27 | analysis. 28 | """ 29 | self.response_matrix = Y 30 | self.design_matrices = Xs 31 | 32 | @abc.abstractmethod 33 | def fit(self, **kwargs): 34 | pass 35 | 36 | @abc.abstractmethod 37 | def summary(self): 38 | """ Print summary results """ 39 | pass 40 | 41 | def percent_explained(self): 42 | """ Proportion explained by each principal balance.""" 43 | # Using sum of squares error calculation (df=1) 44 | # instead of population variance (df=0). 45 | axis_vars = np.var(self.response_matrix, ddof=1, axis=0) 46 | return axis_vars / axis_vars.sum() 47 | -------------------------------------------------------------------------------- /gneiss/composition/_variance.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import numpy as np 9 | from skbio import DistanceMatrix 10 | from skbio.stats.composition import closure 11 | 12 | 13 | def variation_matrix(X): 14 | r""" Calculate Aitchison variation matrix. 15 | 16 | This calculates the Aitchison variation matrix. Given a compositional 17 | matrix :math:`X`, and columns :math:`i` and :math:`j`, the :math:`ij` entry 18 | in the variation matrix of :math:`X` is given by 19 | 20 | .. math: 21 | V_{ij} = \frac{1}{2} var(\ln \frac{x_i}{x_j}) 22 | 23 | Parameters 24 | ---------- 25 | X : pd.DataFrame 26 | Contingency table where there are n rows corresponding to samples 27 | and p features corresponding to columns. 28 | 29 | Returns 30 | ------- 31 | skbio.DistanceMatrix 32 | Total variation matrix of size n x n. 33 | 34 | References 35 | ---------- 36 | .. [1] V. Pawlowsky-Glahn, J. J. Egozcue, R. Tolosana-Delgado (2015), 37 | Modeling and Analysis of Compositional Data, Wiley, Chichester, UK 38 | 39 | .. [2] J. J. Egozcue, V. Pawlowsky-Glahn (2004), Groups of Parts and 40 | Their Balances in Compositional Data Analysis, Mathematical Geology 41 | """ 42 | v = np.zeros((X.shape[1], X.shape[1])) 43 | x = closure(X) 44 | for i in range(X.shape[1]): 45 | for j in range(i): 46 | v[i, j] = np.var(np.log(x[:, i]) - np.log(x[:, j])) 47 | # Making matrix symmetry since V(ln (x/y) ) = V(ln (y/x) ) 48 | # Also dividing by 2, to ensure unit norm for balances. 49 | # See Eqn 4 in [2] 50 | return DistanceMatrix((v + v.T) / 2, ids=X.columns) 51 | -------------------------------------------------------------------------------- /gneiss/composition/_composition.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import pandas as pd 9 | import skbio 10 | from skbio.stats.composition import ilr 11 | from gneiss.balances import balance_basis 12 | from gneiss.util import match_tips 13 | 14 | 15 | def ilr_transform(table: pd.DataFrame, tree: skbio.TreeNode) -> pd.DataFrame: 16 | """Performs isometric logratio (ilr) transformation on feature-table. 17 | 18 | This creates a new table with balances (groups of features) that 19 | distinguish samples. Zeros must first be removed from the table 20 | (e.g. add-pseudocount). For source documentation check out: 21 | https://numpydoc.readthedocs.io/en/latest/ 22 | 23 | Parameters 24 | ----------- 25 | table : pd.DataFrame 26 | Dataframe of the feature table where rows correspond to samples 27 | and columns are features. The values within the table must be 28 | positive and nonzero. 29 | tree : skbio.TreeNode 30 | A tree relating all of the features to balances or 31 | log-contrasts (hierarchy). This tree must be bifurcating 32 | (i.e. has exactly 2 nodes). The internal nodes of the tree 33 | will be renamed. 34 | 35 | Returns 36 | -------- 37 | balances : pd.DataFrame 38 | Balances calculated from the feature table. Balance represents 39 | the log ratio of subchildren values below the specified internal node. 40 | """ 41 | _table, _tree = match_tips(table, tree) 42 | basis, _ = balance_basis(_tree) 43 | balances = ilr(_table.values, basis) 44 | in_nodes = [n.name for n in _tree.levelorder() if not n.is_tip()] 45 | return pd.DataFrame(balances, 46 | columns=in_nodes, 47 | index=table.index) 48 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/loo.csv: -------------------------------------------------------------------------------- 1 | ,model_mse,pred_mse 2 | 0,4260.496054701282,54.18622304652599 3 | 1,4249.729540651938,65.5315803644453 4 | 2,4308.381781538387,2.2473041294562393 5 | 3,4171.3649026828525,148.68431600602923 6 | 4,4229.0766673909075,86.62702948774607 7 | 5,4116.071957191982,206.04849363338943 8 | 6,4291.356272273773,20.185811261941183 9 | 7,4247.848387235389,65.88182699812373 10 | 8,4280.951200463445,30.952001700594646 11 | 9,4308.791284413965,1.7593744234763966 12 | 10,4296.811286205901,14.235755262856841 13 | 11,4266.038114539728,46.16820874785575 14 | 12,4294.513406076312,16.54048715628344 15 | 13,4198.80916007727,115.44075421081786 16 | 14,4299.933519918486,10.873972613900444 17 | 15,4308.962558527042,1.5569522496643602 18 | 16,4146.048139786893,168.97881959977258 19 | 17,4271.59957874892,39.888553605911454 20 | 18,4283.214605760981,27.93225479528788 21 | 19,4240.267113131621,71.85358170168348 22 | 20,3865.3123050881723,455.1507540244841 23 | 21,4298.985490333344,11.737955025509407 24 | 22,4212.7032680853445,99.82809146668683 25 | 23,4195.950083899159,116.88780700999078 26 | 24,4072.6177711303317,242.71727889272387 27 | 25,4159.708341918885,153.8471722349595 28 | 26,4270.2705100334715,41.033911977299155 29 | 27,4287.618745208718,23.3368769592242 30 | 28,4189.413285610791,123.68071225408006 31 | 29,4201.68539487846,111.23040863550231 32 | 30,4225.484906018272,86.9823149007921 33 | 31,4271.267305724941,40.17406365429842 34 | 32,4271.840582007024,39.64126877885141 35 | 33,3977.404975585566,342.2907962328493 36 | 34,4223.808909462919,89.22378741677774 37 | 35,4162.58769041999,152.55352974220145 38 | 36,4162.168549226462,153.32005926284594 39 | 37,4194.701126330643,119.97354476493928 40 | 38,4211.112602919985,103.23385330388982 41 | 39,4283.057863114158,28.56510466677602 42 | 40,4251.739438646766,61.37924034589412 43 | 41,4211.337569965258,103.93346843059663 44 | 42,4213.677952181935,101.82844803208135 45 | 43,4004.6251255020197,322.9223825214111 46 | 44,4309.593365853613,0.9343173620906449 47 | 45,4307.59891842044,3.0607029500315304 48 | 46,4265.732051685689,47.82228161965757 49 | 47,4136.159353565893,187.15821681904137 50 | 48,4238.3420772679465,77.81628431034508 51 | 49,4302.189190917612,8.983205465045119 52 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/loo2.csv: -------------------------------------------------------------------------------- 1 | ,model_mse,pred_mse 2 | 0,4210.254252226571,62.7244814276346 3 | 1,4211.637544078904,59.28731330907396 4 | 2,4261.16964292544,1.5632108577163226 5 | 3,4120.227706392463,158.4176551220517 6 | 4,4194.81383810487,74.12893770610398 7 | 5,4062.59080988408,215.8389185989188 8 | 6,4249.074649016179,14.3801085059596 9 | 7,4201.031230679247,65.0913132126485 10 | 8,4230.052330598066,34.14975753720326 11 | 9,4260.674247388393,1.956655442219737 12 | 10,4248.808610693286,14.312123274205462 13 | 11,4217.133108158621,47.20095190398427 14 | 12,4247.62638198413,15.485331316173468 15 | 13,4158.5575171719565,107.87520269124951 16 | 14,4248.736427270089,14.326680157044192 17 | 15,4262.180211177333,0.37917285947764945 18 | 16,4102.386908392766,166.41818121291553 19 | 17,4219.178090822697,45.11498285380986 20 | 18,4242.171042180773,21.222511951309993 21 | 19,4195.86950494634,69.54045197741316 22 | 20,3817.4178283650426,464.80939115822355 23 | 21,4243.375657832784,20.038443946057576 24 | 22,4139.460083408845,128.7706636543021 25 | 23,4167.699582287961,99.28472513339455 26 | 24,3993.4915035994995,281.73012807025697 27 | 25,4129.178914240647,139.64996281723694 28 | 26,4232.6815844335015,31.26152893262742 29 | 27,4248.3445753384085,14.856826916275455 30 | 28,4130.719920325524,137.79891120579325 31 | 29,4139.821025182945,128.15082264058097 32 | 30,4188.892773390983,76.81691339553873 33 | 31,4210.650307206859,54.0553002064334 34 | 32,4224.565303799641,39.51067743285984 35 | 33,3950.9211117494106,323.80385337396444 36 | 34,4165.780677853521,100.45174573187475 37 | 35,4101.517281439408,167.06442738984504 38 | 36,4104.628998644643,163.81966431619293 39 | 37,4145.145229741029,121.85610388044178 40 | 38,4165.6365980300525,100.72568307909934 41 | 39,4235.22250796671,28.467179798193584 42 | 40,4201.176969556616,64.170451061669 43 | 41,4163.859850749393,103.71697665477915 44 | 42,4158.274885579024,110.33395650500965 45 | 43,3970.6676233926455,311.57982208008514 46 | 44,4259.023597799898,3.801644051465528 47 | 45,4261.765412215231,0.8537315788551669 48 | 46,4196.769182277087,73.21731835996582 49 | 47,4119.239750993762,162.82391090276036 50 | 48,4153.57187529307,126.91052991408179 51 | 49,4249.5788963444775,15.553691413241404 52 | -------------------------------------------------------------------------------- /gneiss/tests/test_model.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import pandas as pd 9 | import statsmodels.formula.api as smf 10 | from skbio import TreeNode 11 | from gneiss._model import Model 12 | import unittest 13 | import os 14 | import pandas.util.testing as pdt 15 | 16 | 17 | # create some mock classes for testing 18 | class submock_ok(Model): 19 | def __init__(self, *args, **kwargs): 20 | super().__init__(*args, **kwargs) 21 | 22 | def summary(self): 23 | print("OK!") 24 | 25 | def fit(self, **kwargs): 26 | pass 27 | 28 | 29 | class submock_bad(Model): 30 | def __init__(self, **kwargs): 31 | super(Model, self, **kwargs) 32 | 33 | 34 | class TestModel(unittest.TestCase): 35 | def setUp(self): 36 | self.pickle_fname = "test.pickle" 37 | self.data = pd.DataFrame([[1, 1, 1], 38 | [3, 2, 3], 39 | [4, 3, 2], 40 | [5, 4, 4], 41 | [2, 5, 3], 42 | [3, 6, 5], 43 | [4, 7, 4]], 44 | index=['s1', 's2', 's3', 's4', 45 | 's5', 's6', 's7'], 46 | columns=['Y1', 'Y2', 'X']) 47 | 48 | self.model1 = smf.ols(formula="Y1 ~ X", data=self.data) 49 | self.model2 = smf.ols(formula="Y2 ~ X", data=self.data) 50 | 51 | self.basis = pd.DataFrame([[0.80442968, 0.19557032]], 52 | index=['a'], 53 | columns=['x', 'y']) 54 | self.tree = TreeNode.read(['(x, y)a;']) 55 | self.balances = pd.DataFrame({'a': [-1, 0, 1]}) 56 | self.metadata = pd.DataFrame( 57 | [[1], [3], [2]], 58 | columns=['X']) 59 | 60 | def tearDown(self): 61 | if os.path.exists(self.pickle_fname): 62 | os.remove(self.pickle_fname) 63 | 64 | def test_init(self): 65 | res = submock_ok(Y=self.balances, Xs=self.metadata) 66 | 67 | # check balances 68 | pdt.assert_frame_equal(self.balances, res.response_matrix) 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /gneiss/tests/data/large_tree.nwk: -------------------------------------------------------------------------------- 1 | (((((((1122517:0.06882,((252012:0.00548,588042:0.03795)45:0.00078,1121144:0.00648)34:0.0202)22:0.02416,(3330572:0.10987,279138:0.06878)23:0.01632)13:0.05558,(((75371:0.0859,214611:0.07975)35:0.00449,(143135:0.09553,356045:0.1125)36:0.00595)24:0.01277,160908:0.07333)14:0.105)7:0.07707,(((112795:0.09046,357011:0.13893)25:0.07603,4447334:0.25156)15:0.00593,((732929:0.14242,831289:0.16867)26:0.00658,(((1638797:0.02978,(((792450:0.04478,4681:0.02314)80:0.00634,((83531:0.02222,213177:0.01197)93:0.00769,(215692:0.01593,319907:0.05481)94:0.01631)81:0.00808)69:0.00336,208293:0.01894)57:0.01132)46:0.05063,((1806981:0.01642,(523224:0.03288,((148783:0.01871,146397:0.02416)95:0.00014,(148890:0.01623,146676:0.01235)96:0.0011)82:0.00547)70:0.01097)58:0.00721,(222209:0.0811,216805:0.02552)59:0.05218)47:0.01742)37:0.02315,((1137157:0.00854,1139779:0.01691)48:0.00141,(4362556:0.02248,4416927:0.00937)49:0.01111)38:0.07136)27:0.07544)16:0.01742)8:0.0766)4:0.02393,136959:0.21412)2:0.01293,((((1094976:0.27174,(2601820:0.12574,1124701:0.03443)28:0.21253)17:0.00958,(171768:0.31195,(211848:0.24113,845780:0.16002)29:0.0206)18:0.02439)9:0.25276,(3431064:0.06261,4423681:0.00907)10:0.39256)5:0.04968,((3749019:0.23613,(((1876538:0.09481,((((4468200:0.02519,((1146003:0.01465,1142972:0.00893)104:0.01089,1122202:0.01897)97:0.00342)83:0.01099,(216549:0.00841,(607006:0.02324,663880:0.02481)98:0.01213)84:0.01983)71:0.00654,(((4440638:0.00637,((697997:0.01101,4346060:0.02106)109:0.00484,804187:0.01941)105:0.00823)99:0.0033,1108390:0.02694)85:0.01253,3639039:0.0138)72:0.01591)60:0.03741,4440611:0.02193)50:0.045)39:0.05434,572134:0.13989)30:0.05905,((((((((850823:0.01423,1123984:0.01446)106:0.00333,(2578357:0.00999,(1024089:0.0153,(256536:0.00389,746927:0.0085)111:0.02256)110:0.01297)107:0.00745)100:0.00982,242467:0.00942)86:0.00283,224043:0.00904)73:0.06305,(209803:0.05809,(1147699:0.00958,78839:0.01539)87:0.01255)74:0.02145)61:0.05437,(203969:0.01049,113212:0.01718)62:0.07463)51:0.09163,((((766178:0.00642,156065:0.02144)88:0.00252,3616127:0.01843)75:0.01515,205391:0.08328)63:0.01478,(843189:0.05067,2867534:0.04027)64:0.05349)52:0.12384)40:0.02845,(((512006:0.01707,(1130478:0.01127,((742260:0.0579,4440396:0.03928)101:0.01357,2285453:0.01427)89:0.00474)76:0.05994)65:0.01083,(156611:0.04171,151283:0.03279)66:0.01435)53:0.01822,(((((4409771:0.0131,4450823:0.01556)102:0.01771,4367783:0.0177)90:0.0423,(1127423:0.01089,((1104509:0.00653,4424782:0.0305)108:0.00209,154494:0.0309)103:0.02042)91:0.01431)77:0.00614,(211129:0.08219,(4478794:0.01191,1129210:0.01509)92:0.04495)78:0.00774)67:0.01808,((154519:0.0024,4341561:0.04065)79:0.01119,4351648:0.01519)68:0.03589)54:0.01789)41:0.03913)31:0.03029)19:0.12182)11:0.01264,((((255018:0.01859,4466061:0.01239)42:0.0169,825937:0.0207)32:0.02137,1123837:0.05322)20:0.2196,((((364805:0.00924,4470139:0.00877)55:0.0024,154567:0.05202)43:0.0347,(223583:0.14375,(834883:0.00959,592291:0.00967)56:0.09018)44:0.02203)33:0.04164,238800:0.0929)21:0.01988)12:0.18772)6:0.01563)3:0.02085)1:0.0212,4322321:0.2845)0:0.25729; 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gneiss 2 | 3 | [![Build Status](https://travis-ci.org/biocore/gneiss.png?branch=master)](https://travis-ci.org/biocore/gneiss) 4 | [![Coverage Status](https://coveralls.io/repos/biocore/gneiss/badge.svg)](https://coveralls.io/r/biocore/gneiss) 5 | [![Gitter](https://badges.gitter.im/biocore/gneiss.svg)](https://gitter.im/biocore/gneiss?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) 6 | 7 | Canonically pronouced *nice* 8 | 9 | 10 | gneiss is a compositional data analysis and visualization toolbox designed for analyzing high dimensional proportions. See [here](https://biocore.github.io/gneiss/) for API documentation. 11 | 12 | Note that gneiss is not compatible with python 2, and is compatible with Python 3.4 or later. 13 | gneiss is currently in alpha. We are actively developing it, and __backward-incompatible interface changes may arise__. 14 | 15 | # Installation 16 | 17 | To install this package, it is recommended to use conda. First make sure that the appropriate channels are configured. 18 | 19 | ``` 20 | conda config --add channels https://conda.anaconda.org/bioconda 21 | conda config --add channels https://conda.anaconda.org/biocore 22 | conda config --add channels https://conda.anaconda.org/qiime2 23 | conda config --add channels https://conda.anaconda.org/qiime2/label/r2017.6 24 | ``` 25 | 26 | Then gneiss can be installed in a conda environment as follows 27 | ``` 28 | conda create -n gneiss_env gneiss 29 | ``` 30 | To install the most up to date version of gneiss, run the following command 31 | 32 | ``` 33 | pip install git+https://github.com/biocore/gneiss.git 34 | ``` 35 | 36 | # Tutorials 37 | 38 | * [What are balances](https://github.com/biocore/gneiss/blob/master/ipynb/balance_trees.ipynb) 39 | 40 | # Qiime2 tutorials 41 | 42 | * [Linear regression on balances in the 88 soils](https://biocore.github.io/gneiss/docs/v0.4.0/tutorials/qiime2/88soils-qiime2-tutorial.html) 43 | * [Linear mixed effects models on balances in a CF study](https://biocore.github.io/gneiss/docs/v0.4.0/tutorials/qiime2/cfstudy-qiime2-tutorial.html) 44 | * [Linear regression on balances in the Chronic Fatigue Syndrome](https://biocore.github.io/gneiss/docs/v0.4.0/tutorials/qiime2/cfs-qiime2-tutorial.html) 45 | 46 | # Python tutorials 47 | 48 | * [Linear regression on balances in the 88 soils](https://biocore.github.io/gneiss/docs/v0.4.0/tutorials/python/88soils-python-tutorial.html) 49 | * [Linear mixed effects models on balances in a CF study](https://biocore.github.io/gneiss/docs/v0.4.0/tutorials/python/cfstudy-python-tutorial.html) 50 | * [Linear regression on balances in the Chronic Fatigue Syndrome](https://biocore.github.io/gneiss/docs/v0.4.0/tutorials/python/cfs-python-tutorial.html) 51 | 52 | 53 | If you use this software package in your own publications, please cite it at 54 | ``` 55 | Morton JT, Sanders J, Quinn RA, McDonald D, Gonzalez A, Vázquez-Baeza Y, 56 | Navas-Molina JA, Song SJ, Metcalf JL, Hyde ER, Lladser M, Dorrestein PC, 57 | Knight R. 2017. Balance trees reveal microbial niche differentiation. 58 | mSystems 2:e00162-16. https://doi.org/10.1128/mSystems.00162-16. 59 | ``` 60 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---------------------------------------------------------------------------- 4 | # Copyright (c) 2016--, gneiss development team. 5 | # 6 | # Distributed under the terms of the Modified BSD License. 7 | # 8 | # The full license is in the file COPYING.txt, distributed with this software. 9 | # ---------------------------------------------------------------------------- 10 | 11 | import re 12 | import ast 13 | import os 14 | 15 | from setuptools import find_packages, setup 16 | from setuptools.command.build_ext import build_ext as _build_ext 17 | 18 | 19 | class build_ext(_build_ext): 20 | def finalize_options(self): 21 | _build_ext.finalize_options(self) 22 | # Prevent numpy from thinking it is still in its setup process: 23 | __builtins__.__NUMPY_SETUP__ = False 24 | import numpy 25 | self.include_dirs.append(numpy.get_include()) 26 | 27 | 28 | # Dealing with Cython 29 | USE_CYTHON = os.environ.get('USE_CYTHON', False) 30 | ext = '.pyx' if USE_CYTHON else '.c' 31 | 32 | extensions = [ 33 | ] 34 | 35 | if USE_CYTHON: 36 | from Cython.Build import cythonize 37 | extensions = cythonize(extensions) 38 | 39 | classes = """ 40 | Development Status :: 4 - Beta 41 | License :: OSI Approved :: BSD License 42 | Topic :: Software Development :: Libraries 43 | Topic :: Scientific/Engineering 44 | Topic :: Scientific/Engineering :: Bio-Informatics 45 | Programming Language :: Python :: 3 46 | Programming Language :: Python :: 3 :: Only 47 | Operating System :: Unix 48 | Operating System :: POSIX 49 | Operating System :: MacOS :: MacOS X 50 | """ 51 | classifiers = [s.strip() for s in classes.split('\n') if s] 52 | 53 | description = ('Compositional data analysis tools and visualizations') 54 | 55 | with open('README.md') as f: 56 | long_description = f.read() 57 | 58 | 59 | # version parsing from __init__ pulled from Flask's setup.py 60 | # https://github.com/mitsuhiko/flask/blob/master/setup.py 61 | _version_re = re.compile(r'__version__\s+=\s+(.*)') 62 | 63 | with open('gneiss/__init__.py', 'rb') as f: 64 | hit = _version_re.search(f.read().decode('utf-8')).group(1) 65 | version = str(ast.literal_eval(hit)) 66 | 67 | setup(name='gneiss', 68 | version=version, 69 | license='BSD', 70 | description=description, 71 | long_description=long_description, 72 | long_description_content_type='text/markdown', 73 | author="gneiss development team", 74 | author_email="jamietmorton@gmail.com", 75 | maintainer="gneiss development team", 76 | maintainer_email="jamietmorton@gmail.com", 77 | packages=find_packages(), 78 | setup_requires=['numpy >= 1.15.3'], 79 | ext_modules=extensions, 80 | cmdclass={'build_ext': build_ext}, 81 | install_requires=[ 82 | 'IPython >= 3.2.0', 83 | 'matplotlib >= 1.4.3', 84 | 'numpy >= 1.15.3', 85 | 'pandas >= 0.18.0', 86 | 'scipy >= 0.15.1', 87 | 'nose >= 1.3.7', 88 | 'scikit-bio >= 0.5.5', 89 | 'statsmodels>=0.8.0', 90 | 'biom-format', 91 | 'seaborn', 92 | 'bokeh==1.1.0' 93 | ], 94 | classifiers=classifiers, 95 | package_data={}) 96 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # gneiss changelog 2 | 3 | ## Version 0.4.5 4 | * Pandas dependencies have been updated [#282](https://github.com/biocore/gneiss/pull/282) 5 | 6 | ## Version 0.4.4 7 | * Numpy and scikit-bio dependencies have been updated [#278](https://github.com/biocore/gneiss/pull/278) 8 | 9 | ## Version 0.4.3 10 | * Enabling direct download of fdr corrected pvalues 11 | * Adding in sparse version of the ilr transform utilizing COO-formated sparse matrices [#250](https://github.com/biocore/gneiss/pull/250) 12 | * Adding in sparse utilities for matching biom tables [#253](https://github.com/biocore/gneiss/pull/253) 13 | 14 | 15 | ## Version 0.4.2 16 | * Added `proportion_plot` to plot the mean proportions within a single balance [#234](https://github.com/biocore/gneiss/pull/234) 17 | 18 | ## Version 0.4.1 19 | * Added colorbar for heatmap 20 | * Decoupled qiime2 from gneiss. All qiime2 commands have now been ported to [q2-gneiss](https://github.com/qiime2/q2-gneiss) 21 | 22 | ## Version 0.4.0 23 | * Accelerated the ordinary least squares regression 24 | * Improved summary statistics and cross validation in ordinary least squares regression 25 | * Improved summary visualizations for OLS and MixedLM 26 | 27 | ## Version 0.3.2 28 | * Added `balance_boxplot` and `balance_barplot` to make interpretation balance partitions easier. 29 | * Added `balance_summary` to summarize a given balance using the q2 cli. 30 | * Added `assign_ids` command to allow for ids to be added manually. 31 | 32 | ## Version 0.3.0 33 | * Added q2 support for linear regression and linear mixed effects models [#98](https://github.com/biocore/gneiss/pull/98) 34 | * Added q2 support hierarchical clustering [#116](https://github.com/biocore/gneiss/pull/116) 35 | * Added interactive heatmaps with highlights with matplotlib [#114](https://github.com/biocore/gneiss/pull/114) 36 | * Added tree visualizations for unrooted trees with bokeh [#112](https://github.com/biocore/gneiss/pull/112) 37 | * Added support of cross validation for ordinary least squares [#101](https://github.com/biocore/gneiss/pull/101) 38 | 39 | ## Version 0.2.1 40 | * Added heatmap dendrogram plotting functionality [#87](https://github.com/biocore/gneiss/issues/87) 41 | * Added principal balance analysis heuristic using proportionality and wards clustering algorithm [#83](https://github.com/biocore/gneiss/issues/83) 42 | 43 | ## Version 0.2.0 44 | 45 | ### Features 46 | * Added filehandle support for write and read io in RegressionResults object [#77](https://github.com/biocore/gneiss/issues/77) 47 | 48 | 49 | ## Version 0.1.3 50 | 51 | ### Features 52 | * Added write and read io for RegressionResults object [#72](https://github.com/biocore/gneiss/issues/72) 53 | 54 | ## Version 0.1.2 55 | 56 | ### Features 57 | * Added `ladderize` and `gradient_sort` [#29](https://github.com/biocore/gneiss/issues/29) 58 | 59 | ### Bug fixes 60 | 61 | 62 | ## Version 0.0.2 63 | 64 | ### Features 65 | * Added statsmodels inference [#22](https://github.com/biocore/gneiss/pull/22) 66 | * Added support for ordinary least squares regression [#33](https://github.com/biocore/gneiss/pull/33) 67 | * Added support for linear mixed effects models [#38](https://github.com/biocore/gneiss/pull/38) 68 | * Added RegressionResults object to summarize statistics from statistical analyses 69 | * Adding in a niche sorting algorithm `gneiss.sort.niche_sort` that can generate a band table given a gradient [#16](https://github.com/biocore/gneiss/pull/16) 70 | * Adding in utility functions for handing feature tables, metadata, and trees. [#12](https://github.com/biocore/gneiss/pull/12) 71 | * Adding GPL license. 72 | 73 | ### Bug fixes 74 | -------------------------------------------------------------------------------- /gneiss/cluster/tests/test_pba.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import numpy as np 9 | import pandas as pd 10 | import unittest 11 | from gneiss.cluster._pba import (correlation_linkage, gradient_linkage, 12 | rank_linkage, random_linkage) 13 | from skbio import TreeNode 14 | 15 | 16 | class TestPBA(unittest.TestCase): 17 | def setUp(self): 18 | pass 19 | 20 | def test_correlation_linkage_1(self): 21 | table = pd.DataFrame( 22 | [[1, 1, 0, 0, 0], 23 | [0, 1, 1, 0, 0], 24 | [0, 0, 1, 1, 0], 25 | [0, 0, 0, 1, 1]], 26 | columns=['s1', 's2', 's3', 's4', 's5'], 27 | index=['o1', 'o2', 'o3', 'o4']).T 28 | exp_str = ('((o1:0.574990173931,o2:0.574990173931)y1:0.773481312844,' 29 | '(o3:0.574990173931,o4:0.574990173931)y2:0.773481312844)' 30 | 'y0;\n') 31 | exp_tree = TreeNode.read([exp_str]) 32 | res_tree = correlation_linkage(table + 0.1) 33 | # only check for tree topology since checking for floating point 34 | # numbers on the branches is still tricky. 35 | self.assertEqual(exp_tree.ascii_art(), res_tree.ascii_art()) 36 | 37 | def test_correlation_linkage_2(self): 38 | t = pd.DataFrame([[1, 1, 2, 3, 1, 4], 39 | [2, 2, 0.1, 4, 1, .1], 40 | [3, 3.1, 2, 3, 2, 2], 41 | [4.1, 4, 0.2, 1, 1, 2.5]], 42 | index=['S1', 'S2', 'S3', 'S4'], 43 | columns=['F1', 'F2', 'F3', 'F4', 'F5', 'F6']) 44 | exp_str = ('((F4:0.228723591874,(F5:0.074748541601,' 45 | '(F1:0.00010428164962,F2:0.00010428164962)' 46 | 'y4:0.0746442599513)y3:0.153975050273)' 47 | 'y1:0.70266138894,(F3:0.266841737789,F6:0.266841737789)' 48 | 'y2:0.664543243026)y0;\n') 49 | exp_tree = TreeNode.read([exp_str]) 50 | res_tree = correlation_linkage(t) 51 | self.assertEqual(exp_tree.ascii_art(), res_tree.ascii_art()) 52 | 53 | 54 | class TestUPGMA(unittest.TestCase): 55 | def setUp(self): 56 | pass 57 | 58 | def test_gradient_linkage(self): 59 | table = pd.DataFrame( 60 | [[1, 1, 0, 0, 0], 61 | [0, 1, 1, 0, 0], 62 | [0, 0, 1, 1, 0], 63 | [0, 0, 0, 1, 1]], 64 | columns=['s1', 's2', 's3', 's4', 's5'], 65 | index=['o1', 'o2', 'o3', 'o4']).T 66 | gradient = pd.Series( 67 | [1, 2, 3, 4, 5], 68 | index=['s1', 's2', 's3', 's4', 's5']) 69 | res_tree = gradient_linkage(table, gradient) 70 | exp_str = '((o1:0.5,o2:0.5)y1:0.5,(o3:0.5,o4:0.5)y2:0.5)y0;\n' 71 | self.assertEqual(exp_str, str(res_tree)) 72 | 73 | 74 | class TestRandomLinkage(unittest.TestCase): 75 | 76 | def test_random_tree(self): 77 | np.random.seed(0) 78 | t = random_linkage(10) 79 | exp_str = ( 80 | '((7:0.0359448798595,8:0.0359448798595)y1:0.15902486847,' 81 | '((9:0.0235897432375,(4:0.00696620596189,6:0.00696620596189)' 82 | 'y5:0.0166235372756)y3:0.0747173561014,(1:0.0648004111784,' 83 | '((0:0.00196516046521,3:0.00196516046521)y7:0.0367750400883,' 84 | '(2:0.0215653684975,5:0.0215653684975)y8:0.017174832056)' 85 | 'y6:0.0260602106249)y4:0.0335066881605)y2:0.0966626489905)y0;\n') 86 | exp_tree = TreeNode.read([exp_str]) 87 | self.assertEqual(t.ascii_art(), exp_tree.ascii_art()) 88 | 89 | 90 | class TestRankLinkage(unittest.TestCase): 91 | 92 | def test_rank_linkage(self): 93 | ranks = pd.Series([1, 2, 4, 5], 94 | index=['o1', 'o2', 'o3', 'o4']) 95 | t = rank_linkage(ranks) 96 | exp = '((o1:0.5,o2:0.5)y1:1.0,(o3:0.5,o4:0.5)y2:1.0)y0;\n' 97 | self.assertEqual(str(t), exp) 98 | 99 | 100 | if __name__ == '__main__': 101 | unittest.main() 102 | -------------------------------------------------------------------------------- /gneiss/plot/tests/test_radial.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | import numpy as np 4 | from scipy.cluster.hierarchy import ward 5 | 6 | from skbio import TreeNode, DistanceMatrix 7 | from gneiss.plot._radial import radialplot 8 | from gneiss.plot._dendrogram import UnrootedDendrogram 9 | import numpy.testing as npt 10 | 11 | 12 | class TestRadial(unittest.TestCase): 13 | def setUp(self): 14 | 15 | self.coords = pd.DataFrame( 16 | [['487.5', '347.769', 'NaN', 'NaN', 'True'], 17 | ['12.5', '483.28', 'NaN', 'NaN', 'True'], 18 | ['324.897', '16.7199', 'NaN', 'NaN', 'True'], 19 | ['338.261', '271.728', '0', '2', 'False'], 20 | ['193.169', '365.952', '1', 'y3', 'False']], 21 | columns=['x', 'y', 'child0', 'child1', 'is_tip'], 22 | index=['0', '1', '2', 'y3', 'y4']) 23 | 24 | @unittest.skip('Visualizations are deprecated') 25 | def test_basic_plot(self): 26 | self.maxDiff = None 27 | exp_edges = {'dest_node': ['0', '1', '2', 'y3'], 28 | 'edge_color': ['#00FF00', '#00FF00', 29 | '#00FF00', '#FF0000'], 30 | 'edge_width': [2, 2, 2, 2], 31 | 'src_node': ['y3', 'y4', 'y3', 'y4'], 32 | 'x0': [338.2612593838583, 33 | 193.1688862557773, 34 | 338.2612593838583, 35 | 193.1688862557773], 36 | 'x1': [487.5, 12.499999999999972, 37 | 324.89684138234867, 338.2612593838583], 38 | 'y0': [271.7282256126416, 39 | 365.95231443706376, 40 | 271.7282256126416, 41 | 365.95231443706376], 42 | 'y1': [347.7691620070637, 43 | 483.2800610261029, 44 | 16.719938973897143, 45 | 271.7282256126416]} 46 | 47 | exp_nodes = {'child0': [np.nan, np.nan, np.nan, '0', '1'], 48 | 'child1': [np.nan, np.nan, np.nan, '2', 'y3'], 49 | 'color': ['#1C9099', '#1C9099', '#1C9099', 50 | '#FF999F', '#FF999F'], 51 | 'hover_var': [None, None, None, None, None], 52 | 'is_tip': [True, True, True, False, False], 53 | 'node_size': [10, 10, 10, 10, 10], 54 | 'x': [12.499999999999972, 55 | 487.5, 56 | 324.89684138234867, 57 | 338.26125938385832, 58 | 193.16888625577729], 59 | 'y': [483.28006102610289, 60 | 347.7691620070637, 61 | 16.719938973897143, 62 | 271.72822561264161, 63 | 365.95231443706376]} 64 | np.random.seed(0) 65 | num_otus = 3 # otus 66 | x = np.random.rand(num_otus) 67 | dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) 68 | lm = ward(dm.condensed_form()) 69 | t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) 70 | t = UnrootedDendrogram.from_tree(t) 71 | # incorporate colors in tree 72 | for i, n in enumerate(t.postorder(include_self=True)): 73 | if not n.is_tip(): 74 | n.name = "y%d" % i 75 | n.color = '#FF999F' 76 | n.edge_color = '#FF0000' 77 | n.node_size = 10 78 | else: 79 | n.color = '#1C9099' 80 | n.edge_color = '#00FF00' 81 | n.node_size = 10 82 | n.length = np.random.rand() * 3 83 | n.edge_width = 2 84 | p = radialplot(t, node_color='color', edge_color='edge_color', 85 | node_size='node_size', edge_width='edge_width') 86 | 87 | for e in exp_edges.keys(): 88 | if isinstance(exp_edges[e], float): 89 | npt.assert_allclose( 90 | p.renderers[0].data_source.data[e], 91 | np.array(exp_edges[e]) 92 | ) 93 | else: 94 | self.assertListEqual( 95 | list(p.renderers[0].data_source.data[e]), 96 | exp_edges[e]) 97 | 98 | for e in exp_nodes.keys(): 99 | self.assertListEqual( 100 | list(p.renderers[1].data_source.data[e]), 101 | exp_nodes[e]) 102 | 103 | self.assertTrue(isinstance(t, TreeNode)) 104 | 105 | 106 | if __name__ == "__main__": 107 | unittest.main() 108 | -------------------------------------------------------------------------------- /ci/environment.yml: -------------------------------------------------------------------------------- 1 | name: gneiss 2 | channels: !!python/tuple 3 | - https://conda.anaconda.org/bioconda 4 | - https://conda.anaconda.org/biocore 5 | - https://conda.anaconda.org/qiime2 6 | - defaults 7 | dependencies: 8 | - anaconda::matplotlib=1.5.1=np111py35_0 9 | - anaconda::numexpr=2.6.1=np111py35_1 10 | - anaconda::numpy=1.11.2=py35_0 11 | - anaconda::pandas=0.19.1=np111py35_0 12 | - anaconda::pytables=3.2.2=np111py35_4 13 | - anaconda::scikit-bio=0.5.1=np111py35_0 14 | - anaconda::scipy=0.18.1=np111py35_0 15 | - bioconda::click=6.6=py35_0 16 | - bokeh=0.12.4=py35_0 17 | - bz2file=0.98=py35_0 18 | - cachecontrol=0.11.6=py35_0 19 | - contextlib2=0.4.0=py35_0 20 | - cycler=0.10.0=py35_0 21 | - decorator=4.0.10=py35_0 22 | - entrypoints=0.2.2=py35_0 23 | - flake8=2.5.1=py35_0 24 | - freetype=2.5.5=1 25 | - future=0.15.2=py35_0 26 | - h5py=2.6.0=np111py35_1 27 | - hdf5=1.8.16=0 28 | - ipykernel=4.0.3=py35_0 29 | - ipython=3.2.3=py35_0 30 | - ipython_genutils=0.1.0=py35_0 31 | - ipywidgets=4.1.1=py35_0 32 | - jinja2=2.8=py35_1 33 | - jsonschema=2.5.1=py35_0 34 | - jupyter=1.0.0=py35_3 35 | - jupyter_client=4.3.0=py35_0 36 | - jupyter_console=5.0.0=py35_0 37 | - jupyter_core=4.1.0=py35_0 38 | - libpng=1.6.22=0 39 | - lockfile=0.12.2=py35_0 40 | - markupsafe=0.23=py35_2 41 | - mccabe=0.3.1=py35_0 42 | - mistune=0.7.2=py35_1 43 | - mkl=11.3.3=0 44 | - natsort=4.0.3=py35_0 45 | - nbconvert=4.2.0=py35_0 46 | - nbformat=4.0.1=py35_0 47 | - nose=1.3.7=py35_1 48 | - notebook=4.2.1=py35_0 49 | - openssl=1.0.2h=1 50 | - patsy=0.4.1=py35_0 51 | - pep8=1.7.0=py35_0 52 | - pip=8.1.2=py35_0 53 | - prompt_toolkit=1.0.9=py35_0 54 | - ptyprocess=0.5.1=py35_0 55 | - pyflakes=1.2.3=py35_0 56 | - pygments=2.1.3=py35_0 57 | - pyparsing=2.1.4=py35_0 58 | - pyqt=4.11.4=py35_3 59 | - python=3.5.2=0 60 | - python-dateutil=2.5.3=py35_0 61 | - python.app=1.2=py35_4 62 | - pytz=2016.4=py35_0 63 | - pyyaml=3.12=py35_0 64 | - pyzmq=15.2.0=py35_1 65 | - qiime2::arrow=0.8.0=py35_0 66 | - qiime2::binaryornot=0.3.0=0_ge797740 67 | - qiime2::biom-format=2.1.5=py35_3 68 | - qiime2::cookiecutter=1.4.0=py35_0 69 | - qiime2::ijson=2.3=py35_0 70 | - qiime2::ipymd=0.1.2=py35_0 71 | - qiime2::jinja2-time=0.2.0=py35_0 72 | - qiime2::poyo=0.4.0=py35_0 73 | - qiime2::python-frontmatter=0.2.1=py35_0 74 | - qiime2::tzlocal=1.3=py35_0 75 | - qiime2::whichcraft=0.4.0=py35_0 76 | - qt=4.8.7=3 77 | - qtconsole=4.0.1=py35_0 78 | - readline=6.2=2 79 | - requests=2.10.0=py35_0 80 | - setuptools=23.0.0=py35_0 81 | - sip=4.16.9=py35_0 82 | - six=1.10.0=py35_0 83 | - sqlite=3.13.0=0 84 | - statsmodels=0.8.0=np111py35_0 85 | - terminado=0.6=py35_0 86 | - tk=8.5.18=0 87 | - tornado=4.3=py35_1 88 | - traitlets=4.2.1=py35_0 89 | - wcwidth=0.1.7=py35_0 90 | - wheel=0.29.0=py35_0 91 | - xz=5.2.2=0 92 | - yaml=0.1.6=0 93 | - zlib=1.2.8=3 94 | - pip: 95 | - alabaster==0.7.9 96 | - appdirs==1.4.0 97 | - appnope==0.1.0 98 | - args==0.1.0 99 | - babel==2.3.4 100 | - backports.shutil-get-terminal-size==1.0.0 101 | - canvas==0.0.1 102 | - chest==0.2.3 103 | - clint==0.5.1 104 | - cloudpickle==0.2.1 105 | - colorama==0.3.7 106 | - coverage==4.1 107 | - coveralls==1.1 108 | - cvxopt==1.1.8 109 | - cython==0.23.5 110 | - dask==0.11.0 111 | - datashader==0.4.0 112 | - datashape==0.5.2 113 | - docopt==0.6.2 114 | - docutils==0.12 115 | - emperor==1.0.0b5 116 | - ete3==3.0.0b35 117 | - gneiss (/Users/mortonjt/Dropbox/UCSD/research/gneiss)==0.3.1 118 | - gnureadline==6.3.3 119 | - heapdict==1.0.0 120 | - igraph==0.1.11 121 | - imagesize==0.7.1 122 | - ipyparallel==5.2.0 123 | - ipython-genutils==0.1.0 124 | - jgraph==0.2.1 125 | - joblib==0.10.2 126 | - jupyter-client==4.3.0 127 | - jupyter-console==5.0.0 128 | - jupyter-core==4.1.0 129 | - llvmlite==0.14.0 130 | - locket==0.2.0 131 | - matplotlib-venn==0.11.4 132 | - msgpack-python==0.4.8 133 | - multipledispatch==0.4.8 134 | - nbdime==0.1.0 135 | - networkx==1.11 136 | - numba==0.29.0 137 | - odo==0.4.2 138 | - packaging==16.8 139 | - partd==0.3.6 140 | - path.py==8.1.2 141 | - pexpect==4.2.1 142 | - pickleshare==0.7.4 143 | - pillow==3.3.1 144 | - pkginfo==1.4.1 145 | - ply==3.9 146 | - prompt-toolkit==1.0.13 147 | - pulp==1.6.5 148 | - pyemd==0.3.0 149 | - pyomo==5.1.1 150 | - pysam==0.9.0 151 | - python-igraph==0.7.1.post6 152 | - python-ternary==1.0 153 | - pyutilib==5.4.1 154 | - q2-composition==2017.3.0.dev0 155 | - q2-feature-table==2017.3.0.dev0 156 | - q2-taxa==2017.3.0.dev0 157 | - q2-types==2017.3.0.dev0 158 | - q2cli (/Users/mortonjt/Dropbox/UCSD/research/q2/q2cli)==2017.3.0.dev0 159 | - q2templates==2017.3.0.dev0 160 | - qcli==0.1.1 161 | - qiime2 (/Users/mortonjt/Dropbox/UCSD/research/q2/qiime2)==2017.3.0.dev0 162 | - requests-toolbelt==0.7.0 163 | - rpy2==2.7.8 164 | - scikit-learn==0.17.1 165 | - seaborn==0.7.1 166 | - simplegeneric==0.8.1 167 | - snowballstemmer==1.2.1 168 | - sourcetracker==2.0.1.dev0 169 | - sphinx==1.4.9 170 | - tables==3.2.2 171 | - tabview==1.4.2 172 | - toolz==0.8.0 173 | - twine==1.8.1 174 | - xarray==0.8.2 175 | - xlrd==0.9.4 176 | 177 | -------------------------------------------------------------------------------- /gneiss/plot/_radial.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import pandas as pd 9 | from gneiss.plot._dendrogram import UnrootedDendrogram 10 | import warnings 11 | try: 12 | from bokeh.models.glyphs import Circle, Segment 13 | from bokeh.models import ColumnDataSource, DataRange1d, Plot 14 | from bokeh.models import (HoverTool, BoxZoomTool, ResetTool, 15 | WheelZoomTool, SaveTool, PanTool) 16 | except: 17 | warnings.warn("Bokeh isn't installed - " 18 | "the interactive visualizations won't work.") 19 | 20 | 21 | def radialplot(tree, node_color='node_color', node_size='node_size', 22 | node_alpha='node_alpha', edge_color='edge_color', 23 | edge_alpha='edge_alpha', edge_width='edge_width', 24 | hover_var='hover_var', figsize=(500, 500), **kwargs): 25 | """ Plots unrooted radial tree. 26 | 27 | Parameters 28 | ---------- 29 | tree : instance of skbio.TreeNode 30 | Input tree for plotting. 31 | node_color : str 32 | Name of variable in `tree` to color nodes. 33 | node_size : str 34 | Name of variable in `tree` that specifies the radius of nodes. 35 | node_alpha : str 36 | Name of variable in `tree` to specify node transparency. 37 | edge_color : str 38 | Name of variable in `tree` to color edges. 39 | edge_alpha : str 40 | Name of variable in `tree` to specify edge transparency. 41 | edge_width : str 42 | Name of variable in `tree` to specify edge width. 43 | hover_var : str 44 | Name of variable in `tree` to display in the hover menu. 45 | figsize : tuple, int 46 | Size of resulting figure. default: (500, 500) 47 | **kwargs: dict 48 | Plotting options to pass into bokeh.models.Plot 49 | 50 | Returns 51 | ------- 52 | bokeh.models.Plot 53 | Interactive plotting instance. 54 | 55 | 56 | Notes 57 | ----- 58 | This assumes that the tree is strictly bifurcating. 59 | 60 | See also 61 | -------- 62 | bifurcate 63 | """ 64 | warnings.warn("This visualization are deprecated.", DeprecationWarning) 65 | # This entire function was motivated by 66 | # http://chuckpr.github.io/blog/trees2.html 67 | t = UnrootedDendrogram.from_tree(tree.copy()) 68 | 69 | nodes = t.coords(figsize[0], figsize[1]) 70 | 71 | # fill in all of the node attributes 72 | def _retreive(tree, x, default): 73 | return pd.Series({n.name: getattr(n, x, default) 74 | for n in tree.levelorder()}) 75 | 76 | # default node color to light grey 77 | nodes[node_color] = _retreive(t, node_color, default='#D3D3D3') 78 | nodes[node_size] = _retreive(t, node_size, default=1) 79 | nodes[node_alpha] = _retreive(t, node_alpha, default=1) 80 | nodes[hover_var] = _retreive(t, hover_var, default=None) 81 | 82 | edges = nodes[['child0', 'child1']] 83 | edges = edges.dropna(subset=['child0', 'child1']) 84 | edges = edges.unstack() 85 | edges = pd.DataFrame({'src_node': edges.index.get_level_values(1), 86 | 'dest_node': edges.values}) 87 | edges['x0'] = [nodes.loc[n].x for n in edges.src_node] 88 | edges['x1'] = [nodes.loc[n].x for n in edges.dest_node] 89 | edges['y0'] = [nodes.loc[n].y for n in edges.src_node] 90 | edges['y1'] = [nodes.loc[n].y for n in edges.dest_node] 91 | ns = [n.name for n in t.levelorder(include_self=True)] 92 | attrs = pd.DataFrame(index=ns) 93 | 94 | # default edge color to black 95 | attrs[edge_color] = _retreive(t, edge_color, default='#000000') 96 | attrs[edge_width] = _retreive(t, edge_width, default=1) 97 | attrs[edge_alpha] = _retreive(t, edge_alpha, default=1) 98 | 99 | edges = pd.merge(edges, attrs, left_on='dest_node', 100 | right_index=True, how='outer') 101 | edges = edges.dropna(subset=['src_node']) 102 | 103 | node_glyph = Circle(x="x", y="y", 104 | radius=node_size, 105 | fill_color=node_color, 106 | fill_alpha=node_alpha) 107 | 108 | edge_glyph = Segment(x0="x0", y0="y0", 109 | x1="x1", y1="y1", 110 | line_color=edge_color, 111 | line_alpha=edge_alpha, 112 | line_width=edge_width) 113 | 114 | def df2ds(df): 115 | return ColumnDataSource(ColumnDataSource.from_df(df)) 116 | 117 | ydr = DataRange1d(range_padding=0.05) 118 | xdr = DataRange1d(range_padding=0.05) 119 | 120 | plot = Plot(x_range=xdr, y_range=ydr, **kwargs) 121 | plot.add_glyph(df2ds(edges), edge_glyph) 122 | ns = plot.add_glyph(df2ds(nodes), node_glyph) 123 | 124 | tooltip = [ 125 | ("Feature ID", "@index") 126 | ] 127 | if hover_var is not None: 128 | tooltip += [(hover_var, "@" + hover_var)] 129 | 130 | hover = HoverTool(renderers=[ns], tooltips=tooltip) 131 | plot.add_tools(hover, BoxZoomTool(), ResetTool(), 132 | WheelZoomTool(), SaveTool(), PanTool()) 133 | 134 | return plot 135 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/coefficients.csv: -------------------------------------------------------------------------------- 1 | ,Intercept,ph 2 | y0,-2.417821963898191,0.3566904413073933 3 | y1,-0.9052110511210674,0.049641573168426295 4 | y2,-0.9357918880737763,0.10783154573751495 5 | y3,-2.3133000514177455,0.3606910551421914 6 | y4,0.33748362746687366,0.025202276296260796 7 | y5,-1.7196040211996224,0.24445492171611835 8 | y6,0.7183771668620241,-0.09853010667696244 9 | y7,-1.2448131020654418,0.18461233905393876 10 | y8,0.0043281368780772956,0.017766196752291036 11 | y9,-8.888635895449163,1.4338978894784278 12 | y10,-0.1380162869555483,-0.027033261659450195 13 | y11,2.0081696710019457,-0.33669643064356075 14 | y12,-0.4458001655134542,0.052614929491326755 15 | y13,-2.5388789829593894,0.36269640192567715 16 | y14,-1.7009227515869187,0.2486995712072929 17 | y15,2.2342942376459476,-0.28137737848819855 18 | y16,0.9952936138338209,-0.1358283261966583 19 | y17,1.8985792038741323,-0.309574156534415 20 | y18,-0.5185397408352778,0.045702588530660976 21 | y19,0.29038527282402016,-0.031194965277184943 22 | y20,0.7927326302466351,-0.10810810411171143 23 | y21,-0.19862086569906898,0.09197269958700291 24 | y22,1.11695795500751,-0.15393879768283344 25 | y23,-1.9375935274988665,0.316771155039698 26 | y24,-1.5197127403457722,0.2466330689419788 27 | y25,-7.222697297003709,1.090728825447765 28 | y26,11.135550530286885,-1.842919462263025 29 | y27,-0.5969641090728304,0.11717486570967786 30 | y28,-0.5083320185752758,0.07391030315795062 31 | y29,-0.7113172797977424,0.12208714562398323 32 | y30,-0.4043710947066187,0.038640993551844874 33 | y31,2.731012073137618,-0.46159032232526753 34 | y32,3.965568487845451,-0.5957211835451143 35 | y33,-1.3843125783110992,0.2063797362105431 36 | y34,0.11186906658990858,0.09974027808792005 37 | y35,0.7458762279144321,-0.15991699299592574 38 | y36,0.5462379729323912,-0.12509925899594823 39 | y37,0.7715428127698168,-0.08374870435548773 40 | y38,-0.006490297265430415,0.006489381824971823 41 | y39,-1.1505876784891131,0.2419237573784568 42 | y40,1.4927158016341542,-0.22453611806483276 43 | y41,0.2362826246016099,-0.02382474060748356 44 | y42,-0.8199793918824666,0.14617589105266396 45 | y43,-1.2479231999016884,0.25900931458640947 46 | y44,1.6489193595737197,-0.2656311740355035 47 | y45,0.37513055994129435,-0.06575775407372958 48 | y46,-0.23235985768487274,0.04345932796883874 49 | y47,0.232413460376192,-0.04363490376686736 50 | y48,-0.9701856408728218,0.17462167769163456 51 | y49,0.08141413719517979,0.004396636449273949 52 | y50,1.8409060639108943,-0.29310915670058585 53 | y51,0.11107094311846646,-0.023971435949612815 54 | y52,-0.3288508216536449,0.04478598174234949 55 | y53,-1.950201188792175,0.27019388715883313 56 | y54,-0.21754527398648923,0.02567564793019846 57 | y55,0.5070344389950753,-0.08435530699541947 58 | y56,0.5469829898615253,-0.015692985885081844 59 | y57,-1.0651920370632453,0.10357010418282334 60 | y58,-0.3618051141258941,0.04083277567819005 61 | y59,0.7359516747239916,-0.1191674300733181 62 | y60,-0.27027205230119355,0.027219092575649903 63 | y61,0.11332657234338792,-0.037628480171609864 64 | y62,-0.054802386332120014,0.015438312126530043 65 | y63,0.17834351919623104,-0.0353004982881047 66 | y64,0.5021027013118613,-0.08149261323028857 67 | y65,-0.7317177817250978,0.11092985149299428 68 | y66,0.47637412333038126,-0.0035924721497009432 69 | y67,0.020606271402806065,0.009997371330490225 70 | y68,1.9221471293322256,-0.26752500098575666 71 | y69,1.17970241212005,-0.1641827048201078 72 | y70,-0.8184772546295412,0.12022121428082484 73 | y71,-0.12330798367923247,0.028963428511499006 74 | y72,-0.7043520903538832,0.12111737639867173 75 | y73,0.16468385036158828,-0.01491809942028498 76 | y74,-0.05458864419854579,0.008102836974669044 77 | y75,-1.3794040734816935,0.18755889639070997 78 | y76,1.2988084300034206,-0.17970824507379088 79 | y77,0.23804873279931538,-0.04967872251402735 80 | y78,1.3443871234443194,-0.1511514994081249 81 | y79,1.791956991985029,-0.31027551804655645 82 | y80,-0.2260027577677782,0.0352420930295071 83 | y81,-0.7315837744871326,0.09397099811752183 84 | y82,0.22342083868015558,-0.036252296752822454 85 | y83,0.9890597158215642,-0.12754719940053685 86 | y84,0.020923823752194582,0.006047398987785311 87 | y85,1.2659322933931254,-0.1790956481945273 88 | y86,0.5490840386384254,-0.09191494829380803 89 | y87,-0.854051259233914,0.11497150927490624 90 | y88,1.5442076690205047,-0.22477604150408292 91 | y89,0.05180071552460162,-0.04441923140965983 92 | y90,2.1702012325252618,-0.32509549154637923 93 | y91,-1.1438217763831038,0.18457225499582508 94 | y92,0.20182664476633647,-0.03617016003385909 95 | y93,0.32283178714269156,-0.07556082692785951 96 | y94,0.6624037908148538,-0.11691640268465277 97 | y95,0.12781936469114472,-0.05442398123766153 98 | y96,0.3225935214956588,-0.049489243690185744 99 | y97,0.05504548243442081,-0.0019830812889765153 100 | y98,-0.31634347553783554,0.03592855377461579 101 | y99,-0.20450768537043104,0.01647646243984905 102 | y100,0.5706627524394878,-0.07096218388437267 103 | y101,0.4481857221755603,-0.05229362634021477 104 | y102,1.4610714039897088,-0.1974608317058108 105 | y103,-0.5831225769898846,0.12847535822056447 106 | y104,-0.3778252764706474,0.05540222040774105 107 | y105,-0.10391591807003929,0.05028748957940854 108 | y106,-0.6458814418806339,0.0919046542929978 109 | y107,1.726022770127027,-0.23392347023971047 110 | y108,0.40707386320519157,-0.05445419955171094 111 | y109,1.7269767479591678,-0.22161951275880676 112 | y110,-0.3485150472085344,0.04398702248446438 113 | y111,-0.6674467573758245,0.22982383932854994 114 | y112,-1.5226066949789847,0.21540215886086916 115 | y113,-0.4682652952856936,0.06948694013844546 116 | y114,-0.5963751787826362,0.07423163157790996 117 | y115,0.18941501262895516,-0.003290492356183168 118 | y116,0.3319245569899489,-0.07497860531797448 119 | y117,-0.44905384789598235,0.059247252407698896 120 | -------------------------------------------------------------------------------- /gneiss/regression/tests/data/pvalues.csv: -------------------------------------------------------------------------------- 1 | ,Intercept,ph 2 | y0,2.0231382176562713e-07,1.5986797744717098e-06 3 | y1,0.06011885055018011,0.5200315964601321 4 | y2,0.016362183704805352,0.08459797560939894 5 | y3,2.711345677586785e-05,4.908287303929518e-05 6 | y4,0.37635229111205415,0.6823421422061464 7 | y5,0.0005396934480471485,0.0021766278626422775 8 | y6,0.07985080247891974,0.13639964193115275 9 | y7,5.369709631311585e-05,0.0001910744159765913 10 | y8,0.988225064070116,0.7080350094829028 11 | y9,4.409135649041548e-30,5.177944970965206e-30 12 | y10,0.6303272886698309,0.5600337884848634 13 | y11,3.128836166342658e-07,1.30963118803634e-07 14 | y12,0.1066787667914187,0.23735473121815692 15 | y13,1.295050642130945e-08,3.0887411253115263e-07 16 | y14,2.2367190437659e-05,0.00010972539901087592 17 | y15,1.1370780526586244e-11,3.000983940825702e-08 18 | y16,0.006185201230885478,0.02009040359692411 19 | y17,7.695994405895602e-05,6.775560403179732e-05 20 | y18,0.12150931064477158,0.39638796434776435 21 | y19,0.47269850766776533,0.6330303174570436 22 | y20,0.010308804348404794,0.02962236033751162 23 | y21,0.6217481615522058,0.15975695806591558 24 | y22,0.0025029093920176213,0.009470244972664433 25 | y23,5.771271567361451e-09,4.189942632303552e-09 26 | y24,1.6429321667207632e-05,1.54081456332786e-05 27 | y25,7.958076802929733e-18,2.2899968563086935e-16 28 | y26,1.9455705722308582e-18,5.777506242608944e-19 29 | y27,0.012384954189432178,0.002601398943639638 30 | y28,0.07759917873075849,0.11193438405921681 31 | y29,0.004749709484931084,0.0028015471036144045 32 | y30,0.18729661964979205,0.43434670921461993 33 | y31,8.055677022011245e-11,1.696061444790691e-11 34 | y32,3.302116870758148e-13,5.795117625440092e-12 35 | y33,1.3939285026652989e-05,5.424895386889316e-05 36 | y34,0.8709721145588748,0.3715329255621058 37 | y35,0.00021920050748216926,1.8379424591619849e-06 38 | y36,0.16360444034078986,0.04973415032091874 39 | y37,0.01355347015225425,0.09429934604574804 40 | y38,0.9731249027798142,0.8350011803300309 41 | y39,0.00031226279875793295,4.677092902076776e-06 42 | y40,0.0015954833779699596,0.003203703162063911 43 | y41,0.18614915413516903,0.40828378488135897 44 | y42,0.004251445104840961,0.001709061310481806 45 | y43,1.703359098557291e-05,8.552286564044924e-08 46 | y44,1.2426600622226377e-05,1.328985290260878e-05 47 | y45,0.17551299912142332,0.14230620069990005 48 | y46,0.35489045213483716,0.28496905252631144 49 | y47,0.16602735174894406,0.10845519956226542 50 | y48,0.006427527916242203,0.002539622003724941 51 | y49,0.75368004235738,0.9165150791122991 52 | y50,0.005771991467566583,0.006530235615020178 53 | y51,0.6017802273926969,0.48637481410913097 54 | y52,0.26759490722527424,0.3499562720950433 55 | y53,8.924597332512463e-06,0.00011325234507231189 56 | y54,0.31949290114412354,0.466871596076887 57 | y55,0.09013458423408123,0.08136538477870153 58 | y56,0.0816663196354404,0.7553993031914661 59 | y57,0.004433194199059207,0.08250281174037756 60 | y58,0.2290113551466194,0.40014959567649544 61 | y59,0.018282435042915796,0.01812677919937224 62 | y60,0.32371878164778645,0.5381075053380269 63 | y61,0.6958683825610459,0.42280243487177394 64 | y62,0.8206811909928219,0.6930044395685215 65 | y63,0.3361523253936626,0.2396769106368928 66 | y64,0.07411406118822376,0.0730594671582051 67 | y65,0.0008487512443398987,0.0016913290798711417 68 | y66,0.08156930966743642,0.9346993696415715 69 | y67,0.9536637593962555,0.8616063078659033 70 | y68,5.764697472006051e-07,1.1980943688674127e-05 71 | y69,3.469880515887769e-08,1.251969770259889e-06 72 | y70,0.0050466333460564726,0.01056032847691737 73 | y71,0.5824003403194246,0.42486627824141254 74 | y72,0.020027189607415154,0.013568182783744916 75 | y73,0.5313886104027061,0.725662026927302 76 | y74,0.8883364466894954,0.8974424613269687 77 | y75,0.0004461288566022144,0.0028619022760101014 78 | y76,3.957183581647853e-06,6.0947195212273386e-05 79 | y77,0.4332278891006891,0.3123717778815628 80 | y78,0.0016576279361801888,0.026496104629617434 81 | y79,0.0007138976393654403,0.00030933688733006414 82 | y80,0.28891981081632123,0.30633969607425043 83 | y81,0.04627436056711397,0.11190946808988014 84 | y82,0.4988366152669681,0.4973509989614011 85 | y83,5.783715711833836e-08,8.475043164083101e-06 86 | y84,0.9336421911918483,0.8816971497803062 87 | y85,0.0004047193067072164,0.0018101551247001204 88 | y86,0.13752630011858602,0.12436393090433022 89 | y87,0.0007949699924884687,0.004807041210853434 90 | y88,1.681321342801181e-06,1.2957462590935792e-05 91 | y89,0.8441422899161682,0.2985832666575177 92 | y90,4.5606670103847854e-08,3.0324380533746814e-07 93 | y91,0.000679688358488179,0.0006962171212845777 94 | y92,0.5530261279307382,0.5109620405114144 95 | y93,0.07684438987080362,0.011188109318831293 96 | y94,0.012222421854624525,0.0064113611717861 97 | y95,0.6863133156452363,0.2888980046697222 98 | y96,0.3378152841800086,0.3630338526064949 99 | y97,0.8089947540821971,0.957043446828022 100 | y98,0.1803712958932434,0.3454962628351098 101 | y99,0.4710487985349683,0.7192012832165596 102 | y100,0.2037978320656817,0.3275210105710409 103 | y101,0.07838268739124357,0.2021477449146662 104 | y102,6.275550928468192e-10,1.0081774647797712e-07 105 | y103,0.03602584131371042,0.004694931406947464 106 | y104,0.2205716136177221,0.26625549453712816 107 | y105,0.648991144530094,0.17514504906365966 108 | y106,0.03191994202863099,0.05829444014664721 109 | y107,9.159580575936758e-07,2.7343819297797546e-05 110 | y108,0.20615265960503892,0.29493372156085573 111 | y109,3.9957320068973616e-08,7.122719359440899e-06 112 | y110,0.1601883874809344,0.2719299929048468 113 | y111,0.057124882232972594,9.02276720234949e-05 114 | y112,8.83353387262592e-05,0.0005319623291459005 115 | y113,0.07855307166119041,0.10596204097226382 116 | y114,0.09814960073925237,0.2015466315199174 117 | y115,0.46208377440008475,0.9369296099174003 118 | y116,0.4032484957728807,0.24383860569835114 119 | y117,0.05493605762180559,0.11604625913417174 120 | -------------------------------------------------------------------------------- /gneiss/regression/_model.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import abc 9 | import pandas as pd 10 | from skbio.stats.composition import ilr_inv 11 | from gneiss._model import Model 12 | from gneiss.balances import balance_basis 13 | 14 | 15 | class RegressionModel(Model): 16 | def __init__(self, *args, **kwargs): 17 | """ 18 | Summary object for storing regression results. 19 | 20 | A `RegressionResults` object stores information about the 21 | individual balances used in the regression, the coefficients, 22 | residuals. This object can be used to perform predictions. 23 | In addition, summary statistics such as the coefficient 24 | of determination for the overall fit can be calculated. 25 | 26 | Parameters 27 | ---------- 28 | submodels : list of statsmodels objects 29 | List of statsmodels result objects. 30 | balances : pd.DataFrame 31 | A table of balances where samples are rows and 32 | balances are columns. These balances were calculated 33 | using `tree`. 34 | """ 35 | self._beta = None 36 | self._resid = None 37 | self._fitted = False 38 | super().__init__(*args, **kwargs) 39 | # there is only one design matrix for regression 40 | self.design_matrix = self.design_matrices 41 | 42 | def coefficients(self, tree=None): 43 | """ Returns coefficients from fit. 44 | 45 | Parameters 46 | ---------- 47 | tree : skbio.TreeNode, optional 48 | The tree used to perform the ilr transformation. If this 49 | is specified, then the prediction will be represented as 50 | proportions. Otherwise, if this is not specified, the prediction 51 | will be represented as balances. (default: None). 52 | 53 | Returns 54 | ------- 55 | pd.DataFrame 56 | A table of coefficients where rows are covariates, 57 | and the columns are balances. If `tree` is specified, then 58 | the columns are proportions. 59 | """ 60 | if not self._fitted: 61 | ValueError(('Model not fitted - coefficients not calculated.' 62 | 'See `fit()`')) 63 | coef = self._beta 64 | if tree is not None: 65 | basis, _ = balance_basis(tree) 66 | c = ilr_inv(coef.values, basis=basis) 67 | ids = [n.name for n in tree.tips()] 68 | return pd.DataFrame(c, columns=ids, index=coef.index) 69 | else: 70 | return coef 71 | 72 | def residuals(self, tree=None): 73 | """ Returns calculated residuals from fit. 74 | 75 | Parameters 76 | ---------- 77 | X : pd.DataFrame, optional 78 | Input table of covariates. If not specified, then the 79 | fitted values calculated from training the model will be 80 | returned. 81 | tree : skbio.TreeNode, optional 82 | The tree used to perform the ilr transformation. If this 83 | is specified, then the prediction will be represented 84 | as proportions. Otherwise, if this is not specified, 85 | the prediction will be represented as balances. (default: None). 86 | 87 | Returns 88 | ------- 89 | pd.DataFrame 90 | A table of residuals where rows are covariates, 91 | and the columns are balances. If `tree` is specified, then 92 | the columns are proportions. 93 | 94 | References 95 | ---------- 96 | .. [1] Aitchison, J. "A concise guide to compositional data analysis, 97 | CDA work." Girona 24 (2003): 73-81. 98 | """ 99 | if not self._fitted: 100 | ValueError(('Model not fitted - coefficients not calculated.' 101 | 'See `fit()`')) 102 | resid = self._resid 103 | if tree is not None: 104 | basis, _ = balance_basis(tree) 105 | proj_resid = ilr_inv(resid.values, basis=basis) 106 | ids = [n.name for n in tree.tips()] 107 | return pd.DataFrame(proj_resid, 108 | columns=ids, 109 | index=resid.index) 110 | else: 111 | return resid 112 | 113 | @abc.abstractmethod 114 | def predict(self, X=None, tree=None, **kwargs): 115 | """ Performs a prediction based on model. 116 | 117 | Parameters 118 | ---------- 119 | X : pd.DataFrame, optional 120 | Input table of covariates, where columns are covariates, and 121 | rows are samples. If not specified, then the fitted values 122 | calculated from training the model will be returned. 123 | tree : skbio.TreeNode, optional 124 | The tree used to perform the ilr transformation. If this 125 | is specified, then the prediction will be represented 126 | as proportions. Otherwise, if this is not specified, 127 | the prediction will be represented as balances. (default: None). 128 | **kwargs : dict 129 | Other arguments to be passed into the model prediction. 130 | 131 | Returns 132 | ------- 133 | pd.DataFrame 134 | A table of predicted values where rows are covariates, 135 | and the columns are balances. If `tree` is specified, then 136 | the columns are proportions. 137 | 138 | """ 139 | if not self._fitted: 140 | ValueError(('Model not fitted - coefficients not calculated.' 141 | 'See `fit()`')) 142 | if X is None: 143 | X = self.design_matrices 144 | 145 | prediction = X.dot(self._beta) 146 | if tree is not None: 147 | basis, _ = balance_basis(tree) 148 | proj_prediction = ilr_inv(prediction.values, basis=basis) 149 | ids = [n.name for n in tree.tips()] 150 | return pd.DataFrame(proj_prediction, 151 | columns=ids, 152 | index=prediction.index) 153 | else: 154 | return prediction 155 | -------------------------------------------------------------------------------- /gneiss/regression/tests/test_mixedlm.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import os 9 | import shutil 10 | import numpy as np 11 | import pandas as pd 12 | import pandas.util.testing as pdt 13 | import unittest 14 | from skbio import TreeNode 15 | from gneiss.regression import mixedlm 16 | 17 | 18 | class TestMixedLM(unittest.TestCase): 19 | 20 | def setUp(self): 21 | np.random.seed(6241) 22 | n = 1600 23 | exog = np.random.normal(size=(n, 2)) 24 | groups = np.kron(np.arange(n // 16), np.ones(16)) 25 | 26 | # Build up the random error vector 27 | errors = 0 28 | 29 | # The random effects 30 | exog_re = np.random.normal(size=(n, 2)) 31 | slopes = np.random.normal(size=(n // 16, 2)) 32 | slopes = np.kron(slopes, np.ones((16, 1))) * exog_re 33 | errors += slopes.sum(1) 34 | 35 | # First variance component 36 | errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) 37 | 38 | # Second variance component 39 | errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) 40 | 41 | # iid errors 42 | errors += np.random.normal(size=n) 43 | 44 | endog = exog.sum(1) + errors 45 | 46 | df = pd.DataFrame(index=range(n)) 47 | df["y1"] = endog 48 | df["y2"] = endog + 2 * 2 49 | df["groups"] = groups 50 | df["x1"] = exog[:, 0] 51 | df["x2"] = exog[:, 1] 52 | 53 | self.tree = TreeNode.read(['(c, (b,a)y2)y1;']) 54 | self.table = df[["y1", "y2"]] 55 | self.metadata = df[['x1', 'x2', 'groups']] 56 | 57 | # for testing the plugins 58 | self.results = "results" 59 | if not os.path.exists(self.results): 60 | os.mkdir(self.results) 61 | 62 | def tearDown(self): 63 | shutil.rmtree(self.results) 64 | 65 | 66 | class TestMixedLMFunctions(TestMixedLM): 67 | 68 | def test_mixedlm_balances(self): 69 | 70 | res = mixedlm("x1 + x2", self.table, self.metadata, 71 | groups="groups") 72 | res.fit() 73 | exp_pvalues = pd.DataFrame( 74 | [[0.0994110906314, 4.4193804e-05, 3.972325e-35, 3.568599e-30], 75 | [4.82688604e-236, 4.4193804e-05, 3.972325e-35, 3.568599e-30]], 76 | index=['y1', 'y2'], 77 | columns=['Intercept', 'Group Var', 'x1', 'x2']).T 78 | 79 | res_pvals = res.pvalues.sort_index(axis=0).sort_index(axis=1) 80 | exp_pvals = exp_pvalues.sort_index(axis=0).sort_index(axis=1) 81 | 82 | pdt.assert_frame_equal(res_pvals, exp_pvals, 83 | check_less_precise=True) 84 | 85 | exp_coefficients = pd.DataFrame( 86 | [[0.211451, 0.0935786, 1.022008, 0.924873], 87 | [4.211451, 0.0935786, 1.022008, 0.924873]], 88 | columns=['Intercept', 'Group Var', 'x1', 'x2'], 89 | index=['y1', 'y2']).sort_index().T 90 | res_coef = res.coefficients().sort_index(axis=0).sort_index(axis=1) 91 | exp_coef = exp_coefficients.sort_index(axis=0).sort_index(axis=1) 92 | 93 | pdt.assert_frame_equal(res_coef, exp_coef, 94 | check_less_precise=True) 95 | 96 | def test_mixedlm_balances_vcf(self): 97 | np.random.seed(6241) 98 | n = 1600 99 | exog = np.random.normal(size=(n, 2)) 100 | groups = np.kron(np.arange(n // 16), np.ones(16)) 101 | 102 | # Build up the random error vector 103 | errors = 0 104 | 105 | # The random effects 106 | exog_re = np.random.normal(size=(n, 2)) 107 | slopes = np.random.normal(size=(n // 16, 2)) 108 | slopes = np.kron(slopes, np.ones((16, 1))) * exog_re 109 | errors += slopes.sum(1) 110 | 111 | # First variance component 112 | subgroups1 = np.kron(np.arange(n // 4), np.ones(4)) 113 | errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) 114 | 115 | # Second variance component 116 | subgroups2 = np.kron(np.arange(n // 2), np.ones(2)) 117 | errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) 118 | 119 | # iid errors 120 | errors += np.random.normal(size=n) 121 | 122 | endog = exog.sum(1) + errors 123 | 124 | df = pd.DataFrame(index=range(n)) 125 | df["y1"] = endog 126 | df["y2"] = endog + 2 * 2 127 | df["groups"] = groups 128 | df["x1"] = exog[:, 0] 129 | df["x2"] = exog[:, 1] 130 | df["z1"] = exog_re[:, 0] 131 | df["z2"] = exog_re[:, 1] 132 | df["v1"] = subgroups1 133 | df["v2"] = subgroups2 134 | 135 | table = df[["y1", "y2"]] 136 | metadata = df[['x1', 'x2', 'z1', 'z2', 'v1', 'v2', 'groups']] 137 | 138 | res = mixedlm("x1 + x2", table, metadata, groups="groups", 139 | re_formula="0+z1+z2") 140 | res.fit() 141 | 142 | exp_pvalues = pd.DataFrame([ 143 | [0.038015, 3.858750e-39, 2.245068e-33, 144 | 2.552217e-05, 0.923418, 6.645741e-34], 145 | [0.000000, 3.858750e-39, 2.245068e-33, 146 | 2.552217e-05, 0.923418, 6.645741e-34]], 147 | columns=['Intercept', 'x1', 'x2', 'z1 Var', 148 | 'z1 x z2 Cov', 'z2 Var'], 149 | index=['y1', 'y2']).T 150 | 151 | exp_coefficients = pd.DataFrame( 152 | [[0.163141, 1.030013, 0.935514, 0.115082, -0.001962, 0.14792], 153 | [4.163141, 1.030013, 0.935514, 0.115082, -0.001962, 0.14792]], 154 | columns=['Intercept', 'x1', 'x2', 'z1 Var', 155 | 'z1 x z2 Cov', 'z2 Var'], 156 | index=['y1', 'y2']).T 157 | 158 | pdt.assert_frame_equal(res.pvalues.sort_index(axis=0), 159 | exp_pvalues.sort_index(axis=0), 160 | check_less_precise=True) 161 | 162 | pdt.assert_frame_equal(res.coefficients().sort_index(axis=0), 163 | exp_coefficients.sort_index(axis=0), 164 | check_less_precise=True) 165 | 166 | def test_percent_explained(self): 167 | model = mixedlm("x1 + x2", self.table, self.metadata, 168 | groups="groups") 169 | 170 | model.fit() 171 | res = model.percent_explained() 172 | exp = pd.Series([0.5, 0.5], index=['y1', 'y2']) 173 | pdt.assert_series_equal(res, exp, check_less_precise=True) 174 | 175 | 176 | if __name__ == '__main__': 177 | unittest.main() 178 | -------------------------------------------------------------------------------- /gneiss/tests/test_balances.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | from __future__ import absolute_import, division, print_function 10 | import unittest 11 | import numpy as np 12 | import numpy.testing as npt 13 | from gneiss.balances import (balance_basis, _count_matrix, 14 | _balance_basis, sparse_balance_basis) 15 | from skbio import TreeNode 16 | from skbio.util import get_data_path 17 | from scipy.sparse import coo_matrix 18 | 19 | 20 | def assert_coo_allclose(res, exp, rtol=1e-7, atol=1e-7): 21 | res_data = np.vstack((res.row, res.col, res.data)).T 22 | exp_data = np.vstack((exp.row, exp.col, exp.data)).T 23 | 24 | # sort by row and col 25 | res_data = res_data[res_data[:, 1].argsort()] 26 | res_data = res_data[res_data[:, 0].argsort()] 27 | exp_data = exp_data[exp_data[:, 1].argsort()] 28 | exp_data = exp_data[exp_data[:, 0].argsort()] 29 | npt.assert_allclose(res_data, exp_data, rtol=rtol, atol=atol) 30 | 31 | 32 | class TestSparseBalances(unittest.TestCase): 33 | 34 | def test_sparse_balance_basis_base_case(self): 35 | tree = u"(a,b);" 36 | t = TreeNode.read([tree]) 37 | 38 | exp_basis = coo_matrix( 39 | np.array([[-np.sqrt(1. / 2), 40 | np.sqrt(1. / 2)]])) 41 | exp_keys = [t.name] 42 | res_basis, res_keys = sparse_balance_basis(t) 43 | 44 | assert_coo_allclose(exp_basis, res_basis) 45 | self.assertListEqual(exp_keys, res_keys) 46 | 47 | def test_sparse_balance_basis_invalid(self): 48 | with self.assertRaises(ValueError): 49 | tree = u"(a,b,c);" 50 | t = TreeNode.read([tree]) 51 | sparse_balance_basis(t) 52 | 53 | def test_sparse_balance_basis_unbalanced(self): 54 | tree = u"((a,b)c, d);" 55 | t = TreeNode.read([tree]) 56 | exp_basis = coo_matrix(np.array( 57 | [[-np.sqrt(1. / 6), -np.sqrt(1. / 6), np.sqrt(2. / 3)], 58 | [-np.sqrt(1. / 2), np.sqrt(1. / 2), 0]] 59 | )) 60 | exp_keys = [t.name, t[0].name] 61 | res_basis, res_keys = sparse_balance_basis(t) 62 | 63 | assert_coo_allclose(exp_basis, res_basis) 64 | self.assertListEqual(exp_keys, res_keys) 65 | 66 | def test_sparse_balance_basis_unbalanced2(self): 67 | tree = u"(d, (a,b)c);" 68 | 69 | t = TreeNode.read([tree]) 70 | 71 | exp_basis = coo_matrix(np.array( 72 | [ 73 | [-np.sqrt(2. / 3), np.sqrt(1. / 6), np.sqrt(1. / 6)], 74 | [0, -np.sqrt(1. / 2), np.sqrt(1. / 2)] 75 | ] 76 | )) 77 | 78 | exp_keys = [t.name, t[1].name] 79 | res_basis, res_keys = sparse_balance_basis(t) 80 | assert_coo_allclose(exp_basis, res_basis, atol=1e-7, rtol=1e-7) 81 | self.assertListEqual(exp_keys, res_keys) 82 | 83 | 84 | class TestBalances(unittest.TestCase): 85 | 86 | def test_count_matrix_base_case(self): 87 | tree = u"(a,b);" 88 | t = TreeNode.read([tree]) 89 | res, _ = _count_matrix(t) 90 | exp = {'k': 0, 'l': 1, 'r': 1, 't': 0, 'tips': 2} 91 | self.assertEqual(res[t], exp) 92 | 93 | exp = {'k': 0, 'l': 0, 'r': 0, 't': 0, 'tips': 1} 94 | self.assertEqual(res[t[0]], exp) 95 | 96 | exp = {'k': 0, 'l': 0, 'r': 0, 't': 0, 'tips': 1} 97 | self.assertEqual(res[t[1]], exp) 98 | 99 | def test_count_matrix_unbalanced(self): 100 | tree = u"((a,b)c, d);" 101 | t = TreeNode.read([tree]) 102 | res, _ = _count_matrix(t) 103 | 104 | exp = {'k': 0, 'l': 2, 'r': 1, 't': 0, 'tips': 3} 105 | self.assertEqual(res[t], exp) 106 | exp = {'k': 1, 'l': 1, 'r': 1, 't': 0, 'tips': 2} 107 | self.assertEqual(res[t[0]], exp) 108 | 109 | exp = {'k': 0, 'l': 0, 'r': 0, 't': 0, 'tips': 1} 110 | self.assertEqual(res[t[1]], exp) 111 | self.assertEqual(res[t[0][0]], exp) 112 | self.assertEqual(res[t[0][1]], exp) 113 | 114 | def test_count_matrix_singleton_error(self): 115 | with self.assertRaises(ValueError): 116 | tree = u"(((a,b)c, d)root);" 117 | t = TreeNode.read([tree]) 118 | _count_matrix(t) 119 | 120 | def test_count_matrix_trifurcating_error(self): 121 | with self.assertRaises(ValueError): 122 | tree = u"((a,b,e)c, d);" 123 | t = TreeNode.read([tree]) 124 | _count_matrix(t) 125 | 126 | def test__balance_basis_base_case(self): 127 | tree = u"(a,b);" 128 | t = TreeNode.read([tree]) 129 | 130 | exp_basis = np.array([[-np.sqrt(1. / 2), np.sqrt(1. / 2)]]) 131 | exp_keys = [t.name] 132 | res_basis, res_keys = _balance_basis(t) 133 | 134 | npt.assert_allclose(exp_basis, res_basis) 135 | self.assertListEqual(exp_keys, res_keys) 136 | 137 | def test__balance_basis_unbalanced(self): 138 | tree = u"((a,b)c, d);" 139 | t = TreeNode.read([tree]) 140 | 141 | exp_basis = np.array( 142 | [[-np.sqrt(1. / 6), -np.sqrt(1. / 6), np.sqrt(2. / 3)], 143 | [-np.sqrt(1. / 2), np.sqrt(1. / 2), 0]] 144 | ) 145 | exp_keys = [t.name, t[0].name] 146 | res_basis, res_keys = _balance_basis(t) 147 | 148 | npt.assert_allclose(exp_basis, res_basis) 149 | self.assertListEqual(exp_keys, res_keys) 150 | 151 | def test_balance_basis_base_case(self): 152 | tree = u"(a,b);" 153 | t = TreeNode.read([tree]) 154 | exp_keys = [t.name] 155 | exp_basis = np.array([0.19557032, 0.80442968]) 156 | res_basis, res_keys = balance_basis(t) 157 | 158 | npt.assert_allclose(exp_basis, res_basis) 159 | self.assertListEqual(exp_keys, res_keys) 160 | 161 | def test_balance_basis_unbalanced(self): 162 | tree = u"((a,b)c, d);" 163 | t = TreeNode.read([tree]) 164 | exp_keys = [t.name, t[0].name] 165 | exp_basis = np.array([[0.18507216, 0.18507216, 0.62985567], 166 | [0.14002925, 0.57597535, 0.28399541]]) 167 | 168 | res_basis, res_keys = balance_basis(t) 169 | 170 | npt.assert_allclose(exp_basis, res_basis) 171 | self.assertListEqual(exp_keys, list(res_keys)) 172 | 173 | def test_balance_basis_large1(self): 174 | fname = get_data_path('large_tree.nwk', 175 | subfolder='data') 176 | t = TreeNode.read(fname) 177 | # note that the basis is in reverse level order 178 | exp_basis = np.loadtxt( 179 | get_data_path('large_tree_basis.txt', 180 | subfolder='data')) 181 | res_basis, res_keys = balance_basis(t) 182 | npt.assert_allclose(exp_basis[:, ::-1], res_basis) 183 | 184 | 185 | if __name__ == "__main__": 186 | unittest.main() 187 | -------------------------------------------------------------------------------- /gneiss/plot/tests/test_regression_plot.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import unittest 9 | import os 10 | import shutil 11 | 12 | import numpy as np 13 | import pandas as pd 14 | import numpy.testing as npt 15 | 16 | from skbio import TreeNode 17 | from skbio.util import get_data_path 18 | 19 | from gneiss.plot._regression_plot import ols_summary, lme_summary 20 | from gneiss.regression import ols, mixedlm 21 | 22 | 23 | class TestOLS_Summary(unittest.TestCase): 24 | 25 | def setUp(self): 26 | A = np.array # aliasing for the sake of pep8 27 | self.table = pd.DataFrame({ 28 | 's1': A([1., 1.]), 29 | 's2': A([1., 2.]), 30 | 's3': A([1., 3.]), 31 | 's4': A([1., 4.]), 32 | 's5': A([1., 5.])}, 33 | index=['Y2', 'Y1']).T 34 | self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) 35 | self.metadata = pd.DataFrame({ 36 | 'lame': [1, 1, 1, 1, 1], 37 | 'real': [1, 2, 3, 4, 5] 38 | }, index=['s1', 's2', 's3', 's4', 's5']) 39 | 40 | np.random.seed(0) 41 | n = 15 42 | a = np.array([1, 4.2, 5.3, -2.2, 8]) 43 | x1 = np.linspace(.01, 0.1, n) 44 | x2 = np.logspace(0, 0.01, n) 45 | x3 = np.exp(np.linspace(0, 0.01, n)) 46 | x4 = x1 ** 2 47 | self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4}) 48 | n__ = np.random.normal(size=n) 49 | y = a[0] + a[1] * x1 + a[2] * x2 + a[3] * x3 + a[4] * x4 + n__ 50 | sy = np.vstack((-y / 10, -y)).T 51 | self.y = pd.DataFrame(sy, columns=['y0', 'y1']) 52 | self.t2 = TreeNode.read([r"((a,b)y1,c)y0;"]) 53 | 54 | self.results = "results" 55 | os.mkdir(self.results) 56 | 57 | def tearDown(self): 58 | shutil.rmtree(self.results) 59 | 60 | @unittest.skip('Visualizations are deprecated') 61 | def test_visualization(self): 62 | res = ols(formula="x1 + x2 + x3 + x4", 63 | table=self.y, metadata=self.x) 64 | res.fit() 65 | 66 | ols_summary(self.results, res, tree=self.t2) 67 | fp = os.path.join(self.results, 'pvalues.csv') 68 | self.assertTrue(os.path.exists(fp)) 69 | fp = os.path.join(self.results, 'coefficients.csv') 70 | self.assertTrue(os.path.exists(fp)) 71 | fp = os.path.join(self.results, 'predicted.csv') 72 | self.assertTrue(os.path.exists(fp)) 73 | fp = os.path.join(self.results, 'residuals.csv') 74 | self.assertTrue(os.path.exists(fp)) 75 | 76 | index_fp = os.path.join(self.results, 'index.html') 77 | self.assertTrue(os.path.exists(index_fp)) 78 | 79 | with open(index_fp, 'r') as fh: 80 | html = fh.read() 81 | self.assertIn('

Simplicial Linear Regression Summary

', 82 | html) 83 | self.assertIn('Coefficients\n', html) 84 | self.assertIn('Predicted Balances\n', html) 85 | self.assertIn('Residuals\n', html) 86 | 87 | 88 | class TestLME_Summary(unittest.TestCase): 89 | 90 | def setUp(self): 91 | np.random.seed(6241) 92 | n = 1600 93 | exog = np.random.normal(size=(n, 2)) 94 | groups = np.kron(np.arange(n // 16), np.ones(16)) 95 | 96 | # Build up the random error vector 97 | errors = 0 98 | 99 | # The random effects 100 | exog_re = np.random.normal(size=(n, 2)) 101 | slopes = np.random.normal(size=(n // 16, 2)) 102 | slopes = np.kron(slopes, np.ones((16, 1))) * exog_re 103 | errors += slopes.sum(1) 104 | 105 | # First variance component 106 | errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) 107 | 108 | # Second variance component 109 | errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) 110 | 111 | # iid errors 112 | errors += np.random.normal(size=n) 113 | 114 | endog = exog.sum(1) + errors 115 | 116 | df = pd.DataFrame(index=range(n)) 117 | df["Y1"] = endog + 2 * 2 118 | df["Y2"] = endog 119 | df["groups"] = groups 120 | df["x1"] = exog[:, 0] 121 | df["x2"] = exog[:, 1] 122 | 123 | self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) 124 | self.table = df[["Y1", "Y2"]] 125 | self.metadata = df[['x1', 'x2', 'groups']] 126 | 127 | self.results = "results" 128 | if not os.path.exists(self.results): 129 | os.mkdir(self.results) 130 | 131 | def tearDown(self): 132 | shutil.rmtree(self.results) 133 | 134 | @unittest.skip('Visualizations are deprecated') 135 | def test_visualization(self): 136 | model = mixedlm("x1 + x2", self.table, self.metadata, 137 | groups="groups") 138 | model.fit() 139 | lme_summary(self.results, model, self.tree) 140 | pvals = pd.read_csv(os.path.join(self.results, 'pvalues.csv'), 141 | index_col=0) 142 | coefs = pd.read_csv(os.path.join(self.results, 'coefficients.csv'), 143 | index_col=0) 144 | pred = pd.read_csv(os.path.join(self.results, 'predicted.csv'), 145 | index_col=0) 146 | resid = pd.read_csv(os.path.join(self.results, 'residuals.csv'), 147 | index_col=0) 148 | 149 | exp_pvals = pd.DataFrame({ 150 | 'Intercept': {'Y1': 4.8268860492262526e-236, 151 | 'Y2': 0.099411090631406948}, 152 | 'Group Var': {'Y1': 4.4193804668281966e-05, 153 | 'Y2': 4.4193804668280984e-05}, 154 | 'x1': {'Y1': 3.9704936434633392e-35, 155 | 'Y2': 3.9704936434628853e-35}, 156 | 'x2': {'Y1': 3.56912071867573e-30, 157 | 'Y2': 3.56912071867573e-30}}).sort_index(axis=1) 158 | pvals = pvals.sort_index(axis=0).sort_index(axis=1) 159 | exp_pvals = exp_pvals.sort_index(axis=0).sort_index(axis=1) 160 | 161 | npt.assert_allclose(pvals, exp_pvals, rtol=1e-5) 162 | 163 | exp_coefs = pd.DataFrame({ 164 | 'Intercept': {'Y1': 4.2115280233151946, 165 | 'Y2': 0.211528023315187}, 166 | 'Group Var': {'Y1': 0.093578639287859755, 167 | 'Y2': 0.093578639287860019}, 168 | 'x1': {'Y1': 1.0220072967452645, 169 | 'Y2': 1.0220072967452651}, 170 | 'x2': {'Y1': 0.92487193877761575, 171 | 'Y2': 0.92487193877761564}} 172 | ).sort_index(axis=1) 173 | 174 | npt.assert_allclose(coefs.sort_index(axis=0), 175 | exp_coefs.sort_index(axis=0), 176 | rtol=1e-2, atol=1e-2) 177 | 178 | exp_resid = pd.read_csv(get_data_path('exp_resid.csv'), index_col=0) 179 | npt.assert_allclose(resid, exp_resid.T, rtol=1e-2, atol=1e-2) 180 | 181 | exp_pred = pd.read_csv(get_data_path('exp_pred.csv'), index_col=0) 182 | npt.assert_allclose(pred, exp_pred.T, rtol=1e-2, atol=1e-2) 183 | 184 | 185 | if __name__ == "__main__": 186 | unittest.main() 187 | -------------------------------------------------------------------------------- /gneiss/plot/tests/test_dendrogram.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import unittest 9 | import numpy as np 10 | import pandas as pd 11 | from skbio import DistanceMatrix, TreeNode 12 | from gneiss.plot._dendrogram import (Dendrogram, UnrootedDendrogram, 13 | SquareDendrogram) 14 | from scipy.cluster.hierarchy import ward 15 | import pandas.util.testing as pdt 16 | 17 | 18 | class mock(Dendrogram): 19 | # mock dendrogram class to make sure that inheritance 20 | # is working as expected. 21 | def rescale(self, width, height): 22 | pass 23 | 24 | 25 | class TestDendrogram(unittest.TestCase): 26 | 27 | def test_cache_ntips(self): 28 | dm = DistanceMatrix.from_iterable([0, 1, 2, 3], 29 | lambda x, y: np.abs(x - y)) 30 | lm = ward(dm.condensed_form()) 31 | ids = np.arange(4).astype(np.str) 32 | t = mock.from_linkage_matrix(lm, ids) 33 | 34 | t._cache_ntips() 35 | 36 | self.assertEqual(t.leafcount, 4) 37 | self.assertEqual(t.children[0].leafcount, 2) 38 | self.assertEqual(t.children[1].leafcount, 2) 39 | self.assertEqual(t.children[0].children[0].leafcount, 1) 40 | self.assertEqual(t.children[0].children[1].leafcount, 1) 41 | self.assertEqual(t.children[1].children[0].leafcount, 1) 42 | self.assertEqual(t.children[1].children[1].leafcount, 1) 43 | 44 | 45 | class TestUnrootedDendrogram(unittest.TestCase): 46 | 47 | def setUp(self): 48 | np.random.seed(0) 49 | x = np.random.rand(10) 50 | dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) 51 | lm = ward(dm.condensed_form()) 52 | ids = np.arange(len(x)).astype(np.str) 53 | self.tree = TreeNode.from_linkage_matrix(lm, ids) 54 | 55 | # initialize tree with branch length and named internal nodes 56 | for i, n in enumerate(self.tree.postorder(include_self=True)): 57 | n.length = 1 58 | if not n.is_tip(): 59 | n.name = "y%d" % i 60 | 61 | def test_from_tree(self): 62 | t = UnrootedDendrogram.from_tree(self.tree) 63 | self.assertEqual(t.__class__, UnrootedDendrogram) 64 | 65 | def test_coords(self): 66 | t = UnrootedDendrogram.from_tree(self.tree) 67 | 68 | exp = pd.DataFrame({'0': [404.097, 396.979, np.nan, np.nan, True], 69 | '1': [464.724, 174.338, np.nan, np.nan, True], 70 | '2': [487.5, 43.2804, np.nan, np.nan, True], 71 | '3': [446.172, 359.095, np.nan, np.nan, True], 72 | '4': [32.4704, 456.72, np.nan, np.nan, True], 73 | '5': [438.468, 14.9717, np.nan, np.nan, True], 74 | '6': [81.5024, 485.028, np.nan, np.nan, True], 75 | '7': [54.5748, 34.9421, np.nan, np.nan, True], 76 | '8': [12.5, 72.8265, np.nan, np.nan, True], 77 | '9': [55.2464, 325.662, np.nan, np.nan, True], 78 | 'y10': [366.837, 313.291, '0', '3', False], 79 | 'y14': [419.421, 104.579, '2', '5', False], 80 | 'y15': [373.617, 183.914, '1', 'y14', False], 81 | 'y16': [305.539, 245.212, 'y10', 'y15', False], 82 | 'y17': [214.432, 254.788, 'y7', 'y16', False], 83 | 'y18': [153.134, 186.709, 'y2', 'y17', False], 84 | 'y2': [91.8354, 118.631, '7', '8', False], 85 | 'y6': [100.549, 395.421, '4', '6', False], 86 | 'y7': [146.353, 316.086, '9', 'y6', False]}, 87 | index=['x', 'y', 'child0', 'child1', 'is_tip']).T 88 | 89 | res = t.coords(500, 500) 90 | exp = exp.loc[res.index] 91 | pdt.assert_frame_equal(exp, res) 92 | 93 | def test_rescale(self): 94 | t = UnrootedDendrogram.from_tree(self.tree) 95 | self.assertAlmostEqual(t.rescale(500, 500), 91.608680314971238, 96 | places=5) 97 | 98 | def test_update_coordinates(self): 99 | t = UnrootedDendrogram.from_tree(self.tree) 100 | exp = pd.DataFrame([(-0.59847214410395644, -1.6334372886412185), 101 | (-0.99749498660405445, -0.76155647142658189), 102 | (1.0504174348855488, 0.34902579063315775), 103 | (2.8507394969018511, 0.88932809650129752), 104 | (3.3688089449017027, 0.082482736278627664), 105 | (0.81247946938427551, -3.4080712447257464), 106 | (-0.13677590240930079, -3.5433843164696093), 107 | (-1.6101831260150372, -1.1190611577178871), 108 | (-1.6176088321192579, 0.76057470265451865), 109 | (-0.69694851846105044, 1.0284925540912822)]) 110 | 111 | res = pd.DataFrame(t.update_coordinates(1, 0, 0, 2, 1)) 112 | pdt.assert_frame_equal(res, exp, check_less_precise=True) 113 | 114 | 115 | class TestSquareDendrogram(unittest.TestCase): 116 | 117 | def setUp(self): 118 | np.random.seed(0) 119 | self.table = pd.DataFrame(np.random.random((5, 5))) 120 | num_otus = 5 # otus 121 | x = np.random.rand(num_otus) 122 | dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) 123 | lm = ward(dm.condensed_form()) 124 | t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) 125 | self.tree = SquareDendrogram.from_tree(t) 126 | 127 | for i, n in enumerate(t.postorder()): 128 | if not n.is_tip(): 129 | n.name = "y%d" % i 130 | n.length = np.random.rand() * 3 131 | 132 | def test_from_tree(self): 133 | t = SquareDendrogram.from_tree(self.tree) 134 | self.assertEqual(t.__class__, SquareDendrogram) 135 | 136 | def test_coords(self): 137 | # just test to make sure that the coordinates are calculated properly. 138 | t = SquareDendrogram.from_tree(self.tree) 139 | 140 | exp = pd.DataFrame({'0': [20, 2.5, np.nan, np.nan, True], 141 | '1': [20, 3.5, np.nan, np.nan, True], 142 | '2': [20, 4.5, np.nan, np.nan, True], 143 | '3': [20, 1.5, np.nan, np.nan, True], 144 | '4': [20, 0.5, np.nan, np.nan, True], 145 | 'y5': [14.25, 1, '3', '4', False], 146 | 'y6': [9.5, 1.75, '0', 'y5', False], 147 | 'y7': [4.75, 2.625, '1', 'y6', False], 148 | 'y8': [0, 3.5625, '2', 'y7', False]}, 149 | index=['x', 'y', 'child0', 'child1', 'is_tip']).T 150 | 151 | res = t.coords(width=20, height=self.table.shape[0]) 152 | exp = exp.loc[res.index] 153 | pdt.assert_frame_equal(exp, res) 154 | 155 | def test_rescale(self): 156 | t = SquareDendrogram.from_tree(self.tree) 157 | res = t.rescale(10, 10) 158 | self.assertEqual(res, 2.5) 159 | 160 | 161 | if __name__ == "__main__": 162 | unittest.main() 163 | -------------------------------------------------------------------------------- /gneiss/regression/tests/test_model.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import pandas as pd 9 | from skbio import TreeNode 10 | import numpy as np 11 | from skbio.stats.composition import ilr_inv, clr_inv 12 | import statsmodels.formula.api as smf 13 | import pandas.util.testing as pdt 14 | from gneiss.regression._model import RegressionModel 15 | from gneiss.balances import balance_basis 16 | import unittest 17 | import os 18 | 19 | 20 | # create some mock classes for testing 21 | class submock(RegressionModel): 22 | def __init__(self, *args, **kwargs): 23 | super().__init__(*args, **kwargs) 24 | self.results = [] 25 | 26 | def summary(self): 27 | print("OK!") 28 | 29 | def predict(self, **kwargs): 30 | pass 31 | 32 | def fit(self, **kwargs): 33 | """ Fit the model """ 34 | for s in self.submodels: 35 | # assumes that the underlying submodels have implemented `fit`. 36 | m = s.fit(**kwargs) 37 | 38 | self.results.append(m) 39 | 40 | coef = pd.DataFrame() 41 | for r in self.results: 42 | c = r.params 43 | c.name = r.model.endog_names 44 | coef = coef.append(c) 45 | self._beta = coef.T 46 | 47 | resid = pd.DataFrame() 48 | for r in self.results: 49 | err = r.resid 50 | err.name = r.model.endog_names 51 | resid = resid.append(err) 52 | self._resid = resid.T 53 | 54 | pvals = pd.DataFrame() 55 | for r in self.results: 56 | p = r.pvalues 57 | p.name = r.model.endog_names 58 | pvals = pvals.append(p) 59 | 60 | self.pvalues = pvals 61 | 62 | self._fitted = True 63 | 64 | 65 | class TestRegressionModel(unittest.TestCase): 66 | def setUp(self): 67 | self.pickle_fname = "test.pickle" 68 | self.data = pd.DataFrame([[1, 1, 1], 69 | [3, 2, 3], 70 | [4, 3, 2], 71 | [5, 4, 4], 72 | [2, 5, 3], 73 | [3, 6, 5], 74 | [4, 7, 4]], 75 | index=['s1', 's2', 's3', 's4', 76 | 's5', 's6', 's7'], 77 | columns=['Y1', 'Y2', 'X']) 78 | self.model1 = smf.ols(formula="Y1 ~ X", data=self.data) 79 | self.model2 = smf.ols(formula="Y2 ~ X", data=self.data) 80 | self.tree = TreeNode.read(['((a,b)Y1, c)Y2;']) 81 | self.basis = pd.DataFrame(clr_inv(balance_basis(self.tree)[0]), 82 | columns=['a', 'b', 'c'], 83 | index=['Y1', 'Y2']) 84 | self.balances = pd.DataFrame(self.data[['Y1', 'Y2']], 85 | index=self.data.index, 86 | columns=['Y1', 'Y2']) 87 | 88 | def tearDown(self): 89 | if os.path.exists(self.pickle_fname): 90 | os.remove(self.pickle_fname) 91 | 92 | def test_regression_results_pvalues(self): 93 | # checks to see if pvalues are calculated correctly. 94 | 95 | submodels = [self.model1, self.model2] 96 | res = submock(Y=self.balances, Xs=None) 97 | submock.submodels = submodels 98 | res.fit() 99 | exp = pd.DataFrame({'Intercept': [0.307081, 0.972395], 100 | 'X': [0.211391, 0.029677]}, 101 | index=['Y1', 'Y2']) 102 | pdt.assert_frame_equal(res.pvalues, exp, 103 | check_exact=False, 104 | check_less_precise=True) 105 | 106 | def test_regression_results_coefficient(self): 107 | exp_coef = pd.DataFrame({'Intercept': [1.447368, -0.052632], 108 | 'X': [0.539474, 1.289474]}, 109 | index=['Y1', 'Y2']).T 110 | submodels = [self.model1, self.model2] 111 | res = submock(Y=self.balances, Xs=None) 112 | submock.submodels = submodels 113 | res.fit() 114 | res_coef = res.coefficients() 115 | pdt.assert_frame_equal(res_coef, exp_coef, 116 | check_exact=False, 117 | check_less_precise=True) 118 | 119 | def test_regression_results_coefficient_projection(self): 120 | tree = TreeNode.read([r'(c, (a, b)Y2)Y1;']) 121 | exp_coef = pd.DataFrame( 122 | np.array([[0.47802399, 0.44373548, 0.07824052], 123 | [0.11793186, 0.73047731, 0.15159083]]).T, 124 | columns=['Intercept', 'X'], 125 | index=['a', 'b', 'c']) 126 | 127 | submodels = [self.model1, self.model2] 128 | res = submock(Y=self.balances, Xs=None) 129 | submock.submodels = submodels 130 | res.fit() 131 | res_coef = res.coefficients(tree).T 132 | res_coef = res_coef.sort_index() 133 | 134 | pdt.assert_frame_equal(res_coef, exp_coef, 135 | check_exact=False, 136 | check_less_precise=True) 137 | 138 | def test_regression_results_residuals_projection(self): 139 | tree = TreeNode.read([r'(c, (a, b)Y2)Y1;']) 140 | basis, _ = balance_basis(tree) 141 | exp_resid = pd.DataFrame({'s1': [-0.986842, -0.236842], 142 | 's2': [-0.065789, -1.815789], 143 | 's3': [1.473684, 0.473684], 144 | 's4': [1.394737, -1.105263], 145 | 's5': [-1.065789, 1.184211], 146 | 's6': [-1.144737, -0.394737], 147 | 's7': [0.394737, 1.894737]}, 148 | index=['Y1', 'Y2']).T 149 | exp_resid = pd.DataFrame(ilr_inv(exp_resid, basis), 150 | index=['s1', 's2', 's3', 's4', 151 | 's5', 's6', 's7'], 152 | columns=['c', 'a', 'b']) 153 | 154 | submodels = [self.model1, self.model2] 155 | res = submock(Y=self.balances, Xs=None) 156 | submock.submodels = submodels 157 | res.fit() 158 | res_resid = res.residuals(tree).sort_index() 159 | pdt.assert_frame_equal(res_resid, exp_resid, 160 | check_exact=False, 161 | check_less_precise=True) 162 | 163 | def test_regression_results_residuals(self): 164 | exp_resid = pd.DataFrame({'s1': [-0.986842, -0.236842], 165 | 's2': [-0.065789, -1.815789], 166 | 's3': [1.473684, 0.473684], 167 | 's4': [1.394737, -1.105263], 168 | 's5': [-1.065789, 1.184211], 169 | 's6': [-1.144737, -0.394737], 170 | 's7': [0.394737, 1.894737]}, 171 | index=['Y1', 'Y2']).T 172 | submodels = [self.model1, self.model2] 173 | res = submock(Y=self.balances, Xs=None) 174 | submock.submodels = submodels 175 | res.fit() 176 | 177 | pdt.assert_frame_equal(res.residuals(), exp_resid, 178 | check_exact=False, 179 | check_less_precise=True) 180 | 181 | 182 | if __name__ == "__main__": 183 | unittest.main() 184 | -------------------------------------------------------------------------------- /gneiss/cluster/_pba.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import numpy as np 9 | import pandas as pd 10 | from gneiss.sort import mean_niche_estimator 11 | from gneiss.util import match, rename_internal_nodes 12 | from gneiss.composition._variance import variation_matrix 13 | 14 | from skbio import TreeNode, DistanceMatrix 15 | from scipy.cluster.hierarchy import linkage 16 | 17 | 18 | def correlation_linkage(X, method='ward'): 19 | r""" 20 | Hierarchical Clustering based on proportionality. 21 | 22 | The hierarchy is built based on the correlationity between 23 | any two pairs of features. Specifically the correlation between 24 | two features :math:`x` and :math:`y` is measured by 25 | 26 | .. math:: 27 | p(x, y) = var (\ln \frac{x}{y}) 28 | 29 | If :math:`p(x, y)` is very small, then :math:`x` and :math:`y` 30 | are said to be highly correlation. A hierarchical clustering is 31 | then performed using this correlation as a distance metric. 32 | 33 | This can be useful for constructing principal balances [1]_. 34 | 35 | Parameters 36 | ---------- 37 | X : pd.DataFrame 38 | Contingency table where the samples are rows and the features 39 | are columns. 40 | method : str 41 | Clustering method. (default='ward') 42 | 43 | Returns 44 | ------- 45 | skbio.TreeNode 46 | Tree for constructing principal balances. 47 | 48 | References 49 | ---------- 50 | 51 | .. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R. 52 | Principal Balances (2011). 53 | 54 | Examples 55 | -------- 56 | >>> import pandas as pd 57 | >>> from gneiss.cluster import correlation_linkage 58 | >>> table = pd.DataFrame([[1, 1, 0, 0, 0], 59 | ... [0, 1, 1, 0, 0], 60 | ... [0, 0, 1, 1, 0], 61 | ... [0, 0, 0, 1, 1]], 62 | ... columns=['s1', 's2', 's3', 's4', 's5'], 63 | ... index=['o1', 'o2', 'o3', 'o4']).T 64 | >>> tree = correlation_linkage(table+0.1) 65 | >>> print(tree.ascii_art()) 66 | /-o1 67 | /y1------| 68 | | \-o2 69 | -y0------| 70 | | /-o3 71 | \y2------| 72 | \-o4 73 | """ 74 | dm = variation_matrix(X) 75 | lm = linkage(dm.condensed_form(), method=method) 76 | t = TreeNode.from_linkage_matrix(lm, X.columns) 77 | t = rename_internal_nodes(t) 78 | return t 79 | 80 | 81 | def rank_linkage(r, method='average'): 82 | r""" Hierchical Clustering on feature ranks. 83 | 84 | The hierarchy is built based on the rank values of the features given 85 | an input vector `r` of ranks. The distance between two features :math:`x` 86 | and :math:`y` can be defined as 87 | 88 | .. math:: 89 | d(x, y) = (r(x) - r(y))^2 90 | 91 | Where :math:`r(x)` is the rank of the features. Hierarchical clustering is 92 | then performed using :math:`d(x, y)` as the distance metric. 93 | 94 | This can be useful for constructing principal balances. 95 | 96 | Parameters 97 | ---------- 98 | r : pd.Series 99 | Continuous vector representing some ordering of the features in X. 100 | method : str 101 | Clustering method. (default='average') 102 | 103 | Returns 104 | ------- 105 | skbio.TreeNode 106 | Tree for constructing principal balances. 107 | 108 | Examples 109 | -------- 110 | >>> import pandas as pd 111 | >>> from gneiss.cluster import rank_linkage 112 | >>> ranks = pd.Series([1, 2, 4, 5], 113 | ... index=['o1', 'o2', 'o3', 'o4']) 114 | >>> tree = rank_linkage(ranks) 115 | >>> print(tree.ascii_art()) 116 | /-o1 117 | /y1------| 118 | | \-o2 119 | -y0------| 120 | | /-o3 121 | \y2------| 122 | \-o4 123 | """ 124 | dm = DistanceMatrix.from_iterable(r, lambda a, b: np.abs(b-a)) 125 | lm = linkage(dm.condensed_form(), method) 126 | t = TreeNode.from_linkage_matrix(lm, r.index) 127 | t = rename_internal_nodes(t) 128 | return t 129 | 130 | 131 | def gradient_linkage(X, y, method='average'): 132 | r""" 133 | Hierarchical Clustering on known gradient. 134 | 135 | The hierarchy is built based on the values of the samples 136 | located along a gradient. Given a feature :math:`x`, the mean gradient 137 | values that :math:`x` was observed in is calculated by 138 | 139 | .. math:: 140 | f(g , x) = 141 | \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j} 142 | 143 | Where :math:`N` is the number of samples, :math:`x_i` is the proportion of 144 | feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value 145 | at sample `i`. 146 | 147 | The distance between two features :math:`x` and :math:`y` can be defined as 148 | 149 | .. math:: 150 | d(x, y) = (f(g, x) - f(g, y))^2 151 | 152 | If :math:`d(x, y)` is very small, then :math:`x` and :math:`y` 153 | are expected to live in very similar positions across the gradient. 154 | A hierarchical clustering is then performed using :math:`d(x, y)` as 155 | the distance metric. 156 | 157 | This can be useful for constructing principal balances. 158 | 159 | Parameters 160 | ---------- 161 | X : pd.DataFrame 162 | Contingency table where the samples are rows and the features 163 | are columns. 164 | y : pd.Series 165 | Continuous vector representing some ordering of the samples in X. 166 | method : str 167 | Clustering method. (default='average') 168 | 169 | Returns 170 | ------- 171 | skbio.TreeNode 172 | Tree for constructing principal balances. 173 | 174 | See Also 175 | -------- 176 | mean_niche_estimator 177 | 178 | Examples 179 | -------- 180 | >>> import pandas as pd 181 | >>> from gneiss.cluster import gradient_linkage 182 | >>> table = pd.DataFrame([[1, 1, 0, 0, 0], 183 | ... [0, 1, 1, 0, 0], 184 | ... [0, 0, 1, 1, 0], 185 | ... [0, 0, 0, 1, 1]], 186 | ... columns=['s1', 's2', 's3', 's4', 's5'], 187 | ... index=['o1', 'o2', 'o3', 'o4']).T 188 | >>> gradient = pd.Series([1, 2, 3, 4, 5], 189 | ... index=['s1', 's2', 's3', 's4', 's5']) 190 | >>> tree = gradient_linkage(table, gradient) 191 | >>> print(tree.ascii_art()) 192 | /-o1 193 | /y1------| 194 | | \-o2 195 | -y0------| 196 | | /-o3 197 | \y2------| 198 | \-o4 199 | """ 200 | _X, _y = match(X, y) 201 | mean_X = mean_niche_estimator(_X, gradient=_y) 202 | t = rank_linkage(mean_X) 203 | return t 204 | 205 | 206 | def random_linkage(n): 207 | """ Generates a tree with random topology. 208 | 209 | Parameters 210 | ---------- 211 | n : int 212 | Number of nodes in the tree 213 | 214 | Returns 215 | ------- 216 | skbio.TreeNode 217 | Random tree for constructing principal balances. 218 | 219 | Examples 220 | -------- 221 | >>> from gneiss.cluster import random_linkage 222 | >>> tree = random_linkage(10) 223 | 224 | Notes 225 | ----- 226 | The nodes will be labeled from 0 to n. 227 | """ 228 | index = np.arange(n).astype(np.str) 229 | x = pd.Series(np.random.rand(n), index=index) 230 | t = rank_linkage(x) 231 | return t 232 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 16 | 17 | .PHONY: help 18 | help: 19 | @echo "Please use \`make ' where is one of" 20 | @echo " html to make standalone HTML files" 21 | @echo " dirhtml to make HTML files named index.html in directories" 22 | @echo " singlehtml to make a single large HTML file" 23 | @echo " pickle to make pickle files" 24 | @echo " json to make JSON files" 25 | @echo " htmlhelp to make HTML files and a HTML help project" 26 | @echo " qthelp to make HTML files and a qthelp project" 27 | @echo " applehelp to make an Apple Help Book" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " epub3 to make an epub3" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " xml to make Docutils-native XML files" 41 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 42 | @echo " linkcheck to check all external links for integrity" 43 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 44 | @echo " coverage to run coverage check of the documentation (if enabled)" 45 | @echo " dummy to check syntax errors of document sources" 46 | 47 | .PHONY: clean 48 | clean: 49 | rm -rf $(BUILDDIR)/* 50 | 51 | .PHONY: html 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | .PHONY: dirhtml 58 | dirhtml: 59 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 60 | @echo 61 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 62 | 63 | .PHONY: singlehtml 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | .PHONY: pickle 70 | pickle: 71 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 72 | @echo 73 | @echo "Build finished; now you can process the pickle files." 74 | 75 | .PHONY: json 76 | json: 77 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 78 | @echo 79 | @echo "Build finished; now you can process the JSON files." 80 | 81 | .PHONY: htmlhelp 82 | htmlhelp: 83 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 84 | @echo 85 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 86 | ".hhp project file in $(BUILDDIR)/htmlhelp." 87 | 88 | .PHONY: qthelp 89 | qthelp: 90 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 91 | @echo 92 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 93 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 94 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/gneiss.qhcp" 95 | @echo "To view the help file:" 96 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/gneiss.qhc" 97 | 98 | .PHONY: applehelp 99 | applehelp: 100 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 101 | @echo 102 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 103 | @echo "N.B. You won't be able to view it unless you put it in" \ 104 | "~/Library/Documentation/Help or install it in your application" \ 105 | "bundle." 106 | 107 | .PHONY: devhelp 108 | devhelp: 109 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 110 | @echo 111 | @echo "Build finished." 112 | @echo "To view the help file:" 113 | @echo "# mkdir -p $$HOME/.local/share/devhelp/gneiss" 114 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/gneiss" 115 | @echo "# devhelp" 116 | 117 | .PHONY: epub 118 | epub: 119 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 120 | @echo 121 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 122 | 123 | .PHONY: epub3 124 | epub3: 125 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 126 | @echo 127 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 128 | 129 | .PHONY: latex 130 | latex: 131 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 132 | @echo 133 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 134 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 135 | "(use \`make latexpdf' here to do that automatically)." 136 | 137 | .PHONY: latexpdf 138 | latexpdf: 139 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 140 | @echo "Running LaTeX files through pdflatex..." 141 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 142 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 143 | 144 | .PHONY: latexpdfja 145 | latexpdfja: 146 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 147 | @echo "Running LaTeX files through platex and dvipdfmx..." 148 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 149 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 150 | 151 | .PHONY: text 152 | text: 153 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 154 | @echo 155 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 156 | 157 | .PHONY: man 158 | man: 159 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 160 | @echo 161 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 162 | 163 | .PHONY: texinfo 164 | texinfo: 165 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 166 | @echo 167 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 168 | @echo "Run \`make' in that directory to run these through makeinfo" \ 169 | "(use \`make info' here to do that automatically)." 170 | 171 | .PHONY: info 172 | info: 173 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 174 | @echo "Running Texinfo files through makeinfo..." 175 | make -C $(BUILDDIR)/texinfo info 176 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 177 | 178 | .PHONY: gettext 179 | gettext: 180 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 181 | @echo 182 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 183 | 184 | .PHONY: changes 185 | changes: 186 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 187 | @echo 188 | @echo "The overview file is in $(BUILDDIR)/changes." 189 | 190 | .PHONY: linkcheck 191 | linkcheck: 192 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 193 | @echo 194 | @echo "Link check complete; look for any errors in the above output " \ 195 | "or in $(BUILDDIR)/linkcheck/output.txt." 196 | 197 | .PHONY: doctest 198 | doctest: 199 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 200 | @echo "Testing of doctests in the sources finished, look at the " \ 201 | "results in $(BUILDDIR)/doctest/output.txt." 202 | 203 | .PHONY: coverage 204 | coverage: 205 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 206 | @echo "Testing of coverage in the sources finished, look at the " \ 207 | "results in $(BUILDDIR)/coverage/python.txt." 208 | 209 | .PHONY: xml 210 | xml: 211 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 212 | @echo 213 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 214 | 215 | .PHONY: pseudoxml 216 | pseudoxml: 217 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 218 | @echo 219 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 220 | 221 | .PHONY: dummy 222 | dummy: 223 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 224 | @echo 225 | @echo "Build finished. Dummy builder generates no files." 226 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | gneiss documentation 2 | ======================== 3 | 4 | This guide contains instructions for building the gneiss documentation, as 5 | well as guidelines for contributing to the documentation. 6 | 7 | Building the documentation 8 | -------------------------- 9 | 10 | To build the documentation, you'll need a gneiss development environment 11 | set up. See [CONTRIBUTING.md](../CONTRIBUTING.md) for instructions. 12 | 13 | **Important:** The documentation will be built for whatever version of 14 | gneiss is *currently installed* on your system (i.e., the version imported 15 | by ```import gneiss```). This may not match the code located in this repository. 16 | You will need to either install this version of gneiss somewhere (e.g., in 17 | a virtualenv) or point your ```PYTHONPATH``` environment variable to this code, 18 | *before* building the documentation. 19 | 20 | To build the documentation, assuming you are at the top-level gneiss 21 | directory: 22 | 23 | make -C doc clean html 24 | 25 | The built HTML documentation will be at ```doc/build/html/index.html```. 26 | 27 | Contributing to the documentation 28 | --------------------------------- 29 | 30 | If you would like to contribute to the documentation, whether by adding 31 | something entirely new or by modifying existing documentation, please first 32 | review our [gneiss contribution guide](../CONTRIBUTING.md). 33 | 34 | Before submitting your changes, ensure that the documentation builds without 35 | errors or warnings. 36 | 37 | ### Documentation guidelines 38 | 39 | Most of gneiss's API documentation is automatically generated from 40 | [docstrings](http://legacy.python.org/dev/peps/pep-0257/#what-is-a-docstring). 41 | The advantage to this approach is that users can access the documentation in an 42 | interactive Python session or from our website as HTML. Other output formats 43 | are also possible, such as PDF. 44 | 45 | gneiss docstrings follow the [numpydoc conventions](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt). 46 | This ensures that the docstrings are easily readable both from the interpreter 47 | and HTML, PDF, etc. Please read the numpydoc guidelines before continuing. 48 | 49 | ### Documenting a module in gneiss 50 | 51 | In addition to following the numpydoc conventions for docstrings, we have a few 52 | more conventions that will ensure your documentation is correctly built and 53 | linked within our website, and that it maintains consistency with the rest of 54 | the gneiss docs. 55 | 56 | The easiest way to get started with documenting your code is to look at the 57 | docstrings in existing gneiss modules. An example of a module to start with 58 | is ```gneiss.balances```. Go ahead and look 59 | through those now. We've structured our docs in a similar way to 60 | [SciPy's documentation](http://docs.scipy.org/doc/scipy/reference/), so that 61 | may be another good place to look for examples. 62 | 63 | We'll take a top-down approach by discussing how to document a new module that 64 | you'd like to add to gneiss (let's call it ```gneiss/example.py```). 65 | 66 | #### Module docstring 67 | 68 | The first thing you'll need to add is a docstring for the module. The docstring 69 | must start at the first line of the file. It should start with a title for the 70 | module: 71 | 72 | """ 73 | Documentation examples (:mod:`gneiss.example`) 74 | ============================================= 75 | 76 | It is important to include the ```:mod:``` Sphinx directive in the title, as 77 | this title will be included in the table of contents. Also make sure that the 78 | title underline is the same length as the title. 79 | 80 | We also need to include another Sphinx directive below this: 81 | 82 | .. currentmodule:: gneiss.example 83 | 84 | This directive tells Sphinx that other classes, functions, etc. that we will 85 | reference are located in the ```gneiss.example``` module. 86 | 87 | Next, include a more detailed description of the module. For example: 88 | 89 | This module consists of several example classes and functions to illustrate 90 | the gneiss documentation system. 91 | 92 | Following that, list any classes, functions, and exceptions that you'd like 93 | documentation generated for. Note that you do *not* need to include every 94 | single class, function, or exception that is defined in the module. Also, you 95 | do not need to list class methods, as those will be automatically included in 96 | the generated class documentation. Only include objects that should be exposed 97 | as part of the public API. 98 | 99 | For example: 100 | 101 | Classes 102 | ------- 103 | 104 | .. autosummary:: 105 | :toctree: generated/ 106 | 107 | ExampleClass1 108 | ExampleClass2 109 | 110 | Functions 111 | --------- 112 | 113 | .. autosummary:: 114 | :toctree: generated/ 115 | 116 | example_function1 117 | example_function2 118 | 119 | Exceptions 120 | ---------- 121 | 122 | .. autosummary:: 123 | :toctree: generated/ 124 | 125 | ExampleError 126 | 127 | The ```autosummary``` directives are important as they generate RST files in 128 | the ```generated/``` directory for each object. A single-line summary and link 129 | to each object is inserted into the page for you. 130 | 131 | After listing public module members, we encourage a usage example section 132 | showing how to use some of the module's functionality. Examples should be 133 | written in [doctest](http://docs.python.org/3/library/doctest.html) format so 134 | that they can be automatically tested (e.g., using ```make test```). 135 | 136 | Examples 137 | -------- 138 | 139 | Run the ``example_function1`` function: 140 | 141 | >>> from gneiss.example import example_function1 142 | >>> example_function1("hello", "world") 143 | hello world! 144 | 145 | You can also embed the plots that an example generates into the built 146 | documentation with the ```.. plot::``` directive. For example: 147 | 148 | .. plot:: 149 | 150 | >>> import pandas as pd 151 | >>> df = pd.DataFrame({'col1': [1, 2, 3, 4], 'col2': [10, 11, 12, 13]}) 152 | >>> fig = df.boxplot() 153 | 154 | This will include the plot, a link to the source code used to generate the 155 | plot, and links to different image formats (e.g., PNG and PDF) so that users 156 | can easily download the plot. 157 | 158 | You're now ready to document the members of your module. 159 | 160 | #### Documenting module members 161 | 162 | When documenting the members of a module (e.g., classes, methods, attributes, 163 | functions, and exceptions), follow the numpydoc conventions. In addition to 164 | these conventions, there are a few things to keep in mind: 165 | 166 | - When documenting a class, only public methods and attributes are included in 167 | the built documentation. If a method or attribute starts with an 168 | underscore, it is assumed to be private. 169 | 170 | - When documenting a class, include the ```Parameters``` section in the class 171 | docstring, instead of in the ```__init__``` docstring. While numpydoc 172 | technically supports either form, ```__init__``` is not included in the list 173 | of methods by default and thus should have its documentation included in the 174 | class docstring. 175 | 176 | #### Including the module in the docs 177 | 178 | Until now, we've only been editing docstrings, which are attached to Python 179 | code. The final step is to hook up this new module's docstrings to the 180 | documentation build system: 181 | 182 | 1. Make sure you're within the ```gneiss/doc``` directory. 183 | 2. Create a new file with the same name as your module under the ```source``` 184 | directory. Do not include ```gneiss``` as part of the name, and use 185 | ```.rst``` as the suffix. For example, ```source/example.rst```. 186 | 3. Add the following line to ```source/example.rst``` to have your module's 187 | docstring pulled into the document: 188 | 189 | ``` 190 | .. automodule:: gneiss.example 191 | ``` 192 | 193 | 4. Add the following line to ```source/index.rst``` to add the new page to the 194 | top-level table of contents: 195 | 196 | ``` 197 | example 198 | ``` 199 | 200 | That's it! You can now try building the documentation, which should include the 201 | documentation for your new module! 202 | 203 | ### Documenting a subpackage in gneiss 204 | 205 | The process of documenting a subpackage is very similar to documenting a module 206 | in gneiss. The only difference is that the module docstring goes in the 207 | subpackage's ```__init__.py```. 208 | 209 | ### Troubleshooting 210 | 211 | If things aren't working correctly, try running ```make clean``` and then 212 | rebuild the docs. If things still aren't working, try building the docs 213 | *without* your changes, and see if there are any Sphinx errors or warnings. 214 | Make note of these, and then see what new errors or warnings are generated when 215 | you add your changes again. 216 | 217 | ### Acknowledgements 218 | This documentation guide lines are adapted from scikit-bio's guide line. -------------------------------------------------------------------------------- /gneiss/plot/tests/test_heatmap.py: -------------------------------------------------------------------------------- 1 | from gneiss.plot import heatmap 2 | from gneiss.plot._heatmap import _sort_table 3 | 4 | import pandas as pd 5 | import pandas.util.testing as pdt 6 | from skbio import TreeNode, DistanceMatrix 7 | from scipy.cluster.hierarchy import ward 8 | from gneiss.plot._dendrogram import SquareDendrogram 9 | from gneiss.util import block_diagonal 10 | from gneiss.cluster import rank_linkage 11 | import numpy as np 12 | import numpy.testing.utils as npt 13 | import unittest 14 | 15 | 16 | class HeatmapTest(unittest.TestCase): 17 | def setUp(self): 18 | np.random.seed(0) 19 | self.table = pd.DataFrame(np.random.random((5, 5)), 20 | index=['0', '1', '2', '3', '4'], 21 | columns=['0', '1', '2', '3', '4']) 22 | 23 | num_otus = 5 # otus 24 | x = np.random.rand(num_otus) 25 | dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) 26 | lm = ward(dm.condensed_form()) 27 | t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) 28 | self.t = SquareDendrogram.from_tree(t) 29 | self.md = pd.Series(['a', 'a', 'a', 'b', 'b'], 30 | index=['0', '1', '2', '3', '4']) 31 | for i, n in enumerate(t.postorder()): 32 | if not n.is_tip(): 33 | n.name = "y%d" % i 34 | n.length = np.random.rand() * 3 35 | 36 | self.highlights = pd.DataFrame({'y8': ['#FF0000', '#00FF00'], 37 | 'y6': ['#0000FF', '#F0000F']}).T 38 | 39 | def test_sort_table(self): 40 | table = pd.DataFrame( 41 | [[1, 1, 0, 0, 0], 42 | [0, 1, 1, 0, 0], 43 | [0, 0, 1, 1, 0], 44 | [0, 0, 0, 1, 1]], 45 | columns=['s1', 's2', 's3', 's4', 's5'], 46 | index=['o1', 'o2', 'o3', 'o4']) 47 | mdvar = pd.Series(['a', 'b', 'a', 'b', 'a'], 48 | index=['s1', 's2', 's3', 's4', 's5']) 49 | res_table, res_mdvar = _sort_table(table, mdvar) 50 | pdt.assert_index_equal(pd.Index(['s1', 's3', 's5', 's2', 's4']), 51 | res_mdvar.index) 52 | pdt.assert_index_equal(pd.Index(['s1', 's3', 's5', 's2', 's4']), 53 | res_table.columns) 54 | 55 | @unittest.skip('Visualizations are deprecated') 56 | def test_basic(self): 57 | fig = heatmap(self.table, self.t, self.md, 58 | figsize=(5, self.table.shape[0])) 59 | 60 | # Test to see if the lineages of the tree are ok 61 | lines = list(fig.get_axes()[0].get_lines()) 62 | 63 | exp_coords = np.array([[14.25, 0.5], 64 | [14.25, 1.], 65 | [14.25, 1.], 66 | [20., 1.], 67 | [9.5, 1.25], 68 | [9.5, 2.], 69 | [9.5, 2.], 70 | [20., 2.], 71 | [4.75, 2.125], 72 | [4.75, 3.], 73 | [4.75, 3.], 74 | [20., 3.], 75 | [0., 3.0625], 76 | [0., 4.], 77 | [0., 4.], 78 | [20., 4.], 79 | [14.25, 0.5], 80 | [14.25, 0.], 81 | [14.25, 0.], 82 | [20., 0.], 83 | [9.5, 1.25], 84 | [9.5, 0.5], 85 | [9.5, 0.5], 86 | [14.25, 0.5], 87 | [4.75, 2.125], 88 | [4.75, 1.25], 89 | [4.75, 1.25], 90 | [9.5, 1.25], 91 | [0., 3.0625], 92 | [0., 2.125], 93 | [0., 2.125], 94 | [4.75, 2.125]]) 95 | 96 | res = np.vstack([i._xy for i in lines]) 97 | 98 | npt.assert_allclose(exp_coords, res) 99 | 100 | # Make sure that the metadata labels are set properly 101 | res = str(fig.get_axes()[1].get_xticklabels(minor=True)[0]) 102 | self.assertEqual(res, "Text(0, 0, 'a')") 103 | 104 | res = str(fig.get_axes()[1].get_xticklabels(minor=True)[1]) 105 | self.assertEqual(res, "Text(0, 0, 'b')") 106 | 107 | res = str(fig.get_axes()[1].get_xlabel()) 108 | self.assertEqual(res, "") 109 | 110 | def test_basic_line_width(self): 111 | fig = heatmap(self.table, self.t, self.md, 112 | figsize=(5, self.table.shape[0]), linewidth=1) 113 | 114 | # Test to see if the lineages of the tree are ok 115 | lines = list(fig.get_axes()[1].get_lines()) 116 | widths = [L.get_lw() for L in lines] 117 | np.allclose(widths, [1.0] * len(widths)) 118 | 119 | @unittest.skip('Visualizations are deprecated') 120 | def test_highlights(self): 121 | 122 | table = pd.DataFrame(block_diagonal(ncols=5, nrows=5, nblocks=2), 123 | index=['0', '1', '2', '3', '4'], 124 | columns=['0', '1', '2', '3', '4']) 125 | t = rank_linkage(pd.Series([1, 2, 3, 4, 5], 126 | index=['0', '1', '2', '3', '4'])) 127 | t = SquareDendrogram.from_tree(t) 128 | md = pd.Series(['a', 'a', 'a', 'b', 'b'], 129 | index=['0', '1', '2', '3', '4']) 130 | for i, n in enumerate(t.postorder()): 131 | if not n.is_tip(): 132 | n.name = "y%d" % i 133 | n.length = np.random.rand() * 3 134 | 135 | highlights = pd.DataFrame({'y8': ['#FF0000', '#00FF00'], 136 | 'y7': ['#0000FF', '#F0000F']}).T 137 | 138 | fig = heatmap(table, t, md, highlights) 139 | 140 | # Test to see if the lineages of the tree are ok 141 | lines = list(fig.get_axes()[0].get_lines()) 142 | 143 | pts = self.t.coords(width=20, height=self.table.shape[0]) 144 | pts['y'] = pts['y'] - 0.5 # account for offset 145 | pts['x'] = pts['x'].astype(np.float) 146 | pts['y'] = pts['y'].astype(np.float) 147 | 148 | exp_coords = np.array([[6.33333333, 3.5], 149 | [6.33333333, 4.], 150 | [6.33333333, 4.], 151 | [20., 4.], 152 | [12.66666667, 0.5], 153 | [12.66666667, 1.], 154 | [12.66666667, 1.], 155 | [20., 1.], 156 | [6.33333333, 1.25], 157 | [6.33333333, 2.], 158 | [6.33333333, 2.], 159 | [20., 2.], 160 | [0., 2.375], 161 | [0., 3.5], 162 | [0., 3.5], 163 | [6.33333333, 3.5], 164 | [6.33333333, 3.5], 165 | [6.33333333, 3.], 166 | [6.33333333, 3.], 167 | [20., 3.], 168 | [12.66666667, 0.5], 169 | [12.66666667, 0.], 170 | [12.66666667, 0.], 171 | [20., 0.], 172 | [6.33333333, 1.25], 173 | [6.33333333, 0.5], 174 | [6.33333333, 0.5], 175 | [12.66666667, 0.5], 176 | [0., 2.375], 177 | [0., 1.25], 178 | [0., 1.25], 179 | [6.33333333, 1.25]]) 180 | 181 | res = np.vstack([i._xy for i in lines]) 182 | 183 | npt.assert_allclose(exp_coords, res) 184 | 185 | # Make sure that the metadata labels are set properly 186 | res = str(fig.get_axes()[2].get_xticklabels(minor=True)[0]) 187 | self.assertEqual(res, "Text(0, 0, 'a')") 188 | 189 | res = str(fig.get_axes()[2].get_xticklabels(minor=True)[1]) 190 | self.assertEqual(res, "Text(0, 0, 'b')") 191 | 192 | print([str(i) for i in fig.get_axes()[1].get_xticklabels()]) 193 | # Make sure that the highlight labels are set properly 194 | res = str(fig.get_axes()[1].get_xticklabels()[0]) 195 | self.assertEqual(res, "Text(0, 0, 'y8')") 196 | 197 | res = str(fig.get_axes()[1].get_xticklabels()[1]) 198 | self.assertEqual(res, "Text(0, 0, 'y7')") 199 | 200 | # Test to see if the highlights are ok 201 | res = fig.get_axes()[2].get_position()._points 202 | exp = np.array([[0.24, 0.1], 203 | [0.808, 0.9]]) 204 | npt.assert_allclose(res, exp) 205 | 206 | 207 | if __name__ == "__main__": 208 | unittest.main() 209 | -------------------------------------------------------------------------------- /gneiss/sort.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sort functions (:mod:`gneiss.sort`) 3 | =================================== 4 | 5 | .. currentmodule:: gneiss.sort 6 | 7 | This module contains sorting functions that sort contingency tables 8 | in addition to trees. 9 | 10 | Functions 11 | --------- 12 | 13 | .. autosummary:: 14 | :toctree: generated/ 15 | 16 | mean_niche_estimator 17 | niche_sort 18 | ladderize 19 | gradient_sort 20 | """ 21 | # ---------------------------------------------------------------------------- 22 | # Copyright (c) 2016--, gneiss development team. 23 | # 24 | # Distributed under the terms of the Modified BSD License. 25 | # 26 | # The full license is in the file COPYING.txt, distributed with this software. 27 | # ---------------------------------------------------------------------------- 28 | import numpy as np 29 | import pandas as pd 30 | from functools import partial 31 | from gneiss.util import match 32 | 33 | 34 | def mean_niche_estimator(abundances, gradient): 35 | r""" Estimates the mean niche along a gradient of an organism. 36 | 37 | Calculates the mean niche of an organism along a gradient. 38 | This is done by calculating the mean gradient values that 39 | an organism is observed in. 40 | 41 | Specifically, this module calculates the following 42 | 43 | .. math:: 44 | f(g , x) = 45 | \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j} 46 | 47 | 48 | Where :math:`N` is the number of samples, :math:`x_i` is the proportion of 49 | species :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value 50 | at sample `i`. 51 | 52 | Parameters 53 | ---------- 54 | abundances : pd.DataFrame or pd.Series, np.float 55 | Vector of fraction abundances of an organism over a list of samples. 56 | gradient : pd.Series, np.float 57 | Vector of numerical gradient values. 58 | 59 | Returns 60 | ------- 61 | pd.Series or np.float : 62 | The mean gradient that the feature is observed in. 63 | If `abundances` is a `pd.DataFrame` containing the mean gradient 64 | values for each feature. Otherwise a float is returned. 65 | 66 | Raises 67 | ------ 68 | ValueError: 69 | If the length of `abundances` is not the same length as `gradient`. 70 | ValueError: 71 | If the length of `gradient` contains nans. 72 | """ 73 | len_abundances = len(abundances) 74 | len_gradient = len(gradient) 75 | if len_abundances != len_gradient: 76 | raise ValueError("Length of `abundances` (%d) doesn't match the length" 77 | " of the `gradient` (%d)" % (len_abundances, 78 | len_gradient)) 79 | if np.any(pd.isnull(gradient)): 80 | raise ValueError("`gradient` cannot have any nans.") 81 | 82 | # normalizes the proportions of the organism across all of the 83 | # samples to add to 1. 84 | v = abundances / abundances.sum() 85 | m = np.dot(gradient, v) 86 | if isinstance(abundances, pd.DataFrame): 87 | m = pd.Series(m, index=abundances.columns) 88 | return m 89 | 90 | 91 | def niche_sort(table, gradient, niche_estimator=mean_niche_estimator): 92 | """ Sort the table according to estimated niches. 93 | 94 | Sorts the table by samples along the gradient 95 | and otus by their estimated niche along the gradient. 96 | 97 | Parameters 98 | ---------- 99 | table : pd.DataFrame 100 | Contingency table where samples are rows and features (i.e. OTUs) 101 | are columns. 102 | gradient : pd.Series 103 | Vector of numerical gradient values. 104 | niche_estimator : function, optional 105 | A function that takes in two pandas series and returns an ordered 106 | object. The ability for the object to be ordered is critical, since 107 | this will allow the table to be sorted according to this ordering. 108 | By default, `mean_niche_estimator` will be used. 109 | 110 | Returns 111 | ------- 112 | pd.DataFrame : 113 | Sorted table according to the gradient of the samples, and the niches 114 | of the organisms along that gradient. 115 | 116 | Raises 117 | ------ 118 | ValueError : 119 | Raised if `niche_estimator` is not a function. 120 | """ 121 | if not callable(niche_estimator): 122 | raise ValueError("`niche_estimator` is not a function.") 123 | 124 | table, gradient = match(table, gradient) 125 | niche_estimator = partial(niche_estimator, gradient=gradient) 126 | 127 | # normalizes feature abundances to sum to 1, for each sample. 128 | # (i.e. scales values in each row to sum to 1). 129 | normtable = table.apply(lambda x: x / x.sum(), axis=1) 130 | 131 | # calculates estimated niche for each feature 132 | est_niche = normtable.apply(niche_estimator, axis=0) 133 | gradient = gradient.sort_values() 134 | est_niche = est_niche.sort_values() 135 | 136 | table = table.reindex(index=gradient.index, 137 | columns=est_niche.index) 138 | return table 139 | 140 | 141 | def _cache_ntips(tree): 142 | for n in tree.postorder(include_self=True): 143 | if n.is_tip(): 144 | n._n_tips = 1 145 | else: 146 | n._n_tips = sum(c._n_tips for c in n.children) 147 | return tree 148 | 149 | 150 | def ladderize(tree, ascending=True): 151 | r""" 152 | Sorts tree according to the size of the subtrees. 153 | 154 | Parameters 155 | ---------- 156 | tree : skbio.TreeNode 157 | Input tree where leafs correspond to features. 158 | 159 | Returns 160 | ------- 161 | skbio.TreeNode 162 | A tree whose tips are sorted according to subtree size. 163 | 164 | Examples 165 | -------- 166 | >>> from skbio import TreeNode 167 | >>> tree = TreeNode.read([u'((a,b)c, ((g,h)e,f)d)r;']) 168 | >>> print(tree.ascii_art()) 169 | /-a 170 | /c-------| 171 | | \-b 172 | -r-------| 173 | | /-g 174 | | /e-------| 175 | \d-------| \-h 176 | | 177 | \-f 178 | >>> sorted_tree = ladderize(tree) 179 | >>> print(sorted_tree.ascii_art()) 180 | /-a 181 | /c-------| 182 | | \-b 183 | -r-------| 184 | | /-f 185 | \d-------| 186 | | /-g 187 | \e-------| 188 | \-h 189 | """ 190 | sorted_tree = tree.copy() 191 | sorted_tree = _cache_ntips(tree) 192 | 193 | for n in sorted_tree.postorder(include_self=True): 194 | sizes = [k._n_tips for k in n.children] 195 | idx = np.argsort(sizes) 196 | if not ascending: 197 | idx = idx[::-1] 198 | n.children = [n.children[i] for i in idx] 199 | return sorted_tree 200 | 201 | 202 | def gradient_sort(tree, gradient, ascending=True): 203 | r""" 204 | Sorts tree according to ordering in gradient. 205 | 206 | Parameters 207 | ---------- 208 | tree : skbio.TreeNode 209 | Input tree where leafs correspond to features 210 | contained in the index in `gradient`. 211 | gradient : pd.Series, numeric 212 | Gradient where the index correspond to feature names. 213 | The index in the gradient must be consistent with 214 | names of the tips in the `tree`. 215 | 216 | Returns 217 | ------- 218 | skbio.TreeNode 219 | A tree whose tips are sorted along the gradient. 220 | 221 | Examples 222 | -------- 223 | >>> from skbio import TreeNode 224 | >>> import pandas as pd 225 | >>> tree = TreeNode.read([u'((a,b)c, ((g,h)e,f)d)r;']) 226 | >>> x = pd.Series({'f':3, 'g':1, 'h':2, 'a':4, 'b':5}) 227 | >>> print(tree.ascii_art()) 228 | /-a 229 | /c-------| 230 | | \-b 231 | -r-------| 232 | | /-g 233 | | /e-------| 234 | \d-------| \-h 235 | | 236 | \-f 237 | >>> res = gradient_sort(tree, x) 238 | >>> print(res.ascii_art()) 239 | /-g 240 | /e-------| 241 | /d-------| \-h 242 | | | 243 | -r-------| \-f 244 | | 245 | | /-a 246 | \c-------| 247 | \-b 248 | """ 249 | sorted_tree = tree.copy() 250 | if not np.issubdtype(gradient, np.number): 251 | raise ValueError('`gradient` needs to be numeric, not %s' % 252 | gradient.dtype) 253 | 254 | # Note that this operation is not optimal 255 | # See https://github.com/biocore/gneiss/issues/58 256 | for n in sorted_tree.postorder(include_self=True): 257 | means = [gradient.loc[list(k.subset())].mean() for k in n.children] 258 | idx = np.argsort(means) 259 | if not ascending: 260 | idx = idx[::-1] 261 | n.children = [n.children[i] for i in idx] 262 | return sorted_tree 263 | -------------------------------------------------------------------------------- /gneiss/balances.py: -------------------------------------------------------------------------------- 1 | """ 2 | Balances (:mod:`gneiss.balances`) 3 | 4 | ================================= 5 | 6 | .. currentmodule:: gneiss.balances 7 | 8 | This module contains modules for calculating balances and creating ETE 9 | objects to visualize these balances on a tree. 10 | 11 | Functions 12 | --------- 13 | 14 | .. autosummary:: 15 | :toctree: generated/ 16 | 17 | balance_basis 18 | 19 | """ 20 | # ---------------------------------------------------------------------------- 21 | # Copyright (c) 2016--, gneiss development team. 22 | # 23 | # Distributed under the terms of the Modified BSD License. 24 | # 25 | # The full license is in the file COPYING.txt, distributed with this software. 26 | # ---------------------------------------------------------------------------- 27 | 28 | 29 | from __future__ import division 30 | import numpy as np 31 | from skbio.stats.composition import clr_inv 32 | from collections import OrderedDict 33 | from gneiss.util import NUMERATOR, DENOMINATOR 34 | from scipy.sparse import coo_matrix 35 | 36 | 37 | def _balance_basis(tree_node): 38 | """ Helper method for calculating balance basis 39 | """ 40 | counts, n_tips = _count_matrix(tree_node) 41 | counts = OrderedDict([(x, counts[x]) 42 | for x in counts.keys() if not x.is_tip()]) 43 | nds = counts.keys() 44 | r = np.array([counts[n]['r'] for n in nds]) 45 | s = np.array([counts[n]['l'] for n in nds]) 46 | k = np.array([counts[n]['k'] for n in nds]) 47 | t = np.array([counts[n]['t'] for n in nds]) 48 | 49 | a = np.sqrt(s / (r * (r + s))) 50 | b = -1 * np.sqrt(r / (s * (r + s))) 51 | 52 | basis = np.zeros((n_tips - 1, n_tips)) 53 | for i in range(len(nds)): 54 | basis[i, :] = np.array( 55 | [0] * k[i] + [a[i]] * r[i] + [b[i]] * s[i] + [0] * t[i]) 56 | # Make sure that the basis is in level order 57 | basis = basis[:, ::-1] 58 | nds = [n.name for n in nds] 59 | return basis, nds 60 | 61 | 62 | def balance_basis(tree_node): 63 | """ 64 | Determines the basis based on bifurcating tree. 65 | 66 | This is commonly referred to as sequential binary partition [1]_. 67 | Given a binary tree relating a list of features, this module can 68 | be used to calculate an orthonormal basis, which is used to 69 | calculate the ilr transform. 70 | 71 | Parameters 72 | ---------- 73 | treenode : skbio.TreeNode 74 | Input bifurcating tree. Must be strictly bifurcating 75 | (i.e. every internal node needs to have exactly 2 children). 76 | 77 | Returns 78 | ------- 79 | basis : np.array 80 | Returns a set of orthonormal bases in the Aitchison simplex 81 | corresponding to the tree. The order of the 82 | basis is index by the level order of the internal nodes. 83 | nodes : list, skbio.TreeNode 84 | List of tree nodes indicating the ordering in the basis. 85 | 86 | Raises 87 | ------ 88 | ValueError 89 | The tree doesn't contain two branches. 90 | 91 | Examples 92 | -------- 93 | >>> from gneiss.balances import balance_basis 94 | >>> from skbio import TreeNode 95 | >>> tree = u"((b,c)a, d)root;" 96 | >>> t = TreeNode.read([tree]) 97 | >>> basis, nodes = balance_basis(t) 98 | >>> basis 99 | array([[0.18507216, 0.18507216, 0.62985567], 100 | [0.14002925, 0.57597535, 0.28399541]]) 101 | 102 | Notes 103 | ----- 104 | The tree must be strictly bifurcating, meaning that 105 | every internal node has exactly 2 children. 106 | 107 | See Also 108 | -------- 109 | skbio.stats.composition.ilr 110 | 111 | References 112 | ---------- 113 | .. [1] J.J. Egozcue and V. Pawlowsky-Glahn "Exploring Compositional Data 114 | with the CoDa-Dendrogram" (2011) 115 | 116 | """ 117 | basis, nodes = _balance_basis(tree_node) 118 | basis = clr_inv(basis) 119 | return basis, nodes 120 | 121 | 122 | def _count_matrix(treenode): 123 | n_tips = 0 124 | nodes = list(treenode.levelorder(include_self=True)) 125 | # fill in the Ordered dictionary. Note that the 126 | # elements of this Ordered dictionary are 127 | # dictionaries. 128 | counts = OrderedDict() 129 | columns = ['k', 'r', 'l', 't', 'tips'] 130 | for n in nodes: 131 | if n not in counts: 132 | counts[n] = {} 133 | for c in columns: 134 | counts[n][c] = 0 135 | 136 | # fill in r and l. This is done in reverse level order. 137 | for n in nodes[::-1]: 138 | if n.is_tip(): 139 | counts[n]['tips'] = 1 140 | n_tips += 1 141 | elif len(n.children) == 2: 142 | lchild = n.children[0] 143 | rchild = n.children[1] 144 | counts[n]['r'] = counts[rchild]['tips'] 145 | counts[n]['l'] = counts[lchild]['tips'] 146 | counts[n]['tips'] = counts[n]['r'] + counts[n]['l'] 147 | else: 148 | raise ValueError("Not a strictly bifurcating tree!") 149 | 150 | # fill in k and t 151 | for n in nodes: 152 | if n.parent is None: 153 | counts[n]['k'] = 0 154 | counts[n]['t'] = 0 155 | continue 156 | elif n.is_tip(): 157 | continue 158 | # left or right child 159 | # left = 0, right = 1 160 | child_idx = 'l' if n.parent.children[0] != n else 'r' 161 | if child_idx == 'l': 162 | counts[n]['t'] = counts[n.parent]['t'] + counts[n.parent]['l'] 163 | counts[n]['k'] = counts[n.parent]['k'] 164 | else: 165 | counts[n]['k'] = counts[n.parent]['k'] + counts[n.parent]['r'] 166 | counts[n]['t'] = counts[n.parent]['t'] 167 | return counts, n_tips 168 | 169 | 170 | def sparse_balance_basis(tree): 171 | """ Calculates sparse representation of an ilr basis from a tree. 172 | 173 | This computes an orthonormal basis specified from a bifurcating tree. 174 | 175 | Parameters 176 | ---------- 177 | tree : skbio.TreeNode 178 | Input bifurcating tree. Must be strictly bifurcating 179 | (i.e. every internal node needs to have exactly 2 children). 180 | This is used to specify the ilr basis. 181 | 182 | Returns 183 | ------- 184 | scipy.sparse.coo_matrix 185 | The ilr basis required to perform the ilr_inv transform. 186 | This is also known as the sequential binary partition. 187 | Note that this matrix is represented in clr coordinates. 188 | nodes : list, str 189 | List of tree nodes indicating the ordering in the basis. 190 | 191 | Raises 192 | ------ 193 | ValueError 194 | The tree doesn't contain two branches. 195 | 196 | """ 197 | # this is inspired by @wasade in 198 | # https://github.com/biocore/gneiss/pull/8 199 | t = tree.copy() 200 | D = len(list(tree.tips())) 201 | # calculate number of tips under each node 202 | for n in t.postorder(include_self=True): 203 | if n.is_tip(): 204 | n._tip_count = 1 205 | else: 206 | if len(n.children) == 2: 207 | left, right = n.children[NUMERATOR], n.children[DENOMINATOR], 208 | else: 209 | raise ValueError("Not a strictly bifurcating tree.") 210 | n._tip_count = left._tip_count + right._tip_count 211 | 212 | # calculate k, r, s, t coordinate for each node 213 | left, right = t.children[NUMERATOR], t.children[DENOMINATOR], 214 | t._k, t._r, t._s, t._t = 0, left._tip_count, right._tip_count, 0 215 | for n in t.preorder(include_self=False): 216 | if n.is_tip(): 217 | n._k, n._r, n._s, n._t = 0, 0, 0, 0 218 | 219 | elif n == n.parent.children[NUMERATOR]: 220 | n._k = n.parent._k 221 | n._r = n.children[NUMERATOR]._tip_count 222 | n._s = n.children[DENOMINATOR]._tip_count 223 | n._t = n.parent._s + n.parent._t 224 | elif n == n.parent.children[DENOMINATOR]: 225 | n._k = n.parent._r + n.parent._k 226 | n._r = n.children[NUMERATOR]._tip_count 227 | n._s = n.children[DENOMINATOR]._tip_count 228 | n._t = n.parent._t 229 | else: 230 | raise ValueError("Tree topology is not correct.") 231 | 232 | # navigate through tree to build the basis in a sparse matrix form 233 | value = [] 234 | row, col = [], [] 235 | nodes = [] 236 | i = 0 237 | 238 | for n in t.levelorder(include_self=True): 239 | 240 | if n.is_tip(): 241 | continue 242 | 243 | for j in range(n._k, n._k + n._r): 244 | row.append(i) 245 | # consider tips in reverse order. May want to rethink 246 | # this orientation in the future. 247 | col.append(D - 1 - j) 248 | A = np.sqrt(n._s / (n._r * (n._s + n._r))) 249 | 250 | value.append(A) 251 | 252 | for j in range(n._k + n._r, n._k + n._r + n._s): 253 | row.append(i) 254 | col.append(D - 1 - j) 255 | B = -np.sqrt(n._r / (n._s * (n._s + n._r))) 256 | 257 | value.append(B) 258 | i += 1 259 | nodes.append(n.name) 260 | 261 | basis = coo_matrix((value, (row, col)), shape=(D - 1, D)) 262 | 263 | return basis, nodes 264 | -------------------------------------------------------------------------------- /gneiss/tests/test_sort.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import numpy as np 9 | import pandas as pd 10 | import unittest 11 | from gneiss.sort import (niche_sort, mean_niche_estimator, 12 | ladderize, gradient_sort) 13 | import pandas.util.testing as pdt 14 | from skbio import TreeNode 15 | 16 | 17 | class TestSort(unittest.TestCase): 18 | def setUp(self): 19 | pass 20 | 21 | def test_mean_niche_estimator1(self): 22 | gradient = pd.Series( 23 | [1, 2, 3, 4, 5], 24 | index=['s1', 's2', 's3', 's4', 's5']) 25 | values = pd.Series( 26 | [1, 1, 0, 0, 0], 27 | index=['s1', 's2', 's3', 's4', 's5']) 28 | m = mean_niche_estimator(values, gradient) 29 | self.assertEqual(m, 1.5) 30 | 31 | def test_mean_niche_estimator2(self): 32 | gradient = pd.Series( 33 | [1, 2, 3, 4, 5], 34 | index=['s1', 's2', 's3', 's4', 's5']) 35 | values = pd.Series( 36 | [1, 3, 0, 0, 0], 37 | index=['s1', 's2', 's3', 's4', 's5']) 38 | m = mean_niche_estimator(values, gradient) 39 | self.assertEqual(m, 1.75) 40 | 41 | def test_mean_niche_estimator_frame(self): 42 | gradient = pd.Series( 43 | [1, 2, 3, 4, 5], 44 | index=['s1', 's2', 's3', 's4', 's5']) 45 | values = pd.DataFrame( 46 | np.array([[1, 3, 0, 0, 0], 47 | [1, 3, 0, 0, 0]]).T, 48 | index=['s1', 's2', 's3', 's4', 's5'], 49 | columns=['o1', 'o2']) 50 | m = mean_niche_estimator(values, gradient) 51 | exp = pd.Series([1.75, 1.75], index=['o1', 'o2']) 52 | pdt.assert_series_equal(m, exp) 53 | 54 | def test_mean_niche_estimator_bad_length(self): 55 | gradient = pd.Series( 56 | [1, 2, 3, 4, 5], 57 | index=['s1', 's2', 's3', 's4', 's5']) 58 | values = pd.Series( 59 | [1, 3, 0, 0, 0, 0], 60 | index=['s1', 's2', 's3', 's4', 's5', 's6']) 61 | 62 | with self.assertRaises(ValueError): 63 | mean_niche_estimator(values, gradient) 64 | 65 | def test_mean_niche_estimator_missing(self): 66 | gradient = pd.Series( 67 | [1, 2, 3, 4, np.nan], 68 | index=['s1', 's2', 's3', 's4', 's5']) 69 | values = pd.Series( 70 | [1, 3, 0, 0, 0], 71 | index=['s1', 's2', 's3', 's4', 's5']) 72 | 73 | with self.assertRaises(ValueError): 74 | mean_niche_estimator(values, gradient) 75 | 76 | def test_basic_niche_sort(self): 77 | table = pd.DataFrame( 78 | [[1, 1, 0, 0, 0], 79 | [0, 1, 1, 0, 0], 80 | [0, 0, 1, 1, 0], 81 | [0, 0, 0, 1, 1]], 82 | columns=['s1', 's2', 's3', 's4', 's5'], 83 | index=['o1', 'o2', 'o3', 'o4']).T 84 | gradient = pd.Series( 85 | [1, 2, 3, 4, 5], 86 | index=['s1', 's2', 's3', 's4', 's5']) 87 | res_table = niche_sort(table, gradient) 88 | pdt.assert_frame_equal(table, res_table) 89 | 90 | def test_basic_niche_sort_error(self): 91 | table = pd.DataFrame( 92 | [[1, 1, 0, 0, 0], 93 | [0, 1, 1, 0, 0], 94 | [0, 0, 1, 1, 0], 95 | [0, 0, 0, 1, 1]], 96 | columns=['s1', 's2', 's3', 's4', 's5'], 97 | index=['o1', 'o2', 'o3', 'o4']).T 98 | gradient = pd.Series( 99 | [1, 2, 3, 4, 5], 100 | index=['s1', 's2', 's3', 's4', 's5']) 101 | with self.assertRaises(ValueError): 102 | niche_sort(table, gradient, niche_estimator='rawr') 103 | 104 | def test_basic_niche_sort_scrambled(self): 105 | # Swap samples s1 and s2 and features o1 and o2 to see if this can 106 | # obtain the original table structure. 107 | table = pd.DataFrame( 108 | [[1, 0, 1, 0, 0], 109 | [1, 1, 0, 0, 0], 110 | [0, 0, 1, 1, 0], 111 | [0, 0, 0, 1, 1]], 112 | columns=['s2', 's1', 's3', 's4', 's5'], 113 | index=['o2', 'o1', 'o3', 'o4']).T 114 | 115 | gradient = pd.Series( 116 | [2, 1, 3, 4, 5], 117 | index=['s2', 's1', 's3', 's4', 's5']) 118 | 119 | exp_table = pd.DataFrame( 120 | [[1, 1, 0, 0, 0], 121 | [0, 1, 1, 0, 0], 122 | [0, 0, 1, 1, 0], 123 | [0, 0, 0, 1, 1]], 124 | columns=['s1', 's2', 's3', 's4', 's5'], 125 | index=['o1', 'o2', 'o3', 'o4']).T 126 | 127 | res_table = niche_sort(table, gradient) 128 | 129 | pdt.assert_frame_equal(exp_table, res_table) 130 | 131 | def test_basic_niche_sort_lambda(self): 132 | table = pd.DataFrame( 133 | [[1, 1, 0, 0, 0], 134 | [0, 0, 1, 1, 0], 135 | [0, 1, 1, 0, 0], 136 | [0, 0, 0, 1, 1]], 137 | columns=['s1', 's2', 's3', 's4', 's5'], 138 | index=['o1', 'o3', 'o2', 'o4']).T 139 | gradient = pd.Series( 140 | [1, 2, 3, 4, 5], 141 | index=['s1', 's2', 's3', 's4', 's5']) 142 | 143 | exp_table = pd.DataFrame( 144 | [[1, 1, 0, 0, 0], 145 | [0, 1, 1, 0, 0], 146 | [0, 0, 1, 1, 0], 147 | [0, 0, 0, 1, 1]], 148 | columns=['s1', 's2', 's3', 's4', 's5'], 149 | index=['o1', 'o2', 'o3', 'o4']).T 150 | 151 | def _dumb_estimator(v, gradient): 152 | v[v > 0] = 1 153 | values = v / v.sum() 154 | return np.dot(gradient, values) 155 | 156 | res_table = niche_sort(table, gradient, 157 | niche_estimator=_dumb_estimator) 158 | pdt.assert_frame_equal(exp_table, res_table) 159 | 160 | def test_basic_niche_sort_immutable(self): 161 | # Swap samples s1 and s2 and features o1 and o2 to see if this can 162 | # obtain the original table structure. 163 | table = pd.DataFrame( 164 | [[1, 0, 1, 0, 0], 165 | [1, 1, 0, 0, 0], 166 | [0, 0, 1, 1, 0], 167 | [0, 0, 0, 1, 1]], 168 | columns=['s2', 's1', 's3', 's4', 's5'], 169 | index=['o2', 'o1', 'o3', 'o4']).T 170 | 171 | gradient = pd.Series( 172 | [2, 1, 3, 4, 5], 173 | index=['s2', 's1', 's3', 's4', 's5']) 174 | 175 | exp_table = pd.DataFrame( 176 | [[1, 0, 1, 0, 0], 177 | [1, 1, 0, 0, 0], 178 | [0, 0, 1, 1, 0], 179 | [0, 0, 0, 1, 1]], 180 | columns=['s2', 's1', 's3', 's4', 's5'], 181 | index=['o2', 'o1', 'o3', 'o4']).T 182 | 183 | exp_gradient = pd.Series( 184 | [2, 1, 3, 4, 5], 185 | index=['s2', 's1', 's3', 's4', 's5']) 186 | 187 | niche_sort(table, gradient) 188 | pdt.assert_frame_equal(exp_table, table) 189 | pdt.assert_series_equal(exp_gradient, gradient) 190 | 191 | def test_ladderize1(self): 192 | # Makes sure that 1 subtree is ordered 193 | tree = TreeNode.read([u'((a,b)c, ((g,h)e,f)d)r;']) 194 | exp = '((a,b)c,(f,(g,h)e)d)r;\n' 195 | res = str(ladderize(tree)) 196 | self.assertEqual(exp, res) 197 | 198 | def test_ladderize2(self): 199 | # Makes sure that 2 subtrees are ordered 200 | tree = TreeNode.read([u'(((n,m)a,b)c, ((g,(i,j)h)e,f)d)r;']) 201 | exp = '((b,(n,m)a)c,(f,(g,(i,j)h)e)d)r;\n' 202 | res = str(ladderize(tree)) 203 | self.assertEqual(exp, res) 204 | 205 | def test_ladderize_descending(self): 206 | # Makes sure that 2 subtrees are ordered 207 | tree = TreeNode.read([u'(((n,m)a,b)c, ((g,(i,j)h)e,f)d)r;']) 208 | exp = '((((j,i)h,g)e,f)d,((m,n)a,b)c)r;\n' 209 | res = str(ladderize(tree, ascending=False)) 210 | self.assertEqual(exp, res) 211 | 212 | def test_gradient_sort(self): 213 | # Makes sure that the tree is sorted according 214 | # a pre-set ordering 215 | tree = TreeNode.read([u'((a,b)c, ((g,h)e,f)d)r;']) 216 | exp = '(((g,h)e,f)d,(a,b)c)r;\n' 217 | x = pd.Series({'f': 3, 'g': 1, 'h': 2, 'a': 4, 'b': 5}) 218 | res = str(gradient_sort(tree, x)) 219 | self.assertEqual(exp, res) 220 | 221 | def test_gradient_sort_descending(self): 222 | # Makes sure that the tree is sorted according 223 | # a pre-set ordering in descending order 224 | tree = TreeNode.read([u'((a,b)c, ((g,h)e,f)d)r;']) 225 | exp = '((b,a)c,(f,(h,g)e)d)r;\n' 226 | x = pd.Series({'f': 3, 'g': 1, 'h': 2, 'a': 4, 'b': 5}) 227 | res = str(gradient_sort(tree, x, ascending=False)) 228 | self.assertEqual(exp, res) 229 | 230 | def test_gradient_sort_error(self): 231 | # Makes sure that the tree is sorted according 232 | # a pre-set ordering 233 | tree = TreeNode.read([u'((a,b)c, ((g,h)e,f)d)r;']) 234 | x = pd.Series({'f': 'x', 'g': 'y', 'h': 'z', 'a': 'u', 'b': 'dz'}) 235 | with self.assertRaises(ValueError): 236 | gradient_sort(tree, x) 237 | 238 | 239 | if __name__ == '__main__': 240 | unittest.main() 241 | -------------------------------------------------------------------------------- /ipynb/cfstudy/cfstudy-qiime2-tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "In this study, there were 18 patients with cystic fibrosis. The hypothesis was that there were two main microbial communities at play in the CF lung. One of these communities thrives at low pH, and the other community thrives at high pH. To test this, sputum samples were divided among 8 tubes, and each of the tubes was perturbed with a different pH. Here we will calculate balances, and test how these balances change with respect to pH, using linear mixed effects models." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "First, we'll want to filter out low abundance OTUs. This will not only remove potential confounders, but could also alleviate the issue with zeros. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "/Users/mortonjt/miniconda3/envs/qiime2-2018.2/lib/python3.5/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 27 | " from ._conv import register_converters as _register_converters\n", 28 | "\u001b[32mSaved FeatureTable[Frequency] to: cfstudy_common_filt500.biom.qza\u001b[0m\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "!qiime feature-table filter-features \\\n", 34 | " --i-table cfstudy_common.biom.qza \\\n", 35 | " --o-filtered-table cfstudy_common_filt500.biom.qza \\\n", 36 | " --p-min-frequency 500" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Again, we will create the tree using pH. Note that we'll also want to reorder the OTU table for the balance calculations." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "/Users/mortonjt/miniconda3/envs/qiime2-2018.2/lib/python3.5/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 56 | " from ._conv import register_converters as _register_converters\n", 57 | "\u001b[32mSaved Hierarchy to: ph_tree.nwk.qza\u001b[0m\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "!qiime gneiss gradient-clustering \\\n", 63 | " --i-table cfstudy_common_filt500.biom.qza \\\n", 64 | " --m-gradient-file cfstudy_modified_metadata.txt \\\n", 65 | " --m-gradient-column ph \\\n", 66 | " --o-clustering ph_tree.nwk.qza \\\n", 67 | " --p-weighted --verbose" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "Before running the linear mixed effects models using mixed we'll want to replace zeros with a pseudocount to approximate the uncertainity probability." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 3, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "/Users/mortonjt/miniconda3/envs/qiime2-2018.2/lib/python3.5/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 87 | " from ._conv import register_converters as _register_converters\n", 88 | "\u001b[32mSaved FeatureTable[Composition] to: cf_composition.qza\u001b[0m\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "!qiime composition add-pseudocount \\\n", 94 | " --i-table cfstudy_common_filt500.biom.qza \\\n", 95 | " --p-pseudocount 1 \\\n", 96 | " --o-composition-table cf_composition.qza" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "/Users/mortonjt/miniconda3/envs/qiime2-2018.2/lib/python3.5/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 109 | " from ._conv import register_converters as _register_converters\n", 110 | "\u001b[32mSaved FeatureTable[Balance] to: cf_balances.qza\u001b[0m\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "!qiime gneiss ilr-transform \\\n", 116 | " --i-table cf_composition.qza \\\n", 117 | " --i-tree ph_tree.nwk.qza \\\n", 118 | " --o-balances cf_balances.qza" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "Now we can run the linear mixed effects models. pH is the only covariate being tested for and each of the patients are being accounted for by passing host_subject_id into groups. This is because the microbial differences between the patients are much larger than the pH effects, so we need to correct for this change, by treating each patient separately. This is why the linear mixed effects strategy is chosen." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 5, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "/Users/mortonjt/miniconda3/envs/qiime2-2018.2/lib/python3.5/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 138 | " from ._conv import register_converters as _register_converters\n", 139 | "\u001b[32mSaved Visualization to: cf_linear_mixed_effects_model.qzv\u001b[0m\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "!qiime gneiss lme-regression \\\n", 145 | " --p-formula \"ph\" \\\n", 146 | " --i-table cf_balances.qza \\\n", 147 | " --i-tree ph_tree.nwk.qza \\\n", 148 | " --m-metadata-file cfstudy_modified_metadata.txt \\\n", 149 | " --p-groups host_subject_id \\\n", 150 | " --o-visualization cf_linear_mixed_effects_model" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "These summary results can be visualized in qiime2 visualization framework. Checkout [view.qiime2.org](https://view.qiime2.org)\n", 158 | "\n", 159 | "Let's further summarize the results of the linear mixed effects model. We'll plot the how one of the top balances change with respect to the pH." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 10, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "/Users/mortonjt/miniconda3/envs/qiime2-2018.2/lib/python3.5/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 172 | " from ._conv import register_converters as _register_converters\n", 173 | "\u001b[32mSaved Visualization to: y2_taxa_summary.qzv\u001b[0m\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "!qiime gneiss balance-taxonomy \\\n", 179 | " --i-table cf_composition.qza \\\n", 180 | " --i-tree ph_tree.nwk.qza \\\n", 181 | " --i-taxonomy cfstudy_taxonomy.qza \\\n", 182 | " --p-taxa-level 4 \\\n", 183 | " --p-balance-name 'y2' \\\n", 184 | " --m-metadata-file 'cfstudy_modified_metadata.txt' \\\n", 185 | " --m-metadata-column 'ph' \\\n", 186 | " --o-visualization y2_taxa_summary.qzv" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": { 192 | "collapsed": true 193 | }, 194 | "source": [ 195 | "Similar to the 88soils example, there is a very obvious transition from low pH organisms to high pH organism as the pH increases. However, given that every patient has different microbes, so it is difficult to test for individual microbes abundances across patients. However, every patient has microbes that behave the same with respect to pH. Balances is a very powerful tool for addressing this, as it can allow for entire subcommunities to be tested, rather than just individual OTUs." 196 | ] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Python 3", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.5.5" 216 | } 217 | }, 218 | "nbformat": 4, 219 | "nbformat_minor": 1 220 | } 221 | -------------------------------------------------------------------------------- /gneiss/plot/tests/test_decompose.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import unittest 9 | from gneiss.plot import balance_boxplot, balance_barplots, proportion_plot 10 | import numpy as np 11 | import pandas as pd 12 | import numpy.testing as npt 13 | import matplotlib.pyplot as plt 14 | from skbio import TreeNode 15 | 16 | 17 | class TestBoxplot(unittest.TestCase): 18 | def setUp(self): 19 | self.df = pd.DataFrame({ 20 | 'y': [-2, -2.2, -1.8, -1.5, -1, 1, 1.5, 2, 2.2, 1.8], 21 | 'group': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], 22 | 'hue': ['0', '1', '0', '1', '0', '1', '0', '1', '0', '1']} 23 | ) 24 | self.tree = TreeNode.read(['((c, d)z, (b,a)x)y;']) 25 | self.feature_df = pd.DataFrame( 26 | { 27 | 'type': ['tomato', 'carrots', 'apple', 'bacon'], 28 | 'food': ['vegatable', 'vegatable', 'fruit', 'meat'], 29 | 'seed': ['yes', 'no', 'yes', 'no'] 30 | }, 31 | index=["a", "b", "c", "d"] 32 | ) 33 | 34 | @unittest.skip('Visualizations are deprecated') 35 | def test_basic_boxplot(self): 36 | a = balance_boxplot('y', y='group', data=self.df) 37 | res = np.vstack([i._xy for i in a.get_lines()]) 38 | exp = np.array([[-2., 0.], 39 | [-2.2, 0.], 40 | [-1.5, 0.], 41 | [-1., 0.], 42 | [-2.2, -0.2], 43 | [-2.2, 0.2], 44 | [-1., -0.2], 45 | [-1., 0.2], 46 | [-1.8, -0.4], 47 | [-1.8, 0.4], 48 | [1.5, 1.], 49 | [1., 1.], 50 | [2., 1.], 51 | [2.2, 1.], 52 | [1., 0.8], 53 | [1., 1.2], 54 | [2.2, 0.8], 55 | [2.2, 1.2], 56 | [1.8, 0.6], 57 | [1.8, 1.4]]) 58 | npt.assert_allclose(res, exp) 59 | 60 | @unittest.skip('Visualizations are deprecated') 61 | def test_basic_hue_boxplot(self): 62 | a = balance_boxplot('y', y='group', hue='hue', data=self.df) 63 | res = np.vstack([i._xy for i in a.get_lines()]) 64 | exp = np.array([[-1.9, -0.2], 65 | [-2., -0.2], 66 | [-1.4, -0.2], 67 | [-1., -0.2], 68 | [-2., -0.298], 69 | [-2., -0.102], 70 | [-1., -0.298], 71 | [-1., -0.102], 72 | [-1.8, -0.396], 73 | [-1.8, -0.004], 74 | [-2.025, 0.2], 75 | [-2.2, 0.2], 76 | [-1.675, 0.2], 77 | [-1.5, 0.2], 78 | [-2.2, 0.102], 79 | [-2.2, 0.298], 80 | [-1.5, 0.102], 81 | [-1.5, 0.298], 82 | [-1.85, 0.004], 83 | [-1.85, 0.396], 84 | [1.675, 0.8], 85 | [1.5, 0.8], 86 | [2.025, 0.8], 87 | [2.2, 0.8], 88 | [1.5, 0.702], 89 | [1.5, 0.898], 90 | [2.2, 0.702], 91 | [2.2, 0.898], 92 | [1.85, 0.604], 93 | [1.85, 0.996], 94 | [1.4, 1.2], 95 | [1., 1.2], 96 | [1.9, 1.2], 97 | [2., 1.2], 98 | [1., 1.102], 99 | [1., 1.298], 100 | [2., 1.102], 101 | [2., 1.298], 102 | [1.8, 1.004], 103 | [1.8, 1.396]]) 104 | npt.assert_allclose(res, exp) 105 | 106 | @unittest.skip('Visualizations are deprecated') 107 | def test_basic_barplot(self): 108 | ax_denom, ax_num = balance_barplots(self.tree, 'y', header='food', 109 | feature_metadata=self.feature_df) 110 | 111 | 112 | class TestProportionPlot(unittest.TestCase): 113 | def setUp(self): 114 | self.table = pd.DataFrame({ 115 | 'A': [1, 1.2, 1.1, 2.1, 2.2, 2], 116 | 'B': [9.9, 10, 10.1, 2, 2.4, 2.1], 117 | 'C': [5, 3, 1, 2, 2, 3], 118 | 'D': [5, 5, 5, 5, 5, 5], 119 | }, index=['S1', 'S2', 'S3', 'S4', 'S5', 'S6']) 120 | 121 | self.feature_metadata = pd.DataFrame({ 122 | 'A': ['k__foo', 'p__bar', 'c__', 'o__', 'f__', 'g__', 's__'], 123 | 'B': ['k__foo', 'p__bar', 'c__', 'o__', 'f__', 'g__', 's__'], 124 | 'C': ['k__poo', 'p__tar', 'c__', 'o__', 'f__', 'g__', 's__'], 125 | 'D': ['k__poo', 'p__far', 'c__', 'o__', 'f__', 'g__', 's__'] 126 | }, index=['kingdom', 'phylum', 'class', 'order', 127 | 'family', 'genus', 'species']).T 128 | 129 | self.metadata = pd.DataFrame({ 130 | 'groups': ['X', 'X', 'X', 'Y', 'Y', 'Y'], 131 | 'dry': [1, 2, 3, 4, 5, 6] 132 | }, index=['S1', 'S2', 'S3', 'S4', 'S5', 'S6']) 133 | 134 | @unittest.skip('Visualizations are deprecated') 135 | def test_proportion_plot(self): 136 | np.random.seed(0) 137 | num_features = ['A', 'B'] 138 | denom_features = ['C', 'D'] 139 | ax1, ax2 = proportion_plot(self.table, self.metadata, 140 | 'groups', 'X', 'Y', 141 | num_features, denom_features, 142 | self.feature_metadata, 143 | label_col='phylum') 144 | res = np.vstack([L.get_xydata() for L in ax1.get_lines()]) 145 | exp = np.array([0., 0., 1., 1., 2., 2., 3., 3.]) 146 | 147 | npt.assert_allclose(res[:, 1], exp, verbose=True) 148 | 149 | res = np.vstack([L.get_xydata() for L in ax2.get_lines()]) 150 | exp = np.array([0., 0., 1., 1., 2., 2., 3., 3.]) 151 | 152 | npt.assert_allclose(res[:, 1], exp, verbose=True) 153 | 154 | res = [L._text for L in ax2.get_yticklabels()] 155 | exp = ['p__bar', 'p__bar', 'p__tar', 'p__far'] 156 | self.assertListEqual(res, exp) 157 | 158 | @unittest.skip('Visualizations are deprecated') 159 | def test_proportion_plot_order(self): 160 | self.maxDiff = None 161 | np.random.seed(0) 162 | # tests for different ordering 163 | num_features = ['A', 'B'] 164 | denom_features = ['D', 'C'] 165 | ax1, ax2 = proportion_plot(self.table, self.metadata, 166 | 'groups', 'X', 'Y', 167 | num_features, denom_features, 168 | self.feature_metadata, 169 | label_col='phylum') 170 | res = np.vstack([L.get_xydata() for L in ax1.get_lines()]) 171 | exp = np.array([0., 0., 1., 1., 2., 2., 3., 3.]) 172 | 173 | npt.assert_allclose(res[:, 1], exp, atol=1e-2, rtol=1e-2, verbose=True) 174 | 175 | res = np.vstack([L.get_xydata() for L in ax2.get_lines()]) 176 | exp = np.array([0., 0., 1., 1., 2., 2., 3., 3.]) 177 | 178 | npt.assert_allclose(res[:, 1], exp, atol=1e-2, rtol=1e-2, verbose=True) 179 | 180 | res = [L._text for L in ax2.get_yticklabels()] 181 | exp = ['p__bar', 'p__bar', 'p__far', 'p__tar'] 182 | self.assertListEqual(res, exp) 183 | 184 | @unittest.skip('Visualizations are deprecated') 185 | def test_proportion_plot_order_figure(self): 186 | self.maxDiff = None 187 | np.random.seed(0) 188 | # tests for different ordering 189 | fig, axes = plt.subplots(1, 2) 190 | 191 | num_features = ['A', 'B'] 192 | denom_features = ['D', 'C'] 193 | ax1, ax2 = proportion_plot(self.table, self.metadata, 194 | 'groups', 'X', 'Y', 195 | num_features, denom_features, 196 | self.feature_metadata, 197 | label_col='phylum', axes=axes) 198 | res = np.vstack([L.get_xydata() for L in ax1.get_lines()]) 199 | exp = np.array([0., 0., 1., 1., 2., 2., 3., 3.]) 200 | 201 | npt.assert_allclose(res[:, 1], exp, atol=1e-2, rtol=1e-2, verbose=True) 202 | 203 | res = np.vstack([L.get_xydata() for L in ax2.get_lines()]) 204 | exp = np.array([0., 0., 1., 1., 2., 2., 3., 3.]) 205 | 206 | npt.assert_allclose(res[:, 1], exp, atol=1e-2, rtol=1e-2, verbose=True) 207 | 208 | res = [L._text for L in ax2.get_yticklabels()] 209 | exp = ['p__bar', 'p__bar', 'p__far', 'p__tar'] 210 | self.assertListEqual(res, exp) 211 | 212 | def test_proportion_plot_original_labels(self): 213 | # tests for different ordering 214 | fig, axes = plt.subplots(1, 2) 215 | 216 | num_features = ['A', 'B'] 217 | denom_features = ['D', 'C'] 218 | ax1, ax2 = proportion_plot(self.table, self.metadata, 219 | 'groups', 'X', 'Y', 220 | num_features, denom_features, 221 | axes=axes) 222 | 223 | res = [L._text for L in ax2.get_yticklabels()] 224 | exp = ['A', 'B', 'D', 'C'] 225 | self.assertListEqual(res, exp) 226 | 227 | 228 | if __name__ == '__main__': 229 | unittest.main() 230 | -------------------------------------------------------------------------------- /gneiss/regression/tests/test_ols.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2016--, gneiss development team. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file COPYING.txt, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import os 9 | import shutil 10 | import unittest 11 | import numpy as np 12 | import pandas as pd 13 | import pandas.util.testing as pdt 14 | from skbio.stats.composition import ilr_inv 15 | from skbio import TreeNode 16 | from skbio.util import get_data_path 17 | from gneiss.regression import ols 18 | from gneiss.balances import balance_basis 19 | from statsmodels.regression.linear_model import OLS 20 | 21 | 22 | class TestOLS(unittest.TestCase): 23 | """ Tests OLS regression with refactored matrix multiplication. """ 24 | def setUp(self): 25 | np.random.seed(0) 26 | b01, b11, b21 = 1, 2, -3 27 | b02, b12, b22 = 2, -1, 4 28 | n = 50 29 | x1 = np.linspace(0, 10, n) 30 | x2 = np.linspace(10, 15, n) 31 | e = np.random.normal(size=n) * 10 32 | y1 = b01 + b11 * x1 + b21 * x2 + e 33 | e = np.random.normal(size=n) * 10 34 | y2 = b02 + b12 * x1 + b22 * x2 + e 35 | Y = pd.DataFrame(np.vstack((y1, y2)).T, 36 | columns=['y1', 'y2']) 37 | 38 | B = pd.DataFrame([[b01, b11, b21], 39 | [b02, b12, b22]]) 40 | 41 | X = pd.DataFrame( 42 | np.vstack((np.ones(n), x1, x2)).T, 43 | columns=['Intercept', 'x1', 'x2']) 44 | 45 | self.Y = Y 46 | self.B = B 47 | self.X = X 48 | self.r1_ = OLS(endog=y1, exog=X).fit() 49 | self.r2_ = OLS(endog=y2, exog=X).fit() 50 | self.tree = TreeNode.read(['(c, (b,a)y2)y1;']) 51 | 52 | self.results = "results" 53 | if not os.path.exists(self.results): 54 | os.mkdir(self.results) 55 | 56 | def tearDown(self): 57 | shutil.rmtree(self.results) 58 | 59 | def test_ols_immutable(self): 60 | # test to see if values in table get filtered out. 61 | # and that the original table doesn't change 62 | table = self.Y 63 | x = pd.DataFrame(self.X.values, columns=self.X.columns, 64 | index=range(100, 100 + len(self.X.index))) 65 | metadata = pd.concat((self.X, x)) 66 | 67 | exp_metadata = metadata.copy() 68 | ols('x1 + x2', self.Y, self.X) 69 | self.assertEqual(str(table), str(self.Y)) 70 | self.assertEqual(str(metadata), str(exp_metadata)) 71 | 72 | def test_ols_missing_metadata(self): 73 | # test to see if values in table get filtered out. 74 | # and that the original table doesn't change 75 | table = self.Y 76 | y = pd.DataFrame(self.Y.values, columns=self.Y.columns, 77 | index=range(100, 100 + len(self.Y.index))) 78 | 79 | table = pd.concat((self.Y, y)) 80 | ids = np.arange(100, 100 + len(self.X.index)) 81 | x = pd.DataFrame([[np.nan] * len(self.X.columns)] * len(ids), 82 | columns=self.X.columns, index=ids) 83 | 84 | metadata = pd.concat((self.X, x)) 85 | model = ols('x1 + x2', table, metadata) 86 | model.fit() 87 | 88 | # test prediction 89 | exp = pd.DataFrame({'y1': self.r1_.predict(), 90 | 'y2': self.r2_.predict()}, 91 | index=self.Y.index) 92 | res = model.predict() 93 | 94 | pdt.assert_frame_equal(res, exp) 95 | 96 | def test_ols_test(self): 97 | 98 | model = ols('x1 + x2', self.Y, self.X) 99 | model.fit() 100 | 101 | # test pvalues 102 | exp = pd.DataFrame({'y1': self.r1_.pvalues, 103 | 'y2': self.r2_.pvalues}) 104 | pdt.assert_frame_equal(model.pvalues, exp) 105 | 106 | # test coefficients 107 | exp = pd.DataFrame({'y1': self.r1_.params, 108 | 'y2': self.r2_.params}) 109 | res = model.coefficients() 110 | pdt.assert_frame_equal(res, exp) 111 | 112 | # test residuals 113 | exp = pd.DataFrame({'y1': self.r1_.resid, 114 | 'y2': self.r2_.resid}, 115 | index=self.Y.index) 116 | res = model.residuals() 117 | pdt.assert_frame_equal(res, exp) 118 | 119 | # test prediction 120 | exp = pd.DataFrame({'y1': self.r1_.predict(), 121 | 'y2': self.r2_.predict()}, 122 | index=self.Y.index) 123 | res = model.predict() 124 | pdt.assert_frame_equal(res, exp) 125 | 126 | # make a small prediction 127 | fx = pd.DataFrame( 128 | [[1, 1, 1], 129 | [1, 1, 2]], 130 | columns=['Intercept', 'x1', 'x2'], 131 | index=['f1', 'f2']) 132 | 133 | rp1 = self.r1_.predict([[1, 1, 1], 134 | [1, 1, 2]]) 135 | rp2 = self.r2_.predict([[1, 1, 1], 136 | [1, 1, 2]]) 137 | exp = pd.DataFrame({'y1': rp1, 138 | 'y2': rp2}, 139 | index=['f1', 'f2']) 140 | 141 | res = model.predict(X=fx) 142 | pdt.assert_frame_equal(res, exp) 143 | 144 | # test r2 145 | self.assertAlmostEqual(model.r2, 0.21981627865598752) 146 | 147 | def test_ols_ilr_inv_test(self): 148 | 149 | model = ols('x1 + x2', self.Y, self.X) 150 | model.fit() 151 | basis, _ = balance_basis(self.tree) 152 | # test pvalues 153 | exp = pd.DataFrame({'y1': self.r1_.pvalues, 154 | 'y2': self.r2_.pvalues}) 155 | pdt.assert_frame_equal(model.pvalues, exp) 156 | 157 | # test coefficients 158 | exp = pd.DataFrame({'y1': self.r1_.params, 159 | 'y2': self.r2_.params}) 160 | 161 | exp = pd.DataFrame(ilr_inv(exp, basis), 162 | columns=['c', 'b', 'a'], 163 | index=self.X.columns) 164 | 165 | res = model.coefficients(tree=self.tree) 166 | pdt.assert_frame_equal(res, exp) 167 | 168 | # test residuals 169 | exp = pd.DataFrame({'y1': self.r1_.resid, 170 | 'y2': self.r2_.resid}, 171 | index=self.Y.index) 172 | exp = pd.DataFrame(ilr_inv(exp, basis), 173 | index=self.Y.index, 174 | columns=['c', 'b', 'a']) 175 | res = model.residuals(tree=self.tree) 176 | pdt.assert_frame_equal(res, exp) 177 | 178 | # test prediction 179 | exp = pd.DataFrame({'y1': self.r1_.predict(), 180 | 'y2': self.r2_.predict()}, 181 | index=self.Y.index) 182 | exp = pd.DataFrame(ilr_inv(exp, basis), 183 | index=self.Y.index, 184 | columns=['c', 'b', 'a']) 185 | res = model.predict(tree=self.tree) 186 | pdt.assert_frame_equal(res, exp) 187 | 188 | def test_tvalues(self): 189 | model = ols('x1 + x2', self.Y, self.X) 190 | model.fit() 191 | 192 | exp = pd.DataFrame({'y1': self.r1_.tvalues, 193 | 'y2': self.r2_.tvalues}) 194 | pdt.assert_frame_equal(model.tvalues, exp) 195 | 196 | def test_mse(self): 197 | model = ols('x1 + x2', self.Y, self.X) 198 | model.fit() 199 | 200 | exp = pd.Series({'y1': self.r1_.mse_resid, 201 | 'y2': self.r2_.mse_resid}) 202 | pdt.assert_series_equal(model.mse, exp) 203 | 204 | def test_ess(self): 205 | model = ols('x1 + x2', self.Y, self.X) 206 | model.fit() 207 | 208 | exp = pd.Series({'y1': self.r1_.ess, 209 | 'y2': self.r2_.ess}) 210 | pdt.assert_series_equal(model.ess, exp) 211 | 212 | def test_loo(self): 213 | model = ols('x1 + x2', self.Y, self.X) 214 | model.fit() 215 | res = model.loo() 216 | exp = pd.read_csv(get_data_path('loo.csv'), index_col=0) 217 | pdt.assert_frame_equal(res, exp) 218 | 219 | def test_kfold(self): 220 | model = ols('x1 + x2', self.Y, self.X) 221 | model.fit() 222 | res = model.kfold(9) 223 | exp = pd.read_csv(get_data_path('kfold.csv'), index_col=0) 224 | pdt.assert_frame_equal(res, exp) 225 | 226 | def test_lovo(self): 227 | model = ols('x1 + x2', self.Y, self.X) 228 | model.fit() 229 | res = model.lovo() 230 | exp = pd.read_csv(get_data_path('lovo.csv'), index_col=0) 231 | pdt.assert_frame_equal(res, exp) 232 | 233 | 234 | class TestOLSCV(unittest.TestCase): 235 | """ Tests OLS regression with refactored matrix multiplication. """ 236 | def setUp(self): 237 | np.random.seed(0) 238 | b01, b11, b21 = 1, 2, -3 239 | b02, b12, b22 = 2, -1, 4 240 | n = 50 241 | x1 = np.linspace(0, 10, n) 242 | x2 = np.linspace(10, 15, n)**2 243 | e = np.random.normal(size=n) * 10 244 | y1 = b01 + b11 * x1 + b21 * x2 + e 245 | e = np.random.normal(size=n) * 10 246 | y2 = b02 + b12 * x1 + b22 * x2 + e 247 | Y = pd.DataFrame(np.vstack((y1, y2)).T, 248 | columns=['y1', 'y2']) 249 | 250 | B = pd.DataFrame([[b01, b11, b21], 251 | [b02, b12, b22]]) 252 | 253 | X = pd.DataFrame( 254 | np.vstack((np.ones(n), x1, x2)).T, 255 | columns=['Intercept', 'x1', 'x2']) 256 | 257 | self.Y = Y 258 | self.B = B 259 | self.X = X 260 | self.r1_ = OLS(endog=y1, exog=X).fit() 261 | self.r2_ = OLS(endog=y2, exog=X).fit() 262 | self.tree = TreeNode.read(['(c, (b,a)y2)y1;']) 263 | 264 | def test_loo(self): 265 | model = ols('x1 + x2', self.Y, self.X) 266 | model.fit() 267 | res = model.loo() 268 | exp = pd.read_csv(get_data_path('loo2.csv'), index_col=0) 269 | pdt.assert_frame_equal(res, exp) 270 | 271 | def test_kfold(self): 272 | model = ols('x1 + x2', self.Y, self.X) 273 | model.fit() 274 | res = model.kfold(9) 275 | exp = pd.read_csv(get_data_path('kfold2.csv'), index_col=0) 276 | pdt.assert_frame_equal(res, exp) 277 | 278 | def test_lovo(self): 279 | model = ols('x1 + x2', self.Y, self.X) 280 | model.fit() 281 | res = model.lovo() 282 | exp = pd.read_csv(get_data_path('lovo2.csv'), index_col=0) 283 | pdt.assert_frame_equal(res, exp) 284 | 285 | 286 | if __name__ == "__main__": 287 | unittest.main() 288 | --------------------------------------------------------------------------------