├── tests ├── __init__.py ├── test_box.py ├── test_line.py ├── test_bar.py └── test_scatter.py ├── MANIFEST.in ├── .github ├── FUNDING.yml └── workflows │ └── python-package.yml ├── dexplot ├── colors │ ├── __init__.py │ ├── _categories.py │ └── _app.py ├── __init__.py ├── _pandas_accessor.py ├── _utils.py ├── _heat.py ├── _plotly.py ├── _plots.py └── _common_plot.py ├── Upcoming Features.md ├── docs ├── css │ └── style.css ├── overrides │ └── main.html └── index.md ├── setup.py ├── mkdocs.yml ├── LICENSE ├── .gitignore ├── notebooks └── colormaps.ipynb └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | custom: ['https://dunderdata.com'] 2 | -------------------------------------------------------------------------------- /dexplot/colors/__init__.py: -------------------------------------------------------------------------------- 1 | from ._categories import sequential, diverging, cyclic, qualitative, misc, all_cmaps 2 | 3 | import importlib 4 | if importlib.util.find_spec('ipywidgets') and importlib.util.find_spec('IPython'): 5 | from ._app import color_viewer -------------------------------------------------------------------------------- /dexplot/__init__.py: -------------------------------------------------------------------------------- 1 | from ._plots import line, bar, box, scatter, violin, hist, count, kde 2 | from ._utils import load_dataset 3 | from ._plotly import bar_plotly, line_plotly, scatter_plotly, count_plotly, box_plotly, violin_plotly 4 | from . import colors 5 | from ._pandas_accessor import _DexplotAccessor 6 | 7 | __version__ = '0.1.4' 8 | -------------------------------------------------------------------------------- /Upcoming Features.md: -------------------------------------------------------------------------------- 1 | ## Upcoming Features 2 | 3 | * allow user access to entire dataframe in custom aggfunc 4 | * templates for x,y, labels and titles 5 | * color picker ipywdigets 6 | * ipywidgets full app integration 7 | * add other generic kwargs, ec, lw, alpha, etc... 8 | * [ ] kde with annotations, allow for binning 9 | * [ ] scatter with kde 10 | * [ ] allow kde and histograms to be grouped 11 | * [ ] use a categorical variable to size scatter plot 12 | * [ ] allow user to specify a specific matplotlib axes 13 | * [ ] add interaction with ipywidgets 14 | * [ ] stacked area plot 15 | * [ ] rolling averages for line plots 16 | * [ ] add parameter `bins` to bin numeric x 17 | * [ ] option to add counts to all aggregate plots 18 | 19 | ## Other plots 20 | 21 | * heat 22 | * hexplot 23 | * mosaic -------------------------------------------------------------------------------- /docs/css/style.css: -------------------------------------------------------------------------------- 1 | table { 2 | background-color: transparent; 3 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 4 | margin-left:0; 5 | margin-right:0; 6 | border:none; 7 | border-collapse: collapse; 8 | border-spacing:0; 9 | color:black; 10 | font-size:13px; 11 | table-layout:fixed; 12 | overflow: scroll; 13 | } 14 | thead { 15 | border-bottom:1px solid black;vertical-align:bottom; 16 | } 17 | tr, th, td { 18 | text-align:right; 19 | vertical-align: middle; 20 | padding:0.5em 0.5em; 21 | line-height:normal; 22 | white-space:normal; 23 | max-width:none; 24 | border:none; 25 | } 26 | th { 27 | font-weight:bold; 28 | text-align:left; 29 | } 30 | tbody tr:nth-child(odd){ 31 | background:#f5f5f5; 32 | } 33 | :link{ 34 | text-decoration:underline; 35 | } 36 | 37 | .vid { 38 | display: flex; 39 | justify-content: center; 40 | } 41 | .vid video { 42 | width: 85%; 43 | } 44 | 45 | .dataframe { 46 | overflow: scroll; 47 | } -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest, macos-latest, windows-latest] 19 | python-version: [3.6, 3.7, 3.8] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install pytest matplotlib pandas scipy plotly 32 | - name: Test with pytest 33 | run: pytest 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open('dexplot/__init__.py', 'r') as f: 4 | for line in f: 5 | if line.startswith('__version__'): 6 | version = line.split("'")[1] 7 | 8 | with open("README.md", "r") as fh: 9 | long_description = fh.read() 10 | 11 | setuptools.setup( 12 | name="dexplot", 13 | version=version, 14 | author="Ted Petrou", 15 | author_email="petrou.theodore@gmail.com", 16 | description="Powerful and intuitive data visualization library using matplotlib for both long and wide data", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | keywords="data visualization matplotlib pandas", 20 | url="https://github.com/dexplo/dexplot", 21 | packages=setuptools.find_packages(), 22 | classifiers=[ 23 | "Programming Language :: Python :: 3", 24 | "License :: OSI Approved :: BSD License", 25 | "Operating System :: OS Independent", 26 | "Framework :: Matplotlib" 27 | ], 28 | install_requires=['numpy>=1.15', 29 | 'scipy>=1.0' 30 | 'matplotlib>=3.1', 31 | 'pandas>=0.24'], 32 | extras_require={ 33 | "apps": ["ipywidgets"], 34 | }, 35 | python_requires='>=3.6' 36 | ) -------------------------------------------------------------------------------- /dexplot/_pandas_accessor.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pandas as pd 4 | 5 | from . import _plots as plots 6 | 7 | def get_doc(func): 8 | doc = func.__doc__ 9 | return re.sub('data :.*(?=split :)', '', doc, count=1, flags=re.S) 10 | 11 | 12 | @pd.api.extensions.register_dataframe_accessor("dexplot") 13 | class _DexplotAccessor: 14 | def __init__(self, pandas_obj): 15 | self._obj = pandas_obj 16 | 17 | def box(self, x=None, y=None, split=None, row=None, col=None, x_order=None, 18 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 19 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 20 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 21 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2, 22 | groupgap=0, box_kwargs=None): 23 | return plots.box(x, y, self._obj, split, row, col, x_order, y_order, split_order, 24 | row_order, col_order, orientation, wrap, figsize, title, sharex, 25 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 26 | x_textwrap, y_textwrap, x_rot, y_rot, mode, gap, groupgap, box_kwargs) 27 | 28 | _DexplotAccessor.box.__doc__ = get_doc(plots.box) -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Dexplot 2 | site_description: Dexplot is a powerful and intuitive Python data visualization library using matplotlib for both long and wide data 3 | site_author: Ted Petrou 4 | site_url: https://www.dexplo.org/dexplot 5 | repo_url: https://github.com/dexplo/dexplot 6 | copyright: Copyright @ 2020 Ted Petrou 7 | google_analytics: 8 | - UA-119777567-7 9 | - dexplo.org 10 | theme: 11 | name: material 12 | custom_dir: docs/overrides 13 | features: 14 | - tabs 15 | 16 | nav: 17 | - Home: index.md 18 | - More Dexplo Libraries: 19 | - Dexplo: https://www.dexplo.org 20 | 21 | extra_css: 22 | - css/style.css 23 | 24 | extra: 25 | social: 26 | - icon: fontawesome/brands/github-alt 27 | link: https://github.com/dexplo 28 | - icon: fontawesome/brands/twitter 29 | link: https://twitter.com/TedPetrou 30 | - icon: fontawesome/brands/linkedin 31 | link: https://linkedin.com/in/TedPetrou 32 | - icon: fontawesome/brands/youtube 33 | link: https://www.youtube.com/c/dunderdata 34 | - icon: fontawesome/brands/facebook 35 | link: https://www.facebook.com/dunderdata 36 | 37 | markdown_extensions: 38 | - admonition 39 | - toc: 40 | permalink: True 41 | - codehilite: 42 | guess_lang: false 43 | - pymdownx.superfences 44 | 45 | plugins: 46 | - search 47 | - macros 48 | - minify: 49 | minify_html: true 50 | 51 | extra_javascript: 52 | - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, dexplo 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /dexplot/_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.stats import gaussian_kde 4 | 5 | RAW_URL = 'https://raw.githubusercontent.com/dexplo/dexplot/master/data/{name}.csv' 6 | DATASETS = ['airbnb'] 7 | 8 | def load_dataset(name): 9 | """ 10 | Load a dataset. Must be connected to the internet 11 | 12 | Datasets 13 | -------- 14 | airbnb 15 | """ 16 | if name not in DATASETS: 17 | raise KeyError(f'Dataset {name} does not exist. Choose one of the following: {DATASETS}') 18 | 19 | url = RAW_URL.format(name=name) 20 | return pd.read_csv(url) 21 | 22 | 23 | def calculate_density_1d(data, cumulative=False): 24 | density_func = gaussian_kde(data) 25 | min_x, max_x = data.min(), data.max() 26 | range_x = max_x - min_x 27 | min_x = min_x - 2 * range_x 28 | max_x = max_x + 2 * range_x 29 | x = np.linspace(min_x, max_x, 400) 30 | density = density_func(x) 31 | max_density = density.max() 32 | filt = density > max_density / 1000 33 | x = x[filt] 34 | density = density[filt] 35 | if cumulative: 36 | density = np.cumsum(density) 37 | density = 1 / density.max() * density 38 | return x, density 39 | 40 | def calculate_density_2d(x, y): 41 | xmin, xmax = x.min(), x.max() 42 | ymin, ymax = y.min(), y.max() 43 | X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] 44 | positions = np.vstack([X.ravel(), Y.ravel()]) 45 | values = np.vstack([x, y]) 46 | kernel = gaussian_kde(values) 47 | Z = np.reshape(kernel(positions).T, X.shape) 48 | return xmin, xmax, ymin, ymax, np.rot90(Z) 49 | 50 | 51 | -------------------------------------------------------------------------------- /docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block extrahead %} 4 | 5 | {% if page and page.meta and page.meta.title %} 6 | 7 | {% elif page and page.title and not page.is_homepage %} 8 | 9 | {% else %} 10 | 11 | {% endif %} 12 | 13 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | {% if page and page.meta and page.meta.title %} 23 | 24 | {% elif page and page.title and not page.is_homepage %} 25 | 26 | {% else %} 27 | 28 | {% endif %} 29 | 30 | 32 | {% endblock %} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | .DS_Store 10 | .idea 11 | docs/images 12 | notebooks/ 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | -------------------------------------------------------------------------------- /notebooks/colormaps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import plotly" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "p_sequential = [k.lower() for k, v in vars(plotly.colors.sequential).items() if isinstance(v, list) \n", 19 | " and not k.startswith('_')]\n", 20 | "p_diverging = [k.lower() for k, v in vars(plotly.colors.diverging).items() if isinstance(v, list) \n", 21 | " and not k.startswith('_')]\n", 22 | "p_cyclic = [k.lower() for k, v in vars(plotly.colors.cyclical).items() if isinstance(v, list) \n", 23 | " and not k.startswith('_')]\n", 24 | "p_qual = [k.lower() for k, v in vars(plotly.colors.qualitative).items() if isinstance(v, list) \n", 25 | " and not k.startswith('_')]\n", 26 | "p_qual += ['dark12', 'dark12_r']" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "mpl_sequential = ['viridis', 'plasma', 'inferno', 'magma', 'cividis',\n", 36 | " 'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',\n", 37 | " 'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',\n", 38 | " 'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn',\n", 39 | " 'binary', 'gist_yarg', 'gist_gray', 'gray', 'bone', 'pink',\n", 40 | " 'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia',\n", 41 | " 'hot', 'afmhot', 'gist_heat', 'copper'] \n", 42 | "mpl_diverging = ['PiYG', 'PRGn', 'BrBG', 'PuOr', 'RdGy', 'RdBu',\n", 43 | " 'RdYlBu', 'RdYlGn', 'Spectral', 'coolwarm', 'bwr', 'seismic']\n", 44 | "mpl_cyclic = ['twilight', 'twilight_shifted', 'hsv']\n", 45 | "mpl_qual = ['Pastel1', 'Pastel2', 'Paired', 'Accent', 'Dark2', 'Set1', 'Set2', 'Set3',\n", 46 | " 'tab10', 'tab20', 'tab20b', 'tab20c']\n", 47 | "mpl_misc = ['flag', 'prism', 'ocean', 'gist_earth', 'terrain', 'gist_stern',\n", 48 | " 'gnuplot', 'gnuplot2', 'CMRmap', 'cubehelix', 'brg',\n", 49 | " 'gist_rainbow', 'rainbow', 'jet', 'nipy_spectral', 'gist_ncar']\n", 50 | "\n", 51 | "def double(colors):\n", 52 | " a = []\n", 53 | " for color in colors:\n", 54 | " c = color.lower()\n", 55 | " a.append(c)\n", 56 | " a.append(c + '_r')\n", 57 | " return a\n", 58 | " \n", 59 | "mpl_sequential = double(mpl_sequential)\n", 60 | "mpl_diverging = double(mpl_diverging)\n", 61 | "mpl_cyclic = double(mpl_cyclic)\n", 62 | "mpl_qual = double(mpl_qual)\n", 63 | "mpl_misc = double(mpl_misc)\n", 64 | "\n", 65 | "seq = sorted(set(mpl_sequential + p_sequential))\n", 66 | "diverging = sorted(set(mpl_diverging + p_diverging))\n", 67 | "cyclic = sorted(set(mpl_cyclic + p_cyclic))\n", 68 | "qual = sorted(set(mpl_qual + p_qual))\n", 69 | "misc = sorted(set(mpl_misc))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.8.3" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 4 101 | } 102 | -------------------------------------------------------------------------------- /tests/test_box.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import dexplot as dxp 4 | 5 | 6 | airbnb = dxp.load_dataset('airbnb') 7 | 8 | 9 | class TestSort: 10 | 11 | def test_lex_asc(self): 12 | fig = dxp.box(x='price', y='neighborhood', data=airbnb) 13 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 14 | correct = sorted(ticklabels) 15 | assert ticklabels == correct 16 | 17 | def test_lex_desc(self): 18 | fig = dxp.box(x='price', y='neighborhood', data=airbnb, y_order='desc') 19 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 20 | correct = sorted(ticklabels, reverse=True) 21 | assert ticklabels == correct 22 | 23 | def test_asc_values(self): 24 | fig = dxp.box(x='price', y='neighborhood', data=airbnb, sort_values='asc') 25 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 26 | ticklabels = [label.replace('\n', ' ') for label in ticklabels] 27 | values = [p.get_height() for p in fig.axes[0].patches] 28 | 29 | 30 | def test_desc_values(self): 31 | fig = dxp.box(x='price', y='neighborhood', data=airbnb, sort_values='desc') 32 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 33 | ticklabels = [label.replace('\n', ' ') for label in ticklabels] 34 | 35 | 36 | class TestOrder: 37 | 38 | def test_x_order(self): 39 | dxp.box(x='price', y='neighborhood', data=airbnb, 40 | y_order=['Dupont Circle', 'Edgewood', 'Union Station']) 41 | 42 | with pytest.raises(ValueError): 43 | dxp.box(x='price', y='neighborhood', data=airbnb, 44 | y_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST']) 45 | 46 | 47 | class TestVertical: 48 | 49 | def test_vert(self): 50 | dxp.box(x='neighborhood', y='price', data=airbnb, orientation='v') 51 | 52 | 53 | class TestSplit: 54 | 55 | def test_split(self): 56 | dxp.box(x='price', y='neighborhood', data=airbnb, split='superhost') 57 | 58 | def test_split_order(self): 59 | dxp.box(x='price', y='neighborhood', data=airbnb, 60 | split='superhost', split_order=['Yes', 'No']) 61 | 62 | def test_stacked(self): 63 | dxp.box(x='price', y='neighborhood', data=airbnb, 64 | split='superhost', split_order=['Yes', 'No']) 65 | 66 | 67 | class TestRowCol: 68 | 69 | def test_col(self): 70 | dxp.box(x='price', y='neighborhood', data=airbnb, 71 | split='superhost', col='property_type') 72 | 73 | def test_col_wrap(self): 74 | dxp.box(x='price', y='neighborhood', data=airbnb, 75 | split='superhost', col='property_type', wrap=2) 76 | 77 | def test_col_order(self): 78 | dxp.box(x='price', y='neighborhood', data=airbnb, 79 | split='superhost', col='property_type', col_order=['House', 'Condominium']) 80 | 81 | def test_row(self): 82 | dxp.box(x='price', y='neighborhood', data=airbnb, 83 | split='superhost', row='property_type') 84 | 85 | def test_row_order(self): 86 | dxp.box(x='price', y='neighborhood', data=airbnb, 87 | split='superhost', row='property_type', row_order=['House', 'Condominium']) 88 | 89 | def test_row_wrap(self): 90 | dxp.box(x='price', y='neighborhood', data=airbnb, 91 | split='superhost', row='property_type', wrap=2) 92 | 93 | def test_row_col(self): 94 | dxp.box(x='price', y='neighborhood', data=airbnb, 95 | split='superhost', col='property_type', 96 | col_order=['House', 'Condominium', 'Apartment'], 97 | row='bedrooms', row_order=[0, 1, 2, 3]) 98 | 99 | def test_sharex(self): 100 | dxp.box(x='price', y='neighborhood', data=airbnb, 101 | split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'], 102 | row='bedrooms', row_order=[1, 2, 3], sharex=False) 103 | -------------------------------------------------------------------------------- /dexplot/colors/_categories.py: -------------------------------------------------------------------------------- 1 | from ._colormaps import colormaps 2 | 3 | def set_attrs(obj, cmaps): 4 | for cmap in cmaps: 5 | setattr(obj, cmap, colormaps[cmap]) 6 | 7 | sequential_colormaps = [ 8 | 'afmhot', 'afmhot_r', 'aggrnyl', 'aggrnyl_r', 'agsunset', 'agsunset_r', 'algae', 'algae_r', 9 | 'amp', 'amp_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'blackbody', 'blackbody_r', 10 | 'bluered', 'bluered_r', 'blues', 'blues_r', 'blugrn', 'blugrn_r', 'bluyl', 'bluyl_r', 11 | 'bone', 'bone_r', 'brwnyl', 'brwnyl_r', 'bugn', 'bugn_r', 'bupu', 'bupu_r', 'burg', 'burg_r', 12 | 'burgyl', 'burgyl_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'copper', 'copper_r', 13 | 'darkmint', 'darkmint_r', 'deep', 'deep_r', 'dense', 'dense_r', 'electric', 'electric_r', 14 | 'emrld', 'emrld_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_yarg', 15 | 'gist_yarg_r', 'gnbu', 'gnbu_r', 'gray', 'gray_r', 'greens', 'greens_r', 'greys', 'greys_r', 16 | 'haline', 'haline_r', 'hot', 'hot_r', 'ice', 'ice_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 17 | 'magenta', 'magenta_r', 'magma', 'magma_r', 'matter', 'matter_r', 'mint', 'mint_r', 'oranges', 18 | 'oranges_r', 'orrd', 'orrd_r', 'oryel', 'oryel_r', 'peach', 'peach_r', 'pink', 'pink_r', 19 | 'pinkyl', 'pinkyl_r', 'plasma', 'plasma_r', 'plotly3', 'plotly3_r', 'pubu', 'pubu_r', 'pubugn', 20 | 'pubugn_r', 'purd', 'purd_r', 'purp', 'purp_r', 'purples', 'purples_r', 'purpor', 'purpor_r', 21 | 'rainbow', 'rainbow_r', 'rdbu', 'rdbu_r', 'rdpu', 'rdpu_r', 'redor', 'redor_r', 'reds', 22 | 'reds_r', 'solar', 'solar_r', 'speed', 'speed_r', 'spring', 'spring_r', 'summer', 'summer_r', 23 | 'sunset', 'sunset_r', 'sunsetdark', 'sunsetdark_r', 'teal', 'teal_r', 'tealgrn', 'tealgrn_r', 24 | 'tempo', 'tempo_r', 'thermal', 'thermal_r', 'turbid', 'turbid_r', 'viridis', 'viridis_r', 25 | 'winter', 'winter_r', 'wistia', 'wistia_r', 'ylgn', 'ylgn_r', 'ylgnbu', 'ylgnbu_r', 'ylorbr', 26 | 'ylorbr_r', 'ylorrd', 'ylorrd_r' 27 | ] 28 | 29 | diverging_colormaps = [ 30 | 'armyrose', 'armyrose_r', 'balance', 'balance_r', 'brbg', 'brbg_r', 'bwr', 'bwr_r', 'coolwarm', 31 | 'coolwarm_r', 'curl', 'curl_r', 'delta', 'delta_r', 'earth', 'earth_r', 'fall', 'fall_r', 32 | 'geyser', 'geyser_r', 'picnic', 'picnic_r', 'piyg', 'piyg_r', 'portland', 'portland_r', 'prgn', 33 | 'prgn_r', 'puor', 'puor_r', 'rdbu', 'rdbu_r', 'rdgy', 'rdgy_r', 'rdylbu', 'rdylbu_r', 'rdylgn', 34 | 'rdylgn_r', 'seismic', 'seismic_r', 'spectral', 'spectral_r', 'tealrose', 'tealrose_r', 'temps', 35 | 'temps_r', 'tropic', 'tropic_r' 36 | ] 37 | 38 | cyclic_colormaps = [ 39 | 'edge', 'edge_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'mrybm', 'mrybm_r', 'mygbm', 40 | 'mygbm_r', 'phase', 'phase_r', 'twilight', 'twilight_r', 'twilight_shifted', 41 | 'twilight_shifted_r' 42 | ] 43 | 44 | qualitative_colormaps = [ 45 | 'accent', 'accent_r', 'alphabet', 'alphabet_r', 'antique', 'antique_r', 'bold', 'bold_r', 46 | 'd3', 'd3_r', 'dark12', 'dark12_r', 'dark2', 'dark24', 'dark24_r', 'dark2_r', 'g10', 'g10_r', 47 | 'light24', 'light24_r', 'paired', 'paired_r', 'pastel', 'pastel1', 'pastel1_r', 'pastel2', 48 | 'pastel2_r', 'pastel_r', 'plotly', 'plotly_r', 'prism', 'prism_r', 'safe', 'safe_r', 'set1', 49 | 'set1_r', 'set2', 'set2_r', 'set3', 'set3_r', 't10', 't10_r', 'tab10', 'tab10_r', 'tab20', 50 | 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'vivid', 'vivid_r' 51 | ] 52 | 53 | misc_colormaps = [ 54 | 'brg', 'brg_r', 'cmrmap', 'cmrmap_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 55 | 'gist_earth', 'gist_earth_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 56 | 'gist_stern', 'gist_stern_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'jet', 'jet_r', 57 | 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'prism', 'prism_r', 'rainbow', 58 | 'rainbow_r', 'terrain', 'terrain_r' 59 | ] 60 | 61 | all_colormaps = ( 62 | sequential_colormaps + diverging_colormaps + cyclic_colormaps + 63 | qualitative_colormaps + misc_colormaps 64 | ) 65 | 66 | class ColorMaps: 67 | pass 68 | 69 | sequential = ColorMaps() 70 | diverging = ColorMaps() 71 | cyclic = ColorMaps() 72 | qualitative = ColorMaps() 73 | misc = ColorMaps() 74 | all_cmaps = ColorMaps() 75 | 76 | set_attrs(sequential, sequential_colormaps) 77 | set_attrs(diverging, diverging_colormaps) 78 | set_attrs(cyclic, cyclic_colormaps) 79 | set_attrs(qualitative, qualitative_colormaps) 80 | set_attrs(misc, misc_colormaps) 81 | set_attrs(all_cmaps, all_colormaps) -------------------------------------------------------------------------------- /dexplot/colors/_app.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | from ipywidgets import Dropdown, Image, HBox, HTML, Checkbox, interactive_output, VBox 4 | from IPython.display import display 5 | import matplotlib.pyplot as plt 6 | from matplotlib.colors import ListedColormap 7 | import numpy as np 8 | 9 | from dexplot.colors._colormaps import colormaps 10 | from dexplot.colors._categories import (qualitative_colormaps, sequential_colormaps, 11 | diverging_colormaps, cyclic_colormaps, misc_colormaps, 12 | all_colormaps) 13 | 14 | ARR = np.linspace(0, 1, 256).reshape((1, -1)).repeat(20, 0) 15 | 16 | cmap_dict = {'qualitative': qualitative_colormaps, 17 | 'sequential': sequential_colormaps, 18 | 'diverging': diverging_colormaps, 19 | 'cyclic': cyclic_colormaps, 20 | 'misc': misc_colormaps, 21 | 'all': all_colormaps} 22 | 23 | cmap_default = {'qualitative': 't10', 24 | 'sequential': 'viridis', 25 | 'diverging': 'coolwarm', 26 | 'cyclic': 'edge', 27 | 'misc': 'ocean', 28 | 'all': 'tab10'} 29 | 30 | cmap_dropdown = Dropdown(options=[('Qualitative', 'qualitative'), 31 | ('Sequential', 'sequential'), 32 | ('Diverging', 'diverging'), 33 | ('Cyclic', 'cyclic'), 34 | ('Misc', 'misc'), 35 | ('All Colormaps', 'all')], 36 | value=None, 37 | description='Colormap Category: ', 38 | style = {'description_width': 'initial'}) 39 | 40 | def remove_ticks_spines(ax): 41 | ax.set_xticks([]) 42 | ax.set_yticks([]) 43 | for spine in ax.spines.values(): 44 | spine.set_visible(False) 45 | 46 | class ColorViewer: 47 | 48 | def __init__(self): 49 | self.checked_colors = [] 50 | self.test_list = [] 51 | self.cbox_dict = {cat: self.cmap_checkboxes(cat) for cat in cmap_default} 52 | self.layout = self.create_layout() 53 | self.fig, self.ax = self.create_figure() 54 | self.add_interaction() 55 | 56 | def checkbox_maker(self, name, default): 57 | value = name == default 58 | c = Checkbox(value=value, description=name, disabled=False, 59 | indent=False, style={'color': 'blue'}) 60 | c.observe(self.cb_handler, 'value') 61 | return c 62 | 63 | def cmap_checkboxes(self, category): 64 | rows = [] 65 | row = [] 66 | layout = {'justify_content': 'flex-end', 'margin': '0px'} 67 | cmaps = cmap_dict[category] 68 | default = cmap_default[category] 69 | for name in cmaps: 70 | row.append(self.checkbox_maker(name, default)) 71 | if len(row) == 10: 72 | rows.append(HBox(row, layout=layout)) 73 | row = [] 74 | if row: 75 | rows.append(HBox(row, layout=layout)) 76 | return rows 77 | 78 | def create_image(self): 79 | for image in self.ax.images: 80 | image.remove() 81 | 82 | ticks = [] 83 | ticklabels = [] 84 | i = 0 85 | 86 | for i, name in enumerate(self.checked_colors): 87 | cmap = ListedColormap(colormaps[name]) 88 | self.ax.imshow(ARR, cmap=cmap, extent=[0, 10, i + .2, i + .8], aspect='auto') 89 | ticks.append(i + .5) 90 | ticklabels.append(name) 91 | 92 | self.ax.set_ylim(0, i + 1) 93 | self.ax.set_yticks(ticks) 94 | self.ax.set_yticklabels(ticklabels) 95 | img_bytes = io.BytesIO() 96 | self.fig.canvas.print_figure(img_bytes) 97 | img_bytes.seek(0) 98 | 99 | self.img.layout.visibility = 'visible' 100 | self.img.value = img_bytes.read() 101 | 102 | def get_checkboxes(self, category): 103 | if category is None: 104 | return 105 | self.checked_colors.clear() 106 | self.checked_colors.append(cmap_default[category]) 107 | self.layout.children = list(self.layout.children[:2]) + self.cbox_dict[category] 108 | self.test_list.append('end of get_checkboxes') 109 | self.create_image() 110 | 111 | def cb_handler(self, change): 112 | name = change['owner'].description 113 | if change['new']: 114 | self.checked_colors.append(name) 115 | else: 116 | self.checked_colors.remove(name) 117 | self.create_image() 118 | 119 | def create_layout(self): 120 | title = HTML('

Color Viewer

') 121 | self.img = Image(width=700, height=600) 122 | self.img.layout.visibility = 'hidden' 123 | 124 | rows = [] 125 | row1 = HBox([title], layout={'justify_content': 'flex-start'}) 126 | row2 = HBox([cmap_dropdown, self.img], layout={'align_items': 'center'}) 127 | rows = [row1, row2] 128 | 129 | return VBox(rows) 130 | 131 | def create_figure(self): 132 | fig = plt.Figure(dpi=144, tight_layout=True, figsize=(6, 3)) 133 | ax = fig.add_subplot() 134 | remove_ticks_spines(ax) 135 | return fig, ax 136 | 137 | def add_interaction(self): 138 | interactive_output(self.get_checkboxes, {'category': cmap_dropdown}) 139 | cmap_dropdown.value = 'qualitative' 140 | 141 | 142 | def run(self): 143 | display(self.layout) 144 | 145 | 146 | def color_viewer(): 147 | ColorViewer().run() -------------------------------------------------------------------------------- /tests/test_line.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import dexplot as dxp 4 | 5 | 6 | airbnb = dxp.load_dataset('airbnb') 7 | 8 | class TestAgg: 9 | 10 | def test_string_name(self): 11 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median') 12 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='mean') 13 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='min') 14 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='max') 15 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='size') 16 | 17 | def test_function(self): 18 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.median) 19 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.mean) 20 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.min) 21 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.max) 22 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.size) 23 | 24 | 25 | class TestSort: 26 | 27 | def test_lex_asc(self): 28 | fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median') 29 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 30 | correct = sorted(ticklabels) 31 | assert ticklabels == correct 32 | 33 | def test_lex_desc(self): 34 | fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc') 35 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 36 | correct = sorted(ticklabels, reverse=True) 37 | assert ticklabels == correct 38 | 39 | def test_asc_values(self): 40 | fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc') 41 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 42 | ticklabels = [label.replace('\n', ' ') for label in ticklabels] 43 | values = [p.get_height() for p in fig.axes[0].patches] 44 | 45 | s = airbnb.groupby('neighborhood')['price'].median().sort_values() 46 | correct_labels = s.index.tolist() 47 | correct_values = s.values.tolist() 48 | assert ticklabels == correct_labels 49 | 50 | def test_desc_values(self): 51 | fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc') 52 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 53 | ticklabels = [label.replace('\n', ' ') for label in ticklabels] 54 | values = [p.get_height() for p in fig.axes[0].patches] 55 | 56 | df = airbnb.groupby('neighborhood').agg({'price': 'median'}).reset_index() \ 57 | .sort_values(['price', 'neighborhood'], ascending=[False, True]) 58 | s = df.set_index('neighborhood').squeeze() 59 | correct_labels = s.index.tolist() 60 | correct_values = s.values.tolist() 61 | assert ticklabels == correct_labels 62 | 63 | 64 | class TestOrder: 65 | 66 | def test_x_order(self): 67 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 68 | x_order=['Dupont Circle', 'Edgewood', 'Union Station']) 69 | 70 | with pytest.raises(ValueError): 71 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 72 | x_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST']) 73 | 74 | 75 | class TestHorizontal: 76 | 77 | def test_horiz(self): 78 | dxp.line(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h') 79 | 80 | 81 | class TestSplit: 82 | 83 | def test_split(self): 84 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost') 85 | 86 | def test_split_order(self): 87 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 88 | split='superhost', split_order=['Yes', 'No']) 89 | 90 | def test_stacked(self): 91 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 92 | split='superhost', split_order=['Yes', 'No']) 93 | 94 | 95 | class TestRowCol: 96 | 97 | def test_col(self): 98 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 99 | split='superhost', col='property_type') 100 | 101 | def test_col_wrap(self): 102 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 103 | split='superhost', col='property_type', wrap=2) 104 | 105 | def test_col_order(self): 106 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 107 | split='superhost', col='property_type', col_order=['House', 'Condominium']) 108 | 109 | def test_row(self): 110 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 111 | split='superhost', row='property_type') 112 | 113 | def test_row_order(self): 114 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 115 | split='superhost', row='property_type', row_order=['House', 'Condominium']) 116 | 117 | def test_row_wrap(self): 118 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 119 | split='superhost', row='property_type', wrap=2) 120 | 121 | def test_row_col(self): 122 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 123 | split='superhost', col='property_type', 124 | col_order=['House', 'Condominium', 'Apartment'], 125 | row='bedrooms', row_order=[0, 1, 2, 3]) 126 | 127 | def test_sharey(self): 128 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 129 | split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'], 130 | row='bedrooms', row_order=[0, 1, 2, 3], sharey=False) 131 | -------------------------------------------------------------------------------- /tests/test_bar.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import dexplot as dxp 4 | 5 | 6 | airbnb = dxp.load_dataset('airbnb') 7 | aggfunc = ['median', 'mean', 'min', 'max', 'size', np.median, np.mean, np.min, np.max, np.max] 8 | 9 | 10 | class TestAgg: 11 | 12 | @pytest.mark.parametrize('aggfunc', aggfunc) 13 | def test_string_name(self, aggfunc): 14 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc=aggfunc) 15 | 16 | 17 | class TestSort: 18 | 19 | def test_lex_asc(self): 20 | fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median') 21 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 22 | correct = sorted(ticklabels) 23 | assert ticklabels == correct 24 | 25 | def test_lex_desc(self): 26 | fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc') 27 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 28 | correct = sorted(ticklabels, reverse=True) 29 | assert ticklabels == correct 30 | 31 | def test_asc_values(self): 32 | fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc') 33 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 34 | ticklabels = [label.replace('\n', ' ') for label in ticklabels] 35 | values = [p.get_height() for p in fig.axes[0].patches] 36 | 37 | s = airbnb.groupby('neighborhood')['price'].median().sort_values() 38 | correct_labels = s.index.tolist() 39 | correct_values = s.values.tolist() 40 | assert ticklabels == correct_labels 41 | assert values == correct_values 42 | 43 | def test_desc_values(self): 44 | fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc') 45 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 46 | ticklabels = [label.replace('\n', ' ') for label in ticklabels] 47 | values = [p.get_height() for p in fig.axes[0].patches] 48 | 49 | df = airbnb.groupby('neighborhood').agg({'price': 'median'}).reset_index() \ 50 | .sort_values(['price', 'neighborhood'], ascending=[False, True]) 51 | s = df.set_index('neighborhood').squeeze() 52 | correct_labels = s.index.tolist() 53 | correct_values = s.values.tolist() 54 | assert ticklabels == correct_labels 55 | assert values == correct_values 56 | 57 | 58 | class TestOrder: 59 | 60 | def test_x_order(self): 61 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 62 | x_order=['Dupont Circle', 'Edgewood', 'Union Station']) 63 | 64 | with pytest.raises(ValueError): 65 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 66 | x_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST']) 67 | 68 | class TestHorizontal: 69 | 70 | def test_horiz(self): 71 | dxp.bar(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h') 72 | 73 | 74 | class TestSplit: 75 | 76 | def test_split(self): 77 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost') 78 | 79 | def test_split_order(self): 80 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 81 | split='superhost', split_order=['Yes', 'No']) 82 | 83 | def test_stacked(self): 84 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 85 | split='superhost', split_order=['Yes', 'No'], stacked=True) 86 | 87 | def test_errors(self): 88 | with pytest.raises(ValueError): 89 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 90 | split='property_type', split_order=['Yes', 'No']) 91 | 92 | 93 | class TestRowCol: 94 | 95 | def test_col(self): 96 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 97 | split='superhost', col='property_type') 98 | 99 | def test_col_wrap(self): 100 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 101 | split='superhost', col='property_type', wrap=2) 102 | 103 | def test_col_order(self): 104 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 105 | split='superhost', col='property_type', col_order=['House', 'Condominium']) 106 | 107 | def test_row(self): 108 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 109 | split='superhost', row='property_type') 110 | 111 | def test_row_order(self): 112 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 113 | split='superhost', row='property_type', row_order=['House', 'Condominium']) 114 | 115 | def test_row_wrap(self): 116 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 117 | split='superhost', row='property_type', wrap=2) 118 | 119 | def test_row_col(self): 120 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 121 | split='superhost', col='property_type', 122 | col_order=['House', 'Condominium', 'Apartment'], 123 | row='bedrooms', row_order=[0, 1, 2, 3]) 124 | 125 | def test_sharey(self): 126 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 127 | split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'], 128 | row='bedrooms', row_order=[0, 1, 2, 3], sharey=False) 129 | 130 | class TestBarProps: 131 | 132 | def test_bar_size(self): 133 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='property_type', 134 | split_order=['Apartment', 'House'], 135 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'], size=.5) 136 | -------------------------------------------------------------------------------- /tests/test_scatter.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import dexplot as dxp 4 | 5 | 6 | airbnb = dxp.load_dataset('airbnb') 7 | 8 | class TestAgg: 9 | 10 | def test_string_name(self): 11 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median') 12 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='mean') 13 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='min') 14 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='max') 15 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='size') 16 | 17 | def test_function(self): 18 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.median) 19 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.mean) 20 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.min) 21 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.max) 22 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.size) 23 | 24 | 25 | class TestSort: 26 | 27 | def test_lex_asc(self): 28 | fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median') 29 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 30 | correct = sorted(ticklabels) 31 | assert ticklabels == correct 32 | 33 | def test_lex_desc(self): 34 | fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='lex_desc') 35 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 36 | correct = sorted(ticklabels, reverse=True) 37 | assert ticklabels == correct 38 | 39 | def test_asc_values(self): 40 | fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc') 41 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 42 | ticklabels = [label.replace('\n', ' ') for label in ticklabels] 43 | values = [p.get_height() for p in fig.axes[0].patches] 44 | 45 | s = airbnb.groupby('neighborhood')['price'].median().sort_values() 46 | correct_labels = s.index.tolist() 47 | correct_values = s.values.tolist() 48 | assert ticklabels == correct_labels 49 | 50 | def test_desc_values(self): 51 | fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc') 52 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()] 53 | ticklabels = [label.replace('\n', ' ') for label in ticklabels] 54 | values = [p.get_height() for p in fig.axes[0].patches] 55 | 56 | df = airbnb.groupby('neighborhood').agg({'price': 'median'}).reset_index() \ 57 | .sort_values(['price', 'neighborhood'], ascending=[False, True]) 58 | s = df.set_index('neighborhood').squeeze() 59 | correct_labels = s.index.tolist() 60 | correct_values = s.values.tolist() 61 | assert ticklabels == correct_labels 62 | 63 | 64 | class TestOrder: 65 | 66 | def test_x_order(self): 67 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 68 | x_order=['Dupont Circle', 'Edgewood', 'Union Station']) 69 | 70 | with pytest.raises(ValueError): 71 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 72 | x_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST']) 73 | 74 | 75 | class TestHorizontal: 76 | 77 | def test_horiz(self): 78 | dxp.scatter(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h') 79 | 80 | 81 | class TestSplit: 82 | 83 | def test_split(self): 84 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost') 85 | 86 | def test_split_order(self): 87 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 88 | split='superhost', split_order=['Yes', 'No']) 89 | 90 | def test_stacked(self): 91 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 92 | split='superhost', split_order=['Yes', 'No']) 93 | 94 | 95 | class TestRowCol: 96 | 97 | def test_col(self): 98 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 99 | split='superhost', col='property_type') 100 | 101 | def test_col_wrap(self): 102 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 103 | split='superhost', col='property_type', wrap=2) 104 | 105 | def test_col_order(self): 106 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 107 | split='superhost', col='property_type', col_order=['House', 'Condominium']) 108 | 109 | def test_row(self): 110 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 111 | split='superhost', row='property_type') 112 | 113 | def test_row_order(self): 114 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 115 | split='superhost', row='property_type', row_order=['House', 'Condominium']) 116 | 117 | def test_row_wrap(self): 118 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 119 | split='superhost', row='property_type', wrap=2) 120 | 121 | def test_row_col(self): 122 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 123 | split='superhost', col='property_type', 124 | col_order=['House', 'Condominium', 'Apartment'], 125 | row='bedrooms', row_order=[0, 1, 2, 3]) 126 | 127 | def test_sharey(self): 128 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 129 | split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'], 130 | row='bedrooms', row_order=[0, 1, 2, 3], sharey=False) 131 | -------------------------------------------------------------------------------- /dexplot/_heat.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import numpy as np 4 | 5 | 6 | def heatmap(x=None, y=None, agg=None, aggfunc=None, data=None, normalize=None, corr=False, 7 | annot=False, fmt='.2f', ax=None, figsize=None, title=None, cmap=None, 8 | cbarlabel="", cbar_kw={}, **kwargs): 9 | """ 10 | Create a heatmap from a Pandas DataFrame. This function works with either 11 | tidy data or aggregated data. 12 | 13 | If using tidy data, pass it categorical/string variables to `x` and `y` 14 | and a numeric variable to `values`. Pass an aggregation function 15 | as a string to `aggfunc`. You may also choose to leave `values` as None 16 | which result in a raw frequency count for the co-occurence of the `x` and 17 | `y` variables. Set normalize to True to get relative percentages. 18 | 19 | If using aggregated data, only use the `data` parameter. The index and 20 | columns will label the x and y. The values of the DataFrame will form 21 | will be used for the heat map. 22 | 23 | Parameters 24 | ---------- 25 | x: str 26 | Column name who's unique values will be used to form groups. Can 27 | only be used with tidy data and should be a categorical/string. 28 | 29 | y: str 30 | Column name who's unique values will be used to form groups. Can 31 | only be used with tidy data and should be a categorical/string. 32 | 33 | agg: str 34 | Column name who's values will be aggregated across the groups 35 | formed by `x` and `y`. 36 | 37 | aggfunc: str or function 38 | Used to aggregate `agg` variable. Use any of the strings that Pandas 39 | can understand. You can also use a custom function as long as it 40 | aggregates, i.e. returns a single value. 41 | 42 | data: DataFrame 43 | A Pandas DataFrame containing either tidy or aggregated data 44 | 45 | normalize: str 46 | Must be one of three strings, "all" or the name of one of the column 47 | names provided to `x` or `y`. 48 | 49 | corr: bool - Default False 50 | When set to True, will calcaulte the correlation of the co-occurence 51 | between each of the unique values in `x` and `y`. Only works with 52 | tidy data. 53 | 54 | annot: bool - Default False 55 | Controls whether the aggregated values will be plotted as 56 | text in the heatmap. 57 | 58 | fmt: str 59 | Formatting style for annotations 60 | 61 | ax: Matplotlib Axes 62 | The Matplotlib Axes object to use for plotting. If not given, then 63 | create a new Figure and Axes 64 | 65 | figsize: tuple 66 | A two item tuple of ints used to control the figure size 67 | 68 | title: str 69 | Sets the title of the figure 70 | 71 | cmap: str 72 | Matplotlib colormap name 73 | 74 | cbarlabel: str 75 | Labels the colorbar 76 | 77 | cbar_kw: dict 78 | Keyword arguments passed to the `colorbar` Figure function 79 | 80 | kwargs: dict 81 | Keyword arguments passed to the `imshow` Axes function 82 | 83 | Returns 84 | ------- 85 | A one-item tuple containing a Matplotlib Figure 86 | 87 | References 88 | ---------- 89 | Code was inspired from Matplotlib page 90 | https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html 91 | """ 92 | 93 | if figsize is None: 94 | figsize = (10, 8) 95 | 96 | if not isinstance(data, pd.DataFrame): 97 | raise TypeError('`data` must be a DataFrame') 98 | 99 | if ax is None: 100 | fig, ax = plt.subplots(figsize=figsize) 101 | if title: 102 | fig.suptitle(title) 103 | else: 104 | fig = ax.figure 105 | 106 | if aggfunc: 107 | if not agg: 108 | raise ValueError('If you are setting `aggfunc`, you need to set `agg` as well.') 109 | 110 | if not normalize: 111 | normalize = False 112 | 113 | if cmap is None: 114 | cmap = 'RdYlBu_r' 115 | 116 | if x or y: 117 | if not (x and y): 118 | raise ValueError('If you supply one of x or y, you must both of them') 119 | 120 | if normalize not in (False, 'all', x, y): 121 | raise ValueError('If you are setting `normalize`, it must be either ' 122 | f'"all", "{x}" or "{y}"') 123 | elif normalize == x: 124 | normalize = 'columns' 125 | elif normalize == y: 126 | normalize = 'index' 127 | 128 | if agg: 129 | data_values = data[agg] 130 | if not aggfunc: 131 | aggfunc = 'mean' 132 | else: 133 | data_values = None 134 | 135 | agg_data = pd.crosstab(index=data[y], columns=data[x], values=data_values, aggfunc=aggfunc, 136 | normalize=normalize) 137 | else: 138 | agg_data = data 139 | 140 | if corr: 141 | agg_data = agg_data.corr() 142 | 143 | agg_values = agg_data.values 144 | col_labels = agg_data.columns.tolist() 145 | row_labels = agg_data.index.tolist() 146 | 147 | # Plot the heatmap 148 | im = ax.imshow(agg_values, cmap=cmap, **kwargs) 149 | 150 | # Create colorbar 151 | cbar = fig.colorbar(im, ax=ax, **cbar_kw) 152 | cbar.ax.set_ylabel(cbarlabel, rotation=-90, va='bottom') 153 | 154 | x_range, y_range = np.arange(agg_data.shape[1]), np.arange(agg_data.shape[0]) 155 | ax.set_xticks(x_range) 156 | ax.set_yticks(y_range) 157 | 158 | ax.set_xticklabels(col_labels) 159 | ax.set_yticklabels(row_labels) 160 | 161 | # Let the horizontal axes labeling appear on top. 162 | ax.tick_params(top=True, bottom=False, 163 | labeltop=True, labelbottom=False) 164 | 165 | # Rotate the tick labels and set their alignment. 166 | plt.setp(ax.get_xticklabels(), rotation=-30, ha='right', rotation_mode='anchor') 167 | 168 | # Turn spines off and create white grid. 169 | for edge, spine in ax.spines.items(): 170 | spine.set_visible(False) 171 | 172 | ax.set_xticks(x_range - .5, minor=True) 173 | ax.set_yticks(y_range - .5, minor=True) 174 | ax.grid(which='minor', color='w', linestyle='-', linewidth=3) 175 | ax.tick_params(which='minor', bottom=False, left=False) 176 | 177 | if annot: 178 | annotate_heatmap(im, agg_values, fmt='{0:' + fmt + '}') 179 | 180 | return fig, 181 | 182 | 183 | def annotate_heatmap(im, values, fmt="{0:.2f}", **textkw): 184 | """ 185 | Annotates the heatmap 186 | 187 | https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html 188 | """ 189 | 190 | kw = dict(horizontalalignment="center", 191 | verticalalignment="center") 192 | kw.update(textkw) 193 | n_rows, n_cols = values.shape 194 | 195 | for i in range(n_rows): 196 | for j in range(n_cols): 197 | val = values[i, j] 198 | if not np.isnan(val): 199 | im.axes.text(j, i, fmt.format(val), **kw) -------------------------------------------------------------------------------- /dexplot/_plotly.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import textwrap 3 | 4 | import numpy as np 5 | import plotly.graph_objects as go 6 | 7 | from ._common_plot import PlotlyCommon, PlotlyCount 8 | 9 | 10 | def wrap_labels(labels, wrap): 11 | return [textwrap.fill(label, wrap).replace('\n', '
') for label in labels] 12 | 13 | 14 | def line_plotly(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 15 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None, 16 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True, 17 | sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 18 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None): 19 | 20 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 21 | x_order, y_order, split_order, row_order, col_order, 22 | orientation, sort_values, wrap, figsize, title, sharex, 23 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 24 | x_textwrap, y_textwrap, x_rot, y_rot) 25 | 26 | showlegend = True 27 | for (row, col), info in self.final_data.items(): 28 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info): 29 | self.fig.add_scatter(x=x, y=y, name=label, row=row, col=col, 30 | marker_color=self.colors[i % len(self.colors)], 31 | showlegend=showlegend) 32 | showlegend = False 33 | return self.fig 34 | 35 | 36 | def scatter_plotly(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 37 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None, 38 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True, 39 | sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 40 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None): 41 | 42 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 43 | x_order, y_order, split_order, row_order, col_order, 44 | orientation, sort_values, wrap, figsize, title, sharex, 45 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 46 | x_textwrap, y_textwrap, x_rot, y_rot) 47 | 48 | showlegend = True 49 | for (row, col), info in self.final_data.items(): 50 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info): 51 | self.fig.add_scatter(x=x, y=y, name=label, row=row, col=col, 52 | marker_color=self.colors[i % len(self.colors)], 53 | showlegend=showlegend, mode='markers') 54 | showlegend = False 55 | return self.fig 56 | 57 | 58 | def bar_plotly(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 59 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None, 60 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, 61 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, 62 | ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10, 63 | y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2, 64 | groupgap=0, bar_kwargs=None): 65 | 66 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 67 | x_order, y_order, split_order, row_order, col_order, 68 | orientation, sort_values, wrap, figsize, title, sharex, 69 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 70 | x_textwrap, y_textwrap, x_rot, y_rot) 71 | 72 | showlegend = self.split is not None 73 | self.fig.update_layout(barmode=mode, bargap=gap, bargroupgap=groupgap) 74 | for (row, col), info in self.final_data.items(): 75 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info): 76 | if len(x) > 200: 77 | warnings.warn('You are plotting more than 200 bars. ' 78 | 'Did you forget to provide an `aggfunc`?') 79 | 80 | self.fig.add_bar(x=x, y=y, orientation=self.orientation, 81 | name=label, row=row, col=col, 82 | marker_color=self.colors[i % len(self.colors)], 83 | showlegend=showlegend) 84 | showlegend = False 85 | 86 | return self.fig 87 | 88 | 89 | def count_plotly(val, data=None, normalize=False, split=None, row=None, col=None, 90 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None, 91 | orientation='v', sort_values='desc', wrap=None, figsize=None, title=None, 92 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, 93 | xscale='linear', yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, 94 | x_rot=None, y_rot=None, mode='group', gap=.2, groupgap=0, 95 | bar_kwargs=None): 96 | 97 | x, y = (val, None) if orientation == 'v' else (None, val) 98 | aggfunc = '__distribution__' 99 | self = PlotlyCount(x, y, data, aggfunc, split, row, col, 100 | x_order, y_order, split_order, row_order, col_order, 101 | orientation, None, wrap, figsize, title, sharex, 102 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 103 | x_textwrap, y_textwrap, x_rot, y_rot, kind='count') 104 | 105 | count_dict = self.get_count_dict(normalize) 106 | showlegend = self.split is not None 107 | self.fig.update_layout(barmode=mode, bargap=gap, bargroupgap=groupgap) 108 | for (row, col), df in count_dict.items(): 109 | if sort_values == 'asc' and not (self.split or self.row or self.col): 110 | df = df.iloc[::-1] 111 | 112 | labels = df.index.values 113 | for i, column in enumerate(df.columns): 114 | values = df[column].values 115 | x, y = (labels, values) if self.orientation == 'v' else (values, labels) 116 | self.fig.add_bar(x=x, y=y, orientation=self.orientation, name=column, 117 | row=row, col=col, marker_color=self.colors[i % len(self.colors)], 118 | showlegend=showlegend) 119 | showlegend = False 120 | return self.fig 121 | 122 | 123 | def box_plotly(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 124 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 125 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 126 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 127 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', 128 | gap=.2, groupgap=0, box_kwargs=None): 129 | 130 | aggfunc = None 131 | sort_values = None 132 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 133 | x_order, y_order, split_order, row_order, col_order, 134 | orientation, sort_values, wrap, figsize, title, sharex, 135 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 136 | x_textwrap, y_textwrap, x_rot, y_rot) 137 | 138 | showlegend = self.split is not None 139 | self.fig.update_layout(boxmode=mode, boxgap=gap, boxgroupgap=groupgap) 140 | for (row, col), info in self.final_data.items(): 141 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info): 142 | self.fig.add_box(x=x, y=y, orientation=self.orientation, 143 | name=label, row=row, col=col, 144 | marker_color=self.colors[i % len(self.colors)], 145 | showlegend=showlegend) 146 | showlegend = False 147 | 148 | return self.fig 149 | 150 | def violin_plotly(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 151 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 152 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 153 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 154 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', 155 | gap=.2, groupgap=0, box_kwargs=None): 156 | 157 | aggfunc = None 158 | sort_values = None 159 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 160 | x_order, y_order, split_order, row_order, col_order, 161 | orientation, sort_values, wrap, figsize, title, sharex, 162 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 163 | x_textwrap, y_textwrap, x_rot, y_rot) 164 | 165 | showlegend = self.split is not None 166 | self.fig.update_layout(violinmode=mode, violingap=gap, violingroupgap=groupgap) 167 | for (row, col), info in self.final_data.items(): 168 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info): 169 | self.fig.add_violin(x=x, y=y, orientation=self.orientation, 170 | name=label, row=row, col=col, 171 | marker_color=self.colors[i % len(self.colors)], 172 | showlegend=showlegend) 173 | showlegend = False 174 | 175 | return self.fig 176 | 177 | def kde_plotly(x=None, y=None, data=None, split=None, row=None, col=None, split_order=None, 178 | row_order=None, col_order=None, orientation='v', wrap=None, figsize=None, 179 | title=None, sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, 180 | ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10, 181 | y_textwrap=None, x_rot=None, y_rot=None, range=None, cumulative=False): 182 | 183 | aggfunc = None 184 | sort_values = None 185 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 186 | x_order, y_order, split_order, row_order, col_order, 187 | orientation, sort_values, wrap, figsize, title, sharex, 188 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 189 | x_textwrap, y_textwrap, x_rot, y_rot) 190 | 191 | showlegend = self.split is not None 192 | from ._utils import calculate_density_1d, calculate_density_2d 193 | 194 | x_order = y_order = None 195 | # x, y = (x, None) if orientation == 'v' else (None, x) 196 | 197 | if x is not None and y is not None and split is not None: 198 | raise ValueError('Cannot use `split` for 2-dimensional KDE plots') 199 | 200 | aggfunc = '__distribution__' if y is None else None 201 | sort_values = None 202 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 203 | x_order, y_order, split_order, row_order, col_order, 204 | orientation, sort_values, wrap, figsize, title, sharex, 205 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 206 | x_textwrap, y_textwrap, x_rot, y_rot, check_numeric=True) 207 | 208 | for ax, info in self.final_data.items(): 209 | for vals in info: 210 | if aggfunc == '__distribution__': 211 | x, split_label = vals[:2] 212 | x, y = calculate_density_1d(x, cumulative=cumulative) 213 | x, y = (x, y) if self.orientation == 'v' else (y, x) 214 | self.fig.add_scatter(x=x, y=y, name=split_label, row=row, col=col, 215 | marker_color=self.colors[i % len(self.colors)], 216 | showlegend=showlegend) 217 | else: 218 | x, y, split_label = vals[:3] 219 | xmin, xmax, ymin, ymax, Z = calculate_density_2d(x, y) 220 | ax.imshow(Z, extent=[xmin, xmax, ymin, ymax], aspect='auto') 221 | 222 | showlegend = False 223 | 224 | return self.fig 225 | -------------------------------------------------------------------------------- /dexplot/_plots.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from collections import defaultdict 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from scipy import stats 7 | 8 | from ._common_plot import MPLCommon, MPLCount 9 | 10 | 11 | def get_bar_kwargs(bar_kwargs): 12 | default_bar_kwargs = {'ec': 'white', 'alpha': .9} 13 | if bar_kwargs is None: 14 | bar_kwargs = default_bar_kwargs 15 | else: 16 | try: 17 | bar_kwargs = {**default_bar_kwargs, **bar_kwargs} 18 | except: 19 | raise TypeError('`bar_kwargs` must be a dictionary') 20 | return bar_kwargs 21 | 22 | 23 | def verify_gap_args(mode, gap, groupgap): 24 | if mode not in ('group', 'stack', 'overlay', 'relative'): 25 | raise ValueError("`moe` must be one of 'group', 'stack', 'overlay', 'relative'") 26 | if gap < 0 or gap >= 1: 27 | raise ValueError('`gap` must be greater than or equal to 0 and less than 1') 28 | if groupgap < 0 or groupgap >= 1: 29 | raise ValueError('`groupgap` must be greater than or equal to 0 and less than 1') 30 | 31 | 32 | def get_jump_size(n, mode, gap, groupgap): 33 | total = 1 - gap 34 | jump = total / n 35 | size = jump * (1 - groupgap) 36 | if mode != 'group': 37 | jump = 0 38 | size *= n 39 | return jump, size 40 | 41 | 42 | def line(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 43 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None, 44 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True, 45 | sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 46 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None): 47 | 48 | self = MPLCommon(x, y, data, aggfunc, split, row, col, 49 | x_order, y_order, split_order, row_order, col_order, 50 | orientation, sort_values, wrap, figsize, title, sharex, 51 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 52 | x_textwrap, y_textwrap, x_rot, y_rot) 53 | 54 | marker = 'o' if self.groupby else None 55 | 56 | for ax, info in self.final_data.items(): 57 | for x, y, label, col_name, row_label, col_label in info: 58 | x_plot, y_plot = self.get_x_y_plot(x, y) 59 | ax.plot(x_plot, y_plot, label=label, marker=marker) 60 | 61 | if self.groupby: 62 | ticklabels = x if self.orientation == 'v' else y 63 | self.add_ticklabels(ticklabels, ax) 64 | 65 | self.add_legend(label) 66 | if x.dtype == 'O' or y.dtype == 'O': 67 | self.update_fig_size(len(x), 1) 68 | return self.clean_up() 69 | 70 | 71 | def scatter(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 72 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None, 73 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True, 74 | sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 75 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, 76 | regression=False): 77 | 78 | self = MPLCommon(x, y, data, aggfunc, split, row, col, 79 | x_order, y_order, split_order, row_order, col_order, 80 | orientation, sort_values, wrap, figsize, title, sharex, 81 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 82 | x_textwrap, y_textwrap, x_rot, y_rot) 83 | 84 | alpha = 1 if self.groupby else .7 85 | 86 | for ax, info in self.final_data.items(): 87 | for x, y, label, col_name, row_label, col_label in info: 88 | x_plot, y_plot = self.get_x_y_plot(x, y) 89 | ax.scatter(x_plot, y_plot, label=label, alpha=alpha) 90 | if regression: 91 | slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) 92 | x_line = np.array([x.min(), x.max()]) 93 | y_line = x_line * slope + intercept 94 | ax.plot(x_line, y_line) 95 | if self.groupby: 96 | ticklabels = x if self.orientation == 'v' else y 97 | self.add_ticklabels(ticklabels, ax) 98 | 99 | self.add_legend(label) 100 | if x.dtype == 'O' or y.dtype == 'O': 101 | self.update_fig_size(len(x), 1) 102 | return self.clean_up() 103 | 104 | 105 | def bar(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 106 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None, 107 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, 108 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, 109 | ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10, 110 | y_textwrap=None, x_rot=None, y_rot=None, mode='group', 111 | gap=.2, groupgap=0, bar_kwargs=None): 112 | 113 | self = MPLCommon(x, y, data, aggfunc, split, row, col, 114 | x_order, y_order, split_order, row_order, col_order, 115 | orientation, sort_values, wrap, figsize, title, sharex, 116 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 117 | x_textwrap, y_textwrap, x_rot, y_rot) 118 | 119 | bar_kwargs = get_bar_kwargs(bar_kwargs) 120 | verify_gap_args(mode, gap, groupgap) 121 | for ax, info in self.final_data.items(): 122 | jump, size = get_jump_size(len(info), mode, gap, groupgap) 123 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info): 124 | x_plot, y_plot = self.get_x_y_plot(x, y) 125 | if i == 0: 126 | base = np.zeros(len(x_plot)) 127 | if len(x) > 200: 128 | warnings.warn('You are plotting more than 200 bars. ' 129 | 'Did you forget to provide an `aggfunc`?') 130 | 131 | if self.orientation == 'v': 132 | x_plot = x_plot + jump * i 133 | ax.bar(x_plot, y_plot, label=label, width=size, 134 | bottom=base, align='edge', **bar_kwargs) 135 | if mode == 'stack': 136 | base += y_plot 137 | else: 138 | y_plot = y_plot - jump * (i + 1) 139 | ax.barh(y_plot, x_plot, label=label, height=size, 140 | left=base, align='edge', **bar_kwargs) 141 | if mode == 'stack': 142 | base += x_plot 143 | ticklabels = x if self.orientation == 'v' else y 144 | delta = jump * (i + 1) / 2 if mode == 'group' else size / 2 145 | self.add_ticklabels(ticklabels, ax, delta=delta) 146 | 147 | self.add_legend(label) 148 | self.update_fig_size(len(info), len(x)) 149 | return self.clean_up() 150 | 151 | 152 | def count(val, data=None, normalize=False, split=None, row=None, col=None, 153 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None, 154 | orientation='v', sort_values='desc', wrap=None, figsize=None, title=None, 155 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, 156 | xscale='linear', yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, 157 | x_rot=None, y_rot=None, mode='group', gap=.2, groupgap=0, 158 | bar_kwargs=None): 159 | 160 | bar_kwargs = get_bar_kwargs(bar_kwargs) 161 | verify_gap_args(mode, gap, groupgap) 162 | x, y = (val, None) if orientation == 'v' else (None, val) 163 | aggfunc = '__distribution__' 164 | self = MPLCount(x, y, data, aggfunc, split, row, col, 165 | x_order, y_order, split_order, row_order, col_order, 166 | orientation, None, wrap, figsize, title, sharex, 167 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 168 | x_textwrap, y_textwrap, x_rot, y_rot, kind='count') 169 | 170 | count_dict = self.get_count_dict(normalize) 171 | for ax, df in count_dict.items(): 172 | base = np.zeros(len(df)) 173 | position = np.arange(len(df)) 174 | if sort_values == 'asc' and not (self.split or self.row or self.col): 175 | df = df.iloc[::-1] 176 | 177 | ticklabels = df.index.values 178 | jump, size = get_jump_size(df.shape[1], mode, gap, groupgap) 179 | for col in df.columns: 180 | values = df[col].values 181 | 182 | if self.orientation == 'v': 183 | ax.bar(position, values, label=col, width=size, 184 | bottom=base, align='edge', **bar_kwargs) 185 | position = position + jump 186 | else: 187 | ax.barh(position - cur_size, values, label=col, height=size, 188 | left=base, align='edge', **bar_kwargs) 189 | position = position - jump 190 | 191 | if mode == 'stack': 192 | base += values 193 | 194 | delta = jump * df.shape[1] / 2 if mode == 'group' else size / 2 195 | self.add_ticklabels(ticklabels, ax, delta=delta) 196 | if self.split or len(df.columns) > 1: 197 | self.add_legend(col) 198 | self.update_fig_size(df.shape[1], df.shape[0]) 199 | return self.clean_up() 200 | 201 | 202 | def _common_dist(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 203 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 204 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 205 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 206 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, 207 | mode='group', gap=.2, groupgap=0, kind=None, **kwargs): 208 | 209 | aggfunc = '__distribution__' 210 | sort_values = None 211 | self = MPLCommon(x, y, data, aggfunc, split, row, col, 212 | x_order, y_order, split_order, row_order, col_order, 213 | orientation, sort_values, wrap, figsize, title, sharex, 214 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 215 | x_textwrap, y_textwrap, x_rot, y_rot) 216 | 217 | key = 'bodies' if kind == 'violinplot' else 'boxes' 218 | vert = self.orientation == 'v' 219 | for ax, info in self.final_data.items(): 220 | plot_func = getattr(ax, kind) 221 | cur_data, cur_ticklabels = self.get_distribution_data(info) 222 | 223 | handles = [] 224 | split_labels = [] 225 | n_splits = len(cur_data) 226 | widths = min(.5 + .15 * n_splits, .9) / n_splits 227 | n_boxes = len(info) 228 | n = len(next(iter(cur_data.values()))) # number of groups 229 | markersize = max(6 - n_boxes // 5, 2) 230 | jump, size = get_jump_size(n, mode, gap, groupgap) 231 | for i, (split_label, data) in enumerate(cur_data.items()): 232 | filt = [len(arr) > 0 for arr in data] 233 | positions = np.array([i for (i, f) in enumerate(filt) if f]) 234 | data = [np.array(d) for (d, f) in zip(data, filt) if f] 235 | if self.orientation == 'h': 236 | positions = positions - i * widths 237 | else: 238 | positions = positions + i * widths 239 | 240 | if kind == 'boxplot': 241 | kwargs['boxprops'] = {'facecolor': self.colors[i % len(self.colors)] , 242 | 'edgecolor': 'black'} 243 | kwargs['flierprops'] = {'markersize': markersize} 244 | 245 | ret = plot_func(data, vert=vert, positions=positions, widths=widths, **kwargs) 246 | 247 | if kind == 'violinplot': 248 | for k in ['cmeans', 'cmins', 'cmaxes', 'cbars', 'cmedians', 'cquantiles']: 249 | if k in ret: 250 | ret[k].set_linewidth(1) 251 | for body in ret['bodies']: 252 | body.set_alpha(.8) 253 | 254 | handles.append(ret[key][0]) 255 | split_labels.append(split_label) 256 | 257 | delta = (n_splits / 2 - .5) * widths 258 | ticklabels = cur_ticklabels[split_label] 259 | self.add_ticklabels(ticklabels, ax, delta=delta) 260 | 261 | self.add_legend(self.split, handles, split_labels) 262 | self.update_fig_size(n_splits, n) 263 | return self.clean_up() 264 | 265 | # could add groupby to box 266 | def box(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 267 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 268 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 269 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 270 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2, 271 | groupgap=0, box_kwargs=None): 272 | 273 | kwargs = dict(notch=None, sym=None, whis=None, 274 | patch_artist=True, bootstrap=None, usermedians=None, conf_intervals=None, meanline=None, 275 | showmeans=None, showcaps=None, showbox=None, showfliers=None, boxprops=None, labels=None, 276 | flierprops=None, medianprops=None, meanprops=None, capprops=None, whiskerprops=None, 277 | manage_ticks=True, autorange=False, zorder=None) 278 | 279 | if kwargs['medianprops'] is None: 280 | kwargs['medianprops'] = {'color': '.2'} 281 | 282 | # kwargs = dict(notch=notch, sym=sym, whis=whis, patch_artist=patch_artist, 283 | # bootstrap=bootstrap, usermedians=usermedians, conf_intervals=conf_intervals, 284 | # meanline=meanline, showmeans=showmeans, showcaps=showcaps, showbox=showbox, 285 | # showfliers=showfliers, boxprops=boxprops, labels=labels, flierprops=flierprops, 286 | # medianprops=medianprops, meanprops=meanprops, capprops=capprops, 287 | # whiskerprops=whiskerprops, manage_ticks=manage_ticks, 288 | # autorange=autorange, zorder=zorder) 289 | 290 | return _common_dist(x, y, data, split, row, col, x_order, y_order, split_order, 291 | row_order, col_order, orientation, wrap, figsize, title, 292 | sharex, sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 293 | x_textwrap, y_textwrap, x_rot, y_rot, mode, gap, groupgap, 294 | kind='boxplot', **kwargs) 295 | 296 | 297 | def violin(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 298 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 299 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 300 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 301 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2, 302 | groupgap=0, violin_kwargs=None): 303 | 304 | kwargs = dict(showmeans=False, showextrema=True, showmedians=True, 305 | quantiles=None, points=100, bw_method=None) 306 | 307 | # kwargs = dict(showmeans=showmeans, showextrema=showextrema, showmedians=showmedians, 308 | # quantiles=quantiles, points=points, bw_method=bw_method) 309 | 310 | return _common_dist(x, y, data, split, row, col, 311 | x_order, y_order, split_order, row_order, col_order, 312 | orientation, wrap, figsize, title, sharex, 313 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 314 | x_textwrap, y_textwrap, x_rot, y_rot, kind='violinplot', **kwargs) 315 | 316 | 317 | def hist(val, data=None, split=None, row=None, col=None, split_order=None, row_order=None, 318 | col_order=None, orientation='v', wrap=None, figsize=None, title=None, 319 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 320 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, 321 | mode='group', gap=.2, groupgap=0, hist_kwargs=None): 322 | 323 | hist_kwargs = dict(bins=None, range=None, density=False, weights=None, cumulative=False, 324 | bottom=None, histtype='bar', align='mid', rwidth=None, log=False) 325 | 326 | x_order = y_order = None 327 | x, y = (val, None) if orientation == 'v' else (None, val) 328 | bins = bins if bins else 20 329 | kwargs = dict(bins=bins, range=range, density=density, weights=weights, 330 | cumulative=cumulative, bottom=bottom, histtype=histtype, align=align, 331 | rwidth=rwidth, log=log) 332 | 333 | aggfunc = '__distribution__' 334 | sort_values = None 335 | self = MPLCommon(x, y, data, aggfunc, split, row, col, 336 | x_order, y_order, split_order, row_order, col_order, 337 | orientation, sort_values, wrap, figsize, title, sharex, 338 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 339 | x_textwrap, y_textwrap, x_rot, y_rot) 340 | 341 | orientation = 'vertical' if self.orientation == 'v' else 'horizontal' 342 | for ax, info in self.final_data.items(): 343 | cur_data, cur_ticklabels = self.get_distribution_data(info) 344 | 345 | handles = [] 346 | split_labels = [] 347 | n_splits = len(cur_data) 348 | n = len(next(iter(cur_data.values()))) # number of groups 349 | for split_label, data in cur_data.items(): 350 | filt = [len(arr) > 0 for arr in data] 351 | vals = [d for (d, f) in zip(data, filt) if f] 352 | ret = ax.hist(vals, orientation=orientation, alpha=.8, **kwargs) 353 | handles.append(ret[-1][0]) 354 | split_labels.append(split_label) 355 | 356 | self.add_legend(self.split, handles, split_labels) 357 | # self.update_fig_size(n_splits, n) 358 | return self.clean_up() 359 | 360 | 361 | def kde(x=None, y=None, data=None, split=None, row=None, col=None, split_order=None, 362 | row_order=None, col_order=None, orientation='v', wrap=None, figsize=None, 363 | title=None, sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, 364 | ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, 365 | x_rot=None, y_rot=None, range=None, cumulative=False): 366 | 367 | from ._utils import calculate_density_1d, calculate_density_2d 368 | 369 | x_order = y_order = None 370 | # x, y = (x, None) if orientation == 'v' else (None, x) 371 | kwargs = dict(range=range, cumulative=cumulative) 372 | 373 | if x is not None and y is not None and split is not None: 374 | raise ValueError('Cannot use `split` for 2-dimensional KDE plots') 375 | 376 | aggfunc = '__distribution__' if y is None else None 377 | sort_values = None 378 | self = MPLCommon(x, y, data, aggfunc, split, row, col, 379 | x_order, y_order, split_order, row_order, col_order, 380 | orientation, sort_values, wrap, figsize, title, sharex, 381 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 382 | x_textwrap, y_textwrap, x_rot, y_rot, check_numeric=True) 383 | 384 | for ax, info in self.final_data.items(): 385 | for vals in info: 386 | if aggfunc == '__distribution__': 387 | x, split_label = vals[:2] 388 | x, y = calculate_density_1d(x, cumulative=cumulative) 389 | x, y = (x, y) if self.orientation == 'v' else (y, x) 390 | ax.plot(x, y, label=split_label) 391 | else: 392 | x, y, split_label = vals[:3] 393 | xmin, xmax, ymin, ymax, Z = calculate_density_2d(x, y) 394 | ax.imshow(Z, extent=[xmin, xmax, ymin, ymax], aspect='auto') 395 | 396 | self.add_legend(self.split) 397 | # self.update_fig_size(n_splits, n) 398 | return self.clean_up() 399 | 400 | xy_doc = """ 401 | x : str, default None 402 | Column name of DataFrame whose values will go along the x-axis 403 | 404 | y : str, default None 405 | Column name of DataFrame whose values will go along the y-axis 406 | """ 407 | 408 | val_doc = """ 409 | val : str, default None 410 | Column name of DataFrame whose values will be used for distribution 411 | """ 412 | 413 | aggfunc_doc = """ 414 | aggfunc : str or function, default None 415 | Kind of aggregation to perform. Use a string that the DataFrame `agg` 416 | method understands. If providing a function, it will also be passed to 417 | the `agg` method. 418 | 419 | The strings 'countna' and 'percna' are also available to find the 420 | number and percentage of missing values. 421 | """ 422 | 423 | xy_order = """ 424 | x_order : str or list, default None 425 | Used as both a way to order and filter the x-values. Use the strings 426 | 'asc'/'desc' to order ascending or descending. 427 | 428 | Set a specific order with a list, i.e. `['House', 'Apartment', 'Townhouse']` 429 | 430 | Use the strings `'top n'` or `'bottom n'` where `n` is an integer. This will 431 | filter for the most/least frequent groups. 432 | 433 | By default, sorting happens in ascending order. 434 | 435 | y_order : str or list, default None 436 | See x_order 437 | 438 | split_order : str or list, default None 439 | See x_order 440 | 441 | row_order : str or list, default None 442 | See x_order 443 | 444 | col_order : str or list, default None 445 | See x_order 446 | """ 447 | 448 | split_order = """ 449 | split_order : str or list, default None 450 | Used as both a way to order and filter the x-values. Use the strings 451 | 'asc'/'desc' to order ascending or descending. 452 | 453 | Set a specific order with a list, i.e. `['House', 'Apartment', 'Townhouse']` 454 | 455 | Use the strings `'top n'` or `'bottom n'` where `n` is an integer. This will 456 | filter for the most/least frequent groups. 457 | 458 | By default, sorting happens in ascending order. 459 | 460 | row_order : str or list, default None 461 | See split_order 462 | 463 | col_order : str or list, default None 464 | See split_order 465 | """ 466 | 467 | sort_values_doc = """ 468 | sort_values : str - 'asc' or 'desc', default None 469 | Sort the values ascending or descending. If this is given, then 470 | x/y_order is ignored. 471 | """ 472 | 473 | doc = \ 474 | """ 475 | {plot_doc} 476 | 477 | Parameters 478 | ---------- 479 | {xy} 480 | data : DataFrame or Series, default None 481 | A pandas DataFrame with long or wide data. If provided a Series, do not 482 | supply x or y. 483 | {aggfunc} 484 | split : str, default None 485 | Column name that will be used in the DataFrame `groupby` method to 486 | split the data into independent groups within a single plot 487 | 488 | row : str 489 | Column name that will be used in the DataFrame `groupby` method to 490 | split the data into independent groups to form new plots. Each unique value 491 | in the `row` column forms a new row of plots. 492 | 493 | col : str 494 | Column name that will be used in the DataFrame `groupby` method to 495 | split the data into independent groups to form new plots. Each unique value 496 | in the `row` column forms a new row of plots. 497 | {order} 498 | orientation : str 'v' or 'h' 499 | Choose the orientation of the plots. By default, they are vertical 500 | ('v'), except for box and violin plots, which are horizontal. 501 | {sort_values} 502 | wrap : int, default None 503 | When using either `row` or either `col`, but not both, determines the 504 | maximum number of rows/cols before a new row/col is used. 505 | 506 | figsize : tuple, default None 507 | A tuple of numbers used passed to the `figsize` matplotlib parameter. 508 | By default, the figure size will be determined based on the kind of 509 | plot produced. 510 | 511 | title : str 512 | Sets the figure title NOT the Axes title 513 | 514 | sharex : bool 515 | Whether all plots should share the x-axis or not. Default is True 516 | 517 | sharey : bool 518 | Whether all plots should share the y-axis or not. Default is True 519 | 520 | xlabel : str 521 | Label used for x-axis on figures with a single plot 522 | 523 | ylabel : str 524 | Label used for y-axis on figures with a single plot 525 | 526 | xlim : 2-item tuple of numbers 527 | Determines x-axis limits for figures with a single plot 528 | 529 | ylim : 2-item tuple of numbers 530 | Determines y-axis limits for figures with a single plot 531 | 532 | xscale : 'linear', 'log', 'symlog', 'logit' 533 | Sets the scale of the x-axis. 534 | 535 | yscale : 'linear', 'log', 'symlog', 'logit' 536 | Sets the scale of the y-axis 537 | 538 | cmap : str or matplotlib colormap instance, default None 539 | 540 | x_textwrap : int, default 10 541 | Number of characters before wrapping text for x-labels 542 | 543 | y_textwrap : int, default None 544 | Number of characters before wrapping text for y-labels 545 | 546 | x_rot : int or float, default None 547 | Degree of rotation of x-tick labels. If between 0 and 180 548 | horizontal_alignment is set to 'right', otherwise 'left' 549 | 550 | y_rot : int or float, default None 551 | Degree of rotation of y-tick labels. If between 0 and 180 552 | vertical_alignment is set to 'top', otherwise 'bottom' 553 | 554 | mode : str 555 | 556 | gap : float 557 | 558 | groupgap : float 559 | 560 | Returns 561 | ------- 562 | A Matplotlib Figure instance 563 | """ 564 | 565 | 566 | # line doc 567 | line_doc = """\ 568 | Create line plots 569 | """ 570 | 571 | scatter_doc = """\ 572 | Create scatter plots 573 | """ 574 | 575 | bar_doc = """\ 576 | Create bar plots 577 | """ 578 | 579 | count_doc = """\ 580 | Create count plots 581 | """ 582 | 583 | box_doc = """\ 584 | Create box plots 585 | """ 586 | 587 | violin_doc = """\ 588 | Create violin plots 589 | """ 590 | 591 | hist_doc = """\ 592 | Create histograms 593 | """ 594 | 595 | kde_doc = """\ 596 | Create kernel density estimate plots 597 | """ 598 | 599 | line.__doc__ = doc.format(plot_doc=line_doc, xy=xy_doc, aggfunc=aggfunc_doc, 600 | order=xy_order, sort_values=sort_values_doc) 601 | 602 | scatter.__doc__ = doc.format(plot_doc=scatter_doc, xy=xy_doc, aggfunc=aggfunc_doc, 603 | order=xy_order, sort_values=sort_values_doc) 604 | 605 | bar.__doc__ = doc.format(plot_doc=bar_doc, xy=xy_doc, aggfunc=aggfunc_doc, 606 | order=xy_order, sort_values=sort_values_doc) 607 | 608 | count.__doc__ = doc.format(plot_doc=count_doc, xy=val_doc, aggfunc='', 609 | order=split_order, sort_values=sort_values_doc) 610 | 611 | box.__doc__ = doc.format(plot_doc=box_doc, xy=xy_doc, aggfunc='', 612 | order=xy_order, sort_values='') 613 | 614 | violin.__doc__ = doc.format(plot_doc=violin_doc, xy=xy_doc, aggfunc='', 615 | order=xy_order, sort_values='') 616 | 617 | hist.__doc__ = doc.format(plot_doc=hist_doc, xy=val_doc, aggfunc='', 618 | order=split_order, sort_values='') 619 | 620 | kde.__doc__ = doc.format(plot_doc=kde_doc, xy=val_doc, aggfunc='', 621 | order=split_order, sort_values='') 622 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Dexplot 2 | 3 | Dexplot is a Python library for delivering beautiful data visualizations with a simple and intuitive user experience. 4 | 5 | ## Goals 6 | 7 | The primary goals for dexplot are: 8 | 9 | * Maintain a very consistent API with as few functions as necessary to make the desired statistical plots 10 | * Allow the user tremendous power without using matplotlib 11 | 12 | 13 | ## Installation 14 | 15 | `pip install dexplot` 16 | 17 | ## Built for long and wide data 18 | 19 | Dexplot is primarily built for long data, which is a form of data where each row represents a single observation and each column represents a distinct quantity. It is often referred to as "tidy" data. Here, we have some long data. 20 | 21 | ![](images/long.png) 22 | 23 | Dexplot also has the ability to handle wide data, where multiple columns may contain values that represent the same kind of quantity. The same data above has been aggregated to show the mean for each combination of neighborhood and property type. It is now wide data as each column contains the same quantity (price). 24 | 25 | ![](images/wide.png) 26 | 27 | ## Usage 28 | 29 | Dexplot provides a small number of powerful functions that all work similarly. Most plotting functions have the following signature: 30 | 31 | ```python 32 | dxp.plotting_func(x, y, data, aggfunc, split, row, col, orientation, ...) 33 | ``` 34 | 35 | * `x` - Column name along the x-axis 36 | * `y` - Column name the y-axis 37 | * `data` - Pandas DataFrame 38 | * `aggfunc` - String of pandas aggregation function, 'min', 'max', 'mean', etc... 39 | * `split` - Column name to split data into distinct groups 40 | * `row` - Column name to split data into distinct subplots row-wise 41 | * `col` - Column name to split data into distinct subplots column-wise 42 | * `orientation` - Either vertical (`'v'`) or horizontal (`'h'`). Default for most plots is vertical. 43 | 44 | When `aggfunc` is provided, `x` will be the grouping variable and `y` will be aggregated when vertical and vice-versa when horizontal. The best way to learn how to use dexplot is with the examples below. 45 | 46 | ## Families of plots 47 | 48 | There are two primary families of plots, **aggregation** and **distribution**. Aggregation plots take a sequence of values and return a **single** value using the function provided to `aggfunc` to do so. Distribution plots take a sequence of values and depict the shape of the distribution in some manner. 49 | 50 | * Aggregation 51 | * bar 52 | * line 53 | * scatter 54 | * count 55 | * Distribution 56 | * box 57 | * violin 58 | * hist 59 | * kde 60 | 61 | ## Comparison with Seaborn 62 | 63 | If you have used the seaborn library, then you should notice a lot of similarities. Much of dexplot was inspired by Seaborn. Below is a list of the extra features in dexplot not found in seaborn 64 | 65 | * Ability to graph relative frequency and normalize over any number of variables 66 | * No need for multiple functions to do the same thing (far fewer public functions) 67 | * Ability to make grids with a single function instead of having to use a higher level function like `catplot` 68 | * Pandas `groupby` methods available as strings 69 | * Ability to sort by values 70 | * Ability to sort x/y labels lexicographically 71 | * Ability to select most/least frequent groups 72 | * x/y labels are wrapped so that they don't overlap 73 | * Figure size (plus several other options) and available to change without using matplotlib 74 | * A matplotlib figure object is returned 75 | 76 | ## Examples 77 | 78 | Most of the examples below use long data. 79 | 80 | ## Aggregating plots - bar, line and scatter 81 | 82 | We'll begin by covering the plots that **aggregate**. An aggregation is defined as a function that summarizes a sequence of numbers with a single value. The examples come from the Airbnb dataset, which contains many property rental listings from the Washington D.C. area. 83 | 84 | 85 | ```python 86 | import dexplot as dxp 87 | import pandas as pd 88 | airbnb = dxp.load_dataset('airbnb') 89 | airbnb.head() 90 | ``` 91 | 92 | 93 | 94 | 95 |
96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 |
neighborhoodproperty_typeaccommodatesbathroomsbedroomspricecleaning_feeratingsuperhostresponse_timelatitudelongitude
0ShawTownhouse163.5443325095.0Nowithin an hour38.90982-77.02016
1Brightwood ParkTownhouse43.541545097.0NoNaN38.95888-77.02554
2Capitol HillHouse21.51833597.0Yeswithin an hour38.88791-76.99668
3ShawHouse22.51475098.0NoNaN38.91331-77.02436
4Kalorama HeightsApartment31.011181591.0Nowithin an hour38.91933-77.04124
192 |
193 | 194 | 195 | 196 | There are more than 4,000 listings in our dataset. We will use bar charts to aggregate the data. 197 | 198 | 199 | ```python 200 | airbnb.shape 201 | ``` 202 | 203 | 204 | 205 | 206 | (4581, 12) 207 | 208 | 209 | 210 | ### Vertical bar charts 211 | 212 | In order to performa an aggregation, you must supply a value for `aggfunc`. Here, we find the median price per neighborhood. Notice that the column names automatically wrap. 213 | 214 | 215 | ```python 216 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median') 217 | ``` 218 | 219 | 220 | 221 | 222 | ![png](images/output_7_0.png) 223 | 224 | 225 | 226 | Line and scatter plots can be created with the same command, just substituting the name of the function. They both are not good choices for the visualization since the grouping variable (neighborhood) has no meaningful order. 227 | 228 | 229 | ```python 230 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median') 231 | ``` 232 | 233 | 234 | 235 | 236 | ![png](images/output_9_0.png) 237 | 238 | 239 | 240 | 241 | ```python 242 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median') 243 | ``` 244 | 245 | 246 | 247 | 248 | ![png](images/output_10_0.png) 249 | 250 | 251 | 252 | ### Components of the groupby aggregation 253 | 254 | Anytime the `aggfunc` parameter is set, you have performed a groupby aggregation, which always consists of three components: 255 | 256 | * Grouping column - unique values of this column form independent groups (neighborhood) 257 | * Aggregating column - the column that will get summarized with a single value (price) 258 | * Aggregating function - a function that returns a single value (median) 259 | 260 | The general format for doing this in pandas is: 261 | 262 | ```python 263 | df.groupby('grouping column').agg({'aggregating column': 'aggregating function'}) 264 | ``` 265 | 266 | Specifically, the following code is executed within dexplot. 267 | 268 | 269 | ```python 270 | airbnb.groupby('neighborhood').agg({'price': 'median'}) 271 | ``` 272 | 273 | 274 | 275 | 276 |
277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 |
price
neighborhood
Brightwood Park87.0
Capitol Hill129.5
Columbia Heights95.0
Dupont Circle125.0
Edgewood100.0
Kalorama Heights118.0
Shaw133.5
Union Station120.0
323 |
324 | 325 | 326 | 327 | ### Number and percent of missing values with `'countna'` and `'percna'` 328 | 329 | In addition to all the common aggregating functions, you can use the strings `'countna'` and `'percna'` to get the number and percentage of missing values per group. 330 | 331 | 332 | ```python 333 | dxp.bar(x='neighborhood', y='response_time', data=airbnb, aggfunc='countna') 334 | ``` 335 | 336 | 337 | 338 | 339 | ![png](images/output_14_0.png) 340 | 341 | 342 | 343 | ### Sorting the bars by values 344 | 345 | By default, the bars will be sorted by the grouping column (x-axis here) in alphabetical order. Use the `sort_values` parameter to sort the bars by value. 346 | 347 | * None - sort x/y axis labels alphabetically (default) 348 | * `asc` - sort values from least to greatest 349 | * `desc` - sort values from greatest to least 350 | 351 | 352 | ```python 353 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc') 354 | ``` 355 | 356 | 357 | 358 | 359 | ![png](images/output_16_0.png) 360 | 361 | 362 | 363 | Here, we sort the values from greatest to least. 364 | 365 | 366 | ```python 367 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc') 368 | ``` 369 | 370 | 371 | 372 | 373 | ![png](images/output_18_0.png) 374 | 375 | 376 | 377 | ### Specify order with `x_order` 378 | 379 | Specify a specific order of the labels on the x-axis by passing a list of values to `x_order`. This can also act as a filter to limit the number of bars. 380 | 381 | 382 | ```python 383 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 384 | x_order=['Dupont Circle', 'Edgewood', 'Union Station']) 385 | ``` 386 | 387 | 388 | 389 | 390 | ![png](images/output_20_0.png) 391 | 392 | 393 | 394 | By default, `x_order` and all of the `_order` parameters are set to `'asc'` by default, which will order them alphabetically. Use the string `'desc'` to sort in the opposite direction. 395 | 396 | 397 | ```python 398 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc') 399 | ``` 400 | 401 | 402 | 403 | 404 | ![png](images/output_22_0.png) 405 | 406 | 407 | 408 | ### Filter for the neighborhoods with most/least frequency of occurrence 409 | 410 | You can use `x_order` again to filter for the x-values that appear the most/least often by setting it to the string `'top n'` or `'bottom n'` where `n` is an integer. Here, we filter for the top 4 most frequently occurring neighborhoods. This option is useful when there are dozens of unique values in the grouping column. 411 | 412 | 413 | ```python 414 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 415 | x_order='top 4') 416 | ``` 417 | 418 | 419 | 420 | 421 | ![png](images/output_24_0.png) 422 | 423 | 424 | 425 | We can verify that the four neighborhoods are the most common. 426 | 427 | 428 | ```python 429 | airbnb['neighborhood'].value_counts() 430 | ``` 431 | 432 | 433 | 434 | 435 | Columbia Heights 773 436 | Union Station 713 437 | Capitol Hill 654 438 | Edgewood 610 439 | Dupont Circle 549 440 | Shaw 514 441 | Brightwood Park 406 442 | Kalorama Heights 362 443 | Name: neighborhood, dtype: int64 444 | 445 | 446 | 447 | ### Horizontal bars 448 | 449 | Set `orientation` to `'h'` for horizontal bars. When you do this, you'll need to switch `x` and `y` since the grouping column (neighborhood) will be along the y-axis and the aggregating column (price) will be along the x-axis. 450 | 451 | 452 | ```python 453 | dxp.bar(x='price', y='neighborhood', data=airbnb, aggfunc='median', 454 | orientation='h', sort_values='desc') 455 | ``` 456 | 457 | 458 | 459 | 460 | ![png](images/output_28_0.png) 461 | 462 | 463 | 464 | Switching orientation is possible for most other plots. 465 | 466 | 467 | ```python 468 | dxp.line(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h') 469 | ``` 470 | 471 | 472 | 473 | 474 | ![png](images/output_30_0.png) 475 | 476 | 477 | 478 | ### Split bars into groups 479 | 480 | You can split each bar into further groups by setting the `split` parameter to another column. 481 | 482 | 483 | ```python 484 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost') 485 | ``` 486 | 487 | 488 | 489 | 490 | ![png](images/output_32_0.png) 491 | 492 | 493 | 494 | We can use the `pivot_table` method to verify the results in pandas. 495 | 496 | 497 | ```python 498 | airbnb.pivot_table(index='superhost', columns='neighborhood', 499 | values='price', aggfunc='median') 500 | ``` 501 | 502 | 503 | 504 | 505 |
506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 |
neighborhoodBrightwood ParkCapitol HillColumbia HeightsDupont CircleEdgewoodKalorama HeightsShawUnion Station
superhost
No85.0129.090.5120.0100.0110.0130.0120.0
Yes90.0130.0103.0135.0100.0124.0135.0125.0
556 |
557 | 558 | 559 | 560 | Set the order of the unique split values with `split_order`, which can also act as a filter. 561 | 562 | 563 | ```python 564 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 565 | split='superhost', split_order=['Yes', 'No']) 566 | ``` 567 | 568 | 569 | 570 | 571 | ![png](images/output_36_0.png) 572 | 573 | 574 | 575 | Like all the `_order` parameters, `split_order` defaults to `'asc'` (alphabetical) order. Set it to `'desc'` for the opposite. 576 | 577 | 578 | ```python 579 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 580 | split='property_type', split_order='desc') 581 | ``` 582 | 583 | 584 | 585 | 586 | ![png](images/output_38_0.png) 587 | 588 | 589 | 590 | Filtering for the most/least frequent split categories is possible. 591 | 592 | 593 | ```python 594 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 595 | split='property_type', split_order='bottom 2') 596 | ``` 597 | 598 | 599 | 600 | 601 | ![png](images/output_40_0.png) 602 | 603 | 604 | 605 | Verifying that the least frequent property types are Townhouse and Condominium. 606 | 607 | 608 | ```python 609 | airbnb['property_type'].value_counts() 610 | ``` 611 | 612 | 613 | 614 | 615 | Apartment 2403 616 | House 877 617 | Townhouse 824 618 | Condominium 477 619 | Name: property_type, dtype: int64 620 | 621 | 622 | 623 | ### Stacked bar charts 624 | 625 | Stack all the split groups one on top of the other by setting `stacked` to `True`. 626 | 627 | 628 | ```python 629 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 630 | split='superhost', split_order=['Yes', 'No'], stacked=True) 631 | ``` 632 | 633 | 634 | 635 | 636 | ![png](images/output_44_0.png) 637 | 638 | 639 | 640 | ### Split into multiple plots 641 | 642 | It's possible to split the data further into separate plots by the unique values in a different column with the `row` and `col` parameters. Here, each kind of `property_type` has its own plot. 643 | 644 | 645 | ```python 646 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 647 | split='superhost', col='property_type') 648 | ``` 649 | 650 | 651 | 652 | 653 | ![png](images/output_46_0.png) 654 | 655 | 656 | 657 | If there isn't room for all of the plots, set the `wrap` parameter to an integer to set the maximum number of plots per row/col. We also specify the `col_order` to be descending alphabetically. 658 | 659 | 660 | ```python 661 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 662 | split='superhost', col='property_type', wrap=2, col_order='desc') 663 | ``` 664 | 665 | 666 | 667 | 668 | ![png](images/output_48_0.png) 669 | 670 | 671 | 672 | Use `col_order` to both filter and set a specific order for the plots. 673 | 674 | 675 | ```python 676 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 677 | split='superhost', col='property_type', col_order=['House', 'Condominium']) 678 | ``` 679 | 680 | 681 | 682 | 683 | ![png](images/output_50_0.png) 684 | 685 | 686 | 687 | Splits can be made simultaneously along row and columns. 688 | 689 | 690 | ```python 691 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost', 692 | col='property_type', col_order=['House', 'Condominium', 'Apartment'], 693 | row='bedrooms', row_order=[1, 2, 3]) 694 | ``` 695 | 696 | 697 | 698 | 699 | ![png](images/output_52_0.png) 700 | 701 | 702 | 703 | By default, all axis limits are shared. Allow each plot to set its own limits by setting `sharex` and `sharey` to `False`. 704 | 705 | 706 | ```python 707 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost', 708 | col='property_type', col_order=['House', 'Condominium', 'Apartment'], 709 | row='bedrooms', row_order=[1, 2, 3], sharey=False) 710 | ``` 711 | 712 | 713 | 714 | 715 | ![png](images/output_54_0.png) 716 | 717 | 718 | 719 | ### Set the width of each bar with `size` 720 | 721 | The width (height when horizontal) of the bars is set with the `size` parameter. By default, this value is .9. Think of this number as the relative width of all the bars for a particular x/y value, where 1 is the distance between each x/y value. 722 | 723 | 724 | ```python 725 | dxp.bar(x='neighborhood', y='price', data=airbnb, 726 | aggfunc='median', split='property_type', 727 | split_order=['Apartment', 'House'], 728 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'], size=.5) 729 | ``` 730 | 731 | 732 | 733 | 734 | ![png](images/output_56_0.png) 735 | 736 | 737 | 738 | ### Splitting line plots 739 | 740 | All the other aggregating plots work similarly. 741 | 742 | 743 | ```python 744 | dxp.line(x='neighborhood', y='price', data=airbnb, 745 | aggfunc='median', split='property_type', 746 | split_order=['Apartment', 'House'], 747 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station']) 748 | ``` 749 | 750 | 751 | 752 | 753 | ![png](images/output_58_0.png) 754 | 755 | 756 | 757 | ## Distribution plots - box, violin, histogram, kde 758 | 759 | Distribution plots work similarly, but do not have an `aggfunc` since they do not aggregate. They take their group of values and draw some kind of shape that gives information on how that variable is distributed. 760 | 761 | ### Box plots 762 | 763 | Box plots have colored boxes with ends at the first and third quartiles and a line at the median. The whiskers are placed at 1.5 times the difference between the third and first quartiles (Interquartile range (IQR)). Fliers are the points outside this range and plotted individually. By default, both box and violin plots are plotted horizontally. 764 | 765 | 766 | ```python 767 | dxp.box(x='price', y='neighborhood', data=airbnb) 768 | ``` 769 | 770 | 771 | 772 | 773 | ![png](images/output_60_0.png) 774 | 775 | 776 | 777 | Split the groups in the same manner as with the aggregation plots. 778 | 779 | 780 | ```python 781 | dxp.box(x='price', y='neighborhood', data=airbnb, 782 | split='superhost', split_order=['Yes', 'No']) 783 | ``` 784 | 785 | 786 | 787 | 788 | ![png](images/output_62_0.png) 789 | 790 | 791 | 792 | Order the appearance of the splits alphabetically (in descending order here). 793 | 794 | 795 | ```python 796 | dxp.box(x='price', y='neighborhood', data=airbnb, 797 | split='property_type', split_order='desc') 798 | ``` 799 | 800 | 801 | 802 | 803 | ![png](images/output_64_0.png) 804 | 805 | 806 | 807 | ### Filter range of values with `x_order` 808 | 809 | It's possible to filter the range of possible values by passing in a list of the minimum and maximum to `x_order`. 810 | 811 | 812 | ```python 813 | dxp.box(x='price', y='neighborhood', data=airbnb, 814 | split='superhost', x_order=[50, 250]) 815 | ``` 816 | 817 | 818 | 819 | 820 | ![png](images/output_66_0.png) 821 | 822 | 823 | 824 | Change the `x` and `y` while setting `orientation` to make vertical bar plots. 825 | 826 | 827 | ```python 828 | dxp.box(x='neighborhood', y='price', data=airbnb, orientation='v', 829 | split='property_type', split_order='top 2') 830 | ``` 831 | 832 | 833 | 834 | 835 | ![png](images/output_68_0.png) 836 | 837 | 838 | 839 | Violin plots work identically to box plots, but show "violins", kernel density plots duplicated on both sides of a line. 840 | 841 | 842 | ```python 843 | dxp.violin(x='price', y='neighborhood', data=airbnb, 844 | split='superhost', split_order=['Yes', 'No']) 845 | ``` 846 | 847 | 848 | 849 | 850 | ![png](images/output_70_0.png) 851 | 852 | 853 | 854 | Splitting by rows and columns is possible as well with distribution plots. 855 | 856 | 857 | ```python 858 | dxp.box(x='price', y='neighborhood', data=airbnb,split='superhost', 859 | col='property_type', col_order=['House', 'Condominium', 'Apartment'], 860 | row='bedrooms', row_order=[1, 2]) 861 | ``` 862 | 863 | 864 | 865 | 866 | ![png](images/output_72_0.png) 867 | 868 | 869 | 870 | ### Histograms 871 | 872 | Histograms work in a slightly different manner. Instead of passing both `x` and `y`, you give it a single numeric column. A vertical histogram with 20 bins of the counts is created by default. 873 | 874 | 875 | ```python 876 | dxp.hist(val='price', data=airbnb) 877 | ``` 878 | 879 | 880 | 881 | 882 | ![png](images/output_74_0.png) 883 | 884 | 885 | 886 | We can use `split` just like we did above and also create horizontal histograms. 887 | 888 | 889 | ```python 890 | dxp.hist(val='price', data=airbnb, orientation='h', split='superhost', bins=15) 891 | ``` 892 | 893 | 894 | 895 | 896 | ![png](images/output_76_0.png) 897 | 898 | 899 | 900 | Here, we customize our histogram by plotting the cumulative density as opposed to the raw frequency count using the outline of the bars ('step'). 901 | 902 | 903 | ```python 904 | dxp.hist(val='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3], 905 | bins=30, density=True, histtype='step', cumulative=True) 906 | ``` 907 | 908 | 909 | 910 | 911 | ![png](images/output_78_0.png) 912 | 913 | 914 | 915 | ### KDE Plots 916 | 917 | Kernel density estimates provide an estimate for the probability distribution of a continuous variable. Here, we examine how price is distributed by bedroom. 918 | 919 | 920 | ```python 921 | dxp.kde(x='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3]) 922 | ``` 923 | 924 | 925 | 926 | 927 | ![png](images/output_80_0.png) 928 | 929 | 930 | 931 | Graph the cumulative distribution instead on multiple plots. 932 | 933 | 934 | ```python 935 | dxp.kde(x='price', data=airbnb, split='bedrooms', 936 | split_order=[1, 2, 3], cumulative=True, col='property_type', wrap=2) 937 | ``` 938 | 939 | 940 | 941 | 942 | ![png](images/output_82_0.png) 943 | 944 | 945 | 946 | ### Two-dimensional KDE's 947 | 948 | Provide two numeric columns to `x` and `y` to get a two dimensional KDE. 949 | 950 | 951 | ```python 952 | dxp.kde(x='price', y='cleaning_fee', data=airbnb) 953 | ``` 954 | 955 | 956 | 957 | 958 | ![png](images/output_84_0.png) 959 | 960 | 961 | 962 | Create a grid of two-dimensional KDE's. 963 | 964 | 965 | ```python 966 | dxp.kde(x='price', y='cleaning_fee', data=airbnb, row='neighborhood', wrap=3) 967 | ``` 968 | 969 | 970 | 971 | 972 | ![png](images/output_86_0.png) 973 | 974 | 975 | 976 | ## Count plots 977 | 978 | The `count` function graphs the frequency of unique values as bars. By default, it plots the values in descending order. 979 | 980 | 981 | ```python 982 | dxp.count(val='neighborhood', data=airbnb) 983 | ``` 984 | 985 | 986 | 987 | 988 | ![png](images/output_88_0.png) 989 | 990 | 991 | 992 | In pandas, this is a straightforward call to the `value_counts` method. 993 | 994 | 995 | ```python 996 | airbnb['neighborhood'].value_counts() 997 | ``` 998 | 999 | 1000 | 1001 | 1002 | Columbia Heights 773 1003 | Union Station 713 1004 | Capitol Hill 654 1005 | Edgewood 610 1006 | Dupont Circle 549 1007 | Shaw 514 1008 | Brightwood Park 406 1009 | Kalorama Heights 362 1010 | Name: neighborhood, dtype: int64 1011 | 1012 | 1013 | 1014 | ### Relative frequency with `normalize` 1015 | 1016 | Instead of the raw counts, get the relative frequency by setting normalize to `True`. 1017 | 1018 | 1019 | ```python 1020 | dxp.count(val='neighborhood', data=airbnb, normalize=True) 1021 | ``` 1022 | 1023 | 1024 | 1025 | 1026 | ![png](images/output_92_0.png) 1027 | 1028 | 1029 | 1030 | Here, we split by property type. 1031 | 1032 | 1033 | ```python 1034 | dxp.count(val='neighborhood', data=airbnb, split='property_type') 1035 | ``` 1036 | 1037 | 1038 | 1039 | 1040 | ![png](images/output_94_0.png) 1041 | 1042 | 1043 | 1044 | In pandas, this is done with the `crosstab` function. 1045 | 1046 | 1047 | ```python 1048 | pd.crosstab(index=airbnb['property_type'], columns=airbnb['neighborhood']) 1049 | ``` 1050 | 1051 | 1052 | 1053 | 1054 |
1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | 1091 | 1092 | 1093 | 1094 | 1095 | 1096 | 1097 | 1098 | 1099 | 1100 | 1101 | 1102 | 1103 | 1104 | 1105 | 1106 | 1107 | 1108 | 1109 | 1110 | 1111 | 1112 | 1113 | 1114 | 1115 | 1116 | 1117 | 1118 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | 1126 |
neighborhoodBrightwood ParkCapitol HillColumbia HeightsDupont CircleEdgewoodKalorama HeightsShawUnion Station
property_type
Apartment167299374397244284315323
Condominium3570976265425254
House131137157471462361175
Townhouse73148145431551386161
1127 |
1128 | 1129 | 1130 | 1131 | Horizontal stacked count plots. 1132 | 1133 | 1134 | ```python 1135 | dxp.count(val='neighborhood', data=airbnb, split='property_type', 1136 | orientation='h', stacked=True, col='superhost') 1137 | ``` 1138 | 1139 | 1140 | 1141 | 1142 | ![png](images/output_98_0.png) 1143 | 1144 | 1145 | 1146 | ### Normalize over different variables 1147 | 1148 | Setting `normalize` to `True`, returns the relative frequency with respect to all of the data. You can normalize over any of the variables provided. 1149 | 1150 | 1151 | ```python 1152 | dxp.count(val='neighborhood', data=airbnb, split='property_type', normalize='neighborhood', 1153 | title='Relative Frequency by Neighborhood') 1154 | ``` 1155 | 1156 | 1157 | 1158 | 1159 | ![png](images/output_100_0.png) 1160 | 1161 | 1162 | 1163 | Normalize over several variables at once with a list. 1164 | 1165 | 1166 | ```python 1167 | dxp.count(val='neighborhood', data=airbnb, split='superhost', 1168 | row='property_type', col='bedrooms', col_order=[1, 2], 1169 | normalize=['neighborhood', 'property_type', 'bedrooms'], stacked=True) 1170 | ``` 1171 | 1172 | 1173 | 1174 | 1175 | ![png](images/output_102_0.png) 1176 | 1177 | 1178 | 1179 | ## Wide data 1180 | 1181 | Dexplot can also plot wide data, or data where no aggregation happens. Here is a scatter plot of the location of each listing. 1182 | 1183 | 1184 | ```python 1185 | dxp.scatter(x='longitude', y='latitude', data=airbnb, 1186 | split='neighborhood', col='bedrooms', col_order=[2, 3]) 1187 | ``` 1188 | 1189 | 1190 | 1191 | 1192 | ![png](images/output_104_0.png) 1193 | 1194 | 1195 | 1196 | If you've already aggregated your data, you can plot it directly without specifying `x` or `y`. 1197 | 1198 | 1199 | ```python 1200 | df = airbnb.pivot_table(index='neighborhood', columns='property_type', 1201 | values='price', aggfunc='mean') 1202 | df 1203 | ``` 1204 | 1205 | 1206 | 1207 | 1208 |
1209 | 1210 | 1211 | 1212 | 1213 | 1214 | 1215 | 1216 | 1217 | 1218 | 1219 | 1220 | 1221 | 1222 | 1223 | 1224 | 1225 | 1226 | 1227 | 1228 | 1229 | 1230 | 1231 | 1232 | 1233 | 1234 | 1235 | 1236 | 1237 | 1238 | 1239 | 1240 | 1241 | 1242 | 1243 | 1244 | 1245 | 1246 | 1247 | 1248 | 1249 | 1250 | 1251 | 1252 | 1253 | 1254 | 1255 | 1256 | 1257 | 1258 | 1259 | 1260 | 1261 | 1262 | 1263 | 1264 | 1265 | 1266 | 1267 | 1268 | 1269 | 1270 | 1271 | 1272 | 1273 | 1274 | 1275 | 1276 | 1277 | 1278 | 1279 | 1280 | 1281 | 1282 | 1283 | 1284 | 1285 |
property_typeApartmentCondominiumHouseTownhouse
neighborhood
Brightwood Park96.119760105.000000121.671756133.479452
Capitol Hill141.210702104.200000170.153285184.459459
Columbia Heights114.676471126.773196135.292994124.358621
Dupont Circle146.858942130.709677179.574468139.348837
Edgewood108.508197112.846154156.335616147.503226
Kalorama Heights122.542254155.92857192.695652158.230769
Shaw153.888889158.500000202.114754173.279070
Union Station128.458204133.833333162.748571162.167702
1286 |
1287 | 1288 | 1289 | 1290 | 1291 | ```python 1292 | dxp.bar(data=df, orientation='h') 1293 | ``` 1294 | 1295 | 1296 | 1297 | 1298 | ![png](images/output_107_0.png) 1299 | 1300 | 1301 | 1302 | ### Time series 1303 | 1304 | 1305 | ```python 1306 | stocks = pd.read_csv('../data/stocks10.csv', parse_dates=['date'], index_col='date') 1307 | stocks.head() 1308 | ``` 1309 | 1310 | 1311 | 1312 | 1313 |
1314 | 1315 | 1316 | 1317 | 1318 | 1319 | 1320 | 1321 | 1322 | 1323 | 1324 | 1325 | 1326 | 1327 | 1328 | 1329 | 1330 | 1331 | 1332 | 1333 | 1334 | 1335 | 1336 | 1337 | 1338 | 1339 | 1340 | 1341 | 1342 | 1343 | 1344 | 1345 | 1346 | 1347 | 1348 | 1349 | 1350 | 1351 | 1352 | 1353 | 1354 | 1355 | 1356 | 1357 | 1358 | 1359 | 1360 | 1361 | 1362 | 1363 | 1364 | 1365 | 1366 | 1367 | 1368 | 1369 | 1370 | 1371 | 1372 | 1373 | 1374 | 1375 | 1376 | 1377 | 1378 | 1379 | 1380 | 1381 | 1382 | 1383 | 1384 | 1385 | 1386 | 1387 | 1388 | 1389 | 1390 | 1391 | 1392 | 1393 | 1394 | 1395 | 1396 | 1397 | 1398 | 1399 | 1400 | 1401 | 1402 | 1403 | 1404 | 1405 | 1406 | 1407 | 1408 | 1409 | 1410 |
MSFTAAPLSLBAMZNTSLAXOMWMTTFBV
date
1999-10-2529.842.3217.0282.75NaN21.4538.9916.78NaNNaN
1999-10-2629.822.3416.6581.25NaN20.8937.1117.28NaNNaN
1999-10-2729.332.3816.5275.94NaN20.8036.9418.27NaNNaN
1999-10-2829.012.4316.5971.00NaN21.1938.8519.79NaNNaN
1999-10-2929.882.5017.2170.62NaN21.4739.2520.00NaNNaN
1411 |
1412 | 1413 | 1414 | 1415 | 1416 | ```python 1417 | dxp.line(data=stocks.head(500)) 1418 | ``` 1419 | 1420 | 1421 | 1422 | 1423 | ![png](images/output_110_0.png) 1424 | 1425 | 1426 | -------------------------------------------------------------------------------- /dexplot/_common_plot.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | import warnings 3 | from collections import defaultdict 4 | import io 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | from matplotlib import ticker 10 | from matplotlib.colors import Colormap 11 | 12 | 13 | NONETYPE = type(None) 14 | 15 | class CommonPlot: 16 | 17 | 18 | def __init__(self, x, y, data, aggfunc, split, row, col, 19 | x_order, y_order, split_order, row_order, col_order, 20 | orientation, sort_values, wrap, figsize, title, sharex, sharey, 21 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 22 | x_textwrap, y_textwrap, x_rot, y_rot, 23 | check_numeric=False, kind=None): 24 | 25 | self.used_columns = set() 26 | self.data = self.get_data(data) 27 | self.x = self.get_col(x) 28 | self.y = self.get_col(y) 29 | self.validate_x_y() 30 | self.orientation = orientation 31 | self.aggfunc = self.get_aggfunc(aggfunc) 32 | self.groupby = self.get_groupby() 33 | self.split = self.get_col(split) 34 | self.row = self.get_col(row) 35 | self.col = self.get_col(col) 36 | 37 | self.agg = self.set_agg() 38 | self.make_groups_categorical(kind) 39 | self.validate_numeric(check_numeric) 40 | 41 | self.x_order = self.validate_order(x_order, 'x') 42 | self.y_order = self.validate_order(y_order, 'y') 43 | self.split_order = self.validate_order(split_order, 'split') 44 | self.row_order = self.validate_order(row_order, 'row') 45 | self.col_order = self.validate_order(col_order, 'col') 46 | self.filter_data() 47 | self.groupby_order = self.get_groupby_order() 48 | 49 | self.sort_values = sort_values 50 | self.groupby_sort = True 51 | self.wrap = wrap 52 | self.figsize = figsize 53 | self.title = title 54 | self.sharex = sharex 55 | self.sharey = sharey 56 | self.xlabel = xlabel 57 | self.ylabel = ylabel 58 | self.xlim = xlim 59 | self.ylim = ylim 60 | self.xscale = xscale 61 | self.yscale = yscale 62 | self.colors = self.get_colors(cmap) 63 | self.x_textwrap = x_textwrap 64 | self.y_textwrap = y_textwrap 65 | self.x_rot = x_rot 66 | self.y_rot = y_rot 67 | 68 | self.validate_args() 69 | self.plot_type = self.get_plot_type() 70 | self.agg_kind = self.get_agg_kind() 71 | self.data = self.set_index() 72 | self.rows, self.cols = self.get_uniques() 73 | self.rows, self.cols = self.get_row_col_order() 74 | self.fig_shape = self.get_fig_shape() 75 | 76 | 77 | def get_data(self, data): 78 | if isinstance(data, pd.Series): 79 | return data.to_frame() 80 | 81 | if not isinstance(data, pd.DataFrame): 82 | raise TypeError('`data` must be a pandas DataFrame or Series') 83 | elif len(data) == 0: 84 | raise ValueError('DataFrame contains no data') 85 | return data.copy() 86 | 87 | def get_col(self, col, group=False): 88 | if col: 89 | try: 90 | col in self.data.columns 91 | except KeyError: 92 | raise KeyError(f'{col} is not a column in the DataFrame') 93 | 94 | if col in self.used_columns: 95 | raise ValueError(f'Column {col} has already been chosen. ' 96 | '`x`, `y`, `split`, `row`, and `col` must all be unique.') 97 | self.used_columns.add(col) 98 | return col 99 | 100 | def validate_x_y(self): 101 | if self.x == self.y and self.x is not None and self.y is not None: 102 | raise ValueError('`x` and `y` cannot be the same column name') 103 | 104 | def get_aggfunc(self, aggfunc): 105 | if aggfunc == 'countna': 106 | return lambda x: x.isna().sum() 107 | if aggfunc == 'percna': 108 | return lambda x: x.isna().mean() 109 | return aggfunc 110 | 111 | def get_groupby(self): 112 | if self.x is None or self.y is None or self.aggfunc is None: 113 | return 114 | return self.x if self.orientation == 'v' else self.y 115 | 116 | def set_agg(self): 117 | return self.y if self.orientation == 'v' else self.x 118 | 119 | def filter_data(self): 120 | params = 'x', 'y', 'split', 'row', 'col' 121 | for param in params: 122 | name, order = getattr(self, param), getattr(self, param + '_order') 123 | if name and order: 124 | s = self.data[name] 125 | if isinstance(order, list): 126 | if s.dtype.kind == 'O': 127 | for val in order: 128 | if not (s == val).any(): 129 | raise ValueError(f'Value {val} is not in column {name}') 130 | self.data = self.data[s.isin(order)] 131 | else: 132 | # allow datetimes? 133 | if len(order) != 2: 134 | raise ValueError(f'You are filtering {name}. Provide a two-item list ' 135 | 'of the min and max values') 136 | self.data = self.data[s.between(*order)] 137 | elif isinstance(order, int): 138 | vc = s.value_counts() 139 | if order > 0: 140 | idx = vc.index[:order] 141 | else: 142 | idx = vc.index[order:] 143 | self.data = self.data[s.isin(idx)] 144 | setattr(self, param +'_order', idx.tolist()) 145 | 146 | if name and self.data[name].dtype.name == 'category': 147 | self.data[name].cat.remove_unused_categories(inplace=True) 148 | 149 | def make_groups_categorical(self, kind): 150 | category_cols = [self.groupby, self.split, self.row, self.col] 151 | for col in category_cols: 152 | if col: 153 | if self.data[col].dtype.name != 'category': 154 | self.data[col] = self.data[col].astype('category') 155 | if kind == 'count': 156 | col = self.x or self.y 157 | if self.data[col].dtype.name != 'category': 158 | self.data[col] = self.data[col].astype('category') 159 | 160 | def validate_numeric(self, check_numeric): 161 | if check_numeric: 162 | for val in (self.x, self.y): 163 | if val and self.data[val].dtype.kind not in ('i', 'f', 'b'): 164 | raise TypeError(f'Column {val} must be numeric (integer or float)') 165 | 166 | def validate_order(self, order, kind): 167 | if isinstance(order, str): 168 | order = order.strip().lower() 169 | if order in ('asc', 'desc'): 170 | return order 171 | command = order.split() 172 | if len(command) != 2 or command[0] not in ('top', 'bottom'): 173 | raise ValueError(f'{kind}_order string must begin with either "asc"/"desc" OR ' 174 | ' "top" or "bottom" followed by a space and then an integer.') 175 | mult = int(command[0] == "top") * 2 - 1 176 | try: 177 | num = int(command[1]) 178 | except ValueError: 179 | raise ValueError(f'{command[1]} is not a valid integer') 180 | if num == 0: 181 | raise ValueError('Number cannot be 0') 182 | return num * mult 183 | elif isinstance(order, (tuple, list)): 184 | return list(order) 185 | elif hasattr(order, 'tolist'): 186 | return order.tolist() 187 | elif order is not None: 188 | raise TypeError(f'{kind}_order must be a str or tuple/list/array/series.') 189 | 190 | def get_groupby_order(self): 191 | if self.x == self.groupby: 192 | return self.x_order 193 | if self.y == self.groupby: 194 | return self.y_order 195 | 196 | def get_colors(self, cmap): 197 | if cmap is None: 198 | cmap = 't10' 199 | 200 | if isinstance(cmap, str): 201 | from .colors._colormaps import colormaps 202 | try: 203 | return colormaps[cmap.lower()] 204 | except KeyError: 205 | raise KeyError(f'Colormap {cmap} does not exist. Here are the ' 206 | f'possible colormaps: {colormaps.keys()}') 207 | elif isinstance(cmap, Colormap): 208 | return cmap(range(cmap.N)).tolist() 209 | elif isinstance(cmap, list): 210 | return cmap 211 | elif isinstance(cmap, tuple): 212 | return list(cmap) 213 | elif hasattr(cmap, 'tolist'): 214 | return cmap.tolist() 215 | else: 216 | raise TypeError('`cmap` must be a string name of a colormap, a matplotlib colormap ' 217 | 'instance, list, or tuple of colors') 218 | 219 | def validate_args(self): 220 | self.validate_plot_args() 221 | self.validate_mpl_args() 222 | self.validate_sort_values() 223 | 224 | def validate_plot_args(self): 225 | if self.orientation not in ('v', 'h'): 226 | raise ValueError('`orientation` must be either "v" or "h".') 227 | 228 | if not isinstance(self.wrap, (np.integer, int, NONETYPE)): 229 | raise TypeError(f'`wrap` must either be None or an integer, not {type(wrap)}') 230 | 231 | if self.row and self.col and self.wrap is not None: 232 | raise ValueError('You cannot provide a value for `wrap` if `row` ' 233 | 'and `col` are also provided') 234 | 235 | def validate_mpl_args(self): 236 | if not isinstance(self.title, (NONETYPE, str)): 237 | raise TypeError('`title` must be either None or a str') 238 | if self.sharex not in (False, True, None, 'row', 'col'): 239 | raise ValueError('`sharex` must be one of `False`, `True`, `None`, "row", or "col"') 240 | if self.sharey not in (False, True, None, 'row', 'col'): 241 | raise ValueError('`sharex` must be one of `False`, `True`, `None`, "row", or "col"') 242 | 243 | if not isinstance(self.xlabel, (NONETYPE, str)): 244 | raise TypeError('`xlabel` must be either None or a str') 245 | if not isinstance(self.ylabel, (NONETYPE, str)): 246 | raise TypeError('`ylabel` must be either None or a str') 247 | 248 | if not isinstance(self.xlim, (NONETYPE, tuple)): 249 | raise TypeError('`xlim` must be a two-item tuple of numerics or `None`') 250 | if not isinstance(self.ylim, (NONETYPE, tuple)): 251 | raise TypeError('`xlim` must be a two-item tuple of numerics or `None`') 252 | if self.xscale not in ('linear', 'log', 'symlog', 'logit'): 253 | raise ValueError("`xscale must be one of 'linear', 'log', 'symlog', 'logit'") 254 | if self.yscale not in ('linear', 'log', 'symlog', 'logit'): 255 | raise ValueError("`xscale must be one of 'linear', 'log', 'symlog', 'logit'") 256 | 257 | def validate_sort_values(self): 258 | if self.sort_values not in ['asc', 'desc', None]: 259 | raise ValueError('`sort_values` must be one of "asc", "desc", or `None`') 260 | if self.sort_values and (self.split or self.row or self.col): 261 | raise ValueError('Can only use `sort_values` if `split`, `row`, and `col` are `None`.') 262 | 263 | def get_plot_type(self): 264 | if self.row and self.col: 265 | return 'square' 266 | if self.row: 267 | return 'row_only' 268 | if self.col: 269 | return 'col_only' 270 | return 'single' 271 | 272 | def get_agg_kind(self): 273 | if self.agg: 274 | # string and category use 'O' 275 | agg_kind = self.data[self.agg].dtype.kind 276 | return agg_kind 277 | 278 | def set_index(self): 279 | data = self.data 280 | rc = [] 281 | if self.row: 282 | rc.append(self.row) 283 | if self.col: 284 | rc.append(self.col) 285 | if rc: 286 | data = data.set_index(rc) 287 | return data 288 | 289 | def get_uniques(self): 290 | if self.plot_type == 'single': 291 | return None, None 292 | elif self.plot_type == 'row_only': 293 | return self.data.index.unique(), None 294 | elif self.plot_type == 'col_only': 295 | return None, self.data.index.unique() 296 | else: 297 | return self.data.index.levels 298 | 299 | def get_row_col_order(self): 300 | rows, cols = self.rows, self.cols 301 | if rows is not None: 302 | if self.row_order == 'desc': 303 | rows = sorted(rows, reverse=True) 304 | else: 305 | rows = sorted(rows) 306 | if cols is not None: 307 | if self.col_order == 'desc': 308 | cols = sorted(cols, reverse=True) 309 | else: 310 | cols = sorted(cols) 311 | 312 | if isinstance(self.row_order, list): 313 | new_rows = [] 314 | for row in self.row_order: 315 | if row not in rows: 316 | raise ValueError(f'Row value {row} does not exist') 317 | new_rows.append(row) 318 | rows = new_rows 319 | if isinstance(self.col_order, list): 320 | new_cols = [] 321 | for col in self.col_order: 322 | if col not in cols: 323 | raise ValueError(f'Column value {col} does not exist') 324 | new_cols.append(col) 325 | cols = new_cols 326 | return rows, cols 327 | 328 | def get_fig_shape(self): 329 | if self.plot_type == 'single': 330 | return 1, 1 331 | 332 | nrows = ncols = 1 333 | if self.rows is not None: 334 | nrows = len(self.rows) 335 | if self.cols is not None: 336 | ncols = len(self.cols) 337 | 338 | if self.wrap: 339 | if self.plot_type == 'row_only': 340 | ncols = (nrows - 1) // self.wrap + 1 341 | nrows = min(nrows, self.wrap) 342 | elif self.plot_type == 'col_only': 343 | nrows = (ncols - 1) // self.wrap + 1 344 | ncols = min(ncols, self.wrap) 345 | return nrows, ncols 346 | 347 | def get_data_for_every_plot(self): 348 | # TODO: catch keyerror for groups that dont exist 349 | rows, cols = self.get_row_col_order() 350 | if self.plot_type == 'row_only': 351 | return [(row, self.data.loc[row]) for row in rows] 352 | if self.plot_type in ('row_only', 'col_only'): 353 | return [(col, self.data.loc[col]) for col in cols] 354 | elif self.plot_type == 'square': 355 | groups = [] 356 | for col in cols: 357 | for row in rows: 358 | group = row, col 359 | try: 360 | with warnings.catch_warnings(): 361 | warnings.simplefilter("ignore") 362 | data = self.data.loc[group] 363 | except (KeyError, TypeError): 364 | data = self.data.iloc[:0] 365 | groups.append((group, data)) 366 | return groups 367 | else: 368 | return [(None, self.data)] 369 | 370 | def get_labels(self, labels): 371 | # this won't work for wrapping 372 | if self.plot_type == 'square': 373 | return str(labels[0]), str(labels[1]) 374 | elif self.plot_type == 'row_only': 375 | return str(labels), None 376 | elif self.plot_type == 'col_only': 377 | return None, str(labels) 378 | return None, None 379 | 380 | def sort_values_xy(self, x, y): 381 | grp, num = (x, y) if self.orientation == 'v' else (y, x) 382 | if self.sort_values is None: 383 | return x, y 384 | elif self.sort_values == 'asc': 385 | order = np.lexsort([grp, num]) 386 | else: 387 | order = np.lexsort([grp, -num]) 388 | if self.orientation == 'h': 389 | order = order[::-1] 390 | return x[order], y[order] 391 | 392 | def get_order(self, arr, vals): 393 | arr = arr.tolist() 394 | order = [] 395 | for val in vals: 396 | try: 397 | idx = arr.index(val) 398 | except ValueError: 399 | raise ValueError(f'{val} is not a valid column value') 400 | order.append(idx) 401 | return order 402 | 403 | def reverse_order(self, order): 404 | cond1 = order == 'desc' and self.orientation == 'v' 405 | cond2 = order in ('asc', None) and self.orientation == 'h' 406 | return cond1 or cond2 407 | 408 | def order_xy(self, x, y): 409 | if self.x_order and self.x != self.agg: 410 | if isinstance(self.x_order, list): 411 | order = self.get_order(x, self.x_order) 412 | elif self.reverse_order(self.x_order): 413 | order = np.lexsort([x])[::-1] 414 | else: 415 | return x, y 416 | elif self.y_order and self.y != self.agg: 417 | if isinstance(self.y_order, list): 418 | order = self.get_order(y, self.y_order) 419 | elif self.reverse_order(self.y_order): 420 | order = np.lexsort([y])[::-1] 421 | else: 422 | return x, y 423 | else: 424 | return x, y 425 | return x[order], y[order] 426 | 427 | def get_correct_data_order(self, x, y): 428 | x, y = self.sort_values_xy(x, y) 429 | if self.sort_values is None: 430 | x, y = self.order_xy(x, y) 431 | return x, y 432 | 433 | def get_wide_data(self, data): 434 | x = data.index.values 435 | y = {col: data[col].values for col in data.columns} 436 | if self.orientation == 'h': 437 | x, y = y, x 438 | return x, y 439 | 440 | def get_wide_columns(self, data): 441 | cols = [] 442 | used_cols = [self.groupby, self.split, self.row, self.col] 443 | for col in data.columns: 444 | if col not in used_cols: 445 | cols.append(col) 446 | return cols 447 | 448 | def get_ordered_groups(self, data, specific_order, kind): 449 | # used for split and groupby groups 450 | order = [] 451 | groups = [] 452 | sort = specific_order is not None 453 | # TODO: Need to decide defaults for x_order, y_order etc... either None or 'asc' 454 | for grp, data_grp in data.groupby(getattr(self, kind), sort=True): 455 | order.append((grp, data_grp)) 456 | groups.append(grp) 457 | 458 | if isinstance(specific_order, list): 459 | new_order = [] 460 | for grp in specific_order: 461 | try: 462 | idx = groups.index(grp) 463 | except ValueError: 464 | col = getattr(self, kind) 465 | raise ValueError(f'Value "{grp}" from `{kind}_order` is ' 466 | f'not in column {col}') 467 | 468 | new_order.append(idx) 469 | order = [order[i] for i in new_order] 470 | elif specific_order == 'desc': 471 | new_order = np.lexsort([groups])[::-1] 472 | order = [order[i] for i in new_order] 473 | 474 | return order 475 | 476 | def get_final_groups(self, data, split_label, row_label, col_label): 477 | groups = [] 478 | if self.aggfunc == '__distribution__': 479 | if self.groupby is not None: 480 | for grp, data_grp in self.get_ordered_groups(data, self.groupby_order, 'groupby'): 481 | vals = data_grp[self.agg] 482 | groups.append((vals, split_label, grp, row_label, col_label)) 483 | else: 484 | col = self.x or self.y 485 | vals = data[col] 486 | groups.append((vals, split_label, self.col, row_label, col_label)) 487 | elif self.groupby is not None: 488 | try: 489 | s = data.groupby(self.groupby, sort=self.groupby_sort)[self.agg].agg(self.aggfunc) 490 | except Exception as e: 491 | if type(e).__name__ == 'DataError': 492 | raise ValueError(f'The aggregating column {self.agg} is not numeric and ' 493 | f'cannot be aggregated with {self.aggfunc}. You might need ' 494 | 'to switch x and y') 495 | else: 496 | raise e 497 | x, y = s.index.values, s.values 498 | x, y = (x, y) if self.orientation == 'v' else (y, x) 499 | x, y = self.get_correct_data_order(x, y) 500 | groups.append((x, y, split_label, self.groupby, row_label, col_label)) 501 | elif self.x is None or self.y is None: 502 | if self.x: 503 | s = data[self.x] 504 | x, y = s.values, s.index.values 505 | x, y = self.get_correct_data_order(x, y) 506 | groups.append((x, y, split_label, self.x, row_label, col_label)) 507 | elif self.y: 508 | s = data[self.y] 509 | x, y = s.index.values, s.values 510 | x, y = self.get_correct_data_order(x, y) 511 | groups.append((x, y, split_label, self.y, row_label, col_label)) 512 | else: 513 | # wide data 514 | for col in self.get_wide_columns(data): 515 | s = data[col] 516 | x, y = s.index.values, s.values 517 | x, y = self.get_correct_data_order(x, y) 518 | x, y = (x, y) if self.orientation == 'v' else (y, x) 519 | groups.append((x, y, col, None, row_label, col_label)) 520 | else: 521 | # simple raw plot - make sure to warn when lots of data for bar/box/hist 522 | # one graph per row - OK for scatterplots and line plots 523 | x, y = self.get_correct_data_order(data[self.x], data[self.y]) 524 | groups.append((x, y, split_label, None, row_label, col_label)) 525 | return groups 526 | 527 | def get_x_y_plot(self, x, y): 528 | x_plot, y_plot = x, y 529 | if x_plot.dtype.kind == 'O': 530 | x_plot = np.arange(len(x_plot)) 531 | if y_plot.dtype.kind == 'O': 532 | y_plot = np.arange(len(y_plot)) 533 | return x_plot, y_plot 534 | 535 | def get_distribution_data(self, info): 536 | cur_data = defaultdict(list) 537 | cur_ticklabels = defaultdict(list) 538 | for vals, split_label, col_name, row_label, col_label in info: 539 | cur_data[split_label].append(vals) 540 | cur_ticklabels[split_label].append(col_name) 541 | return cur_data, cur_ticklabels 542 | 543 | 544 | class MPLCommon(CommonPlot): 545 | 546 | def __init__(self, x, y, data, aggfunc, split, row, col, 547 | x_order, y_order, split_order, row_order, col_order, 548 | orientation, sort_values, wrap, figsize, title, sharex, sharey, 549 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 550 | x_textwrap, y_textwrap, x_rot, y_rot, 551 | check_numeric=False, kind=None): 552 | super().__init__(x, y, data, aggfunc, split, row, col, 553 | x_order, y_order, split_order, row_order, col_order, 554 | orientation, sort_values, wrap, figsize, title, sharex, sharey, 555 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 556 | x_textwrap, y_textwrap, x_rot, y_rot, 557 | check_numeric=False, kind=None) 558 | self.figsize = self.get_figsize() 559 | self.user_figsize = self.figsize is not None 560 | self.original_rcParams = plt.rcParams.copy() 561 | self.set_rcParams() 562 | self.fig, self.axs = self.create_figure() 563 | self.set_color_cycle() 564 | self.data_for_plots = self.get_data_for_every_plot() 565 | self.final_data = self.get_final_data() 566 | self.style_fig() 567 | self.add_ax_titles() 568 | self.add_fig_title() 569 | 570 | def get_figsize(self): 571 | if self.figsize is None: 572 | return 573 | elif isinstance(self.figsize, (list, tuple)): 574 | if len(self.figsize) != 2: 575 | raise ValueError('figsize must be a two-item tuple/list') 576 | for val in self.figsize: 577 | if not isinstance(val, (int, float)): 578 | raise ValueError('Each item in figsize must be an integer or a float') 579 | else: 580 | raise TypeError('figsize must be a two-item tuple') 581 | 582 | return self.fig_shape[1] * 4, self.fig_shape[0] * 3 583 | 584 | def create_figure(self): 585 | fig = plt.Figure(tight_layout=True, dpi=144, figsize=self.figsize) 586 | axs = fig.subplots(*self.fig_shape, sharex=self.sharex, sharey=self.sharey) 587 | if self.fig_shape != (1, 1): 588 | axs = axs.flatten(order='F') 589 | else: 590 | axs = [axs] 591 | return fig, axs 592 | 593 | def set_color_cycle(self): 594 | for ax in self.axs: 595 | ax.set_prop_cycle(color=self.colors) 596 | 597 | def get_final_data(self): 598 | # create list of data for each call to plotting method 599 | final_data = defaultdict(list) 600 | for (labels, data), ax in zip(self.data_for_plots, self.axs): 601 | row_label, col_label = self.get_labels(labels) 602 | if self.split: 603 | for grp, data_grp in self.get_ordered_groups(data, self.split_order, 'split'): 604 | final_data[ax].extend(self.get_final_groups(data_grp, grp, row_label, col_label)) 605 | else: 606 | final_data[ax].extend(self.get_final_groups(data, None, row_label, col_label)) 607 | return final_data 608 | 609 | def style_fig(self): 610 | for ax in self.axs: 611 | ax.tick_params(length=0) 612 | ax.set_facecolor('.9') 613 | ax.grid(True) 614 | ax.set_axisbelow(True) 615 | for spine in ax.spines.values(): 616 | spine.set_visible(False) 617 | 618 | def add_x_y_labels(self): 619 | if self.plot_type == 'single': 620 | self.axs[0].set_xlabel(self.x) 621 | self.axs[0].set_ylabel(self.y) 622 | return 623 | 624 | # need to eliminate next line to save lots of time 625 | self.fig.canvas.print_figure(io.BytesIO()) 626 | rows, cols = self.fig_shape 627 | top_left_ax, bottom_right_ax = self.axs[0], self.axs[rows * cols - 1] 628 | top_left_points = top_left_ax.get_position().get_points() 629 | bottom_right_points = bottom_right_ax.get_position().get_points() 630 | 631 | left = top_left_points[0][0] 632 | right = bottom_right_points[1][0] 633 | x = (right + left) / 2 634 | 635 | top = top_left_points[1][1] 636 | bottom = bottom_right_points[0][1] 637 | y = (top + bottom) / 2 638 | self.fig.text(0, y, self.y, rotation=90, ha='center', va='center', size='larger') 639 | self.fig.text(x, 0, self.x, ha='center', va='center', size='larger') 640 | 641 | def add_ax_titles(self): 642 | for ax, info in self.final_data.items(): 643 | row_label, col_label = info[0][-2:] 644 | if row_label is not None: 645 | row_label = str(row_label) 646 | if col_label is not None: 647 | col_label = str(col_label) 648 | row_label = row_label or '' 649 | col_label = col_label or '' 650 | if row_label and col_label: 651 | title = row_label + ' - ' + col_label 652 | else: 653 | title = row_label or col_label 654 | title = textwrap.fill(str(title), 30) 655 | ax.set_title(title) 656 | 657 | def set_rcParams(self): 658 | plt.rcParams['font.size'] = 6 659 | plt.rcParams['font.family'] = 'Helvetica' 660 | 661 | def add_ticklabels(self, labels, ax, delta=0): 662 | ticks = np.arange(len(labels)) 663 | ha, va = 'center', 'center' 664 | if self.orientation == 'v': 665 | if self.x_textwrap: 666 | labels = [textwrap.fill(str(label), self.x_textwrap) for label in labels] 667 | ax.set_xticks(ticks + delta) 668 | if self.x_rot is not None: 669 | if 0 <= self.x_rot <= 180: 670 | ha = 'right' 671 | else: 672 | ha = 'left' 673 | ax.set_xticklabels(labels, rotation=self.x_rot, ha=ha) 674 | else: 675 | if self.y_textwrap: 676 | labels = [textwrap.fill(str(label), self.y_textwrap) for label in labels] 677 | ax.set_yticks(ticks - delta) 678 | if self.y_rot is not None: 679 | if 0 <= self.y_rot <= 180: 680 | va = 'top' 681 | else: 682 | va = 'bottom' 683 | ax.set_yticklabels(labels, rotation=self.y_rot, va=va) 684 | 685 | def add_legend(self, label=None, handles=None, labels=None): 686 | if label is not None: 687 | if handles is None: 688 | handles, labels = self.axs[0].get_legend_handles_labels() 689 | ncol = len(labels) // 8 + 1 690 | self.fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(1.01, .8), 691 | title=self.split, ncol=ncol) 692 | 693 | def clean_up(self): 694 | self.add_x_y_labels() 695 | plt.rcParams = self.original_rcParams 696 | return self.fig 697 | 698 | def update_fig_size(self, n_splits, n_groups_per_split): 699 | if self.user_figsize: 700 | return 701 | c1 = .3 if self.orientation == 'v' else .2 702 | c2 = .06 if self.orientation == 'v' else .04 703 | new_size = 1.8 + (c1 + c2 * n_splits) * n_groups_per_split 704 | if self.orientation == 'v': 705 | height = max(2.5 - .3 * self.fig_shape[0], 1.2) 706 | shrink = max(.9 - .1 * self.fig_shape[1], .5) 707 | width = new_size * shrink * self.fig_shape[1] 708 | height = height * self.fig_shape[0] 709 | else: 710 | width = max(3 - .3 * self.fig_shape[1], 1.5) 711 | height = new_size * .8 * self.fig_shape[0] 712 | width = width * self.fig_shape[1] 713 | width, height = min(width, 25), min(height, 25) 714 | self.fig.set_size_inches(width, height) 715 | 716 | def add_fig_title(self): 717 | self.fig.suptitle(self.title, y=1.02) 718 | 719 | 720 | import plotly.graph_objects as go 721 | from plotly.subplots import make_subplots 722 | 723 | 724 | class PlotlyCommon(CommonPlot): 725 | 726 | def __init__(self, x, y, data, aggfunc, split, row, col, 727 | x_order, y_order, split_order, row_order, col_order, 728 | orientation, sort_values, wrap, figsize, title, sharex, sharey, 729 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 730 | x_textwrap, y_textwrap, x_rot, y_rot, 731 | check_numeric=False, kind=None): 732 | super().__init__(x, y, data, aggfunc, split, row, col, 733 | x_order, y_order, split_order, row_order, col_order, 734 | orientation, sort_values, wrap, figsize, title, sharex, sharey, 735 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 736 | x_textwrap, y_textwrap, x_rot, y_rot, 737 | check_numeric=False, kind=None) 738 | 739 | self.data_for_plots = self.get_data_for_every_plot() 740 | self.final_data = self.get_final_data() 741 | self.fig = self.create_figure() 742 | 743 | def create_figure(self): 744 | titles = self.get_subplot_titles() 745 | fig = make_subplots(rows=self.fig_shape[0], cols=self.fig_shape[1], subplot_titles=titles, 746 | shared_xaxes=self.sharex, shared_yaxes=self.sharey, 747 | horizontal_spacing=.03) 748 | fig.update_layout(title_text=self.title, legend_title_text=self.split) 749 | return fig 750 | 751 | def get_final_data(self): 752 | # create list of data for each call to plotting method 753 | final_data = defaultdict(list) 754 | locs = [] 755 | for i in range(self.fig_shape[0]): 756 | for j in range(self.fig_shape[1]): 757 | locs.append((i + 1, j + 1)) 758 | 759 | for (labels, data), loc in zip(self.data_for_plots, locs): 760 | row_label, col_label = self.get_labels(labels) 761 | if self.split: 762 | for grp, data_grp in self.get_ordered_groups(data, self.split_order, 'split'): 763 | final_data[loc].extend(self.get_final_groups(data_grp, grp, row_label, col_label)) 764 | else: 765 | final_data[loc].extend(self.get_final_groups(data, None, row_label, col_label)) 766 | return final_data 767 | 768 | def get_subplot_titles(self): 769 | titles = [] 770 | for (i, j), info in self.final_data.items(): 771 | row_label, col_label = info[0][-2:] 772 | if row_label is not None: 773 | row_label = str(row_label) 774 | if col_label is not None: 775 | col_label = str(col_label) 776 | row_label = row_label or '' 777 | col_label = col_label or '' 778 | if row_label and col_label: 779 | title = row_label + ' - ' + col_label 780 | else: 781 | title = row_label or col_label 782 | title = textwrap.fill(str(title), 30) 783 | titles.append(title) 784 | return titles 785 | 786 | 787 | class CountCommon(CommonPlot): 788 | 789 | def get_count_dict(self, normalize): 790 | count_dict = {} 791 | 792 | if isinstance(normalize, str): 793 | if normalize in (val, self.split, self.row, self.col): 794 | normalize = [normalize] 795 | 796 | if isinstance(normalize, tuple): 797 | normalize = list(normalize) 798 | elif hasattr(normalize, 'tolist'): 799 | normalize = normalize.tolist() 800 | elif not isinstance(normalize, (bool, list)): 801 | raise ValueError('`normalize` must either be `True`/`False`, one of the columns passed ' 802 | 'to `val`, `split`, `row` or `col`, or a list of ' 803 | 'those columns') 804 | normalize_kind = None 805 | if isinstance(normalize, list): 806 | row_col = [] 807 | val_split = [] 808 | for col in normalize: 809 | if col in (self.row, self.col): 810 | row_col.append(col) 811 | elif col in (val, self.split): 812 | val_split.append(col) 813 | else: 814 | raise ValueError('Columns passed to `normalize` must be the same as ' 815 | ' `val`, `split`, `row` or `col`.') 816 | 817 | if row_col: 818 | all_counts = {} 819 | for grp, data in self.data.groupby(row_col): 820 | if len(row_col) == 1: 821 | grp = str(grp) 822 | else: 823 | grp = tuple(str(g) for g in grp) 824 | 825 | if val_split: 826 | normalize_kind = 'all' 827 | all_counts[grp] = data.groupby(val_split).size() 828 | else: 829 | normalize_kind = 'grid' 830 | all_counts[grp] = len(data) 831 | else: 832 | normalize_kind = 'single' 833 | all_counts = self.data.groupby(val_split).size() 834 | 835 | n = 0 836 | for key, info in self.final_data.items(): 837 | columns = [] 838 | vcs = [] 839 | for vals, split_label, col_name, row_label, col_label in info: 840 | vcs.append(vals.value_counts()) 841 | columns.append(split_label) 842 | 843 | df = pd.concat(vcs, axis=1) 844 | df.columns = columns 845 | df.index.name = vals.name 846 | if normalize_kind == 'single': 847 | if len(val_split) == 2: 848 | df = df / all_counts.unstack(self.split) 849 | elif df.index.name == all_counts.index.name: 850 | df = df.div(all_counts, axis=0) 851 | else: 852 | df = df / all_counts 853 | elif normalize_kind in ('grid', 'all'): 854 | grp = [] 855 | for col in normalize: 856 | if col == self.row: 857 | grp.append(row_label) 858 | if col == self.col: 859 | grp.append(col_label) 860 | 861 | if len(grp) == 1: 862 | grp = grp[0] 863 | else: 864 | grp = tuple(grp) 865 | grp_val = all_counts[grp] 866 | 867 | if normalize_kind == 'grid': 868 | df = df / grp_val 869 | elif len(val_split) == 2: 870 | df = df / grp_val.unstack(self.split) 871 | elif df.index.name == grp_val.index.name: 872 | df = df.div(grp_val, axis=0) 873 | else: 874 | df = df / grp_val 875 | 876 | else: 877 | n += df.sum().sum() 878 | count_dict[key] = df 879 | 880 | if normalize is True: 881 | count_dict = {key: df / n for key, df in count_dict.items()} 882 | 883 | return count_dict 884 | 885 | 886 | class MPLCount(CountCommon, MPLCommon): 887 | pass 888 | 889 | 890 | class PlotlyCount(CountCommon, PlotlyCommon): 891 | pass 892 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dexplot 2 | 3 | [![](https://img.shields.io/pypi/v/dexplot)](https://pypi.org/project/dexplot) 4 | [![PyPI - License](https://img.shields.io/pypi/l/dexplot)](LICENSE) 5 | 6 | Dexplot is a Python library for delivering beautiful data visualizations with a simple and intuitive user experience. 7 | 8 | ## Goals 9 | 10 | The primary goals for dexplot are: 11 | 12 | * Maintain a very consistent API with as few functions as necessary to make the desired statistical plots 13 | * Allow the user tremendous power without using matplotlib 14 | 15 | 16 | ## Installation 17 | 18 | `pip install dexplot` 19 | 20 | ## Built for long and wide data 21 | 22 | Dexplot is primarily built for long data, which is a form of data where each row represents a single observation and each column represents a distinct quantity. It is often referred to as "tidy" data. Here, we have some long data. 23 | 24 | ![](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/long.png) 25 | 26 | Dexplot also has the ability to handle wide data, where multiple columns may contain values that represent the same kind of quantity. The same data above has been aggregated to show the mean for each combination of neighborhood and property type. It is now wide data as each column contains the same quantity (price). 27 | 28 | ![](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/wide.png) 29 | 30 | ## Usage 31 | 32 | Dexplot provides a small number of powerful functions that all work similarly. Most plotting functions have the following signature: 33 | 34 | ```python 35 | dxp.plotting_func(x, y, data, aggfunc, split, row, col, orientation, ...) 36 | ``` 37 | 38 | * `x` - Column name along the x-axis 39 | * `y` - Column name the y-axis 40 | * `data` - Pandas DataFrame 41 | * `aggfunc` - String of pandas aggregation function, 'min', 'max', 'mean', etc... 42 | * `split` - Column name to split data into distinct groups 43 | * `row` - Column name to split data into distinct subplots row-wise 44 | * `col` - Column name to split data into distinct subplots column-wise 45 | * `orientation` - Either vertical (`'v'`) or horizontal (`'h'`). Default for most plots is vertical. 46 | 47 | When `aggfunc` is provided, `x` will be the grouping variable and `y` will be aggregated when vertical and vice-versa when horizontal. The best way to learn how to use dexplot is with the examples below. 48 | 49 | ## Families of plots 50 | 51 | There are two primary families of plots, **aggregation** and **distribution**. Aggregation plots take a sequence of values and return a **single** value using the function provided to `aggfunc` to do so. Distribution plots take a sequence of values and depict the shape of the distribution in some manner. 52 | 53 | * Aggregation 54 | * bar 55 | * line 56 | * scatter 57 | * count 58 | * Distribution 59 | * box 60 | * violin 61 | * hist 62 | * kde 63 | 64 | ## Comparison with Seaborn 65 | 66 | If you have used the seaborn library, then you should notice a lot of similarities. Much of dexplot was inspired by Seaborn. Below is a list of the extra features in dexplot not found in seaborn 67 | 68 | * Ability to graph relative frequency and normalize over any number of variables 69 | * No need for multiple functions to do the same thing (far fewer public functions) 70 | * Ability to make grids with a single function instead of having to use a higher level function like `catplot` 71 | * Pandas `groupby` methods available as strings 72 | * Ability to sort by values 73 | * Ability to sort x/y labels lexicographically 74 | * Ability to select most/least frequent groups 75 | * x/y labels are wrapped so that they don't overlap 76 | * Figure size (plus several other options) and available to change without using matplotlib 77 | * A matplotlib figure object is returned 78 | 79 | ## Examples 80 | 81 | Most of the examples below use long data. 82 | 83 | ## Aggregating plots - bar, line and scatter 84 | 85 | We'll begin by covering the plots that **aggregate**. An aggregation is defined as a function that summarizes a sequence of numbers with a single value. The examples come from the Airbnb dataset, which contains many property rental listings from the Washington D.C. area. 86 | 87 | 88 | ```python 89 | import dexplot as dxp 90 | import pandas as pd 91 | airbnb = dxp.load_dataset('airbnb') 92 | airbnb.head() 93 | ``` 94 | 95 |
96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 |
neighborhoodproperty_typeaccommodatesbathroomsbedroomspricecleaning_feeratingsuperhostresponse_timelatitudelongitude
0ShawTownhouse163.5443325095.0Nowithin an hour38.90982-77.02016
1Brightwood ParkTownhouse43.541545097.0NoNaN38.95888-77.02554
2Capitol HillHouse21.51833597.0Yeswithin an hour38.88791-76.99668
3ShawHouse22.51475098.0NoNaN38.91331-77.02436
4Kalorama HeightsApartment31.011181591.0Nowithin an hour38.91933-77.04124
192 |
193 | 194 | 195 | 196 | There are more than 4,000 listings in our dataset. We will use bar charts to aggregate the data. 197 | 198 | 199 | ```python 200 | airbnb.shape 201 | ``` 202 | 203 | 204 | 205 | 206 | (4581, 12) 207 | 208 | 209 | 210 | ### Vertical bar charts 211 | 212 | In order to performa an aggregation, you must supply a value for `aggfunc`. Here, we find the median price per neighborhood. Notice that the column names automatically wrap. 213 | 214 | 215 | ```python 216 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median') 217 | ``` 218 | 219 | 220 | 221 | 222 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_7_0.png) 223 | 224 | 225 | 226 | Line and scatter plots can be created with the same command, just substituting the name of the function. They both are not good choices for the visualization since the grouping variable (neighborhood) has no meaningful order. 227 | 228 | 229 | ```python 230 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median') 231 | ``` 232 | 233 | 234 | 235 | 236 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_9_0.png) 237 | 238 | 239 | 240 | 241 | ```python 242 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median') 243 | ``` 244 | 245 | 246 | 247 | 248 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_10_0.png) 249 | 250 | 251 | 252 | ### Components of the groupby aggregation 253 | 254 | Anytime the `aggfunc` parameter is set, you have performed a groupby aggregation, which always consists of three components: 255 | 256 | * Grouping column - unique values of this column form independent groups (neighborhood) 257 | * Aggregating column - the column that will get summarized with a single value (price) 258 | * Aggregating function - a function that returns a single value (median) 259 | 260 | The general format for doing this in pandas is: 261 | 262 | ```python 263 | df.groupby('grouping column').agg({'aggregating column': 'aggregating function'}) 264 | ``` 265 | 266 | Specifically, the following code is executed within dexplot. 267 | 268 | 269 | ```python 270 | airbnb.groupby('neighborhood').agg({'price': 'median'}) 271 | ``` 272 | 273 | 274 | 275 | 276 |
277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 |
price
neighborhood
Brightwood Park87.0
Capitol Hill129.5
Columbia Heights95.0
Dupont Circle125.0
Edgewood100.0
Kalorama Heights118.0
Shaw133.5
Union Station120.0
323 |
324 | 325 | 326 | 327 | ### Number and percent of missing values with `'countna'` and `'percna'` 328 | 329 | In addition to all the common aggregating functions, you can use the strings `'countna'` and `'percna'` to get the number and percentage of missing values per group. 330 | 331 | 332 | ```python 333 | dxp.bar(x='neighborhood', y='response_time', data=airbnb, aggfunc='countna') 334 | ``` 335 | 336 | 337 | 338 | 339 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_14_0.png) 340 | 341 | 342 | 343 | ### Sorting the bars by values 344 | 345 | By default, the bars will be sorted by the grouping column (x-axis here) in alphabetical order. Use the `sort_values` parameter to sort the bars by value. 346 | 347 | * None - sort x/y axis labels alphabetically (default) 348 | * `asc` - sort values from least to greatest 349 | * `desc` - sort values from greatest to least 350 | 351 | 352 | ```python 353 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc') 354 | ``` 355 | 356 | 357 | 358 | 359 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_16_0.png) 360 | 361 | 362 | 363 | Here, we sort the values from greatest to least. 364 | 365 | 366 | ```python 367 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc') 368 | ``` 369 | 370 | 371 | 372 | 373 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_18_0.png) 374 | 375 | 376 | 377 | ### Specify order with `x_order` 378 | 379 | Specify a specific order of the labels on the x-axis by passing a list of values to `x_order`. This can also act as a filter to limit the number of bars. 380 | 381 | 382 | ```python 383 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 384 | x_order=['Dupont Circle', 'Edgewood', 'Union Station']) 385 | ``` 386 | 387 | 388 | 389 | 390 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_20_0.png) 391 | 392 | 393 | 394 | By default, `x_order` and all of the `_order` parameters are set to `'asc'` by default, which will order them alphabetically. Use the string `'desc'` to sort in the opposite direction. 395 | 396 | 397 | ```python 398 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc') 399 | ``` 400 | 401 | 402 | 403 | 404 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_22_0.png) 405 | 406 | 407 | 408 | ### Filter for the neighborhoods with most/least frequency of occurrence 409 | 410 | You can use `x_order` again to filter for the x-values that appear the most/least often by setting it to the string `'top n'` or `'bottom n'` where `n` is an integer. Here, we filter for the top 4 most frequently occurring neighborhoods. This option is useful when there are dozens of unique values in the grouping column. 411 | 412 | 413 | ```python 414 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 415 | x_order='top 4') 416 | ``` 417 | 418 | 419 | 420 | 421 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_24_0.png) 422 | 423 | 424 | 425 | We can verify that the four neighborhoods are the most common. 426 | 427 | 428 | ```python 429 | airbnb['neighborhood'].value_counts() 430 | ``` 431 | 432 | 433 | 434 | 435 | Columbia Heights 773 436 | Union Station 713 437 | Capitol Hill 654 438 | Edgewood 610 439 | Dupont Circle 549 440 | Shaw 514 441 | Brightwood Park 406 442 | Kalorama Heights 362 443 | Name: neighborhood, dtype: int64 444 | 445 | 446 | 447 | ### Horizontal bars 448 | 449 | Set `orientation` to `'h'` for horizontal bars. When you do this, you'll need to switch `x` and `y` since the grouping column (neighborhood) will be along the y-axis and the aggregating column (price) will be along the x-axis. 450 | 451 | 452 | ```python 453 | dxp.bar(x='price', y='neighborhood', data=airbnb, aggfunc='median', 454 | orientation='h', sort_values='desc') 455 | ``` 456 | 457 | 458 | 459 | 460 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_28_0.png) 461 | 462 | 463 | 464 | Switching orientation is possible for most other plots. 465 | 466 | 467 | ```python 468 | dxp.line(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h') 469 | ``` 470 | 471 | 472 | 473 | 474 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_30_0.png) 475 | 476 | 477 | 478 | ### Split bars into groups 479 | 480 | You can split each bar into further groups by setting the `split` parameter to another column. 481 | 482 | 483 | ```python 484 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost') 485 | ``` 486 | 487 | 488 | 489 | 490 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_32_0.png) 491 | 492 | 493 | 494 | We can use the `pivot_table` method to verify the results in pandas. 495 | 496 | 497 | ```python 498 | airbnb.pivot_table(index='superhost', columns='neighborhood', 499 | values='price', aggfunc='median') 500 | ``` 501 | 502 | 503 | 504 | 505 |
506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 |
neighborhoodBrightwood ParkCapitol HillColumbia HeightsDupont CircleEdgewoodKalorama HeightsShawUnion Station
superhost
No85.0129.090.5120.0100.0110.0130.0120.0
Yes90.0130.0103.0135.0100.0124.0135.0125.0
556 |
557 | 558 | 559 | 560 | Set the order of the unique split values with `split_order`, which can also act as a filter. 561 | 562 | 563 | ```python 564 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 565 | split='superhost', split_order=['Yes', 'No']) 566 | ``` 567 | 568 | 569 | 570 | 571 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_36_0.png) 572 | 573 | 574 | 575 | Like all the `_order` parameters, `split_order` defaults to `'asc'` (alphabetical) order. Set it to `'desc'` for the opposite. 576 | 577 | 578 | ```python 579 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 580 | split='property_type', split_order='desc') 581 | ``` 582 | 583 | 584 | 585 | 586 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_38_0.png) 587 | 588 | 589 | 590 | Filtering for the most/least frequent split categories is possible. 591 | 592 | 593 | ```python 594 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 595 | split='property_type', split_order='bottom 2') 596 | ``` 597 | 598 | 599 | 600 | 601 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_40_0.png) 602 | 603 | 604 | 605 | Verifying that the least frequent property types are Townhouse and Condominium. 606 | 607 | 608 | ```python 609 | airbnb['property_type'].value_counts() 610 | ``` 611 | 612 | 613 | 614 | 615 | Apartment 2403 616 | House 877 617 | Townhouse 824 618 | Condominium 477 619 | Name: property_type, dtype: int64 620 | 621 | 622 | 623 | ### Stacked bar charts 624 | 625 | Stack all the split groups one on top of the other by setting `stacked` to `True`. 626 | 627 | 628 | ```python 629 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 630 | split='superhost', split_order=['Yes', 'No'], stacked=True) 631 | ``` 632 | 633 | 634 | 635 | 636 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_44_0.png) 637 | 638 | 639 | 640 | ### Split into multiple plots 641 | 642 | It's possible to split the data further into separate plots by the unique values in a different column with the `row` and `col` parameters. Here, each kind of `property_type` has its own plot. 643 | 644 | 645 | ```python 646 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 647 | split='superhost', col='property_type') 648 | ``` 649 | 650 | 651 | 652 | 653 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_46_0.png) 654 | 655 | 656 | 657 | If there isn't room for all of the plots, set the `wrap` parameter to an integer to set the maximum number of plots per row/col. We also specify the `col_order` to be descending alphabetically. 658 | 659 | 660 | ```python 661 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 662 | split='superhost', col='property_type', wrap=2, col_order='desc') 663 | ``` 664 | 665 | 666 | 667 | 668 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_48_0.png) 669 | 670 | 671 | 672 | Use `col_order` to both filter and set a specific order for the plots. 673 | 674 | 675 | ```python 676 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 677 | split='superhost', col='property_type', col_order=['House', 'Condominium']) 678 | ``` 679 | 680 | 681 | 682 | 683 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_50_0.png) 684 | 685 | 686 | 687 | Splits can be made simultaneously along row and columns. 688 | 689 | 690 | ```python 691 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost', 692 | col='property_type', col_order=['House', 'Condominium', 'Apartment'], 693 | row='bedrooms', row_order=[1, 2, 3]) 694 | ``` 695 | 696 | 697 | 698 | 699 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_52_0.png) 700 | 701 | 702 | 703 | By default, all axis limits are shared. Allow each plot to set its own limits by setting `sharex` and `sharey` to `False`. 704 | 705 | 706 | ```python 707 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost', 708 | col='property_type', col_order=['House', 'Condominium', 'Apartment'], 709 | row='bedrooms', row_order=[1, 2, 3], sharey=False) 710 | ``` 711 | 712 | 713 | 714 | 715 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_54_0.png) 716 | 717 | 718 | 719 | ### Set the width of each bar with `size` 720 | 721 | The width (height when horizontal) of the bars is set with the `size` parameter. By default, this value is .9. Think of this number as the relative width of all the bars for a particular x/y value, where 1 is the distance between each x/y value. 722 | 723 | 724 | ```python 725 | dxp.bar(x='neighborhood', y='price', data=airbnb, 726 | aggfunc='median', split='property_type', 727 | split_order=['Apartment', 'House'], 728 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'], size=.5) 729 | ``` 730 | 731 | 732 | 733 | 734 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_56_0.png) 735 | 736 | 737 | 738 | ### Splitting line plots 739 | 740 | All the other aggregating plots work similarly. 741 | 742 | 743 | ```python 744 | dxp.line(x='neighborhood', y='price', data=airbnb, 745 | aggfunc='median', split='property_type', 746 | split_order=['Apartment', 'House'], 747 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station']) 748 | ``` 749 | 750 | 751 | 752 | 753 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_58_0.png) 754 | 755 | 756 | 757 | ## Distribution plots - box, violin, histogram, kde 758 | 759 | Distribution plots work similarly, but do not have an `aggfunc` since they do not aggregate. They take their group of values and draw some kind of shape that gives information on how that variable is distributed. 760 | 761 | ### Box plots 762 | 763 | Box plots have colored boxes with ends at the first and third quartiles and a line at the median. The whiskers are placed at 1.5 times the difference between the third and first quartiles (Interquartile range (IQR)). Fliers are the points outside this range and plotted individually. By default, both box and violin plots are plotted horizontally. 764 | 765 | 766 | ```python 767 | dxp.box(x='price', y='neighborhood', data=airbnb) 768 | ``` 769 | 770 | 771 | 772 | 773 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_60_0.png) 774 | 775 | 776 | 777 | Split the groups in the same manner as with the aggregation plots. 778 | 779 | 780 | ```python 781 | dxp.box(x='price', y='neighborhood', data=airbnb, 782 | split='superhost', split_order=['Yes', 'No']) 783 | ``` 784 | 785 | 786 | 787 | 788 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_62_0.png) 789 | 790 | 791 | 792 | Order the appearance of the splits alphabetically (in descending order here). 793 | 794 | 795 | ```python 796 | dxp.box(x='price', y='neighborhood', data=airbnb, 797 | split='property_type', split_order='desc') 798 | ``` 799 | 800 | 801 | 802 | 803 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_64_0.png) 804 | 805 | 806 | 807 | ### Filter range of values with `x_order` 808 | 809 | It's possible to filter the range of possible values by passing in a list of the minimum and maximum to `x_order`. 810 | 811 | 812 | ```python 813 | dxp.box(x='price', y='neighborhood', data=airbnb, 814 | split='superhost', x_order=[50, 250]) 815 | ``` 816 | 817 | 818 | 819 | 820 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_66_0.png) 821 | 822 | 823 | 824 | Change the `x` and `y` while setting `orientation` to make vertical bar plots. 825 | 826 | 827 | ```python 828 | dxp.box(x='neighborhood', y='price', data=airbnb, orientation='v', 829 | split='property_type', split_order='top 2') 830 | ``` 831 | 832 | 833 | 834 | 835 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_68_0.png) 836 | 837 | 838 | 839 | Violin plots work identically to box plots, but show "violins", kernel density plots duplicated on both sides of a line. 840 | 841 | 842 | ```python 843 | dxp.violin(x='price', y='neighborhood', data=airbnb, 844 | split='superhost', split_order=['Yes', 'No']) 845 | ``` 846 | 847 | 848 | 849 | 850 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_70_0.png) 851 | 852 | 853 | 854 | Splitting by rows and columns is possible as well with distribution plots. 855 | 856 | 857 | ```python 858 | dxp.box(x='price', y='neighborhood', data=airbnb,split='superhost', 859 | col='property_type', col_order=['House', 'Condominium', 'Apartment'], 860 | row='bedrooms', row_order=[1, 2]) 861 | ``` 862 | 863 | 864 | 865 | 866 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_72_0.png) 867 | 868 | 869 | 870 | ### Histograms 871 | 872 | Histograms work in a slightly different manner. Instead of passing both `x` and `y`, you give it a single numeric column. A vertical histogram with 20 bins of the counts is created by default. 873 | 874 | 875 | ```python 876 | dxp.hist(val='price', data=airbnb) 877 | ``` 878 | 879 | 880 | 881 | 882 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_74_0.png) 883 | 884 | 885 | 886 | We can use `split` just like we did above and also create horizontal histograms. 887 | 888 | 889 | ```python 890 | dxp.hist(val='price', data=airbnb, orientation='h', split='superhost', bins=15) 891 | ``` 892 | 893 | 894 | 895 | 896 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_76_0.png) 897 | 898 | 899 | 900 | Here, we customize our histogram by plotting the cumulative density as opposed to the raw frequency count using the outline of the bars ('step'). 901 | 902 | 903 | ```python 904 | dxp.hist(val='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3], 905 | bins=30, density=True, histtype='step', cumulative=True) 906 | ``` 907 | 908 | 909 | 910 | 911 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_78_0.png) 912 | 913 | 914 | 915 | ### KDE Plots 916 | 917 | Kernel density estimates provide an estimate for the probability distribution of a continuous variable. Here, we examine how price is distributed by bedroom. 918 | 919 | 920 | ```python 921 | dxp.kde(x='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3]) 922 | ``` 923 | 924 | 925 | 926 | 927 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_80_0.png) 928 | 929 | 930 | 931 | Graph the cumulative distribution instead on multiple plots. 932 | 933 | 934 | ```python 935 | dxp.kde(x='price', data=airbnb, split='bedrooms', 936 | split_order=[1, 2, 3], cumulative=True, col='property_type', wrap=2) 937 | ``` 938 | 939 | 940 | 941 | 942 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_82_0.png) 943 | 944 | 945 | 946 | ### Two-dimensional KDE's 947 | 948 | Provide two numeric columns to `x` and `y` to get a two dimensional KDE. 949 | 950 | 951 | ```python 952 | dxp.kde(x='price', y='cleaning_fee', data=airbnb) 953 | ``` 954 | 955 | 956 | 957 | 958 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_84_0.png) 959 | 960 | 961 | 962 | Create a grid of two-dimensional KDE's. 963 | 964 | 965 | ```python 966 | dxp.kde(x='price', y='cleaning_fee', data=airbnb, row='neighborhood', wrap=3) 967 | ``` 968 | 969 | 970 | 971 | 972 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_86_0.png) 973 | 974 | 975 | 976 | ## Count plots 977 | 978 | The `count` function graphs the frequency of unique values as bars. By default, it plots the values in descending order. 979 | 980 | 981 | ```python 982 | dxp.count(val='neighborhood', data=airbnb) 983 | ``` 984 | 985 | 986 | 987 | 988 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_88_0.png) 989 | 990 | 991 | 992 | In pandas, this is a straightforward call to the `value_counts` method. 993 | 994 | 995 | ```python 996 | airbnb['neighborhood'].value_counts() 997 | ``` 998 | 999 | 1000 | 1001 | 1002 | Columbia Heights 773 1003 | Union Station 713 1004 | Capitol Hill 654 1005 | Edgewood 610 1006 | Dupont Circle 549 1007 | Shaw 514 1008 | Brightwood Park 406 1009 | Kalorama Heights 362 1010 | Name: neighborhood, dtype: int64 1011 | 1012 | 1013 | 1014 | ### Relative frequency with `normalize` 1015 | 1016 | Instead of the raw counts, get the relative frequency by setting normalize to `True`. 1017 | 1018 | 1019 | ```python 1020 | dxp.count(val='neighborhood', data=airbnb, normalize=True) 1021 | ``` 1022 | 1023 | 1024 | 1025 | 1026 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_92_0.png) 1027 | 1028 | 1029 | 1030 | Here, we split by property type. 1031 | 1032 | 1033 | ```python 1034 | dxp.count(val='neighborhood', data=airbnb, split='property_type') 1035 | ``` 1036 | 1037 | 1038 | 1039 | 1040 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_94_0.png) 1041 | 1042 | 1043 | 1044 | In pandas, this is done with the `crosstab` function. 1045 | 1046 | 1047 | ```python 1048 | pd.crosstab(index=airbnb['property_type'], columns=airbnb['neighborhood']) 1049 | ``` 1050 | 1051 | 1052 | 1053 | 1054 |
1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | 1091 | 1092 | 1093 | 1094 | 1095 | 1096 | 1097 | 1098 | 1099 | 1100 | 1101 | 1102 | 1103 | 1104 | 1105 | 1106 | 1107 | 1108 | 1109 | 1110 | 1111 | 1112 | 1113 | 1114 | 1115 | 1116 | 1117 | 1118 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | 1126 |
neighborhoodBrightwood ParkCapitol HillColumbia HeightsDupont CircleEdgewoodKalorama HeightsShawUnion Station
property_type
Apartment167299374397244284315323
Condominium3570976265425254
House131137157471462361175
Townhouse73148145431551386161
1127 |
1128 | 1129 | 1130 | 1131 | Horizontal stacked count plots. 1132 | 1133 | 1134 | ```python 1135 | dxp.count(val='neighborhood', data=airbnb, split='property_type', 1136 | orientation='h', stacked=True, col='superhost') 1137 | ``` 1138 | 1139 | 1140 | 1141 | 1142 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_98_0.png) 1143 | 1144 | 1145 | 1146 | ### Normalize over different variables 1147 | 1148 | Setting `normalize` to `True`, returns the relative frequency with respect to all of the data. You can normalize over any of the variables provided. 1149 | 1150 | 1151 | ```python 1152 | dxp.count(val='neighborhood', data=airbnb, split='property_type', normalize='neighborhood', 1153 | title='Relative Frequency by Neighborhood') 1154 | ``` 1155 | 1156 | 1157 | 1158 | 1159 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_100_0.png) 1160 | 1161 | 1162 | 1163 | Normalize over several variables at once with a list. 1164 | 1165 | 1166 | ```python 1167 | dxp.count(val='neighborhood', data=airbnb, split='superhost', 1168 | row='property_type', col='bedrooms', col_order=[1, 2], 1169 | normalize=['neighborhood', 'property_type', 'bedrooms'], stacked=True) 1170 | ``` 1171 | 1172 | 1173 | 1174 | 1175 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_102_0.png) 1176 | 1177 | 1178 | 1179 | ## Wide data 1180 | 1181 | Dexplot can also plot wide data, or data where no aggregation happens. Here is a scatter plot of the location of each listing. 1182 | 1183 | 1184 | ```python 1185 | dxp.scatter(x='longitude', y='latitude', data=airbnb, 1186 | split='neighborhood', col='bedrooms', col_order=[2, 3]) 1187 | ``` 1188 | 1189 | 1190 | 1191 | 1192 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_104_0.png) 1193 | 1194 | 1195 | 1196 | If you've already aggregated your data, you can plot it directly without specifying `x` or `y`. 1197 | 1198 | 1199 | ```python 1200 | df = airbnb.pivot_table(index='neighborhood', columns='property_type', 1201 | values='price', aggfunc='mean') 1202 | df 1203 | ``` 1204 | 1205 | 1206 | 1207 | 1208 |
1209 | 1210 | 1211 | 1212 | 1213 | 1214 | 1215 | 1216 | 1217 | 1218 | 1219 | 1220 | 1221 | 1222 | 1223 | 1224 | 1225 | 1226 | 1227 | 1228 | 1229 | 1230 | 1231 | 1232 | 1233 | 1234 | 1235 | 1236 | 1237 | 1238 | 1239 | 1240 | 1241 | 1242 | 1243 | 1244 | 1245 | 1246 | 1247 | 1248 | 1249 | 1250 | 1251 | 1252 | 1253 | 1254 | 1255 | 1256 | 1257 | 1258 | 1259 | 1260 | 1261 | 1262 | 1263 | 1264 | 1265 | 1266 | 1267 | 1268 | 1269 | 1270 | 1271 | 1272 | 1273 | 1274 | 1275 | 1276 | 1277 | 1278 | 1279 | 1280 | 1281 | 1282 | 1283 | 1284 | 1285 |
property_typeApartmentCondominiumHouseTownhouse
neighborhood
Brightwood Park96.119760105.000000121.671756133.479452
Capitol Hill141.210702104.200000170.153285184.459459
Columbia Heights114.676471126.773196135.292994124.358621
Dupont Circle146.858942130.709677179.574468139.348837
Edgewood108.508197112.846154156.335616147.503226
Kalorama Heights122.542254155.92857192.695652158.230769
Shaw153.888889158.500000202.114754173.279070
Union Station128.458204133.833333162.748571162.167702
1286 |
1287 | 1288 | 1289 | 1290 | 1291 | ```python 1292 | dxp.bar(data=df, orientation='h') 1293 | ``` 1294 | 1295 | 1296 | 1297 | 1298 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_107_0.png) 1299 | 1300 | 1301 | 1302 | ### Time series 1303 | 1304 | 1305 | ```python 1306 | stocks = pd.read_csv('../data/stocks10.csv', parse_dates=['date'], index_col='date') 1307 | stocks.head() 1308 | ``` 1309 | 1310 | 1311 | 1312 | 1313 |
1314 | 1315 | 1316 | 1317 | 1318 | 1319 | 1320 | 1321 | 1322 | 1323 | 1324 | 1325 | 1326 | 1327 | 1328 | 1329 | 1330 | 1331 | 1332 | 1333 | 1334 | 1335 | 1336 | 1337 | 1338 | 1339 | 1340 | 1341 | 1342 | 1343 | 1344 | 1345 | 1346 | 1347 | 1348 | 1349 | 1350 | 1351 | 1352 | 1353 | 1354 | 1355 | 1356 | 1357 | 1358 | 1359 | 1360 | 1361 | 1362 | 1363 | 1364 | 1365 | 1366 | 1367 | 1368 | 1369 | 1370 | 1371 | 1372 | 1373 | 1374 | 1375 | 1376 | 1377 | 1378 | 1379 | 1380 | 1381 | 1382 | 1383 | 1384 | 1385 | 1386 | 1387 | 1388 | 1389 | 1390 | 1391 | 1392 | 1393 | 1394 | 1395 | 1396 | 1397 | 1398 | 1399 | 1400 | 1401 | 1402 | 1403 | 1404 | 1405 | 1406 | 1407 | 1408 | 1409 | 1410 |
MSFTAAPLSLBAMZNTSLAXOMWMTTFBV
date
1999-10-2529.842.3217.0282.75NaN21.4538.9916.78NaNNaN
1999-10-2629.822.3416.6581.25NaN20.8937.1117.28NaNNaN
1999-10-2729.332.3816.5275.94NaN20.8036.9418.27NaNNaN
1999-10-2829.012.4316.5971.00NaN21.1938.8519.79NaNNaN
1999-10-2929.882.5017.2170.62NaN21.4739.2520.00NaNNaN
1411 |
1412 | 1413 | 1414 | 1415 | 1416 | ```python 1417 | dxp.line(data=stocks.head(500)) 1418 | ``` 1419 | 1420 | 1421 | 1422 | 1423 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_110_0.png) 1424 | 1425 | 1426 | --------------------------------------------------------------------------------