├── tests
├── __init__.py
├── test_box.py
├── test_line.py
├── test_bar.py
└── test_scatter.py
├── MANIFEST.in
├── .github
├── FUNDING.yml
└── workflows
│ └── python-package.yml
├── dexplot
├── colors
│ ├── __init__.py
│ ├── _categories.py
│ └── _app.py
├── __init__.py
├── _pandas_accessor.py
├── _utils.py
├── _heat.py
├── _plotly.py
├── _plots.py
└── _common_plot.py
├── Upcoming Features.md
├── docs
├── css
│ └── style.css
├── overrides
│ └── main.html
└── index.md
├── setup.py
├── mkdocs.yml
├── LICENSE
├── .gitignore
├── notebooks
└── colormaps.ipynb
└── README.md
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | custom: ['https://dunderdata.com']
2 |
--------------------------------------------------------------------------------
/dexplot/colors/__init__.py:
--------------------------------------------------------------------------------
1 | from ._categories import sequential, diverging, cyclic, qualitative, misc, all_cmaps
2 |
3 | import importlib
4 | if importlib.util.find_spec('ipywidgets') and importlib.util.find_spec('IPython'):
5 | from ._app import color_viewer
--------------------------------------------------------------------------------
/dexplot/__init__.py:
--------------------------------------------------------------------------------
1 | from ._plots import line, bar, box, scatter, violin, hist, count, kde
2 | from ._utils import load_dataset
3 | from ._plotly import bar_plotly, line_plotly, scatter_plotly, count_plotly, box_plotly, violin_plotly
4 | from . import colors
5 | from ._pandas_accessor import _DexplotAccessor
6 |
7 | __version__ = '0.1.4'
8 |
--------------------------------------------------------------------------------
/Upcoming Features.md:
--------------------------------------------------------------------------------
1 | ## Upcoming Features
2 |
3 | * allow user access to entire dataframe in custom aggfunc
4 | * templates for x,y, labels and titles
5 | * color picker ipywdigets
6 | * ipywidgets full app integration
7 | * add other generic kwargs, ec, lw, alpha, etc...
8 | * [ ] kde with annotations, allow for binning
9 | * [ ] scatter with kde
10 | * [ ] allow kde and histograms to be grouped
11 | * [ ] use a categorical variable to size scatter plot
12 | * [ ] allow user to specify a specific matplotlib axes
13 | * [ ] add interaction with ipywidgets
14 | * [ ] stacked area plot
15 | * [ ] rolling averages for line plots
16 | * [ ] add parameter `bins` to bin numeric x
17 | * [ ] option to add counts to all aggregate plots
18 |
19 | ## Other plots
20 |
21 | * heat
22 | * hexplot
23 | * mosaic
--------------------------------------------------------------------------------
/docs/css/style.css:
--------------------------------------------------------------------------------
1 | table {
2 | background-color: transparent;
3 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
4 | margin-left:0;
5 | margin-right:0;
6 | border:none;
7 | border-collapse: collapse;
8 | border-spacing:0;
9 | color:black;
10 | font-size:13px;
11 | table-layout:fixed;
12 | overflow: scroll;
13 | }
14 | thead {
15 | border-bottom:1px solid black;vertical-align:bottom;
16 | }
17 | tr, th, td {
18 | text-align:right;
19 | vertical-align: middle;
20 | padding:0.5em 0.5em;
21 | line-height:normal;
22 | white-space:normal;
23 | max-width:none;
24 | border:none;
25 | }
26 | th {
27 | font-weight:bold;
28 | text-align:left;
29 | }
30 | tbody tr:nth-child(odd){
31 | background:#f5f5f5;
32 | }
33 | :link{
34 | text-decoration:underline;
35 | }
36 |
37 | .vid {
38 | display: flex;
39 | justify-content: center;
40 | }
41 | .vid video {
42 | width: 85%;
43 | }
44 |
45 | .dataframe {
46 | overflow: scroll;
47 | }
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | pull_request:
10 | branches: [ master ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ${{ matrix.os }}
16 | strategy:
17 | matrix:
18 | os: [ubuntu-latest, macos-latest, windows-latest]
19 | python-version: [3.6, 3.7, 3.8]
20 |
21 | steps:
22 | - uses: actions/checkout@v2
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v2
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 |
28 | - name: Install dependencies
29 | run: |
30 | python -m pip install --upgrade pip
31 | pip install pytest matplotlib pandas scipy plotly
32 | - name: Test with pytest
33 | run: pytest
34 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open('dexplot/__init__.py', 'r') as f:
4 | for line in f:
5 | if line.startswith('__version__'):
6 | version = line.split("'")[1]
7 |
8 | with open("README.md", "r") as fh:
9 | long_description = fh.read()
10 |
11 | setuptools.setup(
12 | name="dexplot",
13 | version=version,
14 | author="Ted Petrou",
15 | author_email="petrou.theodore@gmail.com",
16 | description="Powerful and intuitive data visualization library using matplotlib for both long and wide data",
17 | long_description=long_description,
18 | long_description_content_type="text/markdown",
19 | keywords="data visualization matplotlib pandas",
20 | url="https://github.com/dexplo/dexplot",
21 | packages=setuptools.find_packages(),
22 | classifiers=[
23 | "Programming Language :: Python :: 3",
24 | "License :: OSI Approved :: BSD License",
25 | "Operating System :: OS Independent",
26 | "Framework :: Matplotlib"
27 | ],
28 | install_requires=['numpy>=1.15',
29 | 'scipy>=1.0'
30 | 'matplotlib>=3.1',
31 | 'pandas>=0.24'],
32 | extras_require={
33 | "apps": ["ipywidgets"],
34 | },
35 | python_requires='>=3.6'
36 | )
--------------------------------------------------------------------------------
/dexplot/_pandas_accessor.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import pandas as pd
4 |
5 | from . import _plots as plots
6 |
7 | def get_doc(func):
8 | doc = func.__doc__
9 | return re.sub('data :.*(?=split :)', '', doc, count=1, flags=re.S)
10 |
11 |
12 | @pd.api.extensions.register_dataframe_accessor("dexplot")
13 | class _DexplotAccessor:
14 | def __init__(self, pandas_obj):
15 | self._obj = pandas_obj
16 |
17 | def box(self, x=None, y=None, split=None, row=None, col=None, x_order=None,
18 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h',
19 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None,
20 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None,
21 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2,
22 | groupgap=0, box_kwargs=None):
23 | return plots.box(x, y, self._obj, split, row, col, x_order, y_order, split_order,
24 | row_order, col_order, orientation, wrap, figsize, title, sharex,
25 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
26 | x_textwrap, y_textwrap, x_rot, y_rot, mode, gap, groupgap, box_kwargs)
27 |
28 | _DexplotAccessor.box.__doc__ = get_doc(plots.box)
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Dexplot
2 | site_description: Dexplot is a powerful and intuitive Python data visualization library using matplotlib for both long and wide data
3 | site_author: Ted Petrou
4 | site_url: https://www.dexplo.org/dexplot
5 | repo_url: https://github.com/dexplo/dexplot
6 | copyright: Copyright @ 2020 Ted Petrou
7 | google_analytics:
8 | - UA-119777567-7
9 | - dexplo.org
10 | theme:
11 | name: material
12 | custom_dir: docs/overrides
13 | features:
14 | - tabs
15 |
16 | nav:
17 | - Home: index.md
18 | - More Dexplo Libraries:
19 | - Dexplo: https://www.dexplo.org
20 |
21 | extra_css:
22 | - css/style.css
23 |
24 | extra:
25 | social:
26 | - icon: fontawesome/brands/github-alt
27 | link: https://github.com/dexplo
28 | - icon: fontawesome/brands/twitter
29 | link: https://twitter.com/TedPetrou
30 | - icon: fontawesome/brands/linkedin
31 | link: https://linkedin.com/in/TedPetrou
32 | - icon: fontawesome/brands/youtube
33 | link: https://www.youtube.com/c/dunderdata
34 | - icon: fontawesome/brands/facebook
35 | link: https://www.facebook.com/dunderdata
36 |
37 | markdown_extensions:
38 | - admonition
39 | - toc:
40 | permalink: True
41 | - codehilite:
42 | guess_lang: false
43 | - pymdownx.superfences
44 |
45 | plugins:
46 | - search
47 | - macros
48 | - minify:
49 | minify_html: true
50 |
51 | extra_javascript:
52 | - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML
53 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2018, dexplo
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/dexplot/_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from scipy.stats import gaussian_kde
4 |
5 | RAW_URL = 'https://raw.githubusercontent.com/dexplo/dexplot/master/data/{name}.csv'
6 | DATASETS = ['airbnb']
7 |
8 | def load_dataset(name):
9 | """
10 | Load a dataset. Must be connected to the internet
11 |
12 | Datasets
13 | --------
14 | airbnb
15 | """
16 | if name not in DATASETS:
17 | raise KeyError(f'Dataset {name} does not exist. Choose one of the following: {DATASETS}')
18 |
19 | url = RAW_URL.format(name=name)
20 | return pd.read_csv(url)
21 |
22 |
23 | def calculate_density_1d(data, cumulative=False):
24 | density_func = gaussian_kde(data)
25 | min_x, max_x = data.min(), data.max()
26 | range_x = max_x - min_x
27 | min_x = min_x - 2 * range_x
28 | max_x = max_x + 2 * range_x
29 | x = np.linspace(min_x, max_x, 400)
30 | density = density_func(x)
31 | max_density = density.max()
32 | filt = density > max_density / 1000
33 | x = x[filt]
34 | density = density[filt]
35 | if cumulative:
36 | density = np.cumsum(density)
37 | density = 1 / density.max() * density
38 | return x, density
39 |
40 | def calculate_density_2d(x, y):
41 | xmin, xmax = x.min(), x.max()
42 | ymin, ymax = y.min(), y.max()
43 | X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
44 | positions = np.vstack([X.ravel(), Y.ravel()])
45 | values = np.vstack([x, y])
46 | kernel = gaussian_kde(values)
47 | Z = np.reshape(kernel(positions).T, X.shape)
48 | return xmin, xmax, ymin, ymax, np.rot90(Z)
49 |
50 |
51 |
--------------------------------------------------------------------------------
/docs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 |
3 | {% block extrahead %}
4 |
5 | {% if page and page.meta and page.meta.title %}
6 |
7 | {% elif page and page.title and not page.is_homepage %}
8 |
9 | {% else %}
10 |
11 | {% endif %}
12 |
13 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | {% if page and page.meta and page.meta.title %}
23 |
24 | {% elif page and page.title and not page.is_homepage %}
25 |
26 | {% else %}
27 |
28 | {% endif %}
29 |
30 |
32 | {% endblock %}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | .DS_Store
10 | .idea
11 | docs/images
12 | notebooks/
13 |
14 | # Distribution / packaging
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # pyenv
81 | .python-version
82 |
83 | # celery beat schedule file
84 | celerybeat-schedule
85 |
86 | # SageMath parsed files
87 | *.sage.py
88 |
89 | # Environments
90 | .env
91 | .venv
92 | env/
93 | venv/
94 | ENV/
95 | env.bak/
96 | venv.bak/
97 |
98 | # Spyder project settings
99 | .spyderproject
100 | .spyproject
101 |
102 | # Rope project settings
103 | .ropeproject
104 |
105 | # mkdocs documentation
106 | /site
107 |
108 | # mypy
109 | .mypy_cache/
110 |
--------------------------------------------------------------------------------
/notebooks/colormaps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import plotly"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "p_sequential = [k.lower() for k, v in vars(plotly.colors.sequential).items() if isinstance(v, list) \n",
19 | " and not k.startswith('_')]\n",
20 | "p_diverging = [k.lower() for k, v in vars(plotly.colors.diverging).items() if isinstance(v, list) \n",
21 | " and not k.startswith('_')]\n",
22 | "p_cyclic = [k.lower() for k, v in vars(plotly.colors.cyclical).items() if isinstance(v, list) \n",
23 | " and not k.startswith('_')]\n",
24 | "p_qual = [k.lower() for k, v in vars(plotly.colors.qualitative).items() if isinstance(v, list) \n",
25 | " and not k.startswith('_')]\n",
26 | "p_qual += ['dark12', 'dark12_r']"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "mpl_sequential = ['viridis', 'plasma', 'inferno', 'magma', 'cividis',\n",
36 | " 'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',\n",
37 | " 'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',\n",
38 | " 'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn',\n",
39 | " 'binary', 'gist_yarg', 'gist_gray', 'gray', 'bone', 'pink',\n",
40 | " 'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia',\n",
41 | " 'hot', 'afmhot', 'gist_heat', 'copper'] \n",
42 | "mpl_diverging = ['PiYG', 'PRGn', 'BrBG', 'PuOr', 'RdGy', 'RdBu',\n",
43 | " 'RdYlBu', 'RdYlGn', 'Spectral', 'coolwarm', 'bwr', 'seismic']\n",
44 | "mpl_cyclic = ['twilight', 'twilight_shifted', 'hsv']\n",
45 | "mpl_qual = ['Pastel1', 'Pastel2', 'Paired', 'Accent', 'Dark2', 'Set1', 'Set2', 'Set3',\n",
46 | " 'tab10', 'tab20', 'tab20b', 'tab20c']\n",
47 | "mpl_misc = ['flag', 'prism', 'ocean', 'gist_earth', 'terrain', 'gist_stern',\n",
48 | " 'gnuplot', 'gnuplot2', 'CMRmap', 'cubehelix', 'brg',\n",
49 | " 'gist_rainbow', 'rainbow', 'jet', 'nipy_spectral', 'gist_ncar']\n",
50 | "\n",
51 | "def double(colors):\n",
52 | " a = []\n",
53 | " for color in colors:\n",
54 | " c = color.lower()\n",
55 | " a.append(c)\n",
56 | " a.append(c + '_r')\n",
57 | " return a\n",
58 | " \n",
59 | "mpl_sequential = double(mpl_sequential)\n",
60 | "mpl_diverging = double(mpl_diverging)\n",
61 | "mpl_cyclic = double(mpl_cyclic)\n",
62 | "mpl_qual = double(mpl_qual)\n",
63 | "mpl_misc = double(mpl_misc)\n",
64 | "\n",
65 | "seq = sorted(set(mpl_sequential + p_sequential))\n",
66 | "diverging = sorted(set(mpl_diverging + p_diverging))\n",
67 | "cyclic = sorted(set(mpl_cyclic + p_cyclic))\n",
68 | "qual = sorted(set(mpl_qual + p_qual))\n",
69 | "misc = sorted(set(mpl_misc))"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": []
78 | }
79 | ],
80 | "metadata": {
81 | "kernelspec": {
82 | "display_name": "Python 3",
83 | "language": "python",
84 | "name": "python3"
85 | },
86 | "language_info": {
87 | "codemirror_mode": {
88 | "name": "ipython",
89 | "version": 3
90 | },
91 | "file_extension": ".py",
92 | "mimetype": "text/x-python",
93 | "name": "python",
94 | "nbconvert_exporter": "python",
95 | "pygments_lexer": "ipython3",
96 | "version": "3.8.3"
97 | }
98 | },
99 | "nbformat": 4,
100 | "nbformat_minor": 4
101 | }
102 |
--------------------------------------------------------------------------------
/tests/test_box.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import dexplot as dxp
4 |
5 |
6 | airbnb = dxp.load_dataset('airbnb')
7 |
8 |
9 | class TestSort:
10 |
11 | def test_lex_asc(self):
12 | fig = dxp.box(x='price', y='neighborhood', data=airbnb)
13 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
14 | correct = sorted(ticklabels)
15 | assert ticklabels == correct
16 |
17 | def test_lex_desc(self):
18 | fig = dxp.box(x='price', y='neighborhood', data=airbnb, y_order='desc')
19 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
20 | correct = sorted(ticklabels, reverse=True)
21 | assert ticklabels == correct
22 |
23 | def test_asc_values(self):
24 | fig = dxp.box(x='price', y='neighborhood', data=airbnb, sort_values='asc')
25 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
26 | ticklabels = [label.replace('\n', ' ') for label in ticklabels]
27 | values = [p.get_height() for p in fig.axes[0].patches]
28 |
29 |
30 | def test_desc_values(self):
31 | fig = dxp.box(x='price', y='neighborhood', data=airbnb, sort_values='desc')
32 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
33 | ticklabels = [label.replace('\n', ' ') for label in ticklabels]
34 |
35 |
36 | class TestOrder:
37 |
38 | def test_x_order(self):
39 | dxp.box(x='price', y='neighborhood', data=airbnb,
40 | y_order=['Dupont Circle', 'Edgewood', 'Union Station'])
41 |
42 | with pytest.raises(ValueError):
43 | dxp.box(x='price', y='neighborhood', data=airbnb,
44 | y_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST'])
45 |
46 |
47 | class TestVertical:
48 |
49 | def test_vert(self):
50 | dxp.box(x='neighborhood', y='price', data=airbnb, orientation='v')
51 |
52 |
53 | class TestSplit:
54 |
55 | def test_split(self):
56 | dxp.box(x='price', y='neighborhood', data=airbnb, split='superhost')
57 |
58 | def test_split_order(self):
59 | dxp.box(x='price', y='neighborhood', data=airbnb,
60 | split='superhost', split_order=['Yes', 'No'])
61 |
62 | def test_stacked(self):
63 | dxp.box(x='price', y='neighborhood', data=airbnb,
64 | split='superhost', split_order=['Yes', 'No'])
65 |
66 |
67 | class TestRowCol:
68 |
69 | def test_col(self):
70 | dxp.box(x='price', y='neighborhood', data=airbnb,
71 | split='superhost', col='property_type')
72 |
73 | def test_col_wrap(self):
74 | dxp.box(x='price', y='neighborhood', data=airbnb,
75 | split='superhost', col='property_type', wrap=2)
76 |
77 | def test_col_order(self):
78 | dxp.box(x='price', y='neighborhood', data=airbnb,
79 | split='superhost', col='property_type', col_order=['House', 'Condominium'])
80 |
81 | def test_row(self):
82 | dxp.box(x='price', y='neighborhood', data=airbnb,
83 | split='superhost', row='property_type')
84 |
85 | def test_row_order(self):
86 | dxp.box(x='price', y='neighborhood', data=airbnb,
87 | split='superhost', row='property_type', row_order=['House', 'Condominium'])
88 |
89 | def test_row_wrap(self):
90 | dxp.box(x='price', y='neighborhood', data=airbnb,
91 | split='superhost', row='property_type', wrap=2)
92 |
93 | def test_row_col(self):
94 | dxp.box(x='price', y='neighborhood', data=airbnb,
95 | split='superhost', col='property_type',
96 | col_order=['House', 'Condominium', 'Apartment'],
97 | row='bedrooms', row_order=[0, 1, 2, 3])
98 |
99 | def test_sharex(self):
100 | dxp.box(x='price', y='neighborhood', data=airbnb,
101 | split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'],
102 | row='bedrooms', row_order=[1, 2, 3], sharex=False)
103 |
--------------------------------------------------------------------------------
/dexplot/colors/_categories.py:
--------------------------------------------------------------------------------
1 | from ._colormaps import colormaps
2 |
3 | def set_attrs(obj, cmaps):
4 | for cmap in cmaps:
5 | setattr(obj, cmap, colormaps[cmap])
6 |
7 | sequential_colormaps = [
8 | 'afmhot', 'afmhot_r', 'aggrnyl', 'aggrnyl_r', 'agsunset', 'agsunset_r', 'algae', 'algae_r',
9 | 'amp', 'amp_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'blackbody', 'blackbody_r',
10 | 'bluered', 'bluered_r', 'blues', 'blues_r', 'blugrn', 'blugrn_r', 'bluyl', 'bluyl_r',
11 | 'bone', 'bone_r', 'brwnyl', 'brwnyl_r', 'bugn', 'bugn_r', 'bupu', 'bupu_r', 'burg', 'burg_r',
12 | 'burgyl', 'burgyl_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'copper', 'copper_r',
13 | 'darkmint', 'darkmint_r', 'deep', 'deep_r', 'dense', 'dense_r', 'electric', 'electric_r',
14 | 'emrld', 'emrld_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_yarg',
15 | 'gist_yarg_r', 'gnbu', 'gnbu_r', 'gray', 'gray_r', 'greens', 'greens_r', 'greys', 'greys_r',
16 | 'haline', 'haline_r', 'hot', 'hot_r', 'ice', 'ice_r', 'inferno', 'inferno_r', 'jet', 'jet_r',
17 | 'magenta', 'magenta_r', 'magma', 'magma_r', 'matter', 'matter_r', 'mint', 'mint_r', 'oranges',
18 | 'oranges_r', 'orrd', 'orrd_r', 'oryel', 'oryel_r', 'peach', 'peach_r', 'pink', 'pink_r',
19 | 'pinkyl', 'pinkyl_r', 'plasma', 'plasma_r', 'plotly3', 'plotly3_r', 'pubu', 'pubu_r', 'pubugn',
20 | 'pubugn_r', 'purd', 'purd_r', 'purp', 'purp_r', 'purples', 'purples_r', 'purpor', 'purpor_r',
21 | 'rainbow', 'rainbow_r', 'rdbu', 'rdbu_r', 'rdpu', 'rdpu_r', 'redor', 'redor_r', 'reds',
22 | 'reds_r', 'solar', 'solar_r', 'speed', 'speed_r', 'spring', 'spring_r', 'summer', 'summer_r',
23 | 'sunset', 'sunset_r', 'sunsetdark', 'sunsetdark_r', 'teal', 'teal_r', 'tealgrn', 'tealgrn_r',
24 | 'tempo', 'tempo_r', 'thermal', 'thermal_r', 'turbid', 'turbid_r', 'viridis', 'viridis_r',
25 | 'winter', 'winter_r', 'wistia', 'wistia_r', 'ylgn', 'ylgn_r', 'ylgnbu', 'ylgnbu_r', 'ylorbr',
26 | 'ylorbr_r', 'ylorrd', 'ylorrd_r'
27 | ]
28 |
29 | diverging_colormaps = [
30 | 'armyrose', 'armyrose_r', 'balance', 'balance_r', 'brbg', 'brbg_r', 'bwr', 'bwr_r', 'coolwarm',
31 | 'coolwarm_r', 'curl', 'curl_r', 'delta', 'delta_r', 'earth', 'earth_r', 'fall', 'fall_r',
32 | 'geyser', 'geyser_r', 'picnic', 'picnic_r', 'piyg', 'piyg_r', 'portland', 'portland_r', 'prgn',
33 | 'prgn_r', 'puor', 'puor_r', 'rdbu', 'rdbu_r', 'rdgy', 'rdgy_r', 'rdylbu', 'rdylbu_r', 'rdylgn',
34 | 'rdylgn_r', 'seismic', 'seismic_r', 'spectral', 'spectral_r', 'tealrose', 'tealrose_r', 'temps',
35 | 'temps_r', 'tropic', 'tropic_r'
36 | ]
37 |
38 | cyclic_colormaps = [
39 | 'edge', 'edge_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'mrybm', 'mrybm_r', 'mygbm',
40 | 'mygbm_r', 'phase', 'phase_r', 'twilight', 'twilight_r', 'twilight_shifted',
41 | 'twilight_shifted_r'
42 | ]
43 |
44 | qualitative_colormaps = [
45 | 'accent', 'accent_r', 'alphabet', 'alphabet_r', 'antique', 'antique_r', 'bold', 'bold_r',
46 | 'd3', 'd3_r', 'dark12', 'dark12_r', 'dark2', 'dark24', 'dark24_r', 'dark2_r', 'g10', 'g10_r',
47 | 'light24', 'light24_r', 'paired', 'paired_r', 'pastel', 'pastel1', 'pastel1_r', 'pastel2',
48 | 'pastel2_r', 'pastel_r', 'plotly', 'plotly_r', 'prism', 'prism_r', 'safe', 'safe_r', 'set1',
49 | 'set1_r', 'set2', 'set2_r', 'set3', 'set3_r', 't10', 't10_r', 'tab10', 'tab10_r', 'tab20',
50 | 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'vivid', 'vivid_r'
51 | ]
52 |
53 | misc_colormaps = [
54 | 'brg', 'brg_r', 'cmrmap', 'cmrmap_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r',
55 | 'gist_earth', 'gist_earth_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r',
56 | 'gist_stern', 'gist_stern_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'jet', 'jet_r',
57 | 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'prism', 'prism_r', 'rainbow',
58 | 'rainbow_r', 'terrain', 'terrain_r'
59 | ]
60 |
61 | all_colormaps = (
62 | sequential_colormaps + diverging_colormaps + cyclic_colormaps +
63 | qualitative_colormaps + misc_colormaps
64 | )
65 |
66 | class ColorMaps:
67 | pass
68 |
69 | sequential = ColorMaps()
70 | diverging = ColorMaps()
71 | cyclic = ColorMaps()
72 | qualitative = ColorMaps()
73 | misc = ColorMaps()
74 | all_cmaps = ColorMaps()
75 |
76 | set_attrs(sequential, sequential_colormaps)
77 | set_attrs(diverging, diverging_colormaps)
78 | set_attrs(cyclic, cyclic_colormaps)
79 | set_attrs(qualitative, qualitative_colormaps)
80 | set_attrs(misc, misc_colormaps)
81 | set_attrs(all_cmaps, all_colormaps)
--------------------------------------------------------------------------------
/dexplot/colors/_app.py:
--------------------------------------------------------------------------------
1 | import io
2 |
3 | from ipywidgets import Dropdown, Image, HBox, HTML, Checkbox, interactive_output, VBox
4 | from IPython.display import display
5 | import matplotlib.pyplot as plt
6 | from matplotlib.colors import ListedColormap
7 | import numpy as np
8 |
9 | from dexplot.colors._colormaps import colormaps
10 | from dexplot.colors._categories import (qualitative_colormaps, sequential_colormaps,
11 | diverging_colormaps, cyclic_colormaps, misc_colormaps,
12 | all_colormaps)
13 |
14 | ARR = np.linspace(0, 1, 256).reshape((1, -1)).repeat(20, 0)
15 |
16 | cmap_dict = {'qualitative': qualitative_colormaps,
17 | 'sequential': sequential_colormaps,
18 | 'diverging': diverging_colormaps,
19 | 'cyclic': cyclic_colormaps,
20 | 'misc': misc_colormaps,
21 | 'all': all_colormaps}
22 |
23 | cmap_default = {'qualitative': 't10',
24 | 'sequential': 'viridis',
25 | 'diverging': 'coolwarm',
26 | 'cyclic': 'edge',
27 | 'misc': 'ocean',
28 | 'all': 'tab10'}
29 |
30 | cmap_dropdown = Dropdown(options=[('Qualitative', 'qualitative'),
31 | ('Sequential', 'sequential'),
32 | ('Diverging', 'diverging'),
33 | ('Cyclic', 'cyclic'),
34 | ('Misc', 'misc'),
35 | ('All Colormaps', 'all')],
36 | value=None,
37 | description='Colormap Category: ',
38 | style = {'description_width': 'initial'})
39 |
40 | def remove_ticks_spines(ax):
41 | ax.set_xticks([])
42 | ax.set_yticks([])
43 | for spine in ax.spines.values():
44 | spine.set_visible(False)
45 |
46 | class ColorViewer:
47 |
48 | def __init__(self):
49 | self.checked_colors = []
50 | self.test_list = []
51 | self.cbox_dict = {cat: self.cmap_checkboxes(cat) for cat in cmap_default}
52 | self.layout = self.create_layout()
53 | self.fig, self.ax = self.create_figure()
54 | self.add_interaction()
55 |
56 | def checkbox_maker(self, name, default):
57 | value = name == default
58 | c = Checkbox(value=value, description=name, disabled=False,
59 | indent=False, style={'color': 'blue'})
60 | c.observe(self.cb_handler, 'value')
61 | return c
62 |
63 | def cmap_checkboxes(self, category):
64 | rows = []
65 | row = []
66 | layout = {'justify_content': 'flex-end', 'margin': '0px'}
67 | cmaps = cmap_dict[category]
68 | default = cmap_default[category]
69 | for name in cmaps:
70 | row.append(self.checkbox_maker(name, default))
71 | if len(row) == 10:
72 | rows.append(HBox(row, layout=layout))
73 | row = []
74 | if row:
75 | rows.append(HBox(row, layout=layout))
76 | return rows
77 |
78 | def create_image(self):
79 | for image in self.ax.images:
80 | image.remove()
81 |
82 | ticks = []
83 | ticklabels = []
84 | i = 0
85 |
86 | for i, name in enumerate(self.checked_colors):
87 | cmap = ListedColormap(colormaps[name])
88 | self.ax.imshow(ARR, cmap=cmap, extent=[0, 10, i + .2, i + .8], aspect='auto')
89 | ticks.append(i + .5)
90 | ticklabels.append(name)
91 |
92 | self.ax.set_ylim(0, i + 1)
93 | self.ax.set_yticks(ticks)
94 | self.ax.set_yticklabels(ticklabels)
95 | img_bytes = io.BytesIO()
96 | self.fig.canvas.print_figure(img_bytes)
97 | img_bytes.seek(0)
98 |
99 | self.img.layout.visibility = 'visible'
100 | self.img.value = img_bytes.read()
101 |
102 | def get_checkboxes(self, category):
103 | if category is None:
104 | return
105 | self.checked_colors.clear()
106 | self.checked_colors.append(cmap_default[category])
107 | self.layout.children = list(self.layout.children[:2]) + self.cbox_dict[category]
108 | self.test_list.append('end of get_checkboxes')
109 | self.create_image()
110 |
111 | def cb_handler(self, change):
112 | name = change['owner'].description
113 | if change['new']:
114 | self.checked_colors.append(name)
115 | else:
116 | self.checked_colors.remove(name)
117 | self.create_image()
118 |
119 | def create_layout(self):
120 | title = HTML('
Color Viewer
')
121 | self.img = Image(width=700, height=600)
122 | self.img.layout.visibility = 'hidden'
123 |
124 | rows = []
125 | row1 = HBox([title], layout={'justify_content': 'flex-start'})
126 | row2 = HBox([cmap_dropdown, self.img], layout={'align_items': 'center'})
127 | rows = [row1, row2]
128 |
129 | return VBox(rows)
130 |
131 | def create_figure(self):
132 | fig = plt.Figure(dpi=144, tight_layout=True, figsize=(6, 3))
133 | ax = fig.add_subplot()
134 | remove_ticks_spines(ax)
135 | return fig, ax
136 |
137 | def add_interaction(self):
138 | interactive_output(self.get_checkboxes, {'category': cmap_dropdown})
139 | cmap_dropdown.value = 'qualitative'
140 |
141 |
142 | def run(self):
143 | display(self.layout)
144 |
145 |
146 | def color_viewer():
147 | ColorViewer().run()
--------------------------------------------------------------------------------
/tests/test_line.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import dexplot as dxp
4 |
5 |
6 | airbnb = dxp.load_dataset('airbnb')
7 |
8 | class TestAgg:
9 |
10 | def test_string_name(self):
11 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median')
12 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='mean')
13 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='min')
14 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='max')
15 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='size')
16 |
17 | def test_function(self):
18 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.median)
19 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.mean)
20 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.min)
21 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.max)
22 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.size)
23 |
24 |
25 | class TestSort:
26 |
27 | def test_lex_asc(self):
28 | fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median')
29 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
30 | correct = sorted(ticklabels)
31 | assert ticklabels == correct
32 |
33 | def test_lex_desc(self):
34 | fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc')
35 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
36 | correct = sorted(ticklabels, reverse=True)
37 | assert ticklabels == correct
38 |
39 | def test_asc_values(self):
40 | fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
41 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
42 | ticklabels = [label.replace('\n', ' ') for label in ticklabels]
43 | values = [p.get_height() for p in fig.axes[0].patches]
44 |
45 | s = airbnb.groupby('neighborhood')['price'].median().sort_values()
46 | correct_labels = s.index.tolist()
47 | correct_values = s.values.tolist()
48 | assert ticklabels == correct_labels
49 |
50 | def test_desc_values(self):
51 | fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
52 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
53 | ticklabels = [label.replace('\n', ' ') for label in ticklabels]
54 | values = [p.get_height() for p in fig.axes[0].patches]
55 |
56 | df = airbnb.groupby('neighborhood').agg({'price': 'median'}).reset_index() \
57 | .sort_values(['price', 'neighborhood'], ascending=[False, True])
58 | s = df.set_index('neighborhood').squeeze()
59 | correct_labels = s.index.tolist()
60 | correct_values = s.values.tolist()
61 | assert ticklabels == correct_labels
62 |
63 |
64 | class TestOrder:
65 |
66 | def test_x_order(self):
67 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
68 | x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
69 |
70 | with pytest.raises(ValueError):
71 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
72 | x_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST'])
73 |
74 |
75 | class TestHorizontal:
76 |
77 | def test_horiz(self):
78 | dxp.line(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
79 |
80 |
81 | class TestSplit:
82 |
83 | def test_split(self):
84 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
85 |
86 | def test_split_order(self):
87 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
88 | split='superhost', split_order=['Yes', 'No'])
89 |
90 | def test_stacked(self):
91 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
92 | split='superhost', split_order=['Yes', 'No'])
93 |
94 |
95 | class TestRowCol:
96 |
97 | def test_col(self):
98 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
99 | split='superhost', col='property_type')
100 |
101 | def test_col_wrap(self):
102 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
103 | split='superhost', col='property_type', wrap=2)
104 |
105 | def test_col_order(self):
106 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
107 | split='superhost', col='property_type', col_order=['House', 'Condominium'])
108 |
109 | def test_row(self):
110 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
111 | split='superhost', row='property_type')
112 |
113 | def test_row_order(self):
114 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
115 | split='superhost', row='property_type', row_order=['House', 'Condominium'])
116 |
117 | def test_row_wrap(self):
118 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
119 | split='superhost', row='property_type', wrap=2)
120 |
121 | def test_row_col(self):
122 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
123 | split='superhost', col='property_type',
124 | col_order=['House', 'Condominium', 'Apartment'],
125 | row='bedrooms', row_order=[0, 1, 2, 3])
126 |
127 | def test_sharey(self):
128 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
129 | split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'],
130 | row='bedrooms', row_order=[0, 1, 2, 3], sharey=False)
131 |
--------------------------------------------------------------------------------
/tests/test_bar.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import dexplot as dxp
4 |
5 |
6 | airbnb = dxp.load_dataset('airbnb')
7 | aggfunc = ['median', 'mean', 'min', 'max', 'size', np.median, np.mean, np.min, np.max, np.max]
8 |
9 |
10 | class TestAgg:
11 |
12 | @pytest.mark.parametrize('aggfunc', aggfunc)
13 | def test_string_name(self, aggfunc):
14 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc=aggfunc)
15 |
16 |
17 | class TestSort:
18 |
19 | def test_lex_asc(self):
20 | fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median')
21 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
22 | correct = sorted(ticklabels)
23 | assert ticklabels == correct
24 |
25 | def test_lex_desc(self):
26 | fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc')
27 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
28 | correct = sorted(ticklabels, reverse=True)
29 | assert ticklabels == correct
30 |
31 | def test_asc_values(self):
32 | fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
33 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
34 | ticklabels = [label.replace('\n', ' ') for label in ticklabels]
35 | values = [p.get_height() for p in fig.axes[0].patches]
36 |
37 | s = airbnb.groupby('neighborhood')['price'].median().sort_values()
38 | correct_labels = s.index.tolist()
39 | correct_values = s.values.tolist()
40 | assert ticklabels == correct_labels
41 | assert values == correct_values
42 |
43 | def test_desc_values(self):
44 | fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
45 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
46 | ticklabels = [label.replace('\n', ' ') for label in ticklabels]
47 | values = [p.get_height() for p in fig.axes[0].patches]
48 |
49 | df = airbnb.groupby('neighborhood').agg({'price': 'median'}).reset_index() \
50 | .sort_values(['price', 'neighborhood'], ascending=[False, True])
51 | s = df.set_index('neighborhood').squeeze()
52 | correct_labels = s.index.tolist()
53 | correct_values = s.values.tolist()
54 | assert ticklabels == correct_labels
55 | assert values == correct_values
56 |
57 |
58 | class TestOrder:
59 |
60 | def test_x_order(self):
61 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
62 | x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
63 |
64 | with pytest.raises(ValueError):
65 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
66 | x_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST'])
67 |
68 | class TestHorizontal:
69 |
70 | def test_horiz(self):
71 | dxp.bar(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
72 |
73 |
74 | class TestSplit:
75 |
76 | def test_split(self):
77 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
78 |
79 | def test_split_order(self):
80 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
81 | split='superhost', split_order=['Yes', 'No'])
82 |
83 | def test_stacked(self):
84 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
85 | split='superhost', split_order=['Yes', 'No'], stacked=True)
86 |
87 | def test_errors(self):
88 | with pytest.raises(ValueError):
89 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
90 | split='property_type', split_order=['Yes', 'No'])
91 |
92 |
93 | class TestRowCol:
94 |
95 | def test_col(self):
96 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
97 | split='superhost', col='property_type')
98 |
99 | def test_col_wrap(self):
100 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
101 | split='superhost', col='property_type', wrap=2)
102 |
103 | def test_col_order(self):
104 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
105 | split='superhost', col='property_type', col_order=['House', 'Condominium'])
106 |
107 | def test_row(self):
108 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
109 | split='superhost', row='property_type')
110 |
111 | def test_row_order(self):
112 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
113 | split='superhost', row='property_type', row_order=['House', 'Condominium'])
114 |
115 | def test_row_wrap(self):
116 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
117 | split='superhost', row='property_type', wrap=2)
118 |
119 | def test_row_col(self):
120 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
121 | split='superhost', col='property_type',
122 | col_order=['House', 'Condominium', 'Apartment'],
123 | row='bedrooms', row_order=[0, 1, 2, 3])
124 |
125 | def test_sharey(self):
126 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
127 | split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'],
128 | row='bedrooms', row_order=[0, 1, 2, 3], sharey=False)
129 |
130 | class TestBarProps:
131 |
132 | def test_bar_size(self):
133 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='property_type',
134 | split_order=['Apartment', 'House'],
135 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'], size=.5)
136 |
--------------------------------------------------------------------------------
/tests/test_scatter.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import dexplot as dxp
4 |
5 |
6 | airbnb = dxp.load_dataset('airbnb')
7 |
8 | class TestAgg:
9 |
10 | def test_string_name(self):
11 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median')
12 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='mean')
13 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='min')
14 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='max')
15 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='size')
16 |
17 | def test_function(self):
18 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.median)
19 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.mean)
20 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.min)
21 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.max)
22 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.size)
23 |
24 |
25 | class TestSort:
26 |
27 | def test_lex_asc(self):
28 | fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median')
29 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
30 | correct = sorted(ticklabels)
31 | assert ticklabels == correct
32 |
33 | def test_lex_desc(self):
34 | fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='lex_desc')
35 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
36 | correct = sorted(ticklabels, reverse=True)
37 | assert ticklabels == correct
38 |
39 | def test_asc_values(self):
40 | fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
41 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
42 | ticklabels = [label.replace('\n', ' ') for label in ticklabels]
43 | values = [p.get_height() for p in fig.axes[0].patches]
44 |
45 | s = airbnb.groupby('neighborhood')['price'].median().sort_values()
46 | correct_labels = s.index.tolist()
47 | correct_values = s.values.tolist()
48 | assert ticklabels == correct_labels
49 |
50 | def test_desc_values(self):
51 | fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
52 | ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
53 | ticklabels = [label.replace('\n', ' ') for label in ticklabels]
54 | values = [p.get_height() for p in fig.axes[0].patches]
55 |
56 | df = airbnb.groupby('neighborhood').agg({'price': 'median'}).reset_index() \
57 | .sort_values(['price', 'neighborhood'], ascending=[False, True])
58 | s = df.set_index('neighborhood').squeeze()
59 | correct_labels = s.index.tolist()
60 | correct_values = s.values.tolist()
61 | assert ticklabels == correct_labels
62 |
63 |
64 | class TestOrder:
65 |
66 | def test_x_order(self):
67 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
68 | x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
69 |
70 | with pytest.raises(ValueError):
71 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
72 | x_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST'])
73 |
74 |
75 | class TestHorizontal:
76 |
77 | def test_horiz(self):
78 | dxp.scatter(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
79 |
80 |
81 | class TestSplit:
82 |
83 | def test_split(self):
84 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
85 |
86 | def test_split_order(self):
87 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
88 | split='superhost', split_order=['Yes', 'No'])
89 |
90 | def test_stacked(self):
91 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
92 | split='superhost', split_order=['Yes', 'No'])
93 |
94 |
95 | class TestRowCol:
96 |
97 | def test_col(self):
98 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
99 | split='superhost', col='property_type')
100 |
101 | def test_col_wrap(self):
102 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
103 | split='superhost', col='property_type', wrap=2)
104 |
105 | def test_col_order(self):
106 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
107 | split='superhost', col='property_type', col_order=['House', 'Condominium'])
108 |
109 | def test_row(self):
110 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
111 | split='superhost', row='property_type')
112 |
113 | def test_row_order(self):
114 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
115 | split='superhost', row='property_type', row_order=['House', 'Condominium'])
116 |
117 | def test_row_wrap(self):
118 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
119 | split='superhost', row='property_type', wrap=2)
120 |
121 | def test_row_col(self):
122 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
123 | split='superhost', col='property_type',
124 | col_order=['House', 'Condominium', 'Apartment'],
125 | row='bedrooms', row_order=[0, 1, 2, 3])
126 |
127 | def test_sharey(self):
128 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
129 | split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'],
130 | row='bedrooms', row_order=[0, 1, 2, 3], sharey=False)
131 |
--------------------------------------------------------------------------------
/dexplot/_heat.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | import numpy as np
4 |
5 |
6 | def heatmap(x=None, y=None, agg=None, aggfunc=None, data=None, normalize=None, corr=False,
7 | annot=False, fmt='.2f', ax=None, figsize=None, title=None, cmap=None,
8 | cbarlabel="", cbar_kw={}, **kwargs):
9 | """
10 | Create a heatmap from a Pandas DataFrame. This function works with either
11 | tidy data or aggregated data.
12 |
13 | If using tidy data, pass it categorical/string variables to `x` and `y`
14 | and a numeric variable to `values`. Pass an aggregation function
15 | as a string to `aggfunc`. You may also choose to leave `values` as None
16 | which result in a raw frequency count for the co-occurence of the `x` and
17 | `y` variables. Set normalize to True to get relative percentages.
18 |
19 | If using aggregated data, only use the `data` parameter. The index and
20 | columns will label the x and y. The values of the DataFrame will form
21 | will be used for the heat map.
22 |
23 | Parameters
24 | ----------
25 | x: str
26 | Column name who's unique values will be used to form groups. Can
27 | only be used with tidy data and should be a categorical/string.
28 |
29 | y: str
30 | Column name who's unique values will be used to form groups. Can
31 | only be used with tidy data and should be a categorical/string.
32 |
33 | agg: str
34 | Column name who's values will be aggregated across the groups
35 | formed by `x` and `y`.
36 |
37 | aggfunc: str or function
38 | Used to aggregate `agg` variable. Use any of the strings that Pandas
39 | can understand. You can also use a custom function as long as it
40 | aggregates, i.e. returns a single value.
41 |
42 | data: DataFrame
43 | A Pandas DataFrame containing either tidy or aggregated data
44 |
45 | normalize: str
46 | Must be one of three strings, "all" or the name of one of the column
47 | names provided to `x` or `y`.
48 |
49 | corr: bool - Default False
50 | When set to True, will calcaulte the correlation of the co-occurence
51 | between each of the unique values in `x` and `y`. Only works with
52 | tidy data.
53 |
54 | annot: bool - Default False
55 | Controls whether the aggregated values will be plotted as
56 | text in the heatmap.
57 |
58 | fmt: str
59 | Formatting style for annotations
60 |
61 | ax: Matplotlib Axes
62 | The Matplotlib Axes object to use for plotting. If not given, then
63 | create a new Figure and Axes
64 |
65 | figsize: tuple
66 | A two item tuple of ints used to control the figure size
67 |
68 | title: str
69 | Sets the title of the figure
70 |
71 | cmap: str
72 | Matplotlib colormap name
73 |
74 | cbarlabel: str
75 | Labels the colorbar
76 |
77 | cbar_kw: dict
78 | Keyword arguments passed to the `colorbar` Figure function
79 |
80 | kwargs: dict
81 | Keyword arguments passed to the `imshow` Axes function
82 |
83 | Returns
84 | -------
85 | A one-item tuple containing a Matplotlib Figure
86 |
87 | References
88 | ----------
89 | Code was inspired from Matplotlib page
90 | https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html
91 | """
92 |
93 | if figsize is None:
94 | figsize = (10, 8)
95 |
96 | if not isinstance(data, pd.DataFrame):
97 | raise TypeError('`data` must be a DataFrame')
98 |
99 | if ax is None:
100 | fig, ax = plt.subplots(figsize=figsize)
101 | if title:
102 | fig.suptitle(title)
103 | else:
104 | fig = ax.figure
105 |
106 | if aggfunc:
107 | if not agg:
108 | raise ValueError('If you are setting `aggfunc`, you need to set `agg` as well.')
109 |
110 | if not normalize:
111 | normalize = False
112 |
113 | if cmap is None:
114 | cmap = 'RdYlBu_r'
115 |
116 | if x or y:
117 | if not (x and y):
118 | raise ValueError('If you supply one of x or y, you must both of them')
119 |
120 | if normalize not in (False, 'all', x, y):
121 | raise ValueError('If you are setting `normalize`, it must be either '
122 | f'"all", "{x}" or "{y}"')
123 | elif normalize == x:
124 | normalize = 'columns'
125 | elif normalize == y:
126 | normalize = 'index'
127 |
128 | if agg:
129 | data_values = data[agg]
130 | if not aggfunc:
131 | aggfunc = 'mean'
132 | else:
133 | data_values = None
134 |
135 | agg_data = pd.crosstab(index=data[y], columns=data[x], values=data_values, aggfunc=aggfunc,
136 | normalize=normalize)
137 | else:
138 | agg_data = data
139 |
140 | if corr:
141 | agg_data = agg_data.corr()
142 |
143 | agg_values = agg_data.values
144 | col_labels = agg_data.columns.tolist()
145 | row_labels = agg_data.index.tolist()
146 |
147 | # Plot the heatmap
148 | im = ax.imshow(agg_values, cmap=cmap, **kwargs)
149 |
150 | # Create colorbar
151 | cbar = fig.colorbar(im, ax=ax, **cbar_kw)
152 | cbar.ax.set_ylabel(cbarlabel, rotation=-90, va='bottom')
153 |
154 | x_range, y_range = np.arange(agg_data.shape[1]), np.arange(agg_data.shape[0])
155 | ax.set_xticks(x_range)
156 | ax.set_yticks(y_range)
157 |
158 | ax.set_xticklabels(col_labels)
159 | ax.set_yticklabels(row_labels)
160 |
161 | # Let the horizontal axes labeling appear on top.
162 | ax.tick_params(top=True, bottom=False,
163 | labeltop=True, labelbottom=False)
164 |
165 | # Rotate the tick labels and set their alignment.
166 | plt.setp(ax.get_xticklabels(), rotation=-30, ha='right', rotation_mode='anchor')
167 |
168 | # Turn spines off and create white grid.
169 | for edge, spine in ax.spines.items():
170 | spine.set_visible(False)
171 |
172 | ax.set_xticks(x_range - .5, minor=True)
173 | ax.set_yticks(y_range - .5, minor=True)
174 | ax.grid(which='minor', color='w', linestyle='-', linewidth=3)
175 | ax.tick_params(which='minor', bottom=False, left=False)
176 |
177 | if annot:
178 | annotate_heatmap(im, agg_values, fmt='{0:' + fmt + '}')
179 |
180 | return fig,
181 |
182 |
183 | def annotate_heatmap(im, values, fmt="{0:.2f}", **textkw):
184 | """
185 | Annotates the heatmap
186 |
187 | https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html
188 | """
189 |
190 | kw = dict(horizontalalignment="center",
191 | verticalalignment="center")
192 | kw.update(textkw)
193 | n_rows, n_cols = values.shape
194 |
195 | for i in range(n_rows):
196 | for j in range(n_cols):
197 | val = values[i, j]
198 | if not np.isnan(val):
199 | im.axes.text(j, i, fmt.format(val), **kw)
--------------------------------------------------------------------------------
/dexplot/_plotly.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | import textwrap
3 |
4 | import numpy as np
5 | import plotly.graph_objects as go
6 |
7 | from ._common_plot import PlotlyCommon, PlotlyCount
8 |
9 |
10 | def wrap_labels(labels, wrap):
11 | return [textwrap.fill(label, wrap).replace('\n', '
') for label in labels]
12 |
13 |
14 | def line_plotly(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None,
15 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
16 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True,
17 | sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear',
18 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None):
19 |
20 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col,
21 | x_order, y_order, split_order, row_order, col_order,
22 | orientation, sort_values, wrap, figsize, title, sharex,
23 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
24 | x_textwrap, y_textwrap, x_rot, y_rot)
25 |
26 | showlegend = True
27 | for (row, col), info in self.final_data.items():
28 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
29 | self.fig.add_scatter(x=x, y=y, name=label, row=row, col=col,
30 | marker_color=self.colors[i % len(self.colors)],
31 | showlegend=showlegend)
32 | showlegend = False
33 | return self.fig
34 |
35 |
36 | def scatter_plotly(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None,
37 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
38 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True,
39 | sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear',
40 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None):
41 |
42 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col,
43 | x_order, y_order, split_order, row_order, col_order,
44 | orientation, sort_values, wrap, figsize, title, sharex,
45 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
46 | x_textwrap, y_textwrap, x_rot, y_rot)
47 |
48 | showlegend = True
49 | for (row, col), info in self.final_data.items():
50 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
51 | self.fig.add_scatter(x=x, y=y, name=label, row=row, col=col,
52 | marker_color=self.colors[i % len(self.colors)],
53 | showlegend=showlegend, mode='markers')
54 | showlegend = False
55 | return self.fig
56 |
57 |
58 | def bar_plotly(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None,
59 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
60 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None,
61 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None,
62 | ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10,
63 | y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2,
64 | groupgap=0, bar_kwargs=None):
65 |
66 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col,
67 | x_order, y_order, split_order, row_order, col_order,
68 | orientation, sort_values, wrap, figsize, title, sharex,
69 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
70 | x_textwrap, y_textwrap, x_rot, y_rot)
71 |
72 | showlegend = self.split is not None
73 | self.fig.update_layout(barmode=mode, bargap=gap, bargroupgap=groupgap)
74 | for (row, col), info in self.final_data.items():
75 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
76 | if len(x) > 200:
77 | warnings.warn('You are plotting more than 200 bars. '
78 | 'Did you forget to provide an `aggfunc`?')
79 |
80 | self.fig.add_bar(x=x, y=y, orientation=self.orientation,
81 | name=label, row=row, col=col,
82 | marker_color=self.colors[i % len(self.colors)],
83 | showlegend=showlegend)
84 | showlegend = False
85 |
86 | return self.fig
87 |
88 |
89 | def count_plotly(val, data=None, normalize=False, split=None, row=None, col=None,
90 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
91 | orientation='v', sort_values='desc', wrap=None, figsize=None, title=None,
92 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None,
93 | xscale='linear', yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None,
94 | x_rot=None, y_rot=None, mode='group', gap=.2, groupgap=0,
95 | bar_kwargs=None):
96 |
97 | x, y = (val, None) if orientation == 'v' else (None, val)
98 | aggfunc = '__distribution__'
99 | self = PlotlyCount(x, y, data, aggfunc, split, row, col,
100 | x_order, y_order, split_order, row_order, col_order,
101 | orientation, None, wrap, figsize, title, sharex,
102 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
103 | x_textwrap, y_textwrap, x_rot, y_rot, kind='count')
104 |
105 | count_dict = self.get_count_dict(normalize)
106 | showlegend = self.split is not None
107 | self.fig.update_layout(barmode=mode, bargap=gap, bargroupgap=groupgap)
108 | for (row, col), df in count_dict.items():
109 | if sort_values == 'asc' and not (self.split or self.row or self.col):
110 | df = df.iloc[::-1]
111 |
112 | labels = df.index.values
113 | for i, column in enumerate(df.columns):
114 | values = df[column].values
115 | x, y = (labels, values) if self.orientation == 'v' else (values, labels)
116 | self.fig.add_bar(x=x, y=y, orientation=self.orientation, name=column,
117 | row=row, col=col, marker_color=self.colors[i % len(self.colors)],
118 | showlegend=showlegend)
119 | showlegend = False
120 | return self.fig
121 |
122 |
123 | def box_plotly(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None,
124 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h',
125 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None,
126 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None,
127 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group',
128 | gap=.2, groupgap=0, box_kwargs=None):
129 |
130 | aggfunc = None
131 | sort_values = None
132 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col,
133 | x_order, y_order, split_order, row_order, col_order,
134 | orientation, sort_values, wrap, figsize, title, sharex,
135 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
136 | x_textwrap, y_textwrap, x_rot, y_rot)
137 |
138 | showlegend = self.split is not None
139 | self.fig.update_layout(boxmode=mode, boxgap=gap, boxgroupgap=groupgap)
140 | for (row, col), info in self.final_data.items():
141 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
142 | self.fig.add_box(x=x, y=y, orientation=self.orientation,
143 | name=label, row=row, col=col,
144 | marker_color=self.colors[i % len(self.colors)],
145 | showlegend=showlegend)
146 | showlegend = False
147 |
148 | return self.fig
149 |
150 | def violin_plotly(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None,
151 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h',
152 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None,
153 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None,
154 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group',
155 | gap=.2, groupgap=0, box_kwargs=None):
156 |
157 | aggfunc = None
158 | sort_values = None
159 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col,
160 | x_order, y_order, split_order, row_order, col_order,
161 | orientation, sort_values, wrap, figsize, title, sharex,
162 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
163 | x_textwrap, y_textwrap, x_rot, y_rot)
164 |
165 | showlegend = self.split is not None
166 | self.fig.update_layout(violinmode=mode, violingap=gap, violingroupgap=groupgap)
167 | for (row, col), info in self.final_data.items():
168 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
169 | self.fig.add_violin(x=x, y=y, orientation=self.orientation,
170 | name=label, row=row, col=col,
171 | marker_color=self.colors[i % len(self.colors)],
172 | showlegend=showlegend)
173 | showlegend = False
174 |
175 | return self.fig
176 |
177 | def kde_plotly(x=None, y=None, data=None, split=None, row=None, col=None, split_order=None,
178 | row_order=None, col_order=None, orientation='v', wrap=None, figsize=None,
179 | title=None, sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None,
180 | ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10,
181 | y_textwrap=None, x_rot=None, y_rot=None, range=None, cumulative=False):
182 |
183 | aggfunc = None
184 | sort_values = None
185 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col,
186 | x_order, y_order, split_order, row_order, col_order,
187 | orientation, sort_values, wrap, figsize, title, sharex,
188 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
189 | x_textwrap, y_textwrap, x_rot, y_rot)
190 |
191 | showlegend = self.split is not None
192 | from ._utils import calculate_density_1d, calculate_density_2d
193 |
194 | x_order = y_order = None
195 | # x, y = (x, None) if orientation == 'v' else (None, x)
196 |
197 | if x is not None and y is not None and split is not None:
198 | raise ValueError('Cannot use `split` for 2-dimensional KDE plots')
199 |
200 | aggfunc = '__distribution__' if y is None else None
201 | sort_values = None
202 | self = PlotlyCommon(x, y, data, aggfunc, split, row, col,
203 | x_order, y_order, split_order, row_order, col_order,
204 | orientation, sort_values, wrap, figsize, title, sharex,
205 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
206 | x_textwrap, y_textwrap, x_rot, y_rot, check_numeric=True)
207 |
208 | for ax, info in self.final_data.items():
209 | for vals in info:
210 | if aggfunc == '__distribution__':
211 | x, split_label = vals[:2]
212 | x, y = calculate_density_1d(x, cumulative=cumulative)
213 | x, y = (x, y) if self.orientation == 'v' else (y, x)
214 | self.fig.add_scatter(x=x, y=y, name=split_label, row=row, col=col,
215 | marker_color=self.colors[i % len(self.colors)],
216 | showlegend=showlegend)
217 | else:
218 | x, y, split_label = vals[:3]
219 | xmin, xmax, ymin, ymax, Z = calculate_density_2d(x, y)
220 | ax.imshow(Z, extent=[xmin, xmax, ymin, ymax], aspect='auto')
221 |
222 | showlegend = False
223 |
224 | return self.fig
225 |
--------------------------------------------------------------------------------
/dexplot/_plots.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from collections import defaultdict
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from scipy import stats
7 |
8 | from ._common_plot import MPLCommon, MPLCount
9 |
10 |
11 | def get_bar_kwargs(bar_kwargs):
12 | default_bar_kwargs = {'ec': 'white', 'alpha': .9}
13 | if bar_kwargs is None:
14 | bar_kwargs = default_bar_kwargs
15 | else:
16 | try:
17 | bar_kwargs = {**default_bar_kwargs, **bar_kwargs}
18 | except:
19 | raise TypeError('`bar_kwargs` must be a dictionary')
20 | return bar_kwargs
21 |
22 |
23 | def verify_gap_args(mode, gap, groupgap):
24 | if mode not in ('group', 'stack', 'overlay', 'relative'):
25 | raise ValueError("`moe` must be one of 'group', 'stack', 'overlay', 'relative'")
26 | if gap < 0 or gap >= 1:
27 | raise ValueError('`gap` must be greater than or equal to 0 and less than 1')
28 | if groupgap < 0 or groupgap >= 1:
29 | raise ValueError('`groupgap` must be greater than or equal to 0 and less than 1')
30 |
31 |
32 | def get_jump_size(n, mode, gap, groupgap):
33 | total = 1 - gap
34 | jump = total / n
35 | size = jump * (1 - groupgap)
36 | if mode != 'group':
37 | jump = 0
38 | size *= n
39 | return jump, size
40 |
41 |
42 | def line(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None,
43 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
44 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True,
45 | sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear',
46 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None):
47 |
48 | self = MPLCommon(x, y, data, aggfunc, split, row, col,
49 | x_order, y_order, split_order, row_order, col_order,
50 | orientation, sort_values, wrap, figsize, title, sharex,
51 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
52 | x_textwrap, y_textwrap, x_rot, y_rot)
53 |
54 | marker = 'o' if self.groupby else None
55 |
56 | for ax, info in self.final_data.items():
57 | for x, y, label, col_name, row_label, col_label in info:
58 | x_plot, y_plot = self.get_x_y_plot(x, y)
59 | ax.plot(x_plot, y_plot, label=label, marker=marker)
60 |
61 | if self.groupby:
62 | ticklabels = x if self.orientation == 'v' else y
63 | self.add_ticklabels(ticklabels, ax)
64 |
65 | self.add_legend(label)
66 | if x.dtype == 'O' or y.dtype == 'O':
67 | self.update_fig_size(len(x), 1)
68 | return self.clean_up()
69 |
70 |
71 | def scatter(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None,
72 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
73 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True,
74 | sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear',
75 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None,
76 | regression=False):
77 |
78 | self = MPLCommon(x, y, data, aggfunc, split, row, col,
79 | x_order, y_order, split_order, row_order, col_order,
80 | orientation, sort_values, wrap, figsize, title, sharex,
81 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
82 | x_textwrap, y_textwrap, x_rot, y_rot)
83 |
84 | alpha = 1 if self.groupby else .7
85 |
86 | for ax, info in self.final_data.items():
87 | for x, y, label, col_name, row_label, col_label in info:
88 | x_plot, y_plot = self.get_x_y_plot(x, y)
89 | ax.scatter(x_plot, y_plot, label=label, alpha=alpha)
90 | if regression:
91 | slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
92 | x_line = np.array([x.min(), x.max()])
93 | y_line = x_line * slope + intercept
94 | ax.plot(x_line, y_line)
95 | if self.groupby:
96 | ticklabels = x if self.orientation == 'v' else y
97 | self.add_ticklabels(ticklabels, ax)
98 |
99 | self.add_legend(label)
100 | if x.dtype == 'O' or y.dtype == 'O':
101 | self.update_fig_size(len(x), 1)
102 | return self.clean_up()
103 |
104 |
105 | def bar(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None,
106 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
107 | orientation='v', sort_values=None, wrap=None, figsize=None, title=None,
108 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None,
109 | ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10,
110 | y_textwrap=None, x_rot=None, y_rot=None, mode='group',
111 | gap=.2, groupgap=0, bar_kwargs=None):
112 |
113 | self = MPLCommon(x, y, data, aggfunc, split, row, col,
114 | x_order, y_order, split_order, row_order, col_order,
115 | orientation, sort_values, wrap, figsize, title, sharex,
116 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
117 | x_textwrap, y_textwrap, x_rot, y_rot)
118 |
119 | bar_kwargs = get_bar_kwargs(bar_kwargs)
120 | verify_gap_args(mode, gap, groupgap)
121 | for ax, info in self.final_data.items():
122 | jump, size = get_jump_size(len(info), mode, gap, groupgap)
123 | for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
124 | x_plot, y_plot = self.get_x_y_plot(x, y)
125 | if i == 0:
126 | base = np.zeros(len(x_plot))
127 | if len(x) > 200:
128 | warnings.warn('You are plotting more than 200 bars. '
129 | 'Did you forget to provide an `aggfunc`?')
130 |
131 | if self.orientation == 'v':
132 | x_plot = x_plot + jump * i
133 | ax.bar(x_plot, y_plot, label=label, width=size,
134 | bottom=base, align='edge', **bar_kwargs)
135 | if mode == 'stack':
136 | base += y_plot
137 | else:
138 | y_plot = y_plot - jump * (i + 1)
139 | ax.barh(y_plot, x_plot, label=label, height=size,
140 | left=base, align='edge', **bar_kwargs)
141 | if mode == 'stack':
142 | base += x_plot
143 | ticklabels = x if self.orientation == 'v' else y
144 | delta = jump * (i + 1) / 2 if mode == 'group' else size / 2
145 | self.add_ticklabels(ticklabels, ax, delta=delta)
146 |
147 | self.add_legend(label)
148 | self.update_fig_size(len(info), len(x))
149 | return self.clean_up()
150 |
151 |
152 | def count(val, data=None, normalize=False, split=None, row=None, col=None,
153 | x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
154 | orientation='v', sort_values='desc', wrap=None, figsize=None, title=None,
155 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None,
156 | xscale='linear', yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None,
157 | x_rot=None, y_rot=None, mode='group', gap=.2, groupgap=0,
158 | bar_kwargs=None):
159 |
160 | bar_kwargs = get_bar_kwargs(bar_kwargs)
161 | verify_gap_args(mode, gap, groupgap)
162 | x, y = (val, None) if orientation == 'v' else (None, val)
163 | aggfunc = '__distribution__'
164 | self = MPLCount(x, y, data, aggfunc, split, row, col,
165 | x_order, y_order, split_order, row_order, col_order,
166 | orientation, None, wrap, figsize, title, sharex,
167 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
168 | x_textwrap, y_textwrap, x_rot, y_rot, kind='count')
169 |
170 | count_dict = self.get_count_dict(normalize)
171 | for ax, df in count_dict.items():
172 | base = np.zeros(len(df))
173 | position = np.arange(len(df))
174 | if sort_values == 'asc' and not (self.split or self.row or self.col):
175 | df = df.iloc[::-1]
176 |
177 | ticklabels = df.index.values
178 | jump, size = get_jump_size(df.shape[1], mode, gap, groupgap)
179 | for col in df.columns:
180 | values = df[col].values
181 |
182 | if self.orientation == 'v':
183 | ax.bar(position, values, label=col, width=size,
184 | bottom=base, align='edge', **bar_kwargs)
185 | position = position + jump
186 | else:
187 | ax.barh(position - cur_size, values, label=col, height=size,
188 | left=base, align='edge', **bar_kwargs)
189 | position = position - jump
190 |
191 | if mode == 'stack':
192 | base += values
193 |
194 | delta = jump * df.shape[1] / 2 if mode == 'group' else size / 2
195 | self.add_ticklabels(ticklabels, ax, delta=delta)
196 | if self.split or len(df.columns) > 1:
197 | self.add_legend(col)
198 | self.update_fig_size(df.shape[1], df.shape[0])
199 | return self.clean_up()
200 |
201 |
202 | def _common_dist(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None,
203 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h',
204 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None,
205 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None,
206 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None,
207 | mode='group', gap=.2, groupgap=0, kind=None, **kwargs):
208 |
209 | aggfunc = '__distribution__'
210 | sort_values = None
211 | self = MPLCommon(x, y, data, aggfunc, split, row, col,
212 | x_order, y_order, split_order, row_order, col_order,
213 | orientation, sort_values, wrap, figsize, title, sharex,
214 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
215 | x_textwrap, y_textwrap, x_rot, y_rot)
216 |
217 | key = 'bodies' if kind == 'violinplot' else 'boxes'
218 | vert = self.orientation == 'v'
219 | for ax, info in self.final_data.items():
220 | plot_func = getattr(ax, kind)
221 | cur_data, cur_ticklabels = self.get_distribution_data(info)
222 |
223 | handles = []
224 | split_labels = []
225 | n_splits = len(cur_data)
226 | widths = min(.5 + .15 * n_splits, .9) / n_splits
227 | n_boxes = len(info)
228 | n = len(next(iter(cur_data.values()))) # number of groups
229 | markersize = max(6 - n_boxes // 5, 2)
230 | jump, size = get_jump_size(n, mode, gap, groupgap)
231 | for i, (split_label, data) in enumerate(cur_data.items()):
232 | filt = [len(arr) > 0 for arr in data]
233 | positions = np.array([i for (i, f) in enumerate(filt) if f])
234 | data = [np.array(d) for (d, f) in zip(data, filt) if f]
235 | if self.orientation == 'h':
236 | positions = positions - i * widths
237 | else:
238 | positions = positions + i * widths
239 |
240 | if kind == 'boxplot':
241 | kwargs['boxprops'] = {'facecolor': self.colors[i % len(self.colors)] ,
242 | 'edgecolor': 'black'}
243 | kwargs['flierprops'] = {'markersize': markersize}
244 |
245 | ret = plot_func(data, vert=vert, positions=positions, widths=widths, **kwargs)
246 |
247 | if kind == 'violinplot':
248 | for k in ['cmeans', 'cmins', 'cmaxes', 'cbars', 'cmedians', 'cquantiles']:
249 | if k in ret:
250 | ret[k].set_linewidth(1)
251 | for body in ret['bodies']:
252 | body.set_alpha(.8)
253 |
254 | handles.append(ret[key][0])
255 | split_labels.append(split_label)
256 |
257 | delta = (n_splits / 2 - .5) * widths
258 | ticklabels = cur_ticklabels[split_label]
259 | self.add_ticklabels(ticklabels, ax, delta=delta)
260 |
261 | self.add_legend(self.split, handles, split_labels)
262 | self.update_fig_size(n_splits, n)
263 | return self.clean_up()
264 |
265 | # could add groupby to box
266 | def box(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None,
267 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h',
268 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None,
269 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None,
270 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2,
271 | groupgap=0, box_kwargs=None):
272 |
273 | kwargs = dict(notch=None, sym=None, whis=None,
274 | patch_artist=True, bootstrap=None, usermedians=None, conf_intervals=None, meanline=None,
275 | showmeans=None, showcaps=None, showbox=None, showfliers=None, boxprops=None, labels=None,
276 | flierprops=None, medianprops=None, meanprops=None, capprops=None, whiskerprops=None,
277 | manage_ticks=True, autorange=False, zorder=None)
278 |
279 | if kwargs['medianprops'] is None:
280 | kwargs['medianprops'] = {'color': '.2'}
281 |
282 | # kwargs = dict(notch=notch, sym=sym, whis=whis, patch_artist=patch_artist,
283 | # bootstrap=bootstrap, usermedians=usermedians, conf_intervals=conf_intervals,
284 | # meanline=meanline, showmeans=showmeans, showcaps=showcaps, showbox=showbox,
285 | # showfliers=showfliers, boxprops=boxprops, labels=labels, flierprops=flierprops,
286 | # medianprops=medianprops, meanprops=meanprops, capprops=capprops,
287 | # whiskerprops=whiskerprops, manage_ticks=manage_ticks,
288 | # autorange=autorange, zorder=zorder)
289 |
290 | return _common_dist(x, y, data, split, row, col, x_order, y_order, split_order,
291 | row_order, col_order, orientation, wrap, figsize, title,
292 | sharex, sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
293 | x_textwrap, y_textwrap, x_rot, y_rot, mode, gap, groupgap,
294 | kind='boxplot', **kwargs)
295 |
296 |
297 | def violin(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None,
298 | y_order=None, split_order=None, row_order=None, col_order=None, orientation='h',
299 | wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None,
300 | ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None,
301 | x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2,
302 | groupgap=0, violin_kwargs=None):
303 |
304 | kwargs = dict(showmeans=False, showextrema=True, showmedians=True,
305 | quantiles=None, points=100, bw_method=None)
306 |
307 | # kwargs = dict(showmeans=showmeans, showextrema=showextrema, showmedians=showmedians,
308 | # quantiles=quantiles, points=points, bw_method=bw_method)
309 |
310 | return _common_dist(x, y, data, split, row, col,
311 | x_order, y_order, split_order, row_order, col_order,
312 | orientation, wrap, figsize, title, sharex,
313 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
314 | x_textwrap, y_textwrap, x_rot, y_rot, kind='violinplot', **kwargs)
315 |
316 |
317 | def hist(val, data=None, split=None, row=None, col=None, split_order=None, row_order=None,
318 | col_order=None, orientation='v', wrap=None, figsize=None, title=None,
319 | sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear',
320 | yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None,
321 | mode='group', gap=.2, groupgap=0, hist_kwargs=None):
322 |
323 | hist_kwargs = dict(bins=None, range=None, density=False, weights=None, cumulative=False,
324 | bottom=None, histtype='bar', align='mid', rwidth=None, log=False)
325 |
326 | x_order = y_order = None
327 | x, y = (val, None) if orientation == 'v' else (None, val)
328 | bins = bins if bins else 20
329 | kwargs = dict(bins=bins, range=range, density=density, weights=weights,
330 | cumulative=cumulative, bottom=bottom, histtype=histtype, align=align,
331 | rwidth=rwidth, log=log)
332 |
333 | aggfunc = '__distribution__'
334 | sort_values = None
335 | self = MPLCommon(x, y, data, aggfunc, split, row, col,
336 | x_order, y_order, split_order, row_order, col_order,
337 | orientation, sort_values, wrap, figsize, title, sharex,
338 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
339 | x_textwrap, y_textwrap, x_rot, y_rot)
340 |
341 | orientation = 'vertical' if self.orientation == 'v' else 'horizontal'
342 | for ax, info in self.final_data.items():
343 | cur_data, cur_ticklabels = self.get_distribution_data(info)
344 |
345 | handles = []
346 | split_labels = []
347 | n_splits = len(cur_data)
348 | n = len(next(iter(cur_data.values()))) # number of groups
349 | for split_label, data in cur_data.items():
350 | filt = [len(arr) > 0 for arr in data]
351 | vals = [d for (d, f) in zip(data, filt) if f]
352 | ret = ax.hist(vals, orientation=orientation, alpha=.8, **kwargs)
353 | handles.append(ret[-1][0])
354 | split_labels.append(split_label)
355 |
356 | self.add_legend(self.split, handles, split_labels)
357 | # self.update_fig_size(n_splits, n)
358 | return self.clean_up()
359 |
360 |
361 | def kde(x=None, y=None, data=None, split=None, row=None, col=None, split_order=None,
362 | row_order=None, col_order=None, orientation='v', wrap=None, figsize=None,
363 | title=None, sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None,
364 | ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None,
365 | x_rot=None, y_rot=None, range=None, cumulative=False):
366 |
367 | from ._utils import calculate_density_1d, calculate_density_2d
368 |
369 | x_order = y_order = None
370 | # x, y = (x, None) if orientation == 'v' else (None, x)
371 | kwargs = dict(range=range, cumulative=cumulative)
372 |
373 | if x is not None and y is not None and split is not None:
374 | raise ValueError('Cannot use `split` for 2-dimensional KDE plots')
375 |
376 | aggfunc = '__distribution__' if y is None else None
377 | sort_values = None
378 | self = MPLCommon(x, y, data, aggfunc, split, row, col,
379 | x_order, y_order, split_order, row_order, col_order,
380 | orientation, sort_values, wrap, figsize, title, sharex,
381 | sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
382 | x_textwrap, y_textwrap, x_rot, y_rot, check_numeric=True)
383 |
384 | for ax, info in self.final_data.items():
385 | for vals in info:
386 | if aggfunc == '__distribution__':
387 | x, split_label = vals[:2]
388 | x, y = calculate_density_1d(x, cumulative=cumulative)
389 | x, y = (x, y) if self.orientation == 'v' else (y, x)
390 | ax.plot(x, y, label=split_label)
391 | else:
392 | x, y, split_label = vals[:3]
393 | xmin, xmax, ymin, ymax, Z = calculate_density_2d(x, y)
394 | ax.imshow(Z, extent=[xmin, xmax, ymin, ymax], aspect='auto')
395 |
396 | self.add_legend(self.split)
397 | # self.update_fig_size(n_splits, n)
398 | return self.clean_up()
399 |
400 | xy_doc = """
401 | x : str, default None
402 | Column name of DataFrame whose values will go along the x-axis
403 |
404 | y : str, default None
405 | Column name of DataFrame whose values will go along the y-axis
406 | """
407 |
408 | val_doc = """
409 | val : str, default None
410 | Column name of DataFrame whose values will be used for distribution
411 | """
412 |
413 | aggfunc_doc = """
414 | aggfunc : str or function, default None
415 | Kind of aggregation to perform. Use a string that the DataFrame `agg`
416 | method understands. If providing a function, it will also be passed to
417 | the `agg` method.
418 |
419 | The strings 'countna' and 'percna' are also available to find the
420 | number and percentage of missing values.
421 | """
422 |
423 | xy_order = """
424 | x_order : str or list, default None
425 | Used as both a way to order and filter the x-values. Use the strings
426 | 'asc'/'desc' to order ascending or descending.
427 |
428 | Set a specific order with a list, i.e. `['House', 'Apartment', 'Townhouse']`
429 |
430 | Use the strings `'top n'` or `'bottom n'` where `n` is an integer. This will
431 | filter for the most/least frequent groups.
432 |
433 | By default, sorting happens in ascending order.
434 |
435 | y_order : str or list, default None
436 | See x_order
437 |
438 | split_order : str or list, default None
439 | See x_order
440 |
441 | row_order : str or list, default None
442 | See x_order
443 |
444 | col_order : str or list, default None
445 | See x_order
446 | """
447 |
448 | split_order = """
449 | split_order : str or list, default None
450 | Used as both a way to order and filter the x-values. Use the strings
451 | 'asc'/'desc' to order ascending or descending.
452 |
453 | Set a specific order with a list, i.e. `['House', 'Apartment', 'Townhouse']`
454 |
455 | Use the strings `'top n'` or `'bottom n'` where `n` is an integer. This will
456 | filter for the most/least frequent groups.
457 |
458 | By default, sorting happens in ascending order.
459 |
460 | row_order : str or list, default None
461 | See split_order
462 |
463 | col_order : str or list, default None
464 | See split_order
465 | """
466 |
467 | sort_values_doc = """
468 | sort_values : str - 'asc' or 'desc', default None
469 | Sort the values ascending or descending. If this is given, then
470 | x/y_order is ignored.
471 | """
472 |
473 | doc = \
474 | """
475 | {plot_doc}
476 |
477 | Parameters
478 | ----------
479 | {xy}
480 | data : DataFrame or Series, default None
481 | A pandas DataFrame with long or wide data. If provided a Series, do not
482 | supply x or y.
483 | {aggfunc}
484 | split : str, default None
485 | Column name that will be used in the DataFrame `groupby` method to
486 | split the data into independent groups within a single plot
487 |
488 | row : str
489 | Column name that will be used in the DataFrame `groupby` method to
490 | split the data into independent groups to form new plots. Each unique value
491 | in the `row` column forms a new row of plots.
492 |
493 | col : str
494 | Column name that will be used in the DataFrame `groupby` method to
495 | split the data into independent groups to form new plots. Each unique value
496 | in the `row` column forms a new row of plots.
497 | {order}
498 | orientation : str 'v' or 'h'
499 | Choose the orientation of the plots. By default, they are vertical
500 | ('v'), except for box and violin plots, which are horizontal.
501 | {sort_values}
502 | wrap : int, default None
503 | When using either `row` or either `col`, but not both, determines the
504 | maximum number of rows/cols before a new row/col is used.
505 |
506 | figsize : tuple, default None
507 | A tuple of numbers used passed to the `figsize` matplotlib parameter.
508 | By default, the figure size will be determined based on the kind of
509 | plot produced.
510 |
511 | title : str
512 | Sets the figure title NOT the Axes title
513 |
514 | sharex : bool
515 | Whether all plots should share the x-axis or not. Default is True
516 |
517 | sharey : bool
518 | Whether all plots should share the y-axis or not. Default is True
519 |
520 | xlabel : str
521 | Label used for x-axis on figures with a single plot
522 |
523 | ylabel : str
524 | Label used for y-axis on figures with a single plot
525 |
526 | xlim : 2-item tuple of numbers
527 | Determines x-axis limits for figures with a single plot
528 |
529 | ylim : 2-item tuple of numbers
530 | Determines y-axis limits for figures with a single plot
531 |
532 | xscale : 'linear', 'log', 'symlog', 'logit'
533 | Sets the scale of the x-axis.
534 |
535 | yscale : 'linear', 'log', 'symlog', 'logit'
536 | Sets the scale of the y-axis
537 |
538 | cmap : str or matplotlib colormap instance, default None
539 |
540 | x_textwrap : int, default 10
541 | Number of characters before wrapping text for x-labels
542 |
543 | y_textwrap : int, default None
544 | Number of characters before wrapping text for y-labels
545 |
546 | x_rot : int or float, default None
547 | Degree of rotation of x-tick labels. If between 0 and 180
548 | horizontal_alignment is set to 'right', otherwise 'left'
549 |
550 | y_rot : int or float, default None
551 | Degree of rotation of y-tick labels. If between 0 and 180
552 | vertical_alignment is set to 'top', otherwise 'bottom'
553 |
554 | mode : str
555 |
556 | gap : float
557 |
558 | groupgap : float
559 |
560 | Returns
561 | -------
562 | A Matplotlib Figure instance
563 | """
564 |
565 |
566 | # line doc
567 | line_doc = """\
568 | Create line plots
569 | """
570 |
571 | scatter_doc = """\
572 | Create scatter plots
573 | """
574 |
575 | bar_doc = """\
576 | Create bar plots
577 | """
578 |
579 | count_doc = """\
580 | Create count plots
581 | """
582 |
583 | box_doc = """\
584 | Create box plots
585 | """
586 |
587 | violin_doc = """\
588 | Create violin plots
589 | """
590 |
591 | hist_doc = """\
592 | Create histograms
593 | """
594 |
595 | kde_doc = """\
596 | Create kernel density estimate plots
597 | """
598 |
599 | line.__doc__ = doc.format(plot_doc=line_doc, xy=xy_doc, aggfunc=aggfunc_doc,
600 | order=xy_order, sort_values=sort_values_doc)
601 |
602 | scatter.__doc__ = doc.format(plot_doc=scatter_doc, xy=xy_doc, aggfunc=aggfunc_doc,
603 | order=xy_order, sort_values=sort_values_doc)
604 |
605 | bar.__doc__ = doc.format(plot_doc=bar_doc, xy=xy_doc, aggfunc=aggfunc_doc,
606 | order=xy_order, sort_values=sort_values_doc)
607 |
608 | count.__doc__ = doc.format(plot_doc=count_doc, xy=val_doc, aggfunc='',
609 | order=split_order, sort_values=sort_values_doc)
610 |
611 | box.__doc__ = doc.format(plot_doc=box_doc, xy=xy_doc, aggfunc='',
612 | order=xy_order, sort_values='')
613 |
614 | violin.__doc__ = doc.format(plot_doc=violin_doc, xy=xy_doc, aggfunc='',
615 | order=xy_order, sort_values='')
616 |
617 | hist.__doc__ = doc.format(plot_doc=hist_doc, xy=val_doc, aggfunc='',
618 | order=split_order, sort_values='')
619 |
620 | kde.__doc__ = doc.format(plot_doc=kde_doc, xy=val_doc, aggfunc='',
621 | order=split_order, sort_values='')
622 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Dexplot
2 |
3 | Dexplot is a Python library for delivering beautiful data visualizations with a simple and intuitive user experience.
4 |
5 | ## Goals
6 |
7 | The primary goals for dexplot are:
8 |
9 | * Maintain a very consistent API with as few functions as necessary to make the desired statistical plots
10 | * Allow the user tremendous power without using matplotlib
11 |
12 |
13 | ## Installation
14 |
15 | `pip install dexplot`
16 |
17 | ## Built for long and wide data
18 |
19 | Dexplot is primarily built for long data, which is a form of data where each row represents a single observation and each column represents a distinct quantity. It is often referred to as "tidy" data. Here, we have some long data.
20 |
21 | 
22 |
23 | Dexplot also has the ability to handle wide data, where multiple columns may contain values that represent the same kind of quantity. The same data above has been aggregated to show the mean for each combination of neighborhood and property type. It is now wide data as each column contains the same quantity (price).
24 |
25 | 
26 |
27 | ## Usage
28 |
29 | Dexplot provides a small number of powerful functions that all work similarly. Most plotting functions have the following signature:
30 |
31 | ```python
32 | dxp.plotting_func(x, y, data, aggfunc, split, row, col, orientation, ...)
33 | ```
34 |
35 | * `x` - Column name along the x-axis
36 | * `y` - Column name the y-axis
37 | * `data` - Pandas DataFrame
38 | * `aggfunc` - String of pandas aggregation function, 'min', 'max', 'mean', etc...
39 | * `split` - Column name to split data into distinct groups
40 | * `row` - Column name to split data into distinct subplots row-wise
41 | * `col` - Column name to split data into distinct subplots column-wise
42 | * `orientation` - Either vertical (`'v'`) or horizontal (`'h'`). Default for most plots is vertical.
43 |
44 | When `aggfunc` is provided, `x` will be the grouping variable and `y` will be aggregated when vertical and vice-versa when horizontal. The best way to learn how to use dexplot is with the examples below.
45 |
46 | ## Families of plots
47 |
48 | There are two primary families of plots, **aggregation** and **distribution**. Aggregation plots take a sequence of values and return a **single** value using the function provided to `aggfunc` to do so. Distribution plots take a sequence of values and depict the shape of the distribution in some manner.
49 |
50 | * Aggregation
51 | * bar
52 | * line
53 | * scatter
54 | * count
55 | * Distribution
56 | * box
57 | * violin
58 | * hist
59 | * kde
60 |
61 | ## Comparison with Seaborn
62 |
63 | If you have used the seaborn library, then you should notice a lot of similarities. Much of dexplot was inspired by Seaborn. Below is a list of the extra features in dexplot not found in seaborn
64 |
65 | * Ability to graph relative frequency and normalize over any number of variables
66 | * No need for multiple functions to do the same thing (far fewer public functions)
67 | * Ability to make grids with a single function instead of having to use a higher level function like `catplot`
68 | * Pandas `groupby` methods available as strings
69 | * Ability to sort by values
70 | * Ability to sort x/y labels lexicographically
71 | * Ability to select most/least frequent groups
72 | * x/y labels are wrapped so that they don't overlap
73 | * Figure size (plus several other options) and available to change without using matplotlib
74 | * A matplotlib figure object is returned
75 |
76 | ## Examples
77 |
78 | Most of the examples below use long data.
79 |
80 | ## Aggregating plots - bar, line and scatter
81 |
82 | We'll begin by covering the plots that **aggregate**. An aggregation is defined as a function that summarizes a sequence of numbers with a single value. The examples come from the Airbnb dataset, which contains many property rental listings from the Washington D.C. area.
83 |
84 |
85 | ```python
86 | import dexplot as dxp
87 | import pandas as pd
88 | airbnb = dxp.load_dataset('airbnb')
89 | airbnb.head()
90 | ```
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 | |
100 | neighborhood |
101 | property_type |
102 | accommodates |
103 | bathrooms |
104 | bedrooms |
105 | price |
106 | cleaning_fee |
107 | rating |
108 | superhost |
109 | response_time |
110 | latitude |
111 | longitude |
112 |
113 |
114 |
115 |
116 | | 0 |
117 | Shaw |
118 | Townhouse |
119 | 16 |
120 | 3.5 |
121 | 4 |
122 | 433 |
123 | 250 |
124 | 95.0 |
125 | No |
126 | within an hour |
127 | 38.90982 |
128 | -77.02016 |
129 |
130 |
131 | | 1 |
132 | Brightwood Park |
133 | Townhouse |
134 | 4 |
135 | 3.5 |
136 | 4 |
137 | 154 |
138 | 50 |
139 | 97.0 |
140 | No |
141 | NaN |
142 | 38.95888 |
143 | -77.02554 |
144 |
145 |
146 | | 2 |
147 | Capitol Hill |
148 | House |
149 | 2 |
150 | 1.5 |
151 | 1 |
152 | 83 |
153 | 35 |
154 | 97.0 |
155 | Yes |
156 | within an hour |
157 | 38.88791 |
158 | -76.99668 |
159 |
160 |
161 | | 3 |
162 | Shaw |
163 | House |
164 | 2 |
165 | 2.5 |
166 | 1 |
167 | 475 |
168 | 0 |
169 | 98.0 |
170 | No |
171 | NaN |
172 | 38.91331 |
173 | -77.02436 |
174 |
175 |
176 | | 4 |
177 | Kalorama Heights |
178 | Apartment |
179 | 3 |
180 | 1.0 |
181 | 1 |
182 | 118 |
183 | 15 |
184 | 91.0 |
185 | No |
186 | within an hour |
187 | 38.91933 |
188 | -77.04124 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 | There are more than 4,000 listings in our dataset. We will use bar charts to aggregate the data.
197 |
198 |
199 | ```python
200 | airbnb.shape
201 | ```
202 |
203 |
204 |
205 |
206 | (4581, 12)
207 |
208 |
209 |
210 | ### Vertical bar charts
211 |
212 | In order to performa an aggregation, you must supply a value for `aggfunc`. Here, we find the median price per neighborhood. Notice that the column names automatically wrap.
213 |
214 |
215 | ```python
216 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median')
217 | ```
218 |
219 |
220 |
221 |
222 | 
223 |
224 |
225 |
226 | Line and scatter plots can be created with the same command, just substituting the name of the function. They both are not good choices for the visualization since the grouping variable (neighborhood) has no meaningful order.
227 |
228 |
229 | ```python
230 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median')
231 | ```
232 |
233 |
234 |
235 |
236 | 
237 |
238 |
239 |
240 |
241 | ```python
242 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median')
243 | ```
244 |
245 |
246 |
247 |
248 | 
249 |
250 |
251 |
252 | ### Components of the groupby aggregation
253 |
254 | Anytime the `aggfunc` parameter is set, you have performed a groupby aggregation, which always consists of three components:
255 |
256 | * Grouping column - unique values of this column form independent groups (neighborhood)
257 | * Aggregating column - the column that will get summarized with a single value (price)
258 | * Aggregating function - a function that returns a single value (median)
259 |
260 | The general format for doing this in pandas is:
261 |
262 | ```python
263 | df.groupby('grouping column').agg({'aggregating column': 'aggregating function'})
264 | ```
265 |
266 | Specifically, the following code is executed within dexplot.
267 |
268 |
269 | ```python
270 | airbnb.groupby('neighborhood').agg({'price': 'median'})
271 | ```
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 | |
281 | price |
282 |
283 |
284 | | neighborhood |
285 | |
286 |
287 |
288 |
289 |
290 | | Brightwood Park |
291 | 87.0 |
292 |
293 |
294 | | Capitol Hill |
295 | 129.5 |
296 |
297 |
298 | | Columbia Heights |
299 | 95.0 |
300 |
301 |
302 | | Dupont Circle |
303 | 125.0 |
304 |
305 |
306 | | Edgewood |
307 | 100.0 |
308 |
309 |
310 | | Kalorama Heights |
311 | 118.0 |
312 |
313 |
314 | | Shaw |
315 | 133.5 |
316 |
317 |
318 | | Union Station |
319 | 120.0 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 | ### Number and percent of missing values with `'countna'` and `'percna'`
328 |
329 | In addition to all the common aggregating functions, you can use the strings `'countna'` and `'percna'` to get the number and percentage of missing values per group.
330 |
331 |
332 | ```python
333 | dxp.bar(x='neighborhood', y='response_time', data=airbnb, aggfunc='countna')
334 | ```
335 |
336 |
337 |
338 |
339 | 
340 |
341 |
342 |
343 | ### Sorting the bars by values
344 |
345 | By default, the bars will be sorted by the grouping column (x-axis here) in alphabetical order. Use the `sort_values` parameter to sort the bars by value.
346 |
347 | * None - sort x/y axis labels alphabetically (default)
348 | * `asc` - sort values from least to greatest
349 | * `desc` - sort values from greatest to least
350 |
351 |
352 | ```python
353 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
354 | ```
355 |
356 |
357 |
358 |
359 | 
360 |
361 |
362 |
363 | Here, we sort the values from greatest to least.
364 |
365 |
366 | ```python
367 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
368 | ```
369 |
370 |
371 |
372 |
373 | 
374 |
375 |
376 |
377 | ### Specify order with `x_order`
378 |
379 | Specify a specific order of the labels on the x-axis by passing a list of values to `x_order`. This can also act as a filter to limit the number of bars.
380 |
381 |
382 | ```python
383 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
384 | x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
385 | ```
386 |
387 |
388 |
389 |
390 | 
391 |
392 |
393 |
394 | By default, `x_order` and all of the `_order` parameters are set to `'asc'` by default, which will order them alphabetically. Use the string `'desc'` to sort in the opposite direction.
395 |
396 |
397 | ```python
398 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc')
399 | ```
400 |
401 |
402 |
403 |
404 | 
405 |
406 |
407 |
408 | ### Filter for the neighborhoods with most/least frequency of occurrence
409 |
410 | You can use `x_order` again to filter for the x-values that appear the most/least often by setting it to the string `'top n'` or `'bottom n'` where `n` is an integer. Here, we filter for the top 4 most frequently occurring neighborhoods. This option is useful when there are dozens of unique values in the grouping column.
411 |
412 |
413 | ```python
414 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
415 | x_order='top 4')
416 | ```
417 |
418 |
419 |
420 |
421 | 
422 |
423 |
424 |
425 | We can verify that the four neighborhoods are the most common.
426 |
427 |
428 | ```python
429 | airbnb['neighborhood'].value_counts()
430 | ```
431 |
432 |
433 |
434 |
435 | Columbia Heights 773
436 | Union Station 713
437 | Capitol Hill 654
438 | Edgewood 610
439 | Dupont Circle 549
440 | Shaw 514
441 | Brightwood Park 406
442 | Kalorama Heights 362
443 | Name: neighborhood, dtype: int64
444 |
445 |
446 |
447 | ### Horizontal bars
448 |
449 | Set `orientation` to `'h'` for horizontal bars. When you do this, you'll need to switch `x` and `y` since the grouping column (neighborhood) will be along the y-axis and the aggregating column (price) will be along the x-axis.
450 |
451 |
452 | ```python
453 | dxp.bar(x='price', y='neighborhood', data=airbnb, aggfunc='median',
454 | orientation='h', sort_values='desc')
455 | ```
456 |
457 |
458 |
459 |
460 | 
461 |
462 |
463 |
464 | Switching orientation is possible for most other plots.
465 |
466 |
467 | ```python
468 | dxp.line(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
469 | ```
470 |
471 |
472 |
473 |
474 | 
475 |
476 |
477 |
478 | ### Split bars into groups
479 |
480 | You can split each bar into further groups by setting the `split` parameter to another column.
481 |
482 |
483 | ```python
484 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
485 | ```
486 |
487 |
488 |
489 |
490 | 
491 |
492 |
493 |
494 | We can use the `pivot_table` method to verify the results in pandas.
495 |
496 |
497 | ```python
498 | airbnb.pivot_table(index='superhost', columns='neighborhood',
499 | values='price', aggfunc='median')
500 | ```
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 | | neighborhood |
510 | Brightwood Park |
511 | Capitol Hill |
512 | Columbia Heights |
513 | Dupont Circle |
514 | Edgewood |
515 | Kalorama Heights |
516 | Shaw |
517 | Union Station |
518 |
519 |
520 | | superhost |
521 | |
522 | |
523 | |
524 | |
525 | |
526 | |
527 | |
528 | |
529 |
530 |
531 |
532 |
533 | | No |
534 | 85.0 |
535 | 129.0 |
536 | 90.5 |
537 | 120.0 |
538 | 100.0 |
539 | 110.0 |
540 | 130.0 |
541 | 120.0 |
542 |
543 |
544 | | Yes |
545 | 90.0 |
546 | 130.0 |
547 | 103.0 |
548 | 135.0 |
549 | 100.0 |
550 | 124.0 |
551 | 135.0 |
552 | 125.0 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 | Set the order of the unique split values with `split_order`, which can also act as a filter.
561 |
562 |
563 | ```python
564 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
565 | split='superhost', split_order=['Yes', 'No'])
566 | ```
567 |
568 |
569 |
570 |
571 | 
572 |
573 |
574 |
575 | Like all the `_order` parameters, `split_order` defaults to `'asc'` (alphabetical) order. Set it to `'desc'` for the opposite.
576 |
577 |
578 | ```python
579 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
580 | split='property_type', split_order='desc')
581 | ```
582 |
583 |
584 |
585 |
586 | 
587 |
588 |
589 |
590 | Filtering for the most/least frequent split categories is possible.
591 |
592 |
593 | ```python
594 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
595 | split='property_type', split_order='bottom 2')
596 | ```
597 |
598 |
599 |
600 |
601 | 
602 |
603 |
604 |
605 | Verifying that the least frequent property types are Townhouse and Condominium.
606 |
607 |
608 | ```python
609 | airbnb['property_type'].value_counts()
610 | ```
611 |
612 |
613 |
614 |
615 | Apartment 2403
616 | House 877
617 | Townhouse 824
618 | Condominium 477
619 | Name: property_type, dtype: int64
620 |
621 |
622 |
623 | ### Stacked bar charts
624 |
625 | Stack all the split groups one on top of the other by setting `stacked` to `True`.
626 |
627 |
628 | ```python
629 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
630 | split='superhost', split_order=['Yes', 'No'], stacked=True)
631 | ```
632 |
633 |
634 |
635 |
636 | 
637 |
638 |
639 |
640 | ### Split into multiple plots
641 |
642 | It's possible to split the data further into separate plots by the unique values in a different column with the `row` and `col` parameters. Here, each kind of `property_type` has its own plot.
643 |
644 |
645 | ```python
646 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
647 | split='superhost', col='property_type')
648 | ```
649 |
650 |
651 |
652 |
653 | 
654 |
655 |
656 |
657 | If there isn't room for all of the plots, set the `wrap` parameter to an integer to set the maximum number of plots per row/col. We also specify the `col_order` to be descending alphabetically.
658 |
659 |
660 | ```python
661 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
662 | split='superhost', col='property_type', wrap=2, col_order='desc')
663 | ```
664 |
665 |
666 |
667 |
668 | 
669 |
670 |
671 |
672 | Use `col_order` to both filter and set a specific order for the plots.
673 |
674 |
675 | ```python
676 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
677 | split='superhost', col='property_type', col_order=['House', 'Condominium'])
678 | ```
679 |
680 |
681 |
682 |
683 | 
684 |
685 |
686 |
687 | Splits can be made simultaneously along row and columns.
688 |
689 |
690 | ```python
691 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost',
692 | col='property_type', col_order=['House', 'Condominium', 'Apartment'],
693 | row='bedrooms', row_order=[1, 2, 3])
694 | ```
695 |
696 |
697 |
698 |
699 | 
700 |
701 |
702 |
703 | By default, all axis limits are shared. Allow each plot to set its own limits by setting `sharex` and `sharey` to `False`.
704 |
705 |
706 | ```python
707 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost',
708 | col='property_type', col_order=['House', 'Condominium', 'Apartment'],
709 | row='bedrooms', row_order=[1, 2, 3], sharey=False)
710 | ```
711 |
712 |
713 |
714 |
715 | 
716 |
717 |
718 |
719 | ### Set the width of each bar with `size`
720 |
721 | The width (height when horizontal) of the bars is set with the `size` parameter. By default, this value is .9. Think of this number as the relative width of all the bars for a particular x/y value, where 1 is the distance between each x/y value.
722 |
723 |
724 | ```python
725 | dxp.bar(x='neighborhood', y='price', data=airbnb,
726 | aggfunc='median', split='property_type',
727 | split_order=['Apartment', 'House'],
728 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'], size=.5)
729 | ```
730 |
731 |
732 |
733 |
734 | 
735 |
736 |
737 |
738 | ### Splitting line plots
739 |
740 | All the other aggregating plots work similarly.
741 |
742 |
743 | ```python
744 | dxp.line(x='neighborhood', y='price', data=airbnb,
745 | aggfunc='median', split='property_type',
746 | split_order=['Apartment', 'House'],
747 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'])
748 | ```
749 |
750 |
751 |
752 |
753 | 
754 |
755 |
756 |
757 | ## Distribution plots - box, violin, histogram, kde
758 |
759 | Distribution plots work similarly, but do not have an `aggfunc` since they do not aggregate. They take their group of values and draw some kind of shape that gives information on how that variable is distributed.
760 |
761 | ### Box plots
762 |
763 | Box plots have colored boxes with ends at the first and third quartiles and a line at the median. The whiskers are placed at 1.5 times the difference between the third and first quartiles (Interquartile range (IQR)). Fliers are the points outside this range and plotted individually. By default, both box and violin plots are plotted horizontally.
764 |
765 |
766 | ```python
767 | dxp.box(x='price', y='neighborhood', data=airbnb)
768 | ```
769 |
770 |
771 |
772 |
773 | 
774 |
775 |
776 |
777 | Split the groups in the same manner as with the aggregation plots.
778 |
779 |
780 | ```python
781 | dxp.box(x='price', y='neighborhood', data=airbnb,
782 | split='superhost', split_order=['Yes', 'No'])
783 | ```
784 |
785 |
786 |
787 |
788 | 
789 |
790 |
791 |
792 | Order the appearance of the splits alphabetically (in descending order here).
793 |
794 |
795 | ```python
796 | dxp.box(x='price', y='neighborhood', data=airbnb,
797 | split='property_type', split_order='desc')
798 | ```
799 |
800 |
801 |
802 |
803 | 
804 |
805 |
806 |
807 | ### Filter range of values with `x_order`
808 |
809 | It's possible to filter the range of possible values by passing in a list of the minimum and maximum to `x_order`.
810 |
811 |
812 | ```python
813 | dxp.box(x='price', y='neighborhood', data=airbnb,
814 | split='superhost', x_order=[50, 250])
815 | ```
816 |
817 |
818 |
819 |
820 | 
821 |
822 |
823 |
824 | Change the `x` and `y` while setting `orientation` to make vertical bar plots.
825 |
826 |
827 | ```python
828 | dxp.box(x='neighborhood', y='price', data=airbnb, orientation='v',
829 | split='property_type', split_order='top 2')
830 | ```
831 |
832 |
833 |
834 |
835 | 
836 |
837 |
838 |
839 | Violin plots work identically to box plots, but show "violins", kernel density plots duplicated on both sides of a line.
840 |
841 |
842 | ```python
843 | dxp.violin(x='price', y='neighborhood', data=airbnb,
844 | split='superhost', split_order=['Yes', 'No'])
845 | ```
846 |
847 |
848 |
849 |
850 | 
851 |
852 |
853 |
854 | Splitting by rows and columns is possible as well with distribution plots.
855 |
856 |
857 | ```python
858 | dxp.box(x='price', y='neighborhood', data=airbnb,split='superhost',
859 | col='property_type', col_order=['House', 'Condominium', 'Apartment'],
860 | row='bedrooms', row_order=[1, 2])
861 | ```
862 |
863 |
864 |
865 |
866 | 
867 |
868 |
869 |
870 | ### Histograms
871 |
872 | Histograms work in a slightly different manner. Instead of passing both `x` and `y`, you give it a single numeric column. A vertical histogram with 20 bins of the counts is created by default.
873 |
874 |
875 | ```python
876 | dxp.hist(val='price', data=airbnb)
877 | ```
878 |
879 |
880 |
881 |
882 | 
883 |
884 |
885 |
886 | We can use `split` just like we did above and also create horizontal histograms.
887 |
888 |
889 | ```python
890 | dxp.hist(val='price', data=airbnb, orientation='h', split='superhost', bins=15)
891 | ```
892 |
893 |
894 |
895 |
896 | 
897 |
898 |
899 |
900 | Here, we customize our histogram by plotting the cumulative density as opposed to the raw frequency count using the outline of the bars ('step').
901 |
902 |
903 | ```python
904 | dxp.hist(val='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3],
905 | bins=30, density=True, histtype='step', cumulative=True)
906 | ```
907 |
908 |
909 |
910 |
911 | 
912 |
913 |
914 |
915 | ### KDE Plots
916 |
917 | Kernel density estimates provide an estimate for the probability distribution of a continuous variable. Here, we examine how price is distributed by bedroom.
918 |
919 |
920 | ```python
921 | dxp.kde(x='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3])
922 | ```
923 |
924 |
925 |
926 |
927 | 
928 |
929 |
930 |
931 | Graph the cumulative distribution instead on multiple plots.
932 |
933 |
934 | ```python
935 | dxp.kde(x='price', data=airbnb, split='bedrooms',
936 | split_order=[1, 2, 3], cumulative=True, col='property_type', wrap=2)
937 | ```
938 |
939 |
940 |
941 |
942 | 
943 |
944 |
945 |
946 | ### Two-dimensional KDE's
947 |
948 | Provide two numeric columns to `x` and `y` to get a two dimensional KDE.
949 |
950 |
951 | ```python
952 | dxp.kde(x='price', y='cleaning_fee', data=airbnb)
953 | ```
954 |
955 |
956 |
957 |
958 | 
959 |
960 |
961 |
962 | Create a grid of two-dimensional KDE's.
963 |
964 |
965 | ```python
966 | dxp.kde(x='price', y='cleaning_fee', data=airbnb, row='neighborhood', wrap=3)
967 | ```
968 |
969 |
970 |
971 |
972 | 
973 |
974 |
975 |
976 | ## Count plots
977 |
978 | The `count` function graphs the frequency of unique values as bars. By default, it plots the values in descending order.
979 |
980 |
981 | ```python
982 | dxp.count(val='neighborhood', data=airbnb)
983 | ```
984 |
985 |
986 |
987 |
988 | 
989 |
990 |
991 |
992 | In pandas, this is a straightforward call to the `value_counts` method.
993 |
994 |
995 | ```python
996 | airbnb['neighborhood'].value_counts()
997 | ```
998 |
999 |
1000 |
1001 |
1002 | Columbia Heights 773
1003 | Union Station 713
1004 | Capitol Hill 654
1005 | Edgewood 610
1006 | Dupont Circle 549
1007 | Shaw 514
1008 | Brightwood Park 406
1009 | Kalorama Heights 362
1010 | Name: neighborhood, dtype: int64
1011 |
1012 |
1013 |
1014 | ### Relative frequency with `normalize`
1015 |
1016 | Instead of the raw counts, get the relative frequency by setting normalize to `True`.
1017 |
1018 |
1019 | ```python
1020 | dxp.count(val='neighborhood', data=airbnb, normalize=True)
1021 | ```
1022 |
1023 |
1024 |
1025 |
1026 | 
1027 |
1028 |
1029 |
1030 | Here, we split by property type.
1031 |
1032 |
1033 | ```python
1034 | dxp.count(val='neighborhood', data=airbnb, split='property_type')
1035 | ```
1036 |
1037 |
1038 |
1039 |
1040 | 
1041 |
1042 |
1043 |
1044 | In pandas, this is done with the `crosstab` function.
1045 |
1046 |
1047 | ```python
1048 | pd.crosstab(index=airbnb['property_type'], columns=airbnb['neighborhood'])
1049 | ```
1050 |
1051 |
1052 |
1053 |
1054 |
1055 |
1056 |
1057 |
1058 | | neighborhood |
1059 | Brightwood Park |
1060 | Capitol Hill |
1061 | Columbia Heights |
1062 | Dupont Circle |
1063 | Edgewood |
1064 | Kalorama Heights |
1065 | Shaw |
1066 | Union Station |
1067 |
1068 |
1069 | | property_type |
1070 | |
1071 | |
1072 | |
1073 | |
1074 | |
1075 | |
1076 | |
1077 | |
1078 |
1079 |
1080 |
1081 |
1082 | | Apartment |
1083 | 167 |
1084 | 299 |
1085 | 374 |
1086 | 397 |
1087 | 244 |
1088 | 284 |
1089 | 315 |
1090 | 323 |
1091 |
1092 |
1093 | | Condominium |
1094 | 35 |
1095 | 70 |
1096 | 97 |
1097 | 62 |
1098 | 65 |
1099 | 42 |
1100 | 52 |
1101 | 54 |
1102 |
1103 |
1104 | | House |
1105 | 131 |
1106 | 137 |
1107 | 157 |
1108 | 47 |
1109 | 146 |
1110 | 23 |
1111 | 61 |
1112 | 175 |
1113 |
1114 |
1115 | | Townhouse |
1116 | 73 |
1117 | 148 |
1118 | 145 |
1119 | 43 |
1120 | 155 |
1121 | 13 |
1122 | 86 |
1123 | 161 |
1124 |
1125 |
1126 |
1127 |
1128 |
1129 |
1130 |
1131 | Horizontal stacked count plots.
1132 |
1133 |
1134 | ```python
1135 | dxp.count(val='neighborhood', data=airbnb, split='property_type',
1136 | orientation='h', stacked=True, col='superhost')
1137 | ```
1138 |
1139 |
1140 |
1141 |
1142 | 
1143 |
1144 |
1145 |
1146 | ### Normalize over different variables
1147 |
1148 | Setting `normalize` to `True`, returns the relative frequency with respect to all of the data. You can normalize over any of the variables provided.
1149 |
1150 |
1151 | ```python
1152 | dxp.count(val='neighborhood', data=airbnb, split='property_type', normalize='neighborhood',
1153 | title='Relative Frequency by Neighborhood')
1154 | ```
1155 |
1156 |
1157 |
1158 |
1159 | 
1160 |
1161 |
1162 |
1163 | Normalize over several variables at once with a list.
1164 |
1165 |
1166 | ```python
1167 | dxp.count(val='neighborhood', data=airbnb, split='superhost',
1168 | row='property_type', col='bedrooms', col_order=[1, 2],
1169 | normalize=['neighborhood', 'property_type', 'bedrooms'], stacked=True)
1170 | ```
1171 |
1172 |
1173 |
1174 |
1175 | 
1176 |
1177 |
1178 |
1179 | ## Wide data
1180 |
1181 | Dexplot can also plot wide data, or data where no aggregation happens. Here is a scatter plot of the location of each listing.
1182 |
1183 |
1184 | ```python
1185 | dxp.scatter(x='longitude', y='latitude', data=airbnb,
1186 | split='neighborhood', col='bedrooms', col_order=[2, 3])
1187 | ```
1188 |
1189 |
1190 |
1191 |
1192 | 
1193 |
1194 |
1195 |
1196 | If you've already aggregated your data, you can plot it directly without specifying `x` or `y`.
1197 |
1198 |
1199 | ```python
1200 | df = airbnb.pivot_table(index='neighborhood', columns='property_type',
1201 | values='price', aggfunc='mean')
1202 | df
1203 | ```
1204 |
1205 |
1206 |
1207 |
1208 |
1209 |
1210 |
1211 |
1212 |
1213 | | property_type |
1214 | Apartment |
1215 | Condominium |
1216 | House |
1217 | Townhouse |
1218 |
1219 |
1220 | | neighborhood |
1221 | |
1222 | |
1223 | |
1224 | |
1225 |
1226 |
1227 |
1228 |
1229 | | Brightwood Park |
1230 | 96.119760 |
1231 | 105.000000 |
1232 | 121.671756 |
1233 | 133.479452 |
1234 |
1235 |
1236 | | Capitol Hill |
1237 | 141.210702 |
1238 | 104.200000 |
1239 | 170.153285 |
1240 | 184.459459 |
1241 |
1242 |
1243 | | Columbia Heights |
1244 | 114.676471 |
1245 | 126.773196 |
1246 | 135.292994 |
1247 | 124.358621 |
1248 |
1249 |
1250 | | Dupont Circle |
1251 | 146.858942 |
1252 | 130.709677 |
1253 | 179.574468 |
1254 | 139.348837 |
1255 |
1256 |
1257 | | Edgewood |
1258 | 108.508197 |
1259 | 112.846154 |
1260 | 156.335616 |
1261 | 147.503226 |
1262 |
1263 |
1264 | | Kalorama Heights |
1265 | 122.542254 |
1266 | 155.928571 |
1267 | 92.695652 |
1268 | 158.230769 |
1269 |
1270 |
1271 | | Shaw |
1272 | 153.888889 |
1273 | 158.500000 |
1274 | 202.114754 |
1275 | 173.279070 |
1276 |
1277 |
1278 | | Union Station |
1279 | 128.458204 |
1280 | 133.833333 |
1281 | 162.748571 |
1282 | 162.167702 |
1283 |
1284 |
1285 |
1286 |
1287 |
1288 |
1289 |
1290 |
1291 | ```python
1292 | dxp.bar(data=df, orientation='h')
1293 | ```
1294 |
1295 |
1296 |
1297 |
1298 | 
1299 |
1300 |
1301 |
1302 | ### Time series
1303 |
1304 |
1305 | ```python
1306 | stocks = pd.read_csv('../data/stocks10.csv', parse_dates=['date'], index_col='date')
1307 | stocks.head()
1308 | ```
1309 |
1310 |
1311 |
1312 |
1313 |
1314 |
1315 |
1316 |
1317 | |
1318 | MSFT |
1319 | AAPL |
1320 | SLB |
1321 | AMZN |
1322 | TSLA |
1323 | XOM |
1324 | WMT |
1325 | T |
1326 | FB |
1327 | V |
1328 |
1329 |
1330 | | date |
1331 | |
1332 | |
1333 | |
1334 | |
1335 | |
1336 | |
1337 | |
1338 | |
1339 | |
1340 | |
1341 |
1342 |
1343 |
1344 |
1345 | | 1999-10-25 |
1346 | 29.84 |
1347 | 2.32 |
1348 | 17.02 |
1349 | 82.75 |
1350 | NaN |
1351 | 21.45 |
1352 | 38.99 |
1353 | 16.78 |
1354 | NaN |
1355 | NaN |
1356 |
1357 |
1358 | | 1999-10-26 |
1359 | 29.82 |
1360 | 2.34 |
1361 | 16.65 |
1362 | 81.25 |
1363 | NaN |
1364 | 20.89 |
1365 | 37.11 |
1366 | 17.28 |
1367 | NaN |
1368 | NaN |
1369 |
1370 |
1371 | | 1999-10-27 |
1372 | 29.33 |
1373 | 2.38 |
1374 | 16.52 |
1375 | 75.94 |
1376 | NaN |
1377 | 20.80 |
1378 | 36.94 |
1379 | 18.27 |
1380 | NaN |
1381 | NaN |
1382 |
1383 |
1384 | | 1999-10-28 |
1385 | 29.01 |
1386 | 2.43 |
1387 | 16.59 |
1388 | 71.00 |
1389 | NaN |
1390 | 21.19 |
1391 | 38.85 |
1392 | 19.79 |
1393 | NaN |
1394 | NaN |
1395 |
1396 |
1397 | | 1999-10-29 |
1398 | 29.88 |
1399 | 2.50 |
1400 | 17.21 |
1401 | 70.62 |
1402 | NaN |
1403 | 21.47 |
1404 | 39.25 |
1405 | 20.00 |
1406 | NaN |
1407 | NaN |
1408 |
1409 |
1410 |
1411 |
1412 |
1413 |
1414 |
1415 |
1416 | ```python
1417 | dxp.line(data=stocks.head(500))
1418 | ```
1419 |
1420 |
1421 |
1422 |
1423 | 
1424 |
1425 |
1426 |
--------------------------------------------------------------------------------
/dexplot/_common_plot.py:
--------------------------------------------------------------------------------
1 | import textwrap
2 | import warnings
3 | from collections import defaultdict
4 | import io
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import matplotlib.pyplot as plt
9 | from matplotlib import ticker
10 | from matplotlib.colors import Colormap
11 |
12 |
13 | NONETYPE = type(None)
14 |
15 | class CommonPlot:
16 |
17 |
18 | def __init__(self, x, y, data, aggfunc, split, row, col,
19 | x_order, y_order, split_order, row_order, col_order,
20 | orientation, sort_values, wrap, figsize, title, sharex, sharey,
21 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
22 | x_textwrap, y_textwrap, x_rot, y_rot,
23 | check_numeric=False, kind=None):
24 |
25 | self.used_columns = set()
26 | self.data = self.get_data(data)
27 | self.x = self.get_col(x)
28 | self.y = self.get_col(y)
29 | self.validate_x_y()
30 | self.orientation = orientation
31 | self.aggfunc = self.get_aggfunc(aggfunc)
32 | self.groupby = self.get_groupby()
33 | self.split = self.get_col(split)
34 | self.row = self.get_col(row)
35 | self.col = self.get_col(col)
36 |
37 | self.agg = self.set_agg()
38 | self.make_groups_categorical(kind)
39 | self.validate_numeric(check_numeric)
40 |
41 | self.x_order = self.validate_order(x_order, 'x')
42 | self.y_order = self.validate_order(y_order, 'y')
43 | self.split_order = self.validate_order(split_order, 'split')
44 | self.row_order = self.validate_order(row_order, 'row')
45 | self.col_order = self.validate_order(col_order, 'col')
46 | self.filter_data()
47 | self.groupby_order = self.get_groupby_order()
48 |
49 | self.sort_values = sort_values
50 | self.groupby_sort = True
51 | self.wrap = wrap
52 | self.figsize = figsize
53 | self.title = title
54 | self.sharex = sharex
55 | self.sharey = sharey
56 | self.xlabel = xlabel
57 | self.ylabel = ylabel
58 | self.xlim = xlim
59 | self.ylim = ylim
60 | self.xscale = xscale
61 | self.yscale = yscale
62 | self.colors = self.get_colors(cmap)
63 | self.x_textwrap = x_textwrap
64 | self.y_textwrap = y_textwrap
65 | self.x_rot = x_rot
66 | self.y_rot = y_rot
67 |
68 | self.validate_args()
69 | self.plot_type = self.get_plot_type()
70 | self.agg_kind = self.get_agg_kind()
71 | self.data = self.set_index()
72 | self.rows, self.cols = self.get_uniques()
73 | self.rows, self.cols = self.get_row_col_order()
74 | self.fig_shape = self.get_fig_shape()
75 |
76 |
77 | def get_data(self, data):
78 | if isinstance(data, pd.Series):
79 | return data.to_frame()
80 |
81 | if not isinstance(data, pd.DataFrame):
82 | raise TypeError('`data` must be a pandas DataFrame or Series')
83 | elif len(data) == 0:
84 | raise ValueError('DataFrame contains no data')
85 | return data.copy()
86 |
87 | def get_col(self, col, group=False):
88 | if col:
89 | try:
90 | col in self.data.columns
91 | except KeyError:
92 | raise KeyError(f'{col} is not a column in the DataFrame')
93 |
94 | if col in self.used_columns:
95 | raise ValueError(f'Column {col} has already been chosen. '
96 | '`x`, `y`, `split`, `row`, and `col` must all be unique.')
97 | self.used_columns.add(col)
98 | return col
99 |
100 | def validate_x_y(self):
101 | if self.x == self.y and self.x is not None and self.y is not None:
102 | raise ValueError('`x` and `y` cannot be the same column name')
103 |
104 | def get_aggfunc(self, aggfunc):
105 | if aggfunc == 'countna':
106 | return lambda x: x.isna().sum()
107 | if aggfunc == 'percna':
108 | return lambda x: x.isna().mean()
109 | return aggfunc
110 |
111 | def get_groupby(self):
112 | if self.x is None or self.y is None or self.aggfunc is None:
113 | return
114 | return self.x if self.orientation == 'v' else self.y
115 |
116 | def set_agg(self):
117 | return self.y if self.orientation == 'v' else self.x
118 |
119 | def filter_data(self):
120 | params = 'x', 'y', 'split', 'row', 'col'
121 | for param in params:
122 | name, order = getattr(self, param), getattr(self, param + '_order')
123 | if name and order:
124 | s = self.data[name]
125 | if isinstance(order, list):
126 | if s.dtype.kind == 'O':
127 | for val in order:
128 | if not (s == val).any():
129 | raise ValueError(f'Value {val} is not in column {name}')
130 | self.data = self.data[s.isin(order)]
131 | else:
132 | # allow datetimes?
133 | if len(order) != 2:
134 | raise ValueError(f'You are filtering {name}. Provide a two-item list '
135 | 'of the min and max values')
136 | self.data = self.data[s.between(*order)]
137 | elif isinstance(order, int):
138 | vc = s.value_counts()
139 | if order > 0:
140 | idx = vc.index[:order]
141 | else:
142 | idx = vc.index[order:]
143 | self.data = self.data[s.isin(idx)]
144 | setattr(self, param +'_order', idx.tolist())
145 |
146 | if name and self.data[name].dtype.name == 'category':
147 | self.data[name].cat.remove_unused_categories(inplace=True)
148 |
149 | def make_groups_categorical(self, kind):
150 | category_cols = [self.groupby, self.split, self.row, self.col]
151 | for col in category_cols:
152 | if col:
153 | if self.data[col].dtype.name != 'category':
154 | self.data[col] = self.data[col].astype('category')
155 | if kind == 'count':
156 | col = self.x or self.y
157 | if self.data[col].dtype.name != 'category':
158 | self.data[col] = self.data[col].astype('category')
159 |
160 | def validate_numeric(self, check_numeric):
161 | if check_numeric:
162 | for val in (self.x, self.y):
163 | if val and self.data[val].dtype.kind not in ('i', 'f', 'b'):
164 | raise TypeError(f'Column {val} must be numeric (integer or float)')
165 |
166 | def validate_order(self, order, kind):
167 | if isinstance(order, str):
168 | order = order.strip().lower()
169 | if order in ('asc', 'desc'):
170 | return order
171 | command = order.split()
172 | if len(command) != 2 or command[0] not in ('top', 'bottom'):
173 | raise ValueError(f'{kind}_order string must begin with either "asc"/"desc" OR '
174 | ' "top" or "bottom" followed by a space and then an integer.')
175 | mult = int(command[0] == "top") * 2 - 1
176 | try:
177 | num = int(command[1])
178 | except ValueError:
179 | raise ValueError(f'{command[1]} is not a valid integer')
180 | if num == 0:
181 | raise ValueError('Number cannot be 0')
182 | return num * mult
183 | elif isinstance(order, (tuple, list)):
184 | return list(order)
185 | elif hasattr(order, 'tolist'):
186 | return order.tolist()
187 | elif order is not None:
188 | raise TypeError(f'{kind}_order must be a str or tuple/list/array/series.')
189 |
190 | def get_groupby_order(self):
191 | if self.x == self.groupby:
192 | return self.x_order
193 | if self.y == self.groupby:
194 | return self.y_order
195 |
196 | def get_colors(self, cmap):
197 | if cmap is None:
198 | cmap = 't10'
199 |
200 | if isinstance(cmap, str):
201 | from .colors._colormaps import colormaps
202 | try:
203 | return colormaps[cmap.lower()]
204 | except KeyError:
205 | raise KeyError(f'Colormap {cmap} does not exist. Here are the '
206 | f'possible colormaps: {colormaps.keys()}')
207 | elif isinstance(cmap, Colormap):
208 | return cmap(range(cmap.N)).tolist()
209 | elif isinstance(cmap, list):
210 | return cmap
211 | elif isinstance(cmap, tuple):
212 | return list(cmap)
213 | elif hasattr(cmap, 'tolist'):
214 | return cmap.tolist()
215 | else:
216 | raise TypeError('`cmap` must be a string name of a colormap, a matplotlib colormap '
217 | 'instance, list, or tuple of colors')
218 |
219 | def validate_args(self):
220 | self.validate_plot_args()
221 | self.validate_mpl_args()
222 | self.validate_sort_values()
223 |
224 | def validate_plot_args(self):
225 | if self.orientation not in ('v', 'h'):
226 | raise ValueError('`orientation` must be either "v" or "h".')
227 |
228 | if not isinstance(self.wrap, (np.integer, int, NONETYPE)):
229 | raise TypeError(f'`wrap` must either be None or an integer, not {type(wrap)}')
230 |
231 | if self.row and self.col and self.wrap is not None:
232 | raise ValueError('You cannot provide a value for `wrap` if `row` '
233 | 'and `col` are also provided')
234 |
235 | def validate_mpl_args(self):
236 | if not isinstance(self.title, (NONETYPE, str)):
237 | raise TypeError('`title` must be either None or a str')
238 | if self.sharex not in (False, True, None, 'row', 'col'):
239 | raise ValueError('`sharex` must be one of `False`, `True`, `None`, "row", or "col"')
240 | if self.sharey not in (False, True, None, 'row', 'col'):
241 | raise ValueError('`sharex` must be one of `False`, `True`, `None`, "row", or "col"')
242 |
243 | if not isinstance(self.xlabel, (NONETYPE, str)):
244 | raise TypeError('`xlabel` must be either None or a str')
245 | if not isinstance(self.ylabel, (NONETYPE, str)):
246 | raise TypeError('`ylabel` must be either None or a str')
247 |
248 | if not isinstance(self.xlim, (NONETYPE, tuple)):
249 | raise TypeError('`xlim` must be a two-item tuple of numerics or `None`')
250 | if not isinstance(self.ylim, (NONETYPE, tuple)):
251 | raise TypeError('`xlim` must be a two-item tuple of numerics or `None`')
252 | if self.xscale not in ('linear', 'log', 'symlog', 'logit'):
253 | raise ValueError("`xscale must be one of 'linear', 'log', 'symlog', 'logit'")
254 | if self.yscale not in ('linear', 'log', 'symlog', 'logit'):
255 | raise ValueError("`xscale must be one of 'linear', 'log', 'symlog', 'logit'")
256 |
257 | def validate_sort_values(self):
258 | if self.sort_values not in ['asc', 'desc', None]:
259 | raise ValueError('`sort_values` must be one of "asc", "desc", or `None`')
260 | if self.sort_values and (self.split or self.row or self.col):
261 | raise ValueError('Can only use `sort_values` if `split`, `row`, and `col` are `None`.')
262 |
263 | def get_plot_type(self):
264 | if self.row and self.col:
265 | return 'square'
266 | if self.row:
267 | return 'row_only'
268 | if self.col:
269 | return 'col_only'
270 | return 'single'
271 |
272 | def get_agg_kind(self):
273 | if self.agg:
274 | # string and category use 'O'
275 | agg_kind = self.data[self.agg].dtype.kind
276 | return agg_kind
277 |
278 | def set_index(self):
279 | data = self.data
280 | rc = []
281 | if self.row:
282 | rc.append(self.row)
283 | if self.col:
284 | rc.append(self.col)
285 | if rc:
286 | data = data.set_index(rc)
287 | return data
288 |
289 | def get_uniques(self):
290 | if self.plot_type == 'single':
291 | return None, None
292 | elif self.plot_type == 'row_only':
293 | return self.data.index.unique(), None
294 | elif self.plot_type == 'col_only':
295 | return None, self.data.index.unique()
296 | else:
297 | return self.data.index.levels
298 |
299 | def get_row_col_order(self):
300 | rows, cols = self.rows, self.cols
301 | if rows is not None:
302 | if self.row_order == 'desc':
303 | rows = sorted(rows, reverse=True)
304 | else:
305 | rows = sorted(rows)
306 | if cols is not None:
307 | if self.col_order == 'desc':
308 | cols = sorted(cols, reverse=True)
309 | else:
310 | cols = sorted(cols)
311 |
312 | if isinstance(self.row_order, list):
313 | new_rows = []
314 | for row in self.row_order:
315 | if row not in rows:
316 | raise ValueError(f'Row value {row} does not exist')
317 | new_rows.append(row)
318 | rows = new_rows
319 | if isinstance(self.col_order, list):
320 | new_cols = []
321 | for col in self.col_order:
322 | if col not in cols:
323 | raise ValueError(f'Column value {col} does not exist')
324 | new_cols.append(col)
325 | cols = new_cols
326 | return rows, cols
327 |
328 | def get_fig_shape(self):
329 | if self.plot_type == 'single':
330 | return 1, 1
331 |
332 | nrows = ncols = 1
333 | if self.rows is not None:
334 | nrows = len(self.rows)
335 | if self.cols is not None:
336 | ncols = len(self.cols)
337 |
338 | if self.wrap:
339 | if self.plot_type == 'row_only':
340 | ncols = (nrows - 1) // self.wrap + 1
341 | nrows = min(nrows, self.wrap)
342 | elif self.plot_type == 'col_only':
343 | nrows = (ncols - 1) // self.wrap + 1
344 | ncols = min(ncols, self.wrap)
345 | return nrows, ncols
346 |
347 | def get_data_for_every_plot(self):
348 | # TODO: catch keyerror for groups that dont exist
349 | rows, cols = self.get_row_col_order()
350 | if self.plot_type == 'row_only':
351 | return [(row, self.data.loc[row]) for row in rows]
352 | if self.plot_type in ('row_only', 'col_only'):
353 | return [(col, self.data.loc[col]) for col in cols]
354 | elif self.plot_type == 'square':
355 | groups = []
356 | for col in cols:
357 | for row in rows:
358 | group = row, col
359 | try:
360 | with warnings.catch_warnings():
361 | warnings.simplefilter("ignore")
362 | data = self.data.loc[group]
363 | except (KeyError, TypeError):
364 | data = self.data.iloc[:0]
365 | groups.append((group, data))
366 | return groups
367 | else:
368 | return [(None, self.data)]
369 |
370 | def get_labels(self, labels):
371 | # this won't work for wrapping
372 | if self.plot_type == 'square':
373 | return str(labels[0]), str(labels[1])
374 | elif self.plot_type == 'row_only':
375 | return str(labels), None
376 | elif self.plot_type == 'col_only':
377 | return None, str(labels)
378 | return None, None
379 |
380 | def sort_values_xy(self, x, y):
381 | grp, num = (x, y) if self.orientation == 'v' else (y, x)
382 | if self.sort_values is None:
383 | return x, y
384 | elif self.sort_values == 'asc':
385 | order = np.lexsort([grp, num])
386 | else:
387 | order = np.lexsort([grp, -num])
388 | if self.orientation == 'h':
389 | order = order[::-1]
390 | return x[order], y[order]
391 |
392 | def get_order(self, arr, vals):
393 | arr = arr.tolist()
394 | order = []
395 | for val in vals:
396 | try:
397 | idx = arr.index(val)
398 | except ValueError:
399 | raise ValueError(f'{val} is not a valid column value')
400 | order.append(idx)
401 | return order
402 |
403 | def reverse_order(self, order):
404 | cond1 = order == 'desc' and self.orientation == 'v'
405 | cond2 = order in ('asc', None) and self.orientation == 'h'
406 | return cond1 or cond2
407 |
408 | def order_xy(self, x, y):
409 | if self.x_order and self.x != self.agg:
410 | if isinstance(self.x_order, list):
411 | order = self.get_order(x, self.x_order)
412 | elif self.reverse_order(self.x_order):
413 | order = np.lexsort([x])[::-1]
414 | else:
415 | return x, y
416 | elif self.y_order and self.y != self.agg:
417 | if isinstance(self.y_order, list):
418 | order = self.get_order(y, self.y_order)
419 | elif self.reverse_order(self.y_order):
420 | order = np.lexsort([y])[::-1]
421 | else:
422 | return x, y
423 | else:
424 | return x, y
425 | return x[order], y[order]
426 |
427 | def get_correct_data_order(self, x, y):
428 | x, y = self.sort_values_xy(x, y)
429 | if self.sort_values is None:
430 | x, y = self.order_xy(x, y)
431 | return x, y
432 |
433 | def get_wide_data(self, data):
434 | x = data.index.values
435 | y = {col: data[col].values for col in data.columns}
436 | if self.orientation == 'h':
437 | x, y = y, x
438 | return x, y
439 |
440 | def get_wide_columns(self, data):
441 | cols = []
442 | used_cols = [self.groupby, self.split, self.row, self.col]
443 | for col in data.columns:
444 | if col not in used_cols:
445 | cols.append(col)
446 | return cols
447 |
448 | def get_ordered_groups(self, data, specific_order, kind):
449 | # used for split and groupby groups
450 | order = []
451 | groups = []
452 | sort = specific_order is not None
453 | # TODO: Need to decide defaults for x_order, y_order etc... either None or 'asc'
454 | for grp, data_grp in data.groupby(getattr(self, kind), sort=True):
455 | order.append((grp, data_grp))
456 | groups.append(grp)
457 |
458 | if isinstance(specific_order, list):
459 | new_order = []
460 | for grp in specific_order:
461 | try:
462 | idx = groups.index(grp)
463 | except ValueError:
464 | col = getattr(self, kind)
465 | raise ValueError(f'Value "{grp}" from `{kind}_order` is '
466 | f'not in column {col}')
467 |
468 | new_order.append(idx)
469 | order = [order[i] for i in new_order]
470 | elif specific_order == 'desc':
471 | new_order = np.lexsort([groups])[::-1]
472 | order = [order[i] for i in new_order]
473 |
474 | return order
475 |
476 | def get_final_groups(self, data, split_label, row_label, col_label):
477 | groups = []
478 | if self.aggfunc == '__distribution__':
479 | if self.groupby is not None:
480 | for grp, data_grp in self.get_ordered_groups(data, self.groupby_order, 'groupby'):
481 | vals = data_grp[self.agg]
482 | groups.append((vals, split_label, grp, row_label, col_label))
483 | else:
484 | col = self.x or self.y
485 | vals = data[col]
486 | groups.append((vals, split_label, self.col, row_label, col_label))
487 | elif self.groupby is not None:
488 | try:
489 | s = data.groupby(self.groupby, sort=self.groupby_sort)[self.agg].agg(self.aggfunc)
490 | except Exception as e:
491 | if type(e).__name__ == 'DataError':
492 | raise ValueError(f'The aggregating column {self.agg} is not numeric and '
493 | f'cannot be aggregated with {self.aggfunc}. You might need '
494 | 'to switch x and y')
495 | else:
496 | raise e
497 | x, y = s.index.values, s.values
498 | x, y = (x, y) if self.orientation == 'v' else (y, x)
499 | x, y = self.get_correct_data_order(x, y)
500 | groups.append((x, y, split_label, self.groupby, row_label, col_label))
501 | elif self.x is None or self.y is None:
502 | if self.x:
503 | s = data[self.x]
504 | x, y = s.values, s.index.values
505 | x, y = self.get_correct_data_order(x, y)
506 | groups.append((x, y, split_label, self.x, row_label, col_label))
507 | elif self.y:
508 | s = data[self.y]
509 | x, y = s.index.values, s.values
510 | x, y = self.get_correct_data_order(x, y)
511 | groups.append((x, y, split_label, self.y, row_label, col_label))
512 | else:
513 | # wide data
514 | for col in self.get_wide_columns(data):
515 | s = data[col]
516 | x, y = s.index.values, s.values
517 | x, y = self.get_correct_data_order(x, y)
518 | x, y = (x, y) if self.orientation == 'v' else (y, x)
519 | groups.append((x, y, col, None, row_label, col_label))
520 | else:
521 | # simple raw plot - make sure to warn when lots of data for bar/box/hist
522 | # one graph per row - OK for scatterplots and line plots
523 | x, y = self.get_correct_data_order(data[self.x], data[self.y])
524 | groups.append((x, y, split_label, None, row_label, col_label))
525 | return groups
526 |
527 | def get_x_y_plot(self, x, y):
528 | x_plot, y_plot = x, y
529 | if x_plot.dtype.kind == 'O':
530 | x_plot = np.arange(len(x_plot))
531 | if y_plot.dtype.kind == 'O':
532 | y_plot = np.arange(len(y_plot))
533 | return x_plot, y_plot
534 |
535 | def get_distribution_data(self, info):
536 | cur_data = defaultdict(list)
537 | cur_ticklabels = defaultdict(list)
538 | for vals, split_label, col_name, row_label, col_label in info:
539 | cur_data[split_label].append(vals)
540 | cur_ticklabels[split_label].append(col_name)
541 | return cur_data, cur_ticklabels
542 |
543 |
544 | class MPLCommon(CommonPlot):
545 |
546 | def __init__(self, x, y, data, aggfunc, split, row, col,
547 | x_order, y_order, split_order, row_order, col_order,
548 | orientation, sort_values, wrap, figsize, title, sharex, sharey,
549 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
550 | x_textwrap, y_textwrap, x_rot, y_rot,
551 | check_numeric=False, kind=None):
552 | super().__init__(x, y, data, aggfunc, split, row, col,
553 | x_order, y_order, split_order, row_order, col_order,
554 | orientation, sort_values, wrap, figsize, title, sharex, sharey,
555 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
556 | x_textwrap, y_textwrap, x_rot, y_rot,
557 | check_numeric=False, kind=None)
558 | self.figsize = self.get_figsize()
559 | self.user_figsize = self.figsize is not None
560 | self.original_rcParams = plt.rcParams.copy()
561 | self.set_rcParams()
562 | self.fig, self.axs = self.create_figure()
563 | self.set_color_cycle()
564 | self.data_for_plots = self.get_data_for_every_plot()
565 | self.final_data = self.get_final_data()
566 | self.style_fig()
567 | self.add_ax_titles()
568 | self.add_fig_title()
569 |
570 | def get_figsize(self):
571 | if self.figsize is None:
572 | return
573 | elif isinstance(self.figsize, (list, tuple)):
574 | if len(self.figsize) != 2:
575 | raise ValueError('figsize must be a two-item tuple/list')
576 | for val in self.figsize:
577 | if not isinstance(val, (int, float)):
578 | raise ValueError('Each item in figsize must be an integer or a float')
579 | else:
580 | raise TypeError('figsize must be a two-item tuple')
581 |
582 | return self.fig_shape[1] * 4, self.fig_shape[0] * 3
583 |
584 | def create_figure(self):
585 | fig = plt.Figure(tight_layout=True, dpi=144, figsize=self.figsize)
586 | axs = fig.subplots(*self.fig_shape, sharex=self.sharex, sharey=self.sharey)
587 | if self.fig_shape != (1, 1):
588 | axs = axs.flatten(order='F')
589 | else:
590 | axs = [axs]
591 | return fig, axs
592 |
593 | def set_color_cycle(self):
594 | for ax in self.axs:
595 | ax.set_prop_cycle(color=self.colors)
596 |
597 | def get_final_data(self):
598 | # create list of data for each call to plotting method
599 | final_data = defaultdict(list)
600 | for (labels, data), ax in zip(self.data_for_plots, self.axs):
601 | row_label, col_label = self.get_labels(labels)
602 | if self.split:
603 | for grp, data_grp in self.get_ordered_groups(data, self.split_order, 'split'):
604 | final_data[ax].extend(self.get_final_groups(data_grp, grp, row_label, col_label))
605 | else:
606 | final_data[ax].extend(self.get_final_groups(data, None, row_label, col_label))
607 | return final_data
608 |
609 | def style_fig(self):
610 | for ax in self.axs:
611 | ax.tick_params(length=0)
612 | ax.set_facecolor('.9')
613 | ax.grid(True)
614 | ax.set_axisbelow(True)
615 | for spine in ax.spines.values():
616 | spine.set_visible(False)
617 |
618 | def add_x_y_labels(self):
619 | if self.plot_type == 'single':
620 | self.axs[0].set_xlabel(self.x)
621 | self.axs[0].set_ylabel(self.y)
622 | return
623 |
624 | # need to eliminate next line to save lots of time
625 | self.fig.canvas.print_figure(io.BytesIO())
626 | rows, cols = self.fig_shape
627 | top_left_ax, bottom_right_ax = self.axs[0], self.axs[rows * cols - 1]
628 | top_left_points = top_left_ax.get_position().get_points()
629 | bottom_right_points = bottom_right_ax.get_position().get_points()
630 |
631 | left = top_left_points[0][0]
632 | right = bottom_right_points[1][0]
633 | x = (right + left) / 2
634 |
635 | top = top_left_points[1][1]
636 | bottom = bottom_right_points[0][1]
637 | y = (top + bottom) / 2
638 | self.fig.text(0, y, self.y, rotation=90, ha='center', va='center', size='larger')
639 | self.fig.text(x, 0, self.x, ha='center', va='center', size='larger')
640 |
641 | def add_ax_titles(self):
642 | for ax, info in self.final_data.items():
643 | row_label, col_label = info[0][-2:]
644 | if row_label is not None:
645 | row_label = str(row_label)
646 | if col_label is not None:
647 | col_label = str(col_label)
648 | row_label = row_label or ''
649 | col_label = col_label or ''
650 | if row_label and col_label:
651 | title = row_label + ' - ' + col_label
652 | else:
653 | title = row_label or col_label
654 | title = textwrap.fill(str(title), 30)
655 | ax.set_title(title)
656 |
657 | def set_rcParams(self):
658 | plt.rcParams['font.size'] = 6
659 | plt.rcParams['font.family'] = 'Helvetica'
660 |
661 | def add_ticklabels(self, labels, ax, delta=0):
662 | ticks = np.arange(len(labels))
663 | ha, va = 'center', 'center'
664 | if self.orientation == 'v':
665 | if self.x_textwrap:
666 | labels = [textwrap.fill(str(label), self.x_textwrap) for label in labels]
667 | ax.set_xticks(ticks + delta)
668 | if self.x_rot is not None:
669 | if 0 <= self.x_rot <= 180:
670 | ha = 'right'
671 | else:
672 | ha = 'left'
673 | ax.set_xticklabels(labels, rotation=self.x_rot, ha=ha)
674 | else:
675 | if self.y_textwrap:
676 | labels = [textwrap.fill(str(label), self.y_textwrap) for label in labels]
677 | ax.set_yticks(ticks - delta)
678 | if self.y_rot is not None:
679 | if 0 <= self.y_rot <= 180:
680 | va = 'top'
681 | else:
682 | va = 'bottom'
683 | ax.set_yticklabels(labels, rotation=self.y_rot, va=va)
684 |
685 | def add_legend(self, label=None, handles=None, labels=None):
686 | if label is not None:
687 | if handles is None:
688 | handles, labels = self.axs[0].get_legend_handles_labels()
689 | ncol = len(labels) // 8 + 1
690 | self.fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(1.01, .8),
691 | title=self.split, ncol=ncol)
692 |
693 | def clean_up(self):
694 | self.add_x_y_labels()
695 | plt.rcParams = self.original_rcParams
696 | return self.fig
697 |
698 | def update_fig_size(self, n_splits, n_groups_per_split):
699 | if self.user_figsize:
700 | return
701 | c1 = .3 if self.orientation == 'v' else .2
702 | c2 = .06 if self.orientation == 'v' else .04
703 | new_size = 1.8 + (c1 + c2 * n_splits) * n_groups_per_split
704 | if self.orientation == 'v':
705 | height = max(2.5 - .3 * self.fig_shape[0], 1.2)
706 | shrink = max(.9 - .1 * self.fig_shape[1], .5)
707 | width = new_size * shrink * self.fig_shape[1]
708 | height = height * self.fig_shape[0]
709 | else:
710 | width = max(3 - .3 * self.fig_shape[1], 1.5)
711 | height = new_size * .8 * self.fig_shape[0]
712 | width = width * self.fig_shape[1]
713 | width, height = min(width, 25), min(height, 25)
714 | self.fig.set_size_inches(width, height)
715 |
716 | def add_fig_title(self):
717 | self.fig.suptitle(self.title, y=1.02)
718 |
719 |
720 | import plotly.graph_objects as go
721 | from plotly.subplots import make_subplots
722 |
723 |
724 | class PlotlyCommon(CommonPlot):
725 |
726 | def __init__(self, x, y, data, aggfunc, split, row, col,
727 | x_order, y_order, split_order, row_order, col_order,
728 | orientation, sort_values, wrap, figsize, title, sharex, sharey,
729 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
730 | x_textwrap, y_textwrap, x_rot, y_rot,
731 | check_numeric=False, kind=None):
732 | super().__init__(x, y, data, aggfunc, split, row, col,
733 | x_order, y_order, split_order, row_order, col_order,
734 | orientation, sort_values, wrap, figsize, title, sharex, sharey,
735 | xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
736 | x_textwrap, y_textwrap, x_rot, y_rot,
737 | check_numeric=False, kind=None)
738 |
739 | self.data_for_plots = self.get_data_for_every_plot()
740 | self.final_data = self.get_final_data()
741 | self.fig = self.create_figure()
742 |
743 | def create_figure(self):
744 | titles = self.get_subplot_titles()
745 | fig = make_subplots(rows=self.fig_shape[0], cols=self.fig_shape[1], subplot_titles=titles,
746 | shared_xaxes=self.sharex, shared_yaxes=self.sharey,
747 | horizontal_spacing=.03)
748 | fig.update_layout(title_text=self.title, legend_title_text=self.split)
749 | return fig
750 |
751 | def get_final_data(self):
752 | # create list of data for each call to plotting method
753 | final_data = defaultdict(list)
754 | locs = []
755 | for i in range(self.fig_shape[0]):
756 | for j in range(self.fig_shape[1]):
757 | locs.append((i + 1, j + 1))
758 |
759 | for (labels, data), loc in zip(self.data_for_plots, locs):
760 | row_label, col_label = self.get_labels(labels)
761 | if self.split:
762 | for grp, data_grp in self.get_ordered_groups(data, self.split_order, 'split'):
763 | final_data[loc].extend(self.get_final_groups(data_grp, grp, row_label, col_label))
764 | else:
765 | final_data[loc].extend(self.get_final_groups(data, None, row_label, col_label))
766 | return final_data
767 |
768 | def get_subplot_titles(self):
769 | titles = []
770 | for (i, j), info in self.final_data.items():
771 | row_label, col_label = info[0][-2:]
772 | if row_label is not None:
773 | row_label = str(row_label)
774 | if col_label is not None:
775 | col_label = str(col_label)
776 | row_label = row_label or ''
777 | col_label = col_label or ''
778 | if row_label and col_label:
779 | title = row_label + ' - ' + col_label
780 | else:
781 | title = row_label or col_label
782 | title = textwrap.fill(str(title), 30)
783 | titles.append(title)
784 | return titles
785 |
786 |
787 | class CountCommon(CommonPlot):
788 |
789 | def get_count_dict(self, normalize):
790 | count_dict = {}
791 |
792 | if isinstance(normalize, str):
793 | if normalize in (val, self.split, self.row, self.col):
794 | normalize = [normalize]
795 |
796 | if isinstance(normalize, tuple):
797 | normalize = list(normalize)
798 | elif hasattr(normalize, 'tolist'):
799 | normalize = normalize.tolist()
800 | elif not isinstance(normalize, (bool, list)):
801 | raise ValueError('`normalize` must either be `True`/`False`, one of the columns passed '
802 | 'to `val`, `split`, `row` or `col`, or a list of '
803 | 'those columns')
804 | normalize_kind = None
805 | if isinstance(normalize, list):
806 | row_col = []
807 | val_split = []
808 | for col in normalize:
809 | if col in (self.row, self.col):
810 | row_col.append(col)
811 | elif col in (val, self.split):
812 | val_split.append(col)
813 | else:
814 | raise ValueError('Columns passed to `normalize` must be the same as '
815 | ' `val`, `split`, `row` or `col`.')
816 |
817 | if row_col:
818 | all_counts = {}
819 | for grp, data in self.data.groupby(row_col):
820 | if len(row_col) == 1:
821 | grp = str(grp)
822 | else:
823 | grp = tuple(str(g) for g in grp)
824 |
825 | if val_split:
826 | normalize_kind = 'all'
827 | all_counts[grp] = data.groupby(val_split).size()
828 | else:
829 | normalize_kind = 'grid'
830 | all_counts[grp] = len(data)
831 | else:
832 | normalize_kind = 'single'
833 | all_counts = self.data.groupby(val_split).size()
834 |
835 | n = 0
836 | for key, info in self.final_data.items():
837 | columns = []
838 | vcs = []
839 | for vals, split_label, col_name, row_label, col_label in info:
840 | vcs.append(vals.value_counts())
841 | columns.append(split_label)
842 |
843 | df = pd.concat(vcs, axis=1)
844 | df.columns = columns
845 | df.index.name = vals.name
846 | if normalize_kind == 'single':
847 | if len(val_split) == 2:
848 | df = df / all_counts.unstack(self.split)
849 | elif df.index.name == all_counts.index.name:
850 | df = df.div(all_counts, axis=0)
851 | else:
852 | df = df / all_counts
853 | elif normalize_kind in ('grid', 'all'):
854 | grp = []
855 | for col in normalize:
856 | if col == self.row:
857 | grp.append(row_label)
858 | if col == self.col:
859 | grp.append(col_label)
860 |
861 | if len(grp) == 1:
862 | grp = grp[0]
863 | else:
864 | grp = tuple(grp)
865 | grp_val = all_counts[grp]
866 |
867 | if normalize_kind == 'grid':
868 | df = df / grp_val
869 | elif len(val_split) == 2:
870 | df = df / grp_val.unstack(self.split)
871 | elif df.index.name == grp_val.index.name:
872 | df = df.div(grp_val, axis=0)
873 | else:
874 | df = df / grp_val
875 |
876 | else:
877 | n += df.sum().sum()
878 | count_dict[key] = df
879 |
880 | if normalize is True:
881 | count_dict = {key: df / n for key, df in count_dict.items()}
882 |
883 | return count_dict
884 |
885 |
886 | class MPLCount(CountCommon, MPLCommon):
887 | pass
888 |
889 |
890 | class PlotlyCount(CountCommon, PlotlyCommon):
891 | pass
892 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Dexplot
2 |
3 | [](https://pypi.org/project/dexplot)
4 | [](LICENSE)
5 |
6 | Dexplot is a Python library for delivering beautiful data visualizations with a simple and intuitive user experience.
7 |
8 | ## Goals
9 |
10 | The primary goals for dexplot are:
11 |
12 | * Maintain a very consistent API with as few functions as necessary to make the desired statistical plots
13 | * Allow the user tremendous power without using matplotlib
14 |
15 |
16 | ## Installation
17 |
18 | `pip install dexplot`
19 |
20 | ## Built for long and wide data
21 |
22 | Dexplot is primarily built for long data, which is a form of data where each row represents a single observation and each column represents a distinct quantity. It is often referred to as "tidy" data. Here, we have some long data.
23 |
24 | 
25 |
26 | Dexplot also has the ability to handle wide data, where multiple columns may contain values that represent the same kind of quantity. The same data above has been aggregated to show the mean for each combination of neighborhood and property type. It is now wide data as each column contains the same quantity (price).
27 |
28 | 
29 |
30 | ## Usage
31 |
32 | Dexplot provides a small number of powerful functions that all work similarly. Most plotting functions have the following signature:
33 |
34 | ```python
35 | dxp.plotting_func(x, y, data, aggfunc, split, row, col, orientation, ...)
36 | ```
37 |
38 | * `x` - Column name along the x-axis
39 | * `y` - Column name the y-axis
40 | * `data` - Pandas DataFrame
41 | * `aggfunc` - String of pandas aggregation function, 'min', 'max', 'mean', etc...
42 | * `split` - Column name to split data into distinct groups
43 | * `row` - Column name to split data into distinct subplots row-wise
44 | * `col` - Column name to split data into distinct subplots column-wise
45 | * `orientation` - Either vertical (`'v'`) or horizontal (`'h'`). Default for most plots is vertical.
46 |
47 | When `aggfunc` is provided, `x` will be the grouping variable and `y` will be aggregated when vertical and vice-versa when horizontal. The best way to learn how to use dexplot is with the examples below.
48 |
49 | ## Families of plots
50 |
51 | There are two primary families of plots, **aggregation** and **distribution**. Aggregation plots take a sequence of values and return a **single** value using the function provided to `aggfunc` to do so. Distribution plots take a sequence of values and depict the shape of the distribution in some manner.
52 |
53 | * Aggregation
54 | * bar
55 | * line
56 | * scatter
57 | * count
58 | * Distribution
59 | * box
60 | * violin
61 | * hist
62 | * kde
63 |
64 | ## Comparison with Seaborn
65 |
66 | If you have used the seaborn library, then you should notice a lot of similarities. Much of dexplot was inspired by Seaborn. Below is a list of the extra features in dexplot not found in seaborn
67 |
68 | * Ability to graph relative frequency and normalize over any number of variables
69 | * No need for multiple functions to do the same thing (far fewer public functions)
70 | * Ability to make grids with a single function instead of having to use a higher level function like `catplot`
71 | * Pandas `groupby` methods available as strings
72 | * Ability to sort by values
73 | * Ability to sort x/y labels lexicographically
74 | * Ability to select most/least frequent groups
75 | * x/y labels are wrapped so that they don't overlap
76 | * Figure size (plus several other options) and available to change without using matplotlib
77 | * A matplotlib figure object is returned
78 |
79 | ## Examples
80 |
81 | Most of the examples below use long data.
82 |
83 | ## Aggregating plots - bar, line and scatter
84 |
85 | We'll begin by covering the plots that **aggregate**. An aggregation is defined as a function that summarizes a sequence of numbers with a single value. The examples come from the Airbnb dataset, which contains many property rental listings from the Washington D.C. area.
86 |
87 |
88 | ```python
89 | import dexplot as dxp
90 | import pandas as pd
91 | airbnb = dxp.load_dataset('airbnb')
92 | airbnb.head()
93 | ```
94 |
95 |
96 |
97 |
98 |
99 | |
100 | neighborhood |
101 | property_type |
102 | accommodates |
103 | bathrooms |
104 | bedrooms |
105 | price |
106 | cleaning_fee |
107 | rating |
108 | superhost |
109 | response_time |
110 | latitude |
111 | longitude |
112 |
113 |
114 |
115 |
116 | | 0 |
117 | Shaw |
118 | Townhouse |
119 | 16 |
120 | 3.5 |
121 | 4 |
122 | 433 |
123 | 250 |
124 | 95.0 |
125 | No |
126 | within an hour |
127 | 38.90982 |
128 | -77.02016 |
129 |
130 |
131 | | 1 |
132 | Brightwood Park |
133 | Townhouse |
134 | 4 |
135 | 3.5 |
136 | 4 |
137 | 154 |
138 | 50 |
139 | 97.0 |
140 | No |
141 | NaN |
142 | 38.95888 |
143 | -77.02554 |
144 |
145 |
146 | | 2 |
147 | Capitol Hill |
148 | House |
149 | 2 |
150 | 1.5 |
151 | 1 |
152 | 83 |
153 | 35 |
154 | 97.0 |
155 | Yes |
156 | within an hour |
157 | 38.88791 |
158 | -76.99668 |
159 |
160 |
161 | | 3 |
162 | Shaw |
163 | House |
164 | 2 |
165 | 2.5 |
166 | 1 |
167 | 475 |
168 | 0 |
169 | 98.0 |
170 | No |
171 | NaN |
172 | 38.91331 |
173 | -77.02436 |
174 |
175 |
176 | | 4 |
177 | Kalorama Heights |
178 | Apartment |
179 | 3 |
180 | 1.0 |
181 | 1 |
182 | 118 |
183 | 15 |
184 | 91.0 |
185 | No |
186 | within an hour |
187 | 38.91933 |
188 | -77.04124 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 | There are more than 4,000 listings in our dataset. We will use bar charts to aggregate the data.
197 |
198 |
199 | ```python
200 | airbnb.shape
201 | ```
202 |
203 |
204 |
205 |
206 | (4581, 12)
207 |
208 |
209 |
210 | ### Vertical bar charts
211 |
212 | In order to performa an aggregation, you must supply a value for `aggfunc`. Here, we find the median price per neighborhood. Notice that the column names automatically wrap.
213 |
214 |
215 | ```python
216 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median')
217 | ```
218 |
219 |
220 |
221 |
222 | 
223 |
224 |
225 |
226 | Line and scatter plots can be created with the same command, just substituting the name of the function. They both are not good choices for the visualization since the grouping variable (neighborhood) has no meaningful order.
227 |
228 |
229 | ```python
230 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median')
231 | ```
232 |
233 |
234 |
235 |
236 | 
237 |
238 |
239 |
240 |
241 | ```python
242 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median')
243 | ```
244 |
245 |
246 |
247 |
248 | 
249 |
250 |
251 |
252 | ### Components of the groupby aggregation
253 |
254 | Anytime the `aggfunc` parameter is set, you have performed a groupby aggregation, which always consists of three components:
255 |
256 | * Grouping column - unique values of this column form independent groups (neighborhood)
257 | * Aggregating column - the column that will get summarized with a single value (price)
258 | * Aggregating function - a function that returns a single value (median)
259 |
260 | The general format for doing this in pandas is:
261 |
262 | ```python
263 | df.groupby('grouping column').agg({'aggregating column': 'aggregating function'})
264 | ```
265 |
266 | Specifically, the following code is executed within dexplot.
267 |
268 |
269 | ```python
270 | airbnb.groupby('neighborhood').agg({'price': 'median'})
271 | ```
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 | |
281 | price |
282 |
283 |
284 | | neighborhood |
285 | |
286 |
287 |
288 |
289 |
290 | | Brightwood Park |
291 | 87.0 |
292 |
293 |
294 | | Capitol Hill |
295 | 129.5 |
296 |
297 |
298 | | Columbia Heights |
299 | 95.0 |
300 |
301 |
302 | | Dupont Circle |
303 | 125.0 |
304 |
305 |
306 | | Edgewood |
307 | 100.0 |
308 |
309 |
310 | | Kalorama Heights |
311 | 118.0 |
312 |
313 |
314 | | Shaw |
315 | 133.5 |
316 |
317 |
318 | | Union Station |
319 | 120.0 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 | ### Number and percent of missing values with `'countna'` and `'percna'`
328 |
329 | In addition to all the common aggregating functions, you can use the strings `'countna'` and `'percna'` to get the number and percentage of missing values per group.
330 |
331 |
332 | ```python
333 | dxp.bar(x='neighborhood', y='response_time', data=airbnb, aggfunc='countna')
334 | ```
335 |
336 |
337 |
338 |
339 | 
340 |
341 |
342 |
343 | ### Sorting the bars by values
344 |
345 | By default, the bars will be sorted by the grouping column (x-axis here) in alphabetical order. Use the `sort_values` parameter to sort the bars by value.
346 |
347 | * None - sort x/y axis labels alphabetically (default)
348 | * `asc` - sort values from least to greatest
349 | * `desc` - sort values from greatest to least
350 |
351 |
352 | ```python
353 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
354 | ```
355 |
356 |
357 |
358 |
359 | 
360 |
361 |
362 |
363 | Here, we sort the values from greatest to least.
364 |
365 |
366 | ```python
367 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
368 | ```
369 |
370 |
371 |
372 |
373 | 
374 |
375 |
376 |
377 | ### Specify order with `x_order`
378 |
379 | Specify a specific order of the labels on the x-axis by passing a list of values to `x_order`. This can also act as a filter to limit the number of bars.
380 |
381 |
382 | ```python
383 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
384 | x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
385 | ```
386 |
387 |
388 |
389 |
390 | 
391 |
392 |
393 |
394 | By default, `x_order` and all of the `_order` parameters are set to `'asc'` by default, which will order them alphabetically. Use the string `'desc'` to sort in the opposite direction.
395 |
396 |
397 | ```python
398 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc')
399 | ```
400 |
401 |
402 |
403 |
404 | 
405 |
406 |
407 |
408 | ### Filter for the neighborhoods with most/least frequency of occurrence
409 |
410 | You can use `x_order` again to filter for the x-values that appear the most/least often by setting it to the string `'top n'` or `'bottom n'` where `n` is an integer. Here, we filter for the top 4 most frequently occurring neighborhoods. This option is useful when there are dozens of unique values in the grouping column.
411 |
412 |
413 | ```python
414 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
415 | x_order='top 4')
416 | ```
417 |
418 |
419 |
420 |
421 | 
422 |
423 |
424 |
425 | We can verify that the four neighborhoods are the most common.
426 |
427 |
428 | ```python
429 | airbnb['neighborhood'].value_counts()
430 | ```
431 |
432 |
433 |
434 |
435 | Columbia Heights 773
436 | Union Station 713
437 | Capitol Hill 654
438 | Edgewood 610
439 | Dupont Circle 549
440 | Shaw 514
441 | Brightwood Park 406
442 | Kalorama Heights 362
443 | Name: neighborhood, dtype: int64
444 |
445 |
446 |
447 | ### Horizontal bars
448 |
449 | Set `orientation` to `'h'` for horizontal bars. When you do this, you'll need to switch `x` and `y` since the grouping column (neighborhood) will be along the y-axis and the aggregating column (price) will be along the x-axis.
450 |
451 |
452 | ```python
453 | dxp.bar(x='price', y='neighborhood', data=airbnb, aggfunc='median',
454 | orientation='h', sort_values='desc')
455 | ```
456 |
457 |
458 |
459 |
460 | 
461 |
462 |
463 |
464 | Switching orientation is possible for most other plots.
465 |
466 |
467 | ```python
468 | dxp.line(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
469 | ```
470 |
471 |
472 |
473 |
474 | 
475 |
476 |
477 |
478 | ### Split bars into groups
479 |
480 | You can split each bar into further groups by setting the `split` parameter to another column.
481 |
482 |
483 | ```python
484 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
485 | ```
486 |
487 |
488 |
489 |
490 | 
491 |
492 |
493 |
494 | We can use the `pivot_table` method to verify the results in pandas.
495 |
496 |
497 | ```python
498 | airbnb.pivot_table(index='superhost', columns='neighborhood',
499 | values='price', aggfunc='median')
500 | ```
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 | | neighborhood |
510 | Brightwood Park |
511 | Capitol Hill |
512 | Columbia Heights |
513 | Dupont Circle |
514 | Edgewood |
515 | Kalorama Heights |
516 | Shaw |
517 | Union Station |
518 |
519 |
520 | | superhost |
521 | |
522 | |
523 | |
524 | |
525 | |
526 | |
527 | |
528 | |
529 |
530 |
531 |
532 |
533 | | No |
534 | 85.0 |
535 | 129.0 |
536 | 90.5 |
537 | 120.0 |
538 | 100.0 |
539 | 110.0 |
540 | 130.0 |
541 | 120.0 |
542 |
543 |
544 | | Yes |
545 | 90.0 |
546 | 130.0 |
547 | 103.0 |
548 | 135.0 |
549 | 100.0 |
550 | 124.0 |
551 | 135.0 |
552 | 125.0 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 | Set the order of the unique split values with `split_order`, which can also act as a filter.
561 |
562 |
563 | ```python
564 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
565 | split='superhost', split_order=['Yes', 'No'])
566 | ```
567 |
568 |
569 |
570 |
571 | 
572 |
573 |
574 |
575 | Like all the `_order` parameters, `split_order` defaults to `'asc'` (alphabetical) order. Set it to `'desc'` for the opposite.
576 |
577 |
578 | ```python
579 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
580 | split='property_type', split_order='desc')
581 | ```
582 |
583 |
584 |
585 |
586 | 
587 |
588 |
589 |
590 | Filtering for the most/least frequent split categories is possible.
591 |
592 |
593 | ```python
594 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
595 | split='property_type', split_order='bottom 2')
596 | ```
597 |
598 |
599 |
600 |
601 | 
602 |
603 |
604 |
605 | Verifying that the least frequent property types are Townhouse and Condominium.
606 |
607 |
608 | ```python
609 | airbnb['property_type'].value_counts()
610 | ```
611 |
612 |
613 |
614 |
615 | Apartment 2403
616 | House 877
617 | Townhouse 824
618 | Condominium 477
619 | Name: property_type, dtype: int64
620 |
621 |
622 |
623 | ### Stacked bar charts
624 |
625 | Stack all the split groups one on top of the other by setting `stacked` to `True`.
626 |
627 |
628 | ```python
629 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
630 | split='superhost', split_order=['Yes', 'No'], stacked=True)
631 | ```
632 |
633 |
634 |
635 |
636 | 
637 |
638 |
639 |
640 | ### Split into multiple plots
641 |
642 | It's possible to split the data further into separate plots by the unique values in a different column with the `row` and `col` parameters. Here, each kind of `property_type` has its own plot.
643 |
644 |
645 | ```python
646 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
647 | split='superhost', col='property_type')
648 | ```
649 |
650 |
651 |
652 |
653 | 
654 |
655 |
656 |
657 | If there isn't room for all of the plots, set the `wrap` parameter to an integer to set the maximum number of plots per row/col. We also specify the `col_order` to be descending alphabetically.
658 |
659 |
660 | ```python
661 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
662 | split='superhost', col='property_type', wrap=2, col_order='desc')
663 | ```
664 |
665 |
666 |
667 |
668 | 
669 |
670 |
671 |
672 | Use `col_order` to both filter and set a specific order for the plots.
673 |
674 |
675 | ```python
676 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
677 | split='superhost', col='property_type', col_order=['House', 'Condominium'])
678 | ```
679 |
680 |
681 |
682 |
683 | 
684 |
685 |
686 |
687 | Splits can be made simultaneously along row and columns.
688 |
689 |
690 | ```python
691 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost',
692 | col='property_type', col_order=['House', 'Condominium', 'Apartment'],
693 | row='bedrooms', row_order=[1, 2, 3])
694 | ```
695 |
696 |
697 |
698 |
699 | 
700 |
701 |
702 |
703 | By default, all axis limits are shared. Allow each plot to set its own limits by setting `sharex` and `sharey` to `False`.
704 |
705 |
706 | ```python
707 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost',
708 | col='property_type', col_order=['House', 'Condominium', 'Apartment'],
709 | row='bedrooms', row_order=[1, 2, 3], sharey=False)
710 | ```
711 |
712 |
713 |
714 |
715 | 
716 |
717 |
718 |
719 | ### Set the width of each bar with `size`
720 |
721 | The width (height when horizontal) of the bars is set with the `size` parameter. By default, this value is .9. Think of this number as the relative width of all the bars for a particular x/y value, where 1 is the distance between each x/y value.
722 |
723 |
724 | ```python
725 | dxp.bar(x='neighborhood', y='price', data=airbnb,
726 | aggfunc='median', split='property_type',
727 | split_order=['Apartment', 'House'],
728 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'], size=.5)
729 | ```
730 |
731 |
732 |
733 |
734 | 
735 |
736 |
737 |
738 | ### Splitting line plots
739 |
740 | All the other aggregating plots work similarly.
741 |
742 |
743 | ```python
744 | dxp.line(x='neighborhood', y='price', data=airbnb,
745 | aggfunc='median', split='property_type',
746 | split_order=['Apartment', 'House'],
747 | x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'])
748 | ```
749 |
750 |
751 |
752 |
753 | 
754 |
755 |
756 |
757 | ## Distribution plots - box, violin, histogram, kde
758 |
759 | Distribution plots work similarly, but do not have an `aggfunc` since they do not aggregate. They take their group of values and draw some kind of shape that gives information on how that variable is distributed.
760 |
761 | ### Box plots
762 |
763 | Box plots have colored boxes with ends at the first and third quartiles and a line at the median. The whiskers are placed at 1.5 times the difference between the third and first quartiles (Interquartile range (IQR)). Fliers are the points outside this range and plotted individually. By default, both box and violin plots are plotted horizontally.
764 |
765 |
766 | ```python
767 | dxp.box(x='price', y='neighborhood', data=airbnb)
768 | ```
769 |
770 |
771 |
772 |
773 | 
774 |
775 |
776 |
777 | Split the groups in the same manner as with the aggregation plots.
778 |
779 |
780 | ```python
781 | dxp.box(x='price', y='neighborhood', data=airbnb,
782 | split='superhost', split_order=['Yes', 'No'])
783 | ```
784 |
785 |
786 |
787 |
788 | 
789 |
790 |
791 |
792 | Order the appearance of the splits alphabetically (in descending order here).
793 |
794 |
795 | ```python
796 | dxp.box(x='price', y='neighborhood', data=airbnb,
797 | split='property_type', split_order='desc')
798 | ```
799 |
800 |
801 |
802 |
803 | 
804 |
805 |
806 |
807 | ### Filter range of values with `x_order`
808 |
809 | It's possible to filter the range of possible values by passing in a list of the minimum and maximum to `x_order`.
810 |
811 |
812 | ```python
813 | dxp.box(x='price', y='neighborhood', data=airbnb,
814 | split='superhost', x_order=[50, 250])
815 | ```
816 |
817 |
818 |
819 |
820 | 
821 |
822 |
823 |
824 | Change the `x` and `y` while setting `orientation` to make vertical bar plots.
825 |
826 |
827 | ```python
828 | dxp.box(x='neighborhood', y='price', data=airbnb, orientation='v',
829 | split='property_type', split_order='top 2')
830 | ```
831 |
832 |
833 |
834 |
835 | 
836 |
837 |
838 |
839 | Violin plots work identically to box plots, but show "violins", kernel density plots duplicated on both sides of a line.
840 |
841 |
842 | ```python
843 | dxp.violin(x='price', y='neighborhood', data=airbnb,
844 | split='superhost', split_order=['Yes', 'No'])
845 | ```
846 |
847 |
848 |
849 |
850 | 
851 |
852 |
853 |
854 | Splitting by rows and columns is possible as well with distribution plots.
855 |
856 |
857 | ```python
858 | dxp.box(x='price', y='neighborhood', data=airbnb,split='superhost',
859 | col='property_type', col_order=['House', 'Condominium', 'Apartment'],
860 | row='bedrooms', row_order=[1, 2])
861 | ```
862 |
863 |
864 |
865 |
866 | 
867 |
868 |
869 |
870 | ### Histograms
871 |
872 | Histograms work in a slightly different manner. Instead of passing both `x` and `y`, you give it a single numeric column. A vertical histogram with 20 bins of the counts is created by default.
873 |
874 |
875 | ```python
876 | dxp.hist(val='price', data=airbnb)
877 | ```
878 |
879 |
880 |
881 |
882 | 
883 |
884 |
885 |
886 | We can use `split` just like we did above and also create horizontal histograms.
887 |
888 |
889 | ```python
890 | dxp.hist(val='price', data=airbnb, orientation='h', split='superhost', bins=15)
891 | ```
892 |
893 |
894 |
895 |
896 | 
897 |
898 |
899 |
900 | Here, we customize our histogram by plotting the cumulative density as opposed to the raw frequency count using the outline of the bars ('step').
901 |
902 |
903 | ```python
904 | dxp.hist(val='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3],
905 | bins=30, density=True, histtype='step', cumulative=True)
906 | ```
907 |
908 |
909 |
910 |
911 | 
912 |
913 |
914 |
915 | ### KDE Plots
916 |
917 | Kernel density estimates provide an estimate for the probability distribution of a continuous variable. Here, we examine how price is distributed by bedroom.
918 |
919 |
920 | ```python
921 | dxp.kde(x='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3])
922 | ```
923 |
924 |
925 |
926 |
927 | 
928 |
929 |
930 |
931 | Graph the cumulative distribution instead on multiple plots.
932 |
933 |
934 | ```python
935 | dxp.kde(x='price', data=airbnb, split='bedrooms',
936 | split_order=[1, 2, 3], cumulative=True, col='property_type', wrap=2)
937 | ```
938 |
939 |
940 |
941 |
942 | 
943 |
944 |
945 |
946 | ### Two-dimensional KDE's
947 |
948 | Provide two numeric columns to `x` and `y` to get a two dimensional KDE.
949 |
950 |
951 | ```python
952 | dxp.kde(x='price', y='cleaning_fee', data=airbnb)
953 | ```
954 |
955 |
956 |
957 |
958 | 
959 |
960 |
961 |
962 | Create a grid of two-dimensional KDE's.
963 |
964 |
965 | ```python
966 | dxp.kde(x='price', y='cleaning_fee', data=airbnb, row='neighborhood', wrap=3)
967 | ```
968 |
969 |
970 |
971 |
972 | 
973 |
974 |
975 |
976 | ## Count plots
977 |
978 | The `count` function graphs the frequency of unique values as bars. By default, it plots the values in descending order.
979 |
980 |
981 | ```python
982 | dxp.count(val='neighborhood', data=airbnb)
983 | ```
984 |
985 |
986 |
987 |
988 | 
989 |
990 |
991 |
992 | In pandas, this is a straightforward call to the `value_counts` method.
993 |
994 |
995 | ```python
996 | airbnb['neighborhood'].value_counts()
997 | ```
998 |
999 |
1000 |
1001 |
1002 | Columbia Heights 773
1003 | Union Station 713
1004 | Capitol Hill 654
1005 | Edgewood 610
1006 | Dupont Circle 549
1007 | Shaw 514
1008 | Brightwood Park 406
1009 | Kalorama Heights 362
1010 | Name: neighborhood, dtype: int64
1011 |
1012 |
1013 |
1014 | ### Relative frequency with `normalize`
1015 |
1016 | Instead of the raw counts, get the relative frequency by setting normalize to `True`.
1017 |
1018 |
1019 | ```python
1020 | dxp.count(val='neighborhood', data=airbnb, normalize=True)
1021 | ```
1022 |
1023 |
1024 |
1025 |
1026 | 
1027 |
1028 |
1029 |
1030 | Here, we split by property type.
1031 |
1032 |
1033 | ```python
1034 | dxp.count(val='neighborhood', data=airbnb, split='property_type')
1035 | ```
1036 |
1037 |
1038 |
1039 |
1040 | 
1041 |
1042 |
1043 |
1044 | In pandas, this is done with the `crosstab` function.
1045 |
1046 |
1047 | ```python
1048 | pd.crosstab(index=airbnb['property_type'], columns=airbnb['neighborhood'])
1049 | ```
1050 |
1051 |
1052 |
1053 |
1054 |
1055 |
1056 |
1057 |
1058 | | neighborhood |
1059 | Brightwood Park |
1060 | Capitol Hill |
1061 | Columbia Heights |
1062 | Dupont Circle |
1063 | Edgewood |
1064 | Kalorama Heights |
1065 | Shaw |
1066 | Union Station |
1067 |
1068 |
1069 | | property_type |
1070 | |
1071 | |
1072 | |
1073 | |
1074 | |
1075 | |
1076 | |
1077 | |
1078 |
1079 |
1080 |
1081 |
1082 | | Apartment |
1083 | 167 |
1084 | 299 |
1085 | 374 |
1086 | 397 |
1087 | 244 |
1088 | 284 |
1089 | 315 |
1090 | 323 |
1091 |
1092 |
1093 | | Condominium |
1094 | 35 |
1095 | 70 |
1096 | 97 |
1097 | 62 |
1098 | 65 |
1099 | 42 |
1100 | 52 |
1101 | 54 |
1102 |
1103 |
1104 | | House |
1105 | 131 |
1106 | 137 |
1107 | 157 |
1108 | 47 |
1109 | 146 |
1110 | 23 |
1111 | 61 |
1112 | 175 |
1113 |
1114 |
1115 | | Townhouse |
1116 | 73 |
1117 | 148 |
1118 | 145 |
1119 | 43 |
1120 | 155 |
1121 | 13 |
1122 | 86 |
1123 | 161 |
1124 |
1125 |
1126 |
1127 |
1128 |
1129 |
1130 |
1131 | Horizontal stacked count plots.
1132 |
1133 |
1134 | ```python
1135 | dxp.count(val='neighborhood', data=airbnb, split='property_type',
1136 | orientation='h', stacked=True, col='superhost')
1137 | ```
1138 |
1139 |
1140 |
1141 |
1142 | 
1143 |
1144 |
1145 |
1146 | ### Normalize over different variables
1147 |
1148 | Setting `normalize` to `True`, returns the relative frequency with respect to all of the data. You can normalize over any of the variables provided.
1149 |
1150 |
1151 | ```python
1152 | dxp.count(val='neighborhood', data=airbnb, split='property_type', normalize='neighborhood',
1153 | title='Relative Frequency by Neighborhood')
1154 | ```
1155 |
1156 |
1157 |
1158 |
1159 | 
1160 |
1161 |
1162 |
1163 | Normalize over several variables at once with a list.
1164 |
1165 |
1166 | ```python
1167 | dxp.count(val='neighborhood', data=airbnb, split='superhost',
1168 | row='property_type', col='bedrooms', col_order=[1, 2],
1169 | normalize=['neighborhood', 'property_type', 'bedrooms'], stacked=True)
1170 | ```
1171 |
1172 |
1173 |
1174 |
1175 | 
1176 |
1177 |
1178 |
1179 | ## Wide data
1180 |
1181 | Dexplot can also plot wide data, or data where no aggregation happens. Here is a scatter plot of the location of each listing.
1182 |
1183 |
1184 | ```python
1185 | dxp.scatter(x='longitude', y='latitude', data=airbnb,
1186 | split='neighborhood', col='bedrooms', col_order=[2, 3])
1187 | ```
1188 |
1189 |
1190 |
1191 |
1192 | 
1193 |
1194 |
1195 |
1196 | If you've already aggregated your data, you can plot it directly without specifying `x` or `y`.
1197 |
1198 |
1199 | ```python
1200 | df = airbnb.pivot_table(index='neighborhood', columns='property_type',
1201 | values='price', aggfunc='mean')
1202 | df
1203 | ```
1204 |
1205 |
1206 |
1207 |
1208 |
1209 |
1210 |
1211 |
1212 |
1213 | | property_type |
1214 | Apartment |
1215 | Condominium |
1216 | House |
1217 | Townhouse |
1218 |
1219 |
1220 | | neighborhood |
1221 | |
1222 | |
1223 | |
1224 | |
1225 |
1226 |
1227 |
1228 |
1229 | | Brightwood Park |
1230 | 96.119760 |
1231 | 105.000000 |
1232 | 121.671756 |
1233 | 133.479452 |
1234 |
1235 |
1236 | | Capitol Hill |
1237 | 141.210702 |
1238 | 104.200000 |
1239 | 170.153285 |
1240 | 184.459459 |
1241 |
1242 |
1243 | | Columbia Heights |
1244 | 114.676471 |
1245 | 126.773196 |
1246 | 135.292994 |
1247 | 124.358621 |
1248 |
1249 |
1250 | | Dupont Circle |
1251 | 146.858942 |
1252 | 130.709677 |
1253 | 179.574468 |
1254 | 139.348837 |
1255 |
1256 |
1257 | | Edgewood |
1258 | 108.508197 |
1259 | 112.846154 |
1260 | 156.335616 |
1261 | 147.503226 |
1262 |
1263 |
1264 | | Kalorama Heights |
1265 | 122.542254 |
1266 | 155.928571 |
1267 | 92.695652 |
1268 | 158.230769 |
1269 |
1270 |
1271 | | Shaw |
1272 | 153.888889 |
1273 | 158.500000 |
1274 | 202.114754 |
1275 | 173.279070 |
1276 |
1277 |
1278 | | Union Station |
1279 | 128.458204 |
1280 | 133.833333 |
1281 | 162.748571 |
1282 | 162.167702 |
1283 |
1284 |
1285 |
1286 |
1287 |
1288 |
1289 |
1290 |
1291 | ```python
1292 | dxp.bar(data=df, orientation='h')
1293 | ```
1294 |
1295 |
1296 |
1297 |
1298 | 
1299 |
1300 |
1301 |
1302 | ### Time series
1303 |
1304 |
1305 | ```python
1306 | stocks = pd.read_csv('../data/stocks10.csv', parse_dates=['date'], index_col='date')
1307 | stocks.head()
1308 | ```
1309 |
1310 |
1311 |
1312 |
1313 |
1314 |
1315 |
1316 |
1317 | |
1318 | MSFT |
1319 | AAPL |
1320 | SLB |
1321 | AMZN |
1322 | TSLA |
1323 | XOM |
1324 | WMT |
1325 | T |
1326 | FB |
1327 | V |
1328 |
1329 |
1330 | | date |
1331 | |
1332 | |
1333 | |
1334 | |
1335 | |
1336 | |
1337 | |
1338 | |
1339 | |
1340 | |
1341 |
1342 |
1343 |
1344 |
1345 | | 1999-10-25 |
1346 | 29.84 |
1347 | 2.32 |
1348 | 17.02 |
1349 | 82.75 |
1350 | NaN |
1351 | 21.45 |
1352 | 38.99 |
1353 | 16.78 |
1354 | NaN |
1355 | NaN |
1356 |
1357 |
1358 | | 1999-10-26 |
1359 | 29.82 |
1360 | 2.34 |
1361 | 16.65 |
1362 | 81.25 |
1363 | NaN |
1364 | 20.89 |
1365 | 37.11 |
1366 | 17.28 |
1367 | NaN |
1368 | NaN |
1369 |
1370 |
1371 | | 1999-10-27 |
1372 | 29.33 |
1373 | 2.38 |
1374 | 16.52 |
1375 | 75.94 |
1376 | NaN |
1377 | 20.80 |
1378 | 36.94 |
1379 | 18.27 |
1380 | NaN |
1381 | NaN |
1382 |
1383 |
1384 | | 1999-10-28 |
1385 | 29.01 |
1386 | 2.43 |
1387 | 16.59 |
1388 | 71.00 |
1389 | NaN |
1390 | 21.19 |
1391 | 38.85 |
1392 | 19.79 |
1393 | NaN |
1394 | NaN |
1395 |
1396 |
1397 | | 1999-10-29 |
1398 | 29.88 |
1399 | 2.50 |
1400 | 17.21 |
1401 | 70.62 |
1402 | NaN |
1403 | 21.47 |
1404 | 39.25 |
1405 | 20.00 |
1406 | NaN |
1407 | NaN |
1408 |
1409 |
1410 |
1411 |
1412 |
1413 |
1414 |
1415 |
1416 | ```python
1417 | dxp.line(data=stocks.head(500))
1418 | ```
1419 |
1420 |
1421 |
1422 |
1423 | 
1424 |
1425 |
1426 |
--------------------------------------------------------------------------------