├── tests
    ├── __init__.py
    ├── test_box.py
    ├── test_line.py
    ├── test_bar.py
    └── test_scatter.py
├── MANIFEST.in
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── python-package.yml
├── dexplot
    ├── colors
    │   ├── __init__.py
    │   ├── _categories.py
    │   └── _app.py
    ├── __init__.py
    ├── _pandas_accessor.py
    ├── _utils.py
    ├── _heat.py
    ├── _plotly.py
    ├── _plots.py
    └── _common_plot.py
├── Upcoming Features.md
├── docs
    ├── css
    │   └── style.css
    ├── overrides
    │   └── main.html
    └── index.md
├── setup.py
├── mkdocs.yml
├── LICENSE
├── .gitignore
├── notebooks
    └── colormaps.ipynb
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | custom: ['https://dunderdata.com']
2 | 


--------------------------------------------------------------------------------
/dexplot/colors/__init__.py:
--------------------------------------------------------------------------------
1 | from ._categories import sequential, diverging, cyclic, qualitative, misc, all_cmaps
2 | 
3 | import importlib
4 | if importlib.util.find_spec('ipywidgets') and importlib.util.find_spec('IPython'):
5 |     from ._app import color_viewer


--------------------------------------------------------------------------------
/dexplot/__init__.py:
--------------------------------------------------------------------------------
1 | from ._plots import line, bar, box, scatter, violin, hist, count, kde
2 | from ._utils import load_dataset
3 | from ._plotly import bar_plotly, line_plotly, scatter_plotly, count_plotly, box_plotly, violin_plotly
4 | from . import colors
5 | from ._pandas_accessor import _DexplotAccessor
6 | 
7 | __version__ = '0.1.4'
8 | 


--------------------------------------------------------------------------------
/Upcoming Features.md:
--------------------------------------------------------------------------------
 1 | ## Upcoming Features
 2 | 
 3 | * allow user access to entire dataframe in custom aggfunc
 4 | * templates for x,y, labels and titles
 5 | * color picker ipywdigets
 6 | * ipywidgets full app integration
 7 | * add other generic kwargs, ec, lw, alpha, etc...
 8 | * [ ] kde with annotations, allow for binning
 9 | * [ ] scatter with kde
10 | * [ ] allow kde and histograms to be grouped
11 | * [ ] use a categorical variable to size scatter plot
12 | * [ ] allow user to specify a specific matplotlib axes
13 | * [ ] add interaction with ipywidgets
14 | * [ ] stacked area plot
15 | * [ ] rolling averages for line plots
16 | * [ ] add parameter `bins` to bin numeric x
17 | * [ ] option to add counts to all aggregate plots
18 | 
19 | ## Other plots
20 | 
21 | * heat
22 | * hexplot
23 | * mosaic


--------------------------------------------------------------------------------
/docs/css/style.css:
--------------------------------------------------------------------------------
 1 | table {
 2 |     background-color: transparent;
 3 |     font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
 4 |     margin-left:0;
 5 |     margin-right:0;
 6 |     border:none;
 7 |     border-collapse: collapse;
 8 |     border-spacing:0;
 9 |     color:black;
10 |     font-size:13px;
11 |     table-layout:fixed;
12 |     overflow: scroll;
13 |     }
14 | thead {
15 |     border-bottom:1px solid black;vertical-align:bottom;
16 |     }
17 | tr, th, td {
18 |     text-align:right;
19 |     vertical-align: middle;
20 |     padding:0.5em 0.5em;
21 |     line-height:normal;
22 |     white-space:normal;
23 |     max-width:none;
24 |     border:none;
25 |     }
26 | th {
27 |     font-weight:bold; 
28 |     text-align:left;
29 |     }
30 | tbody tr:nth-child(odd){
31 |     background:#f5f5f5;
32 |     }
33 |     :link{
34 |     text-decoration:underline;
35 | }
36 | 
37 | .vid {
38 |     display: flex;
39 |     justify-content: center;
40 | }
41 | .vid video {
42 |     width: 85%;
43 | }
44 | 
45 | .dataframe {
46 |     overflow: scroll;
47 | }


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       matrix:
18 |         os: [ubuntu-latest, macos-latest, windows-latest]
19 |         python-version: [3.6, 3.7, 3.8]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v2
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v2
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 | 
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         pip install pytest matplotlib pandas scipy plotly
32 |     - name: Test with pytest
33 |       run: pytest
34 |   


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('dexplot/__init__.py', 'r') as f:
 4 |     for line in f:
 5 |         if line.startswith('__version__'):
 6 |             version = line.split("'")[1]
 7 | 
 8 | with open("README.md", "r") as fh:
 9 |     long_description = fh.read()
10 | 
11 | setuptools.setup(
12 |     name="dexplot",
13 |     version=version,
14 |     author="Ted Petrou",
15 |     author_email="petrou.theodore@gmail.com",
16 |     description="Powerful and intuitive data visualization library using matplotlib for both long and wide data",
17 |     long_description=long_description,
18 |     long_description_content_type="text/markdown",
19 |     keywords="data visualization matplotlib pandas",
20 |     url="https://github.com/dexplo/dexplot",
21 |     packages=setuptools.find_packages(),
22 |     classifiers=[
23 |         "Programming Language :: Python :: 3",
24 |         "License :: OSI Approved :: BSD License",
25 |         "Operating System :: OS Independent",
26 |         "Framework :: Matplotlib"
27 |     ],
28 |     install_requires=['numpy>=1.15',
29 |                       'scipy>=1.0'
30 |                       'matplotlib>=3.1', 
31 |                       'pandas>=0.24'],
32 |     extras_require={
33 |         "apps":  ["ipywidgets"],
34 |     },
35 |     python_requires='>=3.6'
36 | )


--------------------------------------------------------------------------------
/dexplot/_pandas_accessor.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from . import _plots as plots
 6 | 
 7 | def get_doc(func):
 8 |     doc = func.__doc__
 9 |     return re.sub('data :.*(?=split :)', '', doc, count=1, flags=re.S)
10 | 
11 | 
12 | @pd.api.extensions.register_dataframe_accessor("dexplot")
13 | class _DexplotAccessor:
14 |     def __init__(self, pandas_obj):
15 |         self._obj = pandas_obj
16 | 
17 |     def box(self, x=None, y=None, split=None, row=None, col=None, x_order=None, 
18 |         y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 
19 |         wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 
20 |         ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 
21 |         x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2,
22 |         groupgap=0, box_kwargs=None):
23 |         return plots.box(x, y, self._obj, split, row, col, x_order, y_order, split_order, 
24 |                          row_order, col_order, orientation, wrap, figsize, title, sharex, 
25 |                          sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 
26 |                          x_textwrap, y_textwrap, x_rot, y_rot, mode, gap, groupgap, box_kwargs)
27 | 
28 | _DexplotAccessor.box.__doc__ = get_doc(plots.box)


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Dexplot
 2 | site_description: Dexplot is a powerful and intuitive Python data visualization library using matplotlib for both long and wide data
 3 | site_author: Ted Petrou
 4 | site_url: https://www.dexplo.org/dexplot
 5 | repo_url: https://github.com/dexplo/dexplot
 6 | copyright: Copyright @ 2020 Ted Petrou
 7 | google_analytics:
 8 |   - UA-119777567-7
 9 |   - dexplo.org
10 | theme:
11 |   name: material
12 |   custom_dir: docs/overrides
13 |   features:
14 |     - tabs
15 | 
16 | nav:
17 |   - Home: index.md
18 |   - More Dexplo Libraries:
19 |       - Dexplo: https://www.dexplo.org
20 | 
21 | extra_css:
22 |   - css/style.css
23 | 
24 | extra:
25 |   social:
26 |     - icon: fontawesome/brands/github-alt
27 |       link: https://github.com/dexplo
28 |     - icon: fontawesome/brands/twitter
29 |       link: https://twitter.com/TedPetrou
30 |     - icon: fontawesome/brands/linkedin
31 |       link: https://linkedin.com/in/TedPetrou
32 |     - icon: fontawesome/brands/youtube
33 |       link: https://www.youtube.com/c/dunderdata
34 |     - icon: fontawesome/brands/facebook
35 |       link: https://www.facebook.com/dunderdata
36 | 
37 | markdown_extensions:
38 |     - admonition
39 |     - toc:
40 |         permalink: True
41 |     - codehilite:
42 |         guess_lang: false
43 |     - pymdownx.superfences
44 | 
45 | plugins:
46 |     - search
47 |     - macros
48 |     - minify:
49 |         minify_html: true
50 | 
51 | extra_javascript:
52 |   - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, dexplo
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/dexplot/_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from scipy.stats import gaussian_kde
 4 | 
 5 | RAW_URL = 'https://raw.githubusercontent.com/dexplo/dexplot/master/data/{name}.csv'
 6 | DATASETS = ['airbnb']
 7 | 
 8 | def load_dataset(name):
 9 |     """
10 |     Load a dataset. Must be connected to the internet
11 | 
12 |     Datasets
13 |     --------
14 |     airbnb
15 |     """
16 |     if name not in DATASETS:
17 |         raise KeyError(f'Dataset {name} does not exist. Choose one of the following: {DATASETS}')
18 | 
19 |     url = RAW_URL.format(name=name)
20 |     return pd.read_csv(url)
21 | 
22 | 
23 | def calculate_density_1d(data, cumulative=False):
24 |     density_func = gaussian_kde(data)
25 |     min_x, max_x = data.min(), data.max()
26 |     range_x = max_x - min_x
27 |     min_x = min_x - 2 * range_x
28 |     max_x = max_x + 2 * range_x
29 |     x = np.linspace(min_x, max_x, 400)
30 |     density = density_func(x)
31 |     max_density = density.max()
32 |     filt = density > max_density / 1000
33 |     x = x[filt]
34 |     density = density[filt]
35 |     if cumulative:
36 |         density = np.cumsum(density)
37 |         density = 1 / density.max()  * density
38 |     return x, density
39 | 
40 | def calculate_density_2d(x, y):
41 |     xmin, xmax = x.min(), x.max()
42 |     ymin, ymax = y.min(), y.max()
43 |     X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
44 |     positions = np.vstack([X.ravel(), Y.ravel()])
45 |     values = np.vstack([x, y])
46 |     kernel = gaussian_kde(values)
47 |     Z = np.reshape(kernel(positions).T, X.shape)
48 |     return xmin, xmax, ymin, ymax, np.rot90(Z)
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/docs/overrides/main.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block extrahead %}
 4 |     <meta property="og:url" content="{{ page.canonical_url }}">
 5 |     {% if page and page.meta and page.meta.title %}
 6 |         <meta property="og:title" content="{{ page.meta.title }}">
 7 |     {% elif page and page.title and not page.is_homepage %}
 8 |         <meta property="og:title" content="{{ page.title }} - {{ config.site_name }}">
 9 |     {% else %}
10 |         <meta property="og:title" content="{{ config.site_name }}">
11 |     {% endif %}
12 |     <meta property="og:description" content="{{ config.site_description }}">
13 |     <!-- <meta property="og:image" content="images/">
14 |     <meta property="og:image:alt" content=""> -->
15 |     <meta property="og:image:type" content="image/png">
16 |     <meta property="og:image:width" content="1200">
17 |     <meta property="og:image:height" content="630">
18 | 
19 |     <meta name="twitter:card" content="summary_large_image">
20 |     <meta name="twitter:site" content="@DunderData">
21 |     <meta name="twitter:creator" content="@TedPetrou">
22 |     {% if page and page.meta and page.meta.title %}
23 |         <meta property="twitter:title" content="{{ page.meta.title }}">
24 |     {% elif page and page.title and not page.is_homepage %}
25 |         <meta property="twitter:title" content="{{ page.title }} - {{ config.site_name }}">
26 |     {% else %}
27 |         <meta property="twitter:title" content="{{ config.site_name }}">
28 |     {% endif %}
29 |     <meta name="twitter:description" content="{{ config.site_description }}">
30 |     <!-- <meta name="twitter:image" content="images/">
31 |     <meta name="twitter:image:alt" content=""> -->
32 | {% endblock %}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | .DS_Store
 10 | .idea
 11 | docs/images
 12 | notebooks/
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | 


--------------------------------------------------------------------------------
/notebooks/colormaps.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import plotly"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "p_sequential = [k.lower() for k, v in vars(plotly.colors.sequential).items() if isinstance(v, list) \n",
 19 |     "                and not k.startswith('_')]\n",
 20 |     "p_diverging = [k.lower() for k, v in vars(plotly.colors.diverging).items() if isinstance(v, list) \n",
 21 |     "               and not k.startswith('_')]\n",
 22 |     "p_cyclic = [k.lower() for k, v in vars(plotly.colors.cyclical).items() if isinstance(v, list) \n",
 23 |     "            and not k.startswith('_')]\n",
 24 |     "p_qual = [k.lower() for k, v in vars(plotly.colors.qualitative).items() if isinstance(v, list) \n",
 25 |     "          and not k.startswith('_')]\n",
 26 |     "p_qual += ['dark12', 'dark12_r']"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "mpl_sequential = ['viridis', 'plasma', 'inferno', 'magma', 'cividis',\n",
 36 |     "            'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',\n",
 37 |     "            'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',\n",
 38 |     "            'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn',\n",
 39 |     "            'binary', 'gist_yarg', 'gist_gray', 'gray', 'bone', 'pink',\n",
 40 |     "            'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia',\n",
 41 |     "            'hot', 'afmhot', 'gist_heat', 'copper'] \n",
 42 |     "mpl_diverging =  ['PiYG', 'PRGn', 'BrBG', 'PuOr', 'RdGy', 'RdBu',\n",
 43 |     "            'RdYlBu', 'RdYlGn', 'Spectral', 'coolwarm', 'bwr', 'seismic']\n",
 44 |     "mpl_cyclic = ['twilight', 'twilight_shifted', 'hsv']\n",
 45 |     "mpl_qual = ['Pastel1', 'Pastel2', 'Paired', 'Accent', 'Dark2', 'Set1', 'Set2', 'Set3',\n",
 46 |     "            'tab10', 'tab20', 'tab20b', 'tab20c']\n",
 47 |     "mpl_misc = ['flag', 'prism', 'ocean', 'gist_earth', 'terrain', 'gist_stern',\n",
 48 |     "            'gnuplot', 'gnuplot2', 'CMRmap', 'cubehelix', 'brg',\n",
 49 |     "            'gist_rainbow', 'rainbow', 'jet', 'nipy_spectral', 'gist_ncar']\n",
 50 |     "\n",
 51 |     "def double(colors):\n",
 52 |     "    a = []\n",
 53 |     "    for color in colors:\n",
 54 |     "        c = color.lower()\n",
 55 |     "        a.append(c)\n",
 56 |     "        a.append(c + '_r')\n",
 57 |     "    return a\n",
 58 |     "        \n",
 59 |     "mpl_sequential = double(mpl_sequential)\n",
 60 |     "mpl_diverging = double(mpl_diverging)\n",
 61 |     "mpl_cyclic = double(mpl_cyclic)\n",
 62 |     "mpl_qual = double(mpl_qual)\n",
 63 |     "mpl_misc = double(mpl_misc)\n",
 64 |     "\n",
 65 |     "seq = sorted(set(mpl_sequential + p_sequential))\n",
 66 |     "diverging = sorted(set(mpl_diverging + p_diverging))\n",
 67 |     "cyclic = sorted(set(mpl_cyclic + p_cyclic))\n",
 68 |     "qual = sorted(set(mpl_qual + p_qual))\n",
 69 |     "misc = sorted(set(mpl_misc))"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": []
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 3",
 83 |    "language": "python",
 84 |    "name": "python3"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 3
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython3",
 96 |    "version": "3.8.3"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 4
101 | }
102 | 


--------------------------------------------------------------------------------
/tests/test_box.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import dexplot as dxp
  4 | 
  5 | 
  6 | airbnb = dxp.load_dataset('airbnb')
  7 | 
  8 | 
  9 | class TestSort:
 10 | 
 11 |     def test_lex_asc(self):
 12 |         fig = dxp.box(x='price', y='neighborhood', data=airbnb)
 13 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 14 |         correct = sorted(ticklabels)
 15 |         assert ticklabels == correct
 16 | 
 17 |     def test_lex_desc(self):
 18 |         fig = dxp.box(x='price', y='neighborhood', data=airbnb, y_order='desc')
 19 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 20 |         correct = sorted(ticklabels, reverse=True)
 21 |         assert ticklabels == correct
 22 | 
 23 |     def test_asc_values(self):
 24 |         fig = dxp.box(x='price', y='neighborhood', data=airbnb, sort_values='asc')
 25 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 26 |         ticklabels = [label.replace('\n', ' ') for label in ticklabels]
 27 |         values = [p.get_height() for p in fig.axes[0].patches]
 28 | 
 29 | 
 30 |     def test_desc_values(self):
 31 |         fig = dxp.box(x='price', y='neighborhood', data=airbnb, sort_values='desc')
 32 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 33 |         ticklabels = [label.replace('\n', ' ') for label in ticklabels]
 34 | 
 35 | 
 36 | class TestOrder:
 37 | 
 38 |     def test_x_order(self):
 39 |         dxp.box(x='price', y='neighborhood', data=airbnb,
 40 |                 y_order=['Dupont Circle', 'Edgewood', 'Union Station'])
 41 | 
 42 |         with pytest.raises(ValueError):
 43 |             dxp.box(x='price', y='neighborhood', data=airbnb,
 44 |                 y_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST'])
 45 | 
 46 | 
 47 | class TestVertical:
 48 | 
 49 |     def test_vert(self):
 50 |         dxp.box(x='neighborhood', y='price', data=airbnb, orientation='v')
 51 | 
 52 | 
 53 | class TestSplit:
 54 | 
 55 |     def test_split(self):
 56 |         dxp.box(x='price', y='neighborhood', data=airbnb, split='superhost')
 57 | 
 58 |     def test_split_order(self):
 59 |         dxp.box(x='price', y='neighborhood', data=airbnb, 
 60 |                 split='superhost', split_order=['Yes', 'No'])
 61 | 
 62 |     def test_stacked(self):
 63 |         dxp.box(x='price', y='neighborhood', data=airbnb, 
 64 |                 split='superhost', split_order=['Yes', 'No'])
 65 | 
 66 | 
 67 | class TestRowCol:
 68 | 
 69 |     def test_col(self):
 70 |         dxp.box(x='price', y='neighborhood', data=airbnb, 
 71 |                 split='superhost', col='property_type')
 72 | 
 73 |     def test_col_wrap(self):
 74 |         dxp.box(x='price', y='neighborhood', data=airbnb, 
 75 |                 split='superhost', col='property_type', wrap=2)
 76 | 
 77 |     def test_col_order(self):
 78 |         dxp.box(x='price', y='neighborhood', data=airbnb,
 79 |                 split='superhost', col='property_type', col_order=['House', 'Condominium'])
 80 | 
 81 |     def test_row(self):
 82 |         dxp.box(x='price', y='neighborhood', data=airbnb, 
 83 |                 split='superhost', row='property_type')
 84 | 
 85 |     def test_row_order(self):
 86 |         dxp.box(x='price', y='neighborhood', data=airbnb,
 87 |         split='superhost', row='property_type', row_order=['House', 'Condominium'])
 88 | 
 89 |     def test_row_wrap(self):
 90 |         dxp.box(x='price', y='neighborhood', data=airbnb, 
 91 |             split='superhost', row='property_type', wrap=2)
 92 | 
 93 |     def test_row_col(self):
 94 |         dxp.box(x='price', y='neighborhood', data=airbnb,
 95 |                 split='superhost', col='property_type', 
 96 |                 col_order=['House', 'Condominium', 'Apartment'],
 97 |                 row='bedrooms', row_order=[0, 1, 2, 3])
 98 | 
 99 |     def test_sharex(self):
100 |         dxp.box(x='price', y='neighborhood', data=airbnb,
101 |         split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'],
102 |         row='bedrooms', row_order=[1, 2, 3], sharex=False)
103 |     


--------------------------------------------------------------------------------
/dexplot/colors/_categories.py:
--------------------------------------------------------------------------------
 1 | from ._colormaps import colormaps
 2 | 
 3 | def set_attrs(obj, cmaps):
 4 |     for cmap in cmaps:
 5 |         setattr(obj, cmap, colormaps[cmap])
 6 | 
 7 | sequential_colormaps = [
 8 |     'afmhot', 'afmhot_r', 'aggrnyl', 'aggrnyl_r', 'agsunset', 'agsunset_r', 'algae', 'algae_r', 
 9 |     'amp', 'amp_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'blackbody', 'blackbody_r',
10 |     'bluered', 'bluered_r', 'blues', 'blues_r', 'blugrn', 'blugrn_r', 'bluyl', 'bluyl_r', 
11 |     'bone', 'bone_r', 'brwnyl', 'brwnyl_r', 'bugn', 'bugn_r', 'bupu', 'bupu_r', 'burg', 'burg_r',
12 |     'burgyl', 'burgyl_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'copper', 'copper_r', 
13 |     'darkmint', 'darkmint_r', 'deep', 'deep_r', 'dense', 'dense_r', 'electric', 'electric_r', 
14 |     'emrld', 'emrld_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_yarg',
15 |     'gist_yarg_r', 'gnbu', 'gnbu_r', 'gray', 'gray_r', 'greens', 'greens_r', 'greys', 'greys_r',
16 |     'haline', 'haline_r', 'hot', 'hot_r', 'ice', 'ice_r', 'inferno', 'inferno_r', 'jet', 'jet_r',
17 |     'magenta', 'magenta_r', 'magma', 'magma_r', 'matter', 'matter_r', 'mint', 'mint_r', 'oranges',
18 |     'oranges_r', 'orrd', 'orrd_r', 'oryel', 'oryel_r', 'peach', 'peach_r', 'pink', 'pink_r',
19 |     'pinkyl', 'pinkyl_r', 'plasma', 'plasma_r', 'plotly3', 'plotly3_r', 'pubu', 'pubu_r', 'pubugn',
20 |     'pubugn_r', 'purd', 'purd_r', 'purp', 'purp_r', 'purples', 'purples_r', 'purpor', 'purpor_r',
21 |     'rainbow', 'rainbow_r', 'rdbu', 'rdbu_r', 'rdpu', 'rdpu_r', 'redor', 'redor_r', 'reds',
22 |     'reds_r', 'solar', 'solar_r', 'speed', 'speed_r', 'spring', 'spring_r', 'summer', 'summer_r',
23 |     'sunset', 'sunset_r', 'sunsetdark', 'sunsetdark_r', 'teal', 'teal_r', 'tealgrn', 'tealgrn_r',
24 |     'tempo', 'tempo_r', 'thermal', 'thermal_r', 'turbid', 'turbid_r', 'viridis', 'viridis_r',
25 |     'winter', 'winter_r', 'wistia', 'wistia_r', 'ylgn', 'ylgn_r', 'ylgnbu', 'ylgnbu_r', 'ylorbr',
26 |     'ylorbr_r', 'ylorrd', 'ylorrd_r'
27 |     ]
28 | 
29 | diverging_colormaps = [
30 |     'armyrose', 'armyrose_r', 'balance', 'balance_r', 'brbg', 'brbg_r', 'bwr', 'bwr_r', 'coolwarm',
31 |     'coolwarm_r', 'curl', 'curl_r', 'delta', 'delta_r', 'earth', 'earth_r', 'fall', 'fall_r', 
32 |     'geyser', 'geyser_r', 'picnic', 'picnic_r', 'piyg', 'piyg_r', 'portland', 'portland_r', 'prgn',
33 |     'prgn_r', 'puor', 'puor_r', 'rdbu', 'rdbu_r', 'rdgy', 'rdgy_r', 'rdylbu', 'rdylbu_r', 'rdylgn',
34 |     'rdylgn_r', 'seismic', 'seismic_r', 'spectral', 'spectral_r', 'tealrose', 'tealrose_r', 'temps',
35 |     'temps_r', 'tropic', 'tropic_r'
36 |     ]
37 | 
38 | cyclic_colormaps = [
39 |     'edge', 'edge_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'mrybm', 'mrybm_r', 'mygbm',
40 |     'mygbm_r', 'phase', 'phase_r', 'twilight', 'twilight_r', 'twilight_shifted', 
41 |     'twilight_shifted_r'
42 | ]
43 | 
44 | qualitative_colormaps = [
45 |     'accent', 'accent_r', 'alphabet', 'alphabet_r', 'antique', 'antique_r', 'bold', 'bold_r',
46 |     'd3', 'd3_r', 'dark12', 'dark12_r', 'dark2', 'dark24', 'dark24_r', 'dark2_r', 'g10', 'g10_r',
47 |     'light24', 'light24_r', 'paired', 'paired_r', 'pastel', 'pastel1', 'pastel1_r', 'pastel2',
48 |     'pastel2_r', 'pastel_r', 'plotly', 'plotly_r', 'prism', 'prism_r', 'safe', 'safe_r', 'set1',
49 |     'set1_r', 'set2', 'set2_r', 'set3', 'set3_r', 't10', 't10_r', 'tab10', 'tab10_r', 'tab20',
50 |     'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'vivid', 'vivid_r'
51 | ]
52 | 
53 | misc_colormaps = [
54 |     'brg', 'brg_r', 'cmrmap', 'cmrmap_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 
55 |     'gist_earth', 'gist_earth_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 
56 |     'gist_stern', 'gist_stern_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'jet', 'jet_r',
57 |     'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'prism', 'prism_r', 'rainbow',
58 |     'rainbow_r', 'terrain', 'terrain_r'
59 |     ]
60 | 
61 | all_colormaps = (
62 |     sequential_colormaps + diverging_colormaps + cyclic_colormaps + 
63 |     qualitative_colormaps + misc_colormaps
64 | )
65 | 
66 | class ColorMaps:
67 |     pass
68 | 
69 | sequential = ColorMaps()
70 | diverging = ColorMaps()
71 | cyclic = ColorMaps()
72 | qualitative = ColorMaps()
73 | misc = ColorMaps()
74 | all_cmaps = ColorMaps()
75 | 
76 | set_attrs(sequential, sequential_colormaps)
77 | set_attrs(diverging, diverging_colormaps)
78 | set_attrs(cyclic, cyclic_colormaps)
79 | set_attrs(qualitative, qualitative_colormaps)
80 | set_attrs(misc, misc_colormaps)
81 | set_attrs(all_cmaps, all_colormaps)


--------------------------------------------------------------------------------
/dexplot/colors/_app.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | 
  3 | from ipywidgets import Dropdown, Image, HBox, HTML, Checkbox, interactive_output, VBox
  4 | from IPython.display import display
  5 | import matplotlib.pyplot as plt
  6 | from matplotlib.colors import ListedColormap
  7 | import numpy as np
  8 | 
  9 | from dexplot.colors._colormaps import colormaps
 10 | from dexplot.colors._categories import (qualitative_colormaps, sequential_colormaps, 
 11 |                                         diverging_colormaps, cyclic_colormaps, misc_colormaps, 
 12 |                                         all_colormaps)
 13 | 
 14 | ARR = np.linspace(0, 1, 256).reshape((1, -1)).repeat(20, 0)
 15 | 
 16 | cmap_dict = {'qualitative': qualitative_colormaps,
 17 |              'sequential': sequential_colormaps,
 18 |              'diverging': diverging_colormaps,
 19 |              'cyclic': cyclic_colormaps,
 20 |              'misc': misc_colormaps,
 21 |              'all': all_colormaps}
 22 | 
 23 | cmap_default = {'qualitative': 't10',
 24 |                 'sequential': 'viridis',
 25 |                 'diverging': 'coolwarm',
 26 |                 'cyclic': 'edge',
 27 |                 'misc': 'ocean',
 28 |                 'all': 'tab10'}
 29 | 
 30 | cmap_dropdown = Dropdown(options=[('Qualitative', 'qualitative'),
 31 |                                   ('Sequential', 'sequential'), 
 32 |                                   ('Diverging', 'diverging'), 
 33 |                                   ('Cyclic', 'cyclic'), 
 34 |                                   ('Misc', 'misc'), 
 35 |                                   ('All Colormaps', 'all')],
 36 |                          value=None,
 37 |                          description='Colormap Category: ',
 38 |                          style = {'description_width': 'initial'})
 39 | 
 40 | def remove_ticks_spines(ax):
 41 |     ax.set_xticks([])
 42 |     ax.set_yticks([])
 43 |     for spine in ax.spines.values():
 44 |         spine.set_visible(False)        
 45 | 
 46 | class ColorViewer:
 47 | 
 48 |     def __init__(self):
 49 |         self.checked_colors = []
 50 |         self.test_list = []
 51 |         self.cbox_dict = {cat: self.cmap_checkboxes(cat) for cat in cmap_default}
 52 |         self.layout = self.create_layout()
 53 |         self.fig, self.ax = self.create_figure()
 54 |         self.add_interaction()
 55 | 
 56 |     def checkbox_maker(self, name, default):
 57 |         value = name == default
 58 |         c = Checkbox(value=value, description=name, disabled=False, 
 59 |                      indent=False, style={'color': 'blue'})
 60 |         c.observe(self.cb_handler, 'value')
 61 |         return c
 62 | 
 63 |     def cmap_checkboxes(self, category):
 64 |         rows = []
 65 |         row = []
 66 |         layout = {'justify_content': 'flex-end', 'margin': '0px'}
 67 |         cmaps = cmap_dict[category]
 68 |         default = cmap_default[category]
 69 |         for name in cmaps:
 70 |             row.append(self.checkbox_maker(name, default))
 71 |             if len(row) == 10:
 72 |                 rows.append(HBox(row, layout=layout))
 73 |                 row = []
 74 |         if row:
 75 |             rows.append(HBox(row, layout=layout))
 76 |         return rows
 77 | 
 78 |     def create_image(self): 
 79 |         for image in self.ax.images:
 80 |             image.remove()
 81 | 
 82 |         ticks = []
 83 |         ticklabels = []
 84 |         i = 0
 85 |         
 86 |         for i, name in enumerate(self.checked_colors):
 87 |             cmap = ListedColormap(colormaps[name])
 88 |             self.ax.imshow(ARR, cmap=cmap, extent=[0, 10, i + .2, i + .8], aspect='auto')
 89 |             ticks.append(i + .5)
 90 |             ticklabels.append(name)
 91 |             
 92 |         self.ax.set_ylim(0, i + 1)
 93 |         self.ax.set_yticks(ticks)
 94 |         self.ax.set_yticklabels(ticklabels)
 95 |         img_bytes = io.BytesIO()
 96 |         self.fig.canvas.print_figure(img_bytes)
 97 |         img_bytes.seek(0)
 98 | 
 99 |         self.img.layout.visibility = 'visible'
100 |         self.img.value = img_bytes.read()
101 | 
102 |     def get_checkboxes(self, category):
103 |         if category is None:
104 |             return
105 |         self.checked_colors.clear()
106 |         self.checked_colors.append(cmap_default[category])
107 |         self.layout.children = list(self.layout.children[:2]) + self.cbox_dict[category]
108 |         self.test_list.append('end of get_checkboxes')
109 |         self.create_image()
110 |         
111 |     def cb_handler(self, change):
112 |         name = change['owner'].description
113 |         if change['new']:
114 |             self.checked_colors.append(name)
115 |         else:
116 |             self.checked_colors.remove(name)
117 |         self.create_image()
118 | 
119 |     def create_layout(self):
120 |         title = HTML('<h1>Color Viewer</h1>')
121 |         self.img = Image(width=700, height=600)
122 |         self.img.layout.visibility = 'hidden'
123 | 
124 |         rows = []
125 |         row1 = HBox([title], layout={'justify_content': 'flex-start'})    
126 |         row2 = HBox([cmap_dropdown, self.img], layout={'align_items': 'center'})
127 |         rows = [row1, row2]
128 |         
129 |         return VBox(rows)
130 |     
131 |     def create_figure(self):
132 |         fig = plt.Figure(dpi=144, tight_layout=True, figsize=(6, 3))
133 |         ax = fig.add_subplot()
134 |         remove_ticks_spines(ax)
135 |         return fig, ax
136 |     
137 |     def add_interaction(self):
138 |         interactive_output(self.get_checkboxes, {'category': cmap_dropdown})
139 |         cmap_dropdown.value = 'qualitative'
140 |         
141 | 
142 |     def run(self):
143 |         display(self.layout)
144 | 
145 | 
146 | def color_viewer():
147 |     ColorViewer().run()


--------------------------------------------------------------------------------
/tests/test_line.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import dexplot as dxp
  4 | 
  5 | 
  6 | airbnb = dxp.load_dataset('airbnb')
  7 | 
  8 | class TestAgg:
  9 | 
 10 |     def test_string_name(self):
 11 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 12 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='mean')
 13 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='min')
 14 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='max')
 15 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='size')
 16 | 
 17 |     def test_function(self):
 18 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.median)
 19 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.mean)
 20 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.min)
 21 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.max)
 22 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc=np.size)
 23 | 
 24 | 
 25 | class TestSort:
 26 | 
 27 |     def test_lex_asc(self):
 28 |         fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 29 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 30 |         correct = sorted(ticklabels)
 31 |         assert ticklabels == correct
 32 | 
 33 |     def test_lex_desc(self):
 34 |         fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc')
 35 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 36 |         correct = sorted(ticklabels, reverse=True)
 37 |         assert ticklabels == correct
 38 | 
 39 |     def test_asc_values(self):
 40 |         fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
 41 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 42 |         ticklabels = [label.replace('\n', ' ') for label in ticklabels]
 43 |         values = [p.get_height() for p in fig.axes[0].patches]
 44 | 
 45 |         s = airbnb.groupby('neighborhood')['price'].median().sort_values()
 46 |         correct_labels = s.index.tolist()
 47 |         correct_values = s.values.tolist()
 48 |         assert ticklabels == correct_labels
 49 | 
 50 |     def test_desc_values(self):
 51 |         fig = dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
 52 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 53 |         ticklabels = [label.replace('\n', ' ') for label in ticklabels]
 54 |         values = [p.get_height() for p in fig.axes[0].patches]
 55 |         
 56 |         df = airbnb.groupby('neighborhood').agg({'price': 'median'}).reset_index() \
 57 |                    .sort_values(['price', 'neighborhood'], ascending=[False, True])
 58 |         s = df.set_index('neighborhood').squeeze()
 59 |         correct_labels = s.index.tolist()
 60 |         correct_values = s.values.tolist()
 61 |         assert ticklabels == correct_labels
 62 | 
 63 | 
 64 | class TestOrder:
 65 | 
 66 |     def test_x_order(self):
 67 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 68 |                 x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
 69 | 
 70 |         with pytest.raises(ValueError):
 71 |             dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 72 |                 x_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST'])
 73 | 
 74 | 
 75 | class TestHorizontal:
 76 | 
 77 |     def test_horiz(self):
 78 |         dxp.line(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
 79 | 
 80 | 
 81 | class TestSplit:
 82 | 
 83 |     def test_split(self):
 84 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
 85 | 
 86 |     def test_split_order(self):
 87 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 88 |                 split='superhost', split_order=['Yes', 'No'])
 89 | 
 90 |     def test_stacked(self):
 91 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 92 |                 split='superhost', split_order=['Yes', 'No'])
 93 | 
 94 | 
 95 | class TestRowCol:
 96 | 
 97 |     def test_col(self):
 98 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 99 |                 split='superhost', col='property_type')
100 | 
101 |     def test_col_wrap(self):
102 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
103 |                 split='superhost', col='property_type', wrap=2)
104 | 
105 |     def test_col_order(self):
106 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
107 |                 split='superhost', col='property_type', col_order=['House', 'Condominium'])
108 | 
109 |     def test_row(self):
110 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
111 |                 split='superhost', row='property_type')
112 | 
113 |     def test_row_order(self):
114 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
115 |         split='superhost', row='property_type', row_order=['House', 'Condominium'])
116 | 
117 |     def test_row_wrap(self):
118 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
119 |             split='superhost', row='property_type', wrap=2)
120 | 
121 |     def test_row_col(self):
122 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
123 |                 split='superhost', col='property_type', 
124 |                 col_order=['House', 'Condominium', 'Apartment'],
125 |                 row='bedrooms', row_order=[0, 1, 2, 3])
126 | 
127 |     def test_sharey(self):
128 |         dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median',
129 |         split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'],
130 |         row='bedrooms', row_order=[0, 1, 2, 3], sharey=False)
131 |     


--------------------------------------------------------------------------------
/tests/test_bar.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import dexplot as dxp
  4 | 
  5 | 
  6 | airbnb = dxp.load_dataset('airbnb')
  7 | aggfunc = ['median', 'mean', 'min', 'max', 'size', np.median, np.mean, np.min, np.max, np.max]
  8 | 
  9 | 
 10 | class TestAgg:
 11 | 
 12 |     @pytest.mark.parametrize('aggfunc', aggfunc)
 13 |     def test_string_name(self, aggfunc):
 14 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc=aggfunc)
 15 | 
 16 | 
 17 | class TestSort:
 18 | 
 19 |     def test_lex_asc(self):
 20 |         fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 21 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 22 |         correct = sorted(ticklabels)
 23 |         assert ticklabels == correct
 24 | 
 25 |     def test_lex_desc(self):
 26 |         fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc')
 27 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 28 |         correct = sorted(ticklabels, reverse=True)
 29 |         assert ticklabels == correct
 30 | 
 31 |     def test_asc_values(self):
 32 |         fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
 33 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 34 |         ticklabels = [label.replace('\n', ' ') for label in ticklabels]
 35 |         values = [p.get_height() for p in fig.axes[0].patches]
 36 | 
 37 |         s = airbnb.groupby('neighborhood')['price'].median().sort_values()
 38 |         correct_labels = s.index.tolist()
 39 |         correct_values = s.values.tolist()
 40 |         assert ticklabels == correct_labels
 41 |         assert values == correct_values
 42 | 
 43 |     def test_desc_values(self):
 44 |         fig = dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
 45 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 46 |         ticklabels = [label.replace('\n', ' ') for label in ticklabels]
 47 |         values = [p.get_height() for p in fig.axes[0].patches]
 48 |         
 49 |         df = airbnb.groupby('neighborhood').agg({'price': 'median'}).reset_index() \
 50 |                    .sort_values(['price', 'neighborhood'], ascending=[False, True])
 51 |         s = df.set_index('neighborhood').squeeze()
 52 |         correct_labels = s.index.tolist()
 53 |         correct_values = s.values.tolist()
 54 |         assert ticklabels == correct_labels
 55 |         assert values == correct_values
 56 | 
 57 | 
 58 | class TestOrder:
 59 | 
 60 |     def test_x_order(self):
 61 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 62 |                 x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
 63 | 
 64 |         with pytest.raises(ValueError):
 65 |             dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 66 |                 x_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST'])
 67 | 
 68 | class TestHorizontal:
 69 | 
 70 |     def test_horiz(self):
 71 |         dxp.bar(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
 72 | 
 73 | 
 74 | class TestSplit:
 75 | 
 76 |     def test_split(self):
 77 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
 78 | 
 79 |     def test_split_order(self):
 80 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 81 |                 split='superhost', split_order=['Yes', 'No'])
 82 | 
 83 |     def test_stacked(self):
 84 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 85 |                 split='superhost', split_order=['Yes', 'No'], stacked=True)
 86 | 
 87 |     def test_errors(self):
 88 |         with pytest.raises(ValueError):
 89 |             dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 90 |                     split='property_type', split_order=['Yes', 'No'])
 91 | 
 92 | 
 93 | class TestRowCol:
 94 | 
 95 |     def test_col(self):
 96 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 97 |                 split='superhost', col='property_type')
 98 | 
 99 |     def test_col_wrap(self):
100 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
101 |                 split='superhost', col='property_type', wrap=2)
102 | 
103 |     def test_col_order(self):
104 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
105 |                 split='superhost', col='property_type', col_order=['House', 'Condominium'])
106 | 
107 |     def test_row(self):
108 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
109 |                 split='superhost', row='property_type')
110 | 
111 |     def test_row_order(self):
112 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
113 |         split='superhost', row='property_type', row_order=['House', 'Condominium'])
114 | 
115 |     def test_row_wrap(self):
116 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
117 |             split='superhost', row='property_type', wrap=2)
118 | 
119 |     def test_row_col(self):
120 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
121 |                 split='superhost', col='property_type', 
122 |                 col_order=['House', 'Condominium', 'Apartment'],
123 |                 row='bedrooms', row_order=[0, 1, 2, 3])
124 | 
125 |     def test_sharey(self):
126 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
127 |         split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'],
128 |         row='bedrooms', row_order=[0, 1, 2, 3], sharey=False)
129 |     
130 | class TestBarProps:
131 | 
132 |     def test_bar_size(self):
133 |         dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='property_type',
134 |                 split_order=['Apartment', 'House'], 
135 |                 x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'], size=.5)
136 | 


--------------------------------------------------------------------------------
/tests/test_scatter.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import dexplot as dxp
  4 | 
  5 | 
  6 | airbnb = dxp.load_dataset('airbnb')
  7 | 
  8 | class TestAgg:
  9 | 
 10 |     def test_string_name(self):
 11 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 12 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='mean')
 13 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='min')
 14 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='max')
 15 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='size')
 16 | 
 17 |     def test_function(self):
 18 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.median)
 19 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.mean)
 20 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.min)
 21 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.max)
 22 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc=np.size)
 23 | 
 24 | 
 25 | class TestSort:
 26 | 
 27 |     def test_lex_asc(self):
 28 |         fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 29 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 30 |         correct = sorted(ticklabels)
 31 |         assert ticklabels == correct
 32 | 
 33 |     def test_lex_desc(self):
 34 |         fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='lex_desc')
 35 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 36 |         correct = sorted(ticklabels, reverse=True)
 37 |         assert ticklabels == correct
 38 | 
 39 |     def test_asc_values(self):
 40 |         fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
 41 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 42 |         ticklabels = [label.replace('\n', ' ') for label in ticklabels]
 43 |         values = [p.get_height() for p in fig.axes[0].patches]
 44 | 
 45 |         s = airbnb.groupby('neighborhood')['price'].median().sort_values()
 46 |         correct_labels = s.index.tolist()
 47 |         correct_values = s.values.tolist()
 48 |         assert ticklabels == correct_labels
 49 | 
 50 |     def test_desc_values(self):
 51 |         fig = dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
 52 |         ticklabels = [t.get_text() for t in fig.axes[0].get_xticklabels()]
 53 |         ticklabels = [label.replace('\n', ' ') for label in ticklabels]
 54 |         values = [p.get_height() for p in fig.axes[0].patches]
 55 |         
 56 |         df = airbnb.groupby('neighborhood').agg({'price': 'median'}).reset_index() \
 57 |                    .sort_values(['price', 'neighborhood'], ascending=[False, True])
 58 |         s = df.set_index('neighborhood').squeeze()
 59 |         correct_labels = s.index.tolist()
 60 |         correct_values = s.values.tolist()
 61 |         assert ticklabels == correct_labels
 62 | 
 63 | 
 64 | class TestOrder:
 65 | 
 66 |     def test_x_order(self):
 67 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 68 |                 x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
 69 | 
 70 |         with pytest.raises(ValueError):
 71 |             dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 72 |                 x_order=['Dupont Circle', 'Edgewood', 'DOES NOT EXIST'])
 73 | 
 74 | 
 75 | class TestHorizontal:
 76 | 
 77 |     def test_horiz(self):
 78 |         dxp.scatter(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
 79 | 
 80 | 
 81 | class TestSplit:
 82 | 
 83 |     def test_split(self):
 84 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
 85 | 
 86 |     def test_split_order(self):
 87 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 88 |                 split='superhost', split_order=['Yes', 'No'])
 89 | 
 90 |     def test_stacked(self):
 91 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 92 |                 split='superhost', split_order=['Yes', 'No'])
 93 | 
 94 | 
 95 | class TestRowCol:
 96 | 
 97 |     def test_col(self):
 98 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 99 |                 split='superhost', col='property_type')
100 | 
101 |     def test_col_wrap(self):
102 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
103 |                 split='superhost', col='property_type', wrap=2)
104 | 
105 |     def test_col_order(self):
106 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
107 |                 split='superhost', col='property_type', col_order=['House', 'Condominium'])
108 | 
109 |     def test_row(self):
110 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
111 |                 split='superhost', row='property_type')
112 | 
113 |     def test_row_order(self):
114 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
115 |         split='superhost', row='property_type', row_order=['House', 'Condominium'])
116 | 
117 |     def test_row_wrap(self):
118 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
119 |             split='superhost', row='property_type', wrap=2)
120 | 
121 |     def test_row_col(self):
122 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
123 |                 split='superhost', col='property_type', 
124 |                 col_order=['House', 'Condominium', 'Apartment'],
125 |                 row='bedrooms', row_order=[0, 1, 2, 3])
126 | 
127 |     def test_sharey(self):
128 |         dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median',
129 |         split='superhost', col='property_type', col_order=['House', 'Condominium', 'Apartment'],
130 |         row='bedrooms', row_order=[0, 1, 2, 3], sharey=False)
131 |     


--------------------------------------------------------------------------------
/dexplot/_heat.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import pandas as pd
  3 | import numpy as np
  4 | 
  5 | 
  6 | def heatmap(x=None, y=None, agg=None, aggfunc=None, data=None, normalize=None, corr=False,
  7 |             annot=False, fmt='.2f', ax=None, figsize=None, title=None, cmap=None,
  8 |             cbarlabel="", cbar_kw={}, **kwargs):
  9 |     """
 10 |     Create a heatmap from a Pandas DataFrame. This function works with either
 11 |     tidy data or aggregated data.
 12 | 
 13 |     If using tidy data, pass it categorical/string variables to `x` and `y`
 14 |     and a numeric variable to `values`. Pass an aggregation function
 15 |     as a string to `aggfunc`.  You may also choose to leave `values` as None
 16 |     which result in a raw frequency count for the co-occurence of the `x` and
 17 |     `y` variables. Set normalize to True to get relative percentages.
 18 | 
 19 |     If using aggregated data, only use the `data` parameter. The index and
 20 |     columns will label the x and y. The values of the DataFrame will form
 21 |     will be used for the heat map.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     x: str
 26 |         Column name who's unique values will be used to form groups. Can
 27 |         only be used with tidy data and should be a categorical/string.
 28 | 
 29 |     y: str
 30 |         Column name who's unique values will be used to form groups. Can
 31 |         only be used with tidy data and should be a categorical/string.
 32 | 
 33 |     agg: str
 34 |         Column name who's values will be aggregated across the groups
 35 |         formed by `x` and `y`.
 36 | 
 37 |     aggfunc: str or function
 38 |         Used to aggregate `agg` variable. Use any of the strings that Pandas
 39 |         can understand. You can also use a custom function as long as it
 40 |         aggregates, i.e. returns a single value.
 41 | 
 42 |     data: DataFrame
 43 |         A Pandas DataFrame containing either tidy or aggregated data
 44 | 
 45 |     normalize: str
 46 |         Must be one of three strings, "all" or the name of one of the column
 47 |         names provided to `x` or `y`.
 48 | 
 49 |     corr: bool - Default False
 50 |         When set to True, will calcaulte the correlation of the co-occurence
 51 |         between each of the unique values in `x` and `y`.  Only works with
 52 |         tidy data.
 53 | 
 54 |     annot: bool - Default False
 55 |         Controls whether the aggregated values will be plotted as
 56 |         text in the heatmap.
 57 | 
 58 |     fmt: str
 59 |         Formatting style for annotations
 60 | 
 61 |     ax: Matplotlib Axes
 62 |         The Matplotlib Axes object to use for plotting. If not given, then
 63 |         create a new Figure and Axes
 64 | 
 65 |     figsize: tuple
 66 |         A two item tuple of ints used to control the figure size
 67 | 
 68 |     title: str
 69 |         Sets the title of the figure
 70 | 
 71 |     cmap: str
 72 |         Matplotlib colormap name
 73 | 
 74 |     cbarlabel: str
 75 |         Labels the colorbar
 76 | 
 77 |     cbar_kw: dict
 78 |         Keyword arguments passed to the `colorbar` Figure function
 79 | 
 80 |     kwargs: dict
 81 |         Keyword arguments passed to the `imshow` Axes function
 82 | 
 83 |     Returns
 84 |     -------
 85 |     A one-item tuple containing a Matplotlib Figure
 86 | 
 87 |     References
 88 |     ----------
 89 |     Code was inspired from Matplotlib page
 90 |     https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html
 91 |     """
 92 | 
 93 |     if figsize is None:
 94 |         figsize = (10, 8)
 95 | 
 96 |     if not isinstance(data, pd.DataFrame):
 97 |         raise TypeError('`data` must be a DataFrame')
 98 | 
 99 |     if ax is None:
100 |         fig, ax = plt.subplots(figsize=figsize)
101 |         if title:
102 |             fig.suptitle(title)
103 |     else:
104 |         fig = ax.figure
105 | 
106 |     if aggfunc:
107 |         if not agg:
108 |             raise ValueError('If you are setting `aggfunc`, you need to set `agg` as well.')
109 | 
110 |     if not normalize:
111 |         normalize = False
112 | 
113 |     if cmap is None:
114 |         cmap = 'RdYlBu_r'
115 | 
116 |     if x or y:
117 |         if not (x and y):
118 |             raise ValueError('If you supply one of x or y, you must both of them')
119 | 
120 |         if normalize not in (False, 'all', x, y):
121 |             raise ValueError('If you are setting `normalize`, it must be either '
122 |                              f'"all", "{x}" or "{y}"')
123 |         elif normalize == x:
124 |             normalize = 'columns'
125 |         elif normalize == y:
126 |             normalize = 'index'
127 | 
128 |         if agg:
129 |             data_values = data[agg]
130 |             if not aggfunc:
131 |                 aggfunc = 'mean'
132 |         else:
133 |             data_values = None
134 | 
135 |         agg_data = pd.crosstab(index=data[y], columns=data[x], values=data_values, aggfunc=aggfunc,
136 |                                normalize=normalize)
137 |     else:
138 |         agg_data = data
139 | 
140 |     if corr:
141 |         agg_data = agg_data.corr()
142 | 
143 |     agg_values = agg_data.values
144 |     col_labels = agg_data.columns.tolist()
145 |     row_labels = agg_data.index.tolist()
146 | 
147 |     # Plot the heatmap
148 |     im = ax.imshow(agg_values, cmap=cmap, **kwargs)
149 | 
150 |     # Create colorbar
151 |     cbar = fig.colorbar(im, ax=ax, **cbar_kw)
152 |     cbar.ax.set_ylabel(cbarlabel, rotation=-90, va='bottom')
153 | 
154 |     x_range, y_range = np.arange(agg_data.shape[1]), np.arange(agg_data.shape[0])
155 |     ax.set_xticks(x_range)
156 |     ax.set_yticks(y_range)
157 | 
158 |     ax.set_xticklabels(col_labels)
159 |     ax.set_yticklabels(row_labels)
160 | 
161 |     # Let the horizontal axes labeling appear on top.
162 |     ax.tick_params(top=True, bottom=False,
163 |                    labeltop=True, labelbottom=False)
164 | 
165 |     # Rotate the tick labels and set their alignment.
166 |     plt.setp(ax.get_xticklabels(), rotation=-30, ha='right', rotation_mode='anchor')
167 | 
168 |     # Turn spines off and create white grid.
169 |     for edge, spine in ax.spines.items():
170 |         spine.set_visible(False)
171 | 
172 |     ax.set_xticks(x_range - .5, minor=True)
173 |     ax.set_yticks(y_range - .5, minor=True)
174 |     ax.grid(which='minor', color='w', linestyle='-', linewidth=3)
175 |     ax.tick_params(which='minor', bottom=False, left=False)
176 | 
177 |     if annot:
178 |         annotate_heatmap(im, agg_values, fmt='{0:' + fmt + '}')
179 | 
180 |     return fig,
181 | 
182 | 
183 | def annotate_heatmap(im, values, fmt="{0:.2f}", **textkw):
184 |     """
185 |     Annotates the heatmap
186 | 
187 |     https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html
188 |     """
189 | 
190 |     kw = dict(horizontalalignment="center",
191 |               verticalalignment="center")
192 |     kw.update(textkw)
193 |     n_rows, n_cols = values.shape
194 | 
195 |     for i in range(n_rows):
196 |         for j in range(n_cols):
197 |             val = values[i, j]
198 |             if not np.isnan(val):
199 |                 im.axes.text(j, i, fmt.format(val), **kw)


--------------------------------------------------------------------------------
/dexplot/_plotly.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | import textwrap
  3 | 
  4 | import numpy as np
  5 | import plotly.graph_objects as go
  6 | 
  7 | from ._common_plot import PlotlyCommon, PlotlyCount
  8 | 
  9 | 
 10 | def wrap_labels(labels, wrap):
 11 |     return [textwrap.fill(label, wrap).replace('\n', '<br>') for label in labels]
 12 | 
 13 | 
 14 | def line_plotly(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 
 15 |         x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
 16 |         orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True, 
 17 |         sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 
 18 |         yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None):
 19 |     
 20 |     self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 
 21 |                         x_order, y_order, split_order, row_order, col_order,
 22 |                         orientation, sort_values, wrap, figsize, title, sharex, 
 23 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
 24 |                         x_textwrap, y_textwrap, x_rot, y_rot)
 25 |     
 26 |     showlegend = True
 27 |     for (row, col), info in self.final_data.items():
 28 |         for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
 29 |             self.fig.add_scatter(x=x, y=y, name=label, row=row, col=col,
 30 |                             marker_color=self.colors[i % len(self.colors)], 
 31 |                             showlegend=showlegend)
 32 |         showlegend = False
 33 |     return self.fig
 34 | 
 35 | 
 36 | def scatter_plotly(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 
 37 |         x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
 38 |         orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True, 
 39 |         sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 
 40 |         yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None):
 41 |     
 42 |     self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 
 43 |                     x_order, y_order, split_order, row_order, col_order,
 44 |                     orientation, sort_values, wrap, figsize, title, sharex, 
 45 |                     sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
 46 |                     x_textwrap, y_textwrap, x_rot, y_rot)
 47 | 
 48 |     showlegend = True
 49 |     for (row, col), info in self.final_data.items():
 50 |         for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
 51 |             self.fig.add_scatter(x=x, y=y, name=label, row=row, col=col,
 52 |                                     marker_color=self.colors[i % len(self.colors)], 
 53 |                                     showlegend=showlegend, mode='markers')
 54 |         showlegend = False
 55 |     return self.fig
 56 | 
 57 | 
 58 | def bar_plotly(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 
 59 |             x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
 60 |             orientation='v', sort_values=None, wrap=None, figsize=None, title=None, 
 61 |             sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, 
 62 |             ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10, 
 63 |             y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2,
 64 |             groupgap=0, bar_kwargs=None):
 65 | 
 66 |     self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 
 67 |                     x_order, y_order, split_order, row_order, col_order,
 68 |                     orientation, sort_values, wrap, figsize, title, sharex, 
 69 |                     sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
 70 |                     x_textwrap, y_textwrap, x_rot, y_rot)
 71 | 
 72 |     showlegend = self.split is not None
 73 |     self.fig.update_layout(barmode=mode, bargap=gap, bargroupgap=groupgap)
 74 |     for (row, col), info in self.final_data.items():
 75 |         for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
 76 |             if len(x) > 200:
 77 |                 warnings.warn('You are plotting more than 200 bars. '
 78 |                                 'Did you forget to provide an `aggfunc`?')
 79 | 
 80 |             self.fig.add_bar(x=x, y=y, orientation=self.orientation, 
 81 |                              name=label, row=row, col=col, 
 82 |                              marker_color=self.colors[i % len(self.colors)], 
 83 |                              showlegend=showlegend)
 84 |         showlegend = False
 85 | 
 86 |     return self.fig
 87 | 
 88 | 
 89 | def count_plotly(val, data=None, normalize=False, split=None, row=None, col=None, 
 90 |         x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
 91 |         orientation='v', sort_values='desc', wrap=None, figsize=None, title=None, 
 92 |         sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, 
 93 |         xscale='linear', yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, 
 94 |         x_rot=None, y_rot=None, mode='group', gap=.2, groupgap=0, 
 95 |         bar_kwargs=None):
 96 |     
 97 |     x, y = (val, None) if orientation == 'v' else (None, val)
 98 |     aggfunc = '__distribution__'
 99 |     self = PlotlyCount(x, y, data, aggfunc, split, row, col, 
100 |                         x_order, y_order, split_order, row_order, col_order,
101 |                         orientation, None, wrap, figsize, title, sharex, 
102 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
103 |                         x_textwrap, y_textwrap, x_rot, y_rot, kind='count')
104 | 
105 |     count_dict = self.get_count_dict(normalize)
106 |     showlegend = self.split is not None
107 |     self.fig.update_layout(barmode=mode, bargap=gap, bargroupgap=groupgap)
108 |     for (row, col), df in count_dict.items():
109 |         if sort_values == 'asc' and not (self.split or self.row or self.col):
110 |             df = df.iloc[::-1]
111 | 
112 |         labels = df.index.values
113 |         for i, column in enumerate(df.columns):
114 |             values = df[column].values
115 |             x, y = (labels, values) if self.orientation == 'v' else (values, labels)
116 |             self.fig.add_bar(x=x, y=y, orientation=self.orientation, name=column, 
117 |                              row=row, col=col, marker_color=self.colors[i % len(self.colors)], 
118 |                              showlegend=showlegend)
119 |         showlegend = False
120 |     return self.fig
121 | 
122 | 
123 | def box_plotly(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 
124 |         y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 
125 |         wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 
126 |         ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 
127 |         x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', 
128 |         gap=.2, groupgap=0, box_kwargs=None):
129 | 
130 |     aggfunc = None
131 |     sort_values = None
132 |     self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 
133 |                     x_order, y_order, split_order, row_order, col_order,
134 |                     orientation, sort_values, wrap, figsize, title, sharex, 
135 |                     sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
136 |                     x_textwrap, y_textwrap, x_rot, y_rot)
137 | 
138 |     showlegend = self.split is not None
139 |     self.fig.update_layout(boxmode=mode, boxgap=gap, boxgroupgap=groupgap)
140 |     for (row, col), info in self.final_data.items():
141 |         for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
142 |             self.fig.add_box(x=x, y=y, orientation=self.orientation, 
143 |                              name=label, row=row, col=col, 
144 |                              marker_color=self.colors[i % len(self.colors)], 
145 |                              showlegend=showlegend)
146 |         showlegend = False
147 |         
148 |     return self.fig
149 | 
150 | def violin_plotly(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 
151 |         y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 
152 |         wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 
153 |         ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 
154 |         x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', 
155 |         gap=.2, groupgap=0, box_kwargs=None):
156 | 
157 |     aggfunc = None
158 |     sort_values = None
159 |     self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 
160 |                     x_order, y_order, split_order, row_order, col_order,
161 |                     orientation, sort_values, wrap, figsize, title, sharex, 
162 |                     sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
163 |                     x_textwrap, y_textwrap, x_rot, y_rot)
164 | 
165 |     showlegend = self.split is not None
166 |     self.fig.update_layout(violinmode=mode, violingap=gap, violingroupgap=groupgap)
167 |     for (row, col), info in self.final_data.items():
168 |         for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
169 |             self.fig.add_violin(x=x, y=y, orientation=self.orientation, 
170 |                                 name=label, row=row, col=col, 
171 |                                 marker_color=self.colors[i % len(self.colors)], 
172 |                                 showlegend=showlegend)
173 |         showlegend = False
174 |         
175 |     return self.fig
176 | 
177 | def kde_plotly(x=None, y=None, data=None, split=None, row=None, col=None, split_order=None, 
178 |                row_order=None, col_order=None, orientation='v', wrap=None, figsize=None, 
179 |                title=None, sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, 
180 |                ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10, 
181 |                y_textwrap=None, x_rot=None, y_rot=None, range=None, cumulative=False):
182 | 
183 |     aggfunc = None
184 |     sort_values = None
185 |     self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 
186 |                     x_order, y_order, split_order, row_order, col_order,
187 |                     orientation, sort_values, wrap, figsize, title, sharex, 
188 |                     sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
189 |                     x_textwrap, y_textwrap, x_rot, y_rot)
190 | 
191 |     showlegend = self.split is not None
192 |     from ._utils import calculate_density_1d, calculate_density_2d
193 | 
194 |     x_order = y_order = None
195 |     # x, y = (x, None) if orientation == 'v' else (None, x)
196 | 
197 |     if x is not None and y is not None and split is not None:
198 |         raise ValueError('Cannot use `split` for 2-dimensional KDE plots')
199 | 
200 |     aggfunc = '__distribution__' if y is None else None
201 |     sort_values = None
202 |     self = PlotlyCommon(x, y, data, aggfunc, split, row, col, 
203 |                         x_order, y_order, split_order, row_order, col_order,
204 |                         orientation, sort_values, wrap, figsize, title, sharex, 
205 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
206 |                         x_textwrap, y_textwrap, x_rot, y_rot, check_numeric=True)
207 | 
208 |     for ax, info in self.final_data.items():
209 |         for vals in info:
210 |             if aggfunc == '__distribution__':
211 |                 x, split_label = vals[:2]
212 |                 x, y = calculate_density_1d(x, cumulative=cumulative)
213 |                 x, y = (x, y) if self.orientation == 'v' else (y, x)
214 |                 self.fig.add_scatter(x=x, y=y, name=split_label, row=row, col=col,
215 |                             marker_color=self.colors[i % len(self.colors)], 
216 |                             showlegend=showlegend)
217 |             else:
218 |                 x, y, split_label = vals[:3]
219 |                 xmin, xmax, ymin, ymax, Z = calculate_density_2d(x, y)
220 |                 ax.imshow(Z, extent=[xmin, xmax, ymin, ymax], aspect='auto')
221 |     
222 |         showlegend = False
223 |         
224 |     return self.fig
225 | 


--------------------------------------------------------------------------------
/dexplot/_plots.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from collections import defaultdict
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy import stats
  7 | 
  8 | from ._common_plot import MPLCommon, MPLCount
  9 | 
 10 | 
 11 | def get_bar_kwargs(bar_kwargs):
 12 |     default_bar_kwargs = {'ec': 'white', 'alpha': .9}
 13 |     if bar_kwargs is None:
 14 |         bar_kwargs = default_bar_kwargs
 15 |     else:
 16 |         try:
 17 |             bar_kwargs = {**default_bar_kwargs, **bar_kwargs}
 18 |         except:
 19 |             raise TypeError('`bar_kwargs` must be a dictionary')
 20 |     return bar_kwargs
 21 | 
 22 | 
 23 | def verify_gap_args(mode, gap, groupgap):
 24 |     if mode not in ('group', 'stack', 'overlay', 'relative'):
 25 |         raise ValueError("`moe` must be one of 'group', 'stack', 'overlay', 'relative'")
 26 |     if gap < 0 or gap >= 1:
 27 |         raise ValueError('`gap` must be greater than or equal to 0 and less than 1')
 28 |     if groupgap < 0 or groupgap >= 1:
 29 |         raise ValueError('`groupgap` must be greater than or equal to 0 and less than 1')
 30 | 
 31 | 
 32 | def get_jump_size(n, mode, gap, groupgap):
 33 |     total = 1 - gap
 34 |     jump = total / n 
 35 |     size = jump * (1 - groupgap)
 36 |     if mode != 'group':
 37 |         jump = 0
 38 |         size *= n
 39 |     return jump, size
 40 | 
 41 | 
 42 | def line(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 
 43 |          x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
 44 |          orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True, 
 45 |          sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 
 46 |          yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None):
 47 |         
 48 |     self = MPLCommon(x, y, data, aggfunc, split, row, col, 
 49 |                         x_order, y_order, split_order, row_order, col_order,
 50 |                         orientation, sort_values, wrap, figsize, title, sharex, 
 51 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
 52 |                         x_textwrap, y_textwrap, x_rot, y_rot)
 53 |     
 54 |     marker = 'o' if self.groupby else None
 55 | 
 56 |     for ax, info in self.final_data.items():
 57 |         for x, y, label, col_name, row_label, col_label in info:
 58 |             x_plot, y_plot = self.get_x_y_plot(x, y)
 59 |             ax.plot(x_plot, y_plot, label=label, marker=marker)
 60 |         
 61 |         if self.groupby:
 62 |             ticklabels = x if self.orientation == 'v' else y
 63 |             self.add_ticklabels(ticklabels, ax)
 64 |     
 65 |     self.add_legend(label)
 66 |     if x.dtype == 'O' or y.dtype == 'O':
 67 |         self.update_fig_size(len(x), 1)
 68 |     return self.clean_up()
 69 |         
 70 | 
 71 | def scatter(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 
 72 |             x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
 73 |             orientation='v', sort_values=None, wrap=None, figsize=None, title=None, sharex=True, 
 74 |             sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 
 75 |             yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, 
 76 |             regression=False):
 77 | 
 78 |     self = MPLCommon(x, y, data, aggfunc, split, row, col, 
 79 |                         x_order, y_order, split_order, row_order, col_order,
 80 |                         orientation, sort_values, wrap, figsize, title, sharex, 
 81 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
 82 |                         x_textwrap, y_textwrap, x_rot, y_rot)
 83 | 
 84 |     alpha = 1 if self.groupby else .7
 85 | 
 86 |     for ax, info in self.final_data.items():
 87 |         for x, y, label, col_name, row_label, col_label in info:
 88 |             x_plot, y_plot = self.get_x_y_plot(x, y)
 89 |             ax.scatter(x_plot, y_plot, label=label, alpha=alpha)
 90 |             if regression:
 91 |                 slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
 92 |                 x_line = np.array([x.min(), x.max()])
 93 |                 y_line = x_line * slope + intercept
 94 |                 ax.plot(x_line, y_line)
 95 |         if self.groupby:
 96 |             ticklabels = x if self.orientation == 'v' else y
 97 |             self.add_ticklabels(ticklabels, ax)
 98 | 
 99 |     self.add_legend(label)
100 |     if x.dtype == 'O' or y.dtype == 'O':
101 |         self.update_fig_size(len(x), 1)
102 |     return self.clean_up()
103 | 
104 | 
105 | def bar(x=None, y=None, data=None, aggfunc=None, split=None, row=None, col=None, 
106 |         x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
107 |         orientation='v', sort_values=None, wrap=None, figsize=None, title=None, 
108 |         sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, 
109 |         ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10, 
110 |         y_textwrap=None, x_rot=None, y_rot=None, mode='group', 
111 |         gap=.2, groupgap=0, bar_kwargs=None):
112 | 
113 |     self = MPLCommon(x, y, data, aggfunc, split, row, col, 
114 |                         x_order, y_order, split_order, row_order, col_order,
115 |                         orientation, sort_values, wrap, figsize, title, sharex, 
116 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
117 |                         x_textwrap, y_textwrap, x_rot, y_rot)
118 | 
119 |     bar_kwargs = get_bar_kwargs(bar_kwargs)
120 |     verify_gap_args(mode, gap, groupgap)
121 |     for ax, info in self.final_data.items():
122 |         jump, size = get_jump_size(len(info), mode, gap, groupgap)
123 |         for i, (x, y, label, col_name, row_label, col_label) in enumerate(info):
124 |             x_plot, y_plot = self.get_x_y_plot(x, y)
125 |             if i == 0:
126 |                 base = np.zeros(len(x_plot))
127 |             if len(x) > 200:
128 |                 warnings.warn('You are plotting more than 200 bars. '
129 |                                 'Did you forget to provide an `aggfunc`?')
130 | 
131 |             if self.orientation == 'v':
132 |                 x_plot = x_plot + jump * i
133 |                 ax.bar(x_plot, y_plot, label=label, width=size, 
134 |                         bottom=base, align='edge', **bar_kwargs)
135 |                 if mode == 'stack':
136 |                     base += y_plot
137 |             else:
138 |                 y_plot = y_plot - jump * (i + 1)
139 |                 ax.barh(y_plot, x_plot, label=label, height=size, 
140 |                         left=base, align='edge', **bar_kwargs)
141 |                 if mode == 'stack':
142 |                     base += x_plot
143 |         ticklabels = x if self.orientation == 'v' else y
144 |         delta = jump * (i + 1) / 2 if mode == 'group' else size / 2
145 |         self.add_ticklabels(ticklabels, ax, delta=delta)
146 | 
147 |     self.add_legend(label)
148 |     self.update_fig_size(len(info), len(x))
149 |     return self.clean_up()
150 | 
151 | 
152 | def count(val, data=None, normalize=False, split=None, row=None, col=None, 
153 |           x_order=None, y_order=None, split_order=None, row_order=None, col_order=None,
154 |           orientation='v', sort_values='desc', wrap=None, figsize=None, title=None, 
155 |           sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, 
156 |           xscale='linear', yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, 
157 |           x_rot=None, y_rot=None, mode='group', gap=.2, groupgap=0, 
158 |           bar_kwargs=None):
159 |        
160 |     bar_kwargs = get_bar_kwargs(bar_kwargs)
161 |     verify_gap_args(mode, gap, groupgap)
162 |     x, y = (val, None) if orientation == 'v' else (None, val)
163 |     aggfunc = '__distribution__'
164 |     self = MPLCount(x, y, data, aggfunc, split, row, col, 
165 |                     x_order, y_order, split_order, row_order, col_order,
166 |                     orientation, None, wrap, figsize, title, sharex, 
167 |                     sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
168 |                     x_textwrap, y_textwrap, x_rot, y_rot, kind='count')
169 | 
170 |     count_dict = self.get_count_dict(normalize)
171 |     for ax, df in count_dict.items():
172 |         base = np.zeros(len(df))
173 |         position = np.arange(len(df))
174 |         if sort_values == 'asc' and not (self.split or self.row or self.col):
175 |             df = df.iloc[::-1]
176 | 
177 |         ticklabels = df.index.values
178 |         jump, size = get_jump_size(df.shape[1], mode, gap, groupgap)
179 |         for col in df.columns:
180 |             values = df[col].values
181 |             
182 |             if self.orientation == 'v':
183 |                 ax.bar(position, values, label=col, width=size, 
184 |                         bottom=base, align='edge', **bar_kwargs)
185 |                 position = position + jump
186 |             else:
187 |                 ax.barh(position - cur_size, values, label=col, height=size, 
188 |                         left=base, align='edge', **bar_kwargs)
189 |                 position = position - jump
190 | 
191 |             if mode == 'stack':
192 |                 base += values
193 | 
194 |         delta = jump * df.shape[1] / 2 if mode == 'group' else size / 2
195 |         self.add_ticklabels(ticklabels, ax, delta=delta)
196 |     if self.split or len(df.columns) > 1:
197 |         self.add_legend(col)
198 |     self.update_fig_size(df.shape[1], df.shape[0])
199 |     return self.clean_up()
200 | 
201 | 
202 | def _common_dist(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 
203 |                  y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 
204 |                  wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 
205 |                  ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 
206 |                  x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, 
207 |                  mode='group', gap=.2, groupgap=0, kind=None, **kwargs):
208 | 
209 |     aggfunc = '__distribution__'
210 |     sort_values = None
211 |     self = MPLCommon(x, y, data, aggfunc, split, row, col, 
212 |                         x_order, y_order, split_order, row_order, col_order,
213 |                         orientation, sort_values, wrap, figsize, title, sharex, 
214 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
215 |                         x_textwrap, y_textwrap, x_rot, y_rot)
216 | 
217 |     key = 'bodies' if kind == 'violinplot' else 'boxes'
218 |     vert = self.orientation == 'v'
219 |     for ax, info in self.final_data.items():
220 |         plot_func = getattr(ax, kind)
221 |         cur_data, cur_ticklabels = self.get_distribution_data(info)
222 | 
223 |         handles = []
224 |         split_labels = []
225 |         n_splits = len(cur_data)
226 |         widths = min(.5 + .15 * n_splits, .9) / n_splits
227 |         n_boxes = len(info)
228 |         n = len(next(iter(cur_data.values())))  # number of groups
229 |         markersize = max(6 - n_boxes // 5, 2)
230 |         jump, size = get_jump_size(n, mode, gap, groupgap)
231 |         for i, (split_label, data) in enumerate(cur_data.items()):
232 |             filt = [len(arr) > 0 for arr in data]
233 |             positions = np.array([i for (i, f) in enumerate(filt) if f])
234 |             data = [np.array(d) for (d, f) in zip(data, filt) if f]
235 |             if self.orientation == 'h':
236 |                 positions = positions - i * widths
237 |             else:
238 |                 positions = positions + i * widths
239 |             
240 |             if kind == 'boxplot':
241 |                 kwargs['boxprops'] = {'facecolor': self.colors[i % len(self.colors)] ,
242 |                                         'edgecolor': 'black'}
243 |                 kwargs['flierprops'] = {'markersize': markersize}
244 |             
245 |             ret = plot_func(data, vert=vert, positions=positions, widths=widths, **kwargs)
246 | 
247 |             if kind == 'violinplot':
248 |                 for k in ['cmeans', 'cmins', 'cmaxes', 'cbars', 'cmedians', 'cquantiles']:
249 |                     if k in ret:
250 |                         ret[k].set_linewidth(1)
251 |                 for body in ret['bodies']:
252 |                     body.set_alpha(.8)
253 | 
254 |             handles.append(ret[key][0])
255 |             split_labels.append(split_label)
256 |         
257 |         delta = (n_splits / 2 - .5) * widths
258 |         ticklabels = cur_ticklabels[split_label]
259 |         self.add_ticklabels(ticklabels, ax, delta=delta)
260 | 
261 |     self.add_legend(self.split, handles, split_labels)
262 |     self.update_fig_size(n_splits, n)
263 |     return self.clean_up()
264 | 
265 | # could add groupby to box
266 | def box(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 
267 |         y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 
268 |         wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 
269 |         ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 
270 |         x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2,
271 |         groupgap=0, box_kwargs=None):
272 |             
273 |     kwargs = dict(notch=None, sym=None, whis=None, 
274 |         patch_artist=True, bootstrap=None, usermedians=None, conf_intervals=None, meanline=None, 
275 |         showmeans=None, showcaps=None, showbox=None, showfliers=None, boxprops=None, labels=None, 
276 |         flierprops=None, medianprops=None, meanprops=None, capprops=None, whiskerprops=None, 
277 |         manage_ticks=True, autorange=False, zorder=None)
278 | 
279 |     if kwargs['medianprops'] is None:
280 |         kwargs['medianprops'] = {'color': '.2'}
281 |     
282 |     # kwargs = dict(notch=notch, sym=sym, whis=whis, patch_artist=patch_artist,
283 |     #               bootstrap=bootstrap, usermedians=usermedians, conf_intervals=conf_intervals, 
284 |     #               meanline=meanline, showmeans=showmeans, showcaps=showcaps, showbox=showbox, 
285 |     #               showfliers=showfliers, boxprops=boxprops, labels=labels, flierprops=flierprops,
286 |     #               medianprops=medianprops, meanprops=meanprops, capprops=capprops, 
287 |     #               whiskerprops=whiskerprops, manage_ticks=manage_ticks, 
288 |     #               autorange=autorange, zorder=zorder)
289 |     
290 |     return _common_dist(x, y, data, split, row, col, x_order, y_order, split_order, 
291 |                         row_order, col_order, orientation, wrap, figsize, title, 
292 |                         sharex, sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 
293 |                         x_textwrap, y_textwrap, x_rot, y_rot, mode, gap, groupgap, 
294 |                         kind='boxplot', **kwargs)
295 | 
296 | 
297 | def violin(x=None, y=None, data=None, split=None, row=None, col=None, x_order=None, 
298 |            y_order=None, split_order=None, row_order=None, col_order=None, orientation='h', 
299 |            wrap=None, figsize=None, title=None, sharex=True, sharey=True, xlabel=None, 
300 |            ylabel=None, xlim=None, ylim=None, xscale='linear', yscale='linear', cmap=None, 
301 |            x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, mode='group', gap=.2,
302 |            groupgap=0, violin_kwargs=None):
303 |            
304 |     kwargs = dict(showmeans=False, showextrema=True, showmedians=True, 
305 |                   quantiles=None, points=100, bw_method=None)
306 | 
307 |     # kwargs = dict(showmeans=showmeans, showextrema=showextrema, showmedians=showmedians, 
308 |     #               quantiles=quantiles, points=points, bw_method=bw_method)
309 | 
310 |     return _common_dist(x, y, data, split, row, col, 
311 |                         x_order, y_order, split_order, row_order, col_order,
312 |                         orientation, wrap, figsize, title, sharex, 
313 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 
314 |                         x_textwrap, y_textwrap, x_rot, y_rot, kind='violinplot', **kwargs)
315 | 
316 | 
317 | def hist(val, data=None, split=None, row=None, col=None, split_order=None, row_order=None, 
318 |          col_order=None, orientation='v', wrap=None, figsize=None, title=None, 
319 |          sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, ylim=None, xscale='linear', 
320 |          yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, x_rot=None, y_rot=None, 
321 |          mode='group', gap=.2, groupgap=0, hist_kwargs=None):
322 | 
323 |     hist_kwargs = dict(bins=None, range=None, density=False, weights=None, cumulative=False, 
324 |                        bottom=None, histtype='bar', align='mid', rwidth=None, log=False)
325 |         
326 |     x_order = y_order = None
327 |     x, y = (val, None) if orientation == 'v' else (None, val)
328 |     bins = bins if bins else 20
329 |     kwargs = dict(bins=bins, range=range, density=density, weights=weights, 
330 |                     cumulative=cumulative, bottom=bottom, histtype=histtype, align=align, 
331 |                     rwidth=rwidth, log=log)
332 | 
333 |     aggfunc = '__distribution__'
334 |     sort_values = None
335 |     self = MPLCommon(x, y, data, aggfunc, split, row, col, 
336 |                         x_order, y_order, split_order, row_order, col_order,
337 |                         orientation, sort_values, wrap, figsize, title, sharex, 
338 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
339 |                         x_textwrap, y_textwrap, x_rot, y_rot)
340 | 
341 |     orientation = 'vertical' if self.orientation == 'v' else 'horizontal'
342 |     for ax, info in self.final_data.items():
343 |         cur_data, cur_ticklabels = self.get_distribution_data(info)
344 | 
345 |         handles = []
346 |         split_labels = []
347 |         n_splits = len(cur_data)
348 |         n = len(next(iter(cur_data.values())))  # number of groups
349 |         for split_label, data in cur_data.items():
350 |             filt = [len(arr) > 0 for arr in data]
351 |             vals = [d for (d, f) in zip(data, filt) if f]    
352 |             ret = ax.hist(vals, orientation=orientation, alpha=.8, **kwargs)
353 |             handles.append(ret[-1][0])
354 |             split_labels.append(split_label)
355 | 
356 |     self.add_legend(self.split, handles, split_labels)
357 |     # self.update_fig_size(n_splits, n)
358 |     return self.clean_up()
359 | 
360 | 
361 | def kde(x=None, y=None, data=None, split=None, row=None, col=None, split_order=None, 
362 |         row_order=None, col_order=None, orientation='v', wrap=None, figsize=None, 
363 |         title=None, sharex=True, sharey=True, xlabel=None, ylabel=None, xlim=None, 
364 |         ylim=None, xscale='linear', yscale='linear', cmap=None, x_textwrap=10, y_textwrap=None, 
365 |         x_rot=None, y_rot=None, range=None, cumulative=False):
366 |         
367 |     from ._utils import calculate_density_1d, calculate_density_2d
368 | 
369 |     x_order = y_order = None
370 |     # x, y = (x, None) if orientation == 'v' else (None, x)
371 |     kwargs = dict(range=range, cumulative=cumulative)
372 | 
373 |     if x is not None and y is not None and split is not None:
374 |         raise ValueError('Cannot use `split` for 2-dimensional KDE plots')
375 | 
376 |     aggfunc = '__distribution__' if y is None else None
377 |     sort_values = None
378 |     self = MPLCommon(x, y, data, aggfunc, split, row, col, 
379 |                         x_order, y_order, split_order, row_order, col_order,
380 |                         orientation, sort_values, wrap, figsize, title, sharex, 
381 |                         sharey, xlabel, ylabel, xlim, ylim, xscale, yscale, cmap,
382 |                         x_textwrap, y_textwrap, x_rot, y_rot, check_numeric=True)
383 | 
384 |     for ax, info in self.final_data.items():
385 |         for vals in info:
386 |             if aggfunc == '__distribution__':
387 |                 x, split_label = vals[:2]
388 |                 x, y = calculate_density_1d(x, cumulative=cumulative)
389 |                 x, y = (x, y) if self.orientation == 'v' else (y, x)
390 |                 ax.plot(x, y, label=split_label)
391 |             else:
392 |                 x, y, split_label = vals[:3]
393 |                 xmin, xmax, ymin, ymax, Z = calculate_density_2d(x, y)
394 |                 ax.imshow(Z, extent=[xmin, xmax, ymin, ymax], aspect='auto')
395 |             
396 |     self.add_legend(self.split)
397 |     # self.update_fig_size(n_splits, n)
398 |     return self.clean_up()
399 | 
400 | xy_doc = """
401 | x : str, default None
402 |     Column name of DataFrame whose values will go along the x-axis
403 | 
404 | y : str, default None
405 |     Column name of DataFrame whose values will go along the y-axis
406 |     """
407 | 
408 | val_doc = """
409 | val : str, default None
410 |     Column name of DataFrame whose values will be used for distribution
411 |     """
412 | 
413 | aggfunc_doc = """
414 | aggfunc : str or function, default None
415 |     Kind of aggregation to perform. Use a string that the DataFrame `agg` 
416 |     method understands. If providing a function, it will also be passed to
417 |     the `agg` method.
418 | 
419 |     The strings 'countna' and 'percna' are also available to find the
420 |     number and percentage of missing values.
421 |     """
422 | 
423 | xy_order = """
424 | x_order : str or list, default None
425 |     Used as both a way to order and filter the x-values. Use the strings
426 |     'asc'/'desc' to order ascending or descending.
427 | 
428 |     Set a specific order with a list, i.e. `['House', 'Apartment', 'Townhouse']`
429 | 
430 |     Use the strings `'top n'` or `'bottom n'` where `n` is an integer. This will
431 |     filter for the most/least frequent groups.
432 | 
433 |     By default, sorting happens in ascending order.
434 | 
435 | y_order : str or list, default None
436 |     See x_order
437 | 
438 | split_order : str or list, default None
439 |     See x_order
440 | 
441 | row_order : str or list, default None
442 |     See x_order
443 |     
444 | col_order : str or list, default None
445 |     See x_order
446 |     """
447 | 
448 | split_order = """
449 | split_order : str or list, default None
450 |     Used as both a way to order and filter the x-values. Use the strings
451 |     'asc'/'desc' to order ascending or descending.
452 | 
453 |     Set a specific order with a list, i.e. `['House', 'Apartment', 'Townhouse']`
454 | 
455 |     Use the strings `'top n'` or `'bottom n'` where `n` is an integer. This will
456 |     filter for the most/least frequent groups.
457 | 
458 |     By default, sorting happens in ascending order.
459 | 
460 | row_order : str or list, default None
461 |     See split_order
462 |     
463 | col_order : str or list, default None
464 |     See split_order
465 |     """
466 | 
467 | sort_values_doc = """
468 | sort_values : str - 'asc' or 'desc', default None
469 |     Sort the values ascending or descending. If this is given, then
470 |     x/y_order is ignored.
471 |     """
472 | 
473 | doc = \
474 | """
475 | {plot_doc}
476 | 
477 | Parameters
478 | ----------
479 | {xy}
480 | data : DataFrame or Series, default None
481 |     A pandas DataFrame with long or wide data. If provided a Series, do not
482 |     supply x or y.
483 | {aggfunc}
484 | split : str, default None
485 |     Column name that will be used in the DataFrame `groupby` method to 
486 |     split the data into independent groups within a single plot
487 | 
488 | row : str
489 |     Column name that will be used in the DataFrame `groupby` method to 
490 |     split the data into independent groups to form new plots. Each unique value
491 |     in the `row` column forms a new row of plots.
492 | 
493 | col : str
494 |     Column name that will be used in the DataFrame `groupby` method to 
495 |     split the data into independent groups to form new plots. Each unique value
496 |     in the `row` column forms a new row of plots.
497 | {order}
498 | orientation : str 'v' or 'h'
499 |     Choose the orientation of the plots. By default, they are vertical
500 |     ('v'), except for box and violin plots, which are horizontal.
501 | {sort_values}
502 | wrap : int, default None
503 |     When using either `row` or either `col`, but not both, determines the
504 |     maximum number of rows/cols before a new row/col is used.
505 | 
506 | figsize : tuple, default None
507 |     A tuple of numbers used passed to the `figsize` matplotlib parameter. 
508 |     By default, the figure size will be determined based on the kind of
509 |     plot produced.
510 | 
511 | title : str
512 |     Sets the figure title NOT the Axes title
513 | 
514 | sharex : bool
515 |     Whether all plots should share the x-axis or not. Default is True
516 | 
517 | sharey : bool
518 |     Whether all plots should share the y-axis or not. Default is True
519 | 
520 | xlabel : str
521 |     Label used for x-axis on figures with a single plot
522 | 
523 | ylabel : str
524 |     Label used for y-axis on figures with a single plot
525 | 
526 | xlim : 2-item tuple of numbers
527 |     Determines x-axis limits for figures with a single plot
528 | 
529 | ylim : 2-item tuple of numbers
530 |     Determines y-axis limits for figures with a single plot
531 | 
532 | xscale : 'linear', 'log', 'symlog', 'logit'
533 |     Sets the scale of the x-axis.
534 | 
535 | yscale : 'linear', 'log', 'symlog', 'logit'
536 |     Sets the scale of the y-axis
537 | 
538 | cmap : str or matplotlib colormap instance, default None
539 | 
540 | x_textwrap : int, default 10
541 |     Number of characters before wrapping text for x-labels
542 | 
543 | y_textwrap : int, default None
544 |     Number of characters before wrapping text for y-labels
545 | 
546 | x_rot : int or float, default None
547 |     Degree of rotation of x-tick labels. If between 0 and 180
548 |     horizontal_alignment is set to 'right', otherwise 'left'
549 | 
550 | y_rot : int or float, default None
551 |     Degree of rotation of y-tick labels. If between 0 and 180
552 |     vertical_alignment is set to 'top', otherwise 'bottom'
553 | 
554 | mode : str
555 | 
556 | gap : float
557 | 
558 | groupgap : float
559 | 
560 | Returns
561 | -------
562 | A Matplotlib Figure instance
563 | """
564 | 
565 | 
566 | # line doc
567 | line_doc = """\
568 | Create line plots
569 | """
570 | 
571 | scatter_doc = """\
572 | Create scatter plots
573 | """
574 | 
575 | bar_doc = """\
576 | Create bar plots
577 | """
578 | 
579 | count_doc = """\
580 | Create count plots
581 | """
582 | 
583 | box_doc = """\
584 | Create box plots
585 | """
586 | 
587 | violin_doc = """\
588 | Create violin plots
589 | """
590 | 
591 | hist_doc = """\
592 | Create histograms
593 | """
594 | 
595 | kde_doc = """\
596 | Create kernel density estimate plots
597 | """
598 | 
599 | line.__doc__ = doc.format(plot_doc=line_doc, xy=xy_doc, aggfunc=aggfunc_doc, 
600 |                           order=xy_order, sort_values=sort_values_doc)
601 | 
602 | scatter.__doc__ = doc.format(plot_doc=scatter_doc, xy=xy_doc, aggfunc=aggfunc_doc, 
603 |                           order=xy_order, sort_values=sort_values_doc)
604 | 
605 | bar.__doc__ = doc.format(plot_doc=bar_doc, xy=xy_doc, aggfunc=aggfunc_doc, 
606 |                           order=xy_order, sort_values=sort_values_doc)
607 | 
608 | count.__doc__ = doc.format(plot_doc=count_doc, xy=val_doc, aggfunc='', 
609 |                           order=split_order, sort_values=sort_values_doc)
610 | 
611 | box.__doc__ = doc.format(plot_doc=box_doc, xy=xy_doc, aggfunc='', 
612 |                           order=xy_order, sort_values='')
613 | 
614 | violin.__doc__ = doc.format(plot_doc=violin_doc, xy=xy_doc, aggfunc='', 
615 |                           order=xy_order, sort_values='')
616 | 
617 | hist.__doc__ = doc.format(plot_doc=hist_doc, xy=val_doc, aggfunc='', 
618 |                           order=split_order, sort_values='')
619 | 
620 | kde.__doc__ = doc.format(plot_doc=kde_doc, xy=val_doc, aggfunc='', 
621 |                           order=split_order, sort_values='')
622 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
   1 | # Dexplot
   2 | 
   3 | Dexplot is a Python library for delivering beautiful data visualizations with a simple and intuitive user experience.
   4 | 
   5 | ## Goals
   6 | 
   7 | The primary goals for dexplot are:
   8 | 
   9 | * Maintain a very consistent API with as few functions as necessary to make the desired statistical plots
  10 | * Allow the user tremendous power without using matplotlib
  11 | 
  12 | 
  13 | ## Installation
  14 | 
  15 | `pip install dexplot`
  16 | 
  17 | ## Built for long and wide data
  18 | 
  19 | Dexplot is primarily built for long data, which is a form of data where each row represents a single observation and each column represents a distinct quantity. It is often referred to as "tidy" data. Here, we have some long data.
  20 | 
  21 | ![](images/long.png)
  22 | 
  23 | Dexplot also has the ability to handle wide data, where multiple columns may contain values that represent the same kind of quantity. The same data above has been aggregated to show the mean for each combination of neighborhood and property type. It is now wide data as each column contains the same quantity (price).
  24 | 
  25 | ![](images/wide.png)
  26 | 
  27 | ## Usage
  28 | 
  29 | Dexplot provides a small number of powerful functions that all work similarly. Most plotting functions have the following signature:
  30 | 
  31 | ```python
  32 | dxp.plotting_func(x, y, data, aggfunc, split, row, col, orientation, ...)
  33 | ```
  34 | 
  35 | * `x` - Column name along the x-axis
  36 | * `y` - Column name the y-axis
  37 | * `data` - Pandas DataFrame
  38 | * `aggfunc` - String of pandas aggregation function, 'min', 'max', 'mean', etc...
  39 | * `split` - Column name to split data into distinct groups
  40 | * `row` - Column name to split data into distinct subplots row-wise
  41 | * `col` - Column name to split data into distinct subplots column-wise
  42 | * `orientation` - Either vertical (`'v'`) or horizontal (`'h'`). Default for most plots is vertical.
  43 | 
  44 | When `aggfunc` is provided, `x` will be the grouping variable and `y` will be aggregated when vertical and vice-versa when horizontal. The best way to learn how to use dexplot is with the examples below.
  45 | 
  46 | ## Families of plots
  47 | 
  48 | There are two primary families of plots, **aggregation** and **distribution**. Aggregation plots take a sequence of values and return a **single** value using the function provided to `aggfunc` to do so. Distribution plots take a sequence of values and depict the shape of the distribution in some manner.
  49 | 
  50 | * Aggregation
  51 |     * bar
  52 |     * line
  53 |     * scatter
  54 |     * count
  55 | * Distribution
  56 |     * box
  57 |     * violin
  58 |     * hist
  59 |     * kde
  60 | 
  61 | ## Comparison with Seaborn
  62 | 
  63 | If you have used the seaborn library, then you should notice a lot of similarities. Much of dexplot was inspired by Seaborn. Below is a list of the extra features in dexplot not found in seaborn
  64 | 
  65 | * Ability to graph relative frequency and normalize over any number of variables
  66 | * No need for multiple functions to do the same thing (far fewer public functions)
  67 | * Ability to make grids with a single function instead of having to use a higher level function like `catplot`
  68 | * Pandas `groupby` methods available as strings
  69 | * Ability to sort by values
  70 | * Ability to sort x/y labels lexicographically
  71 | * Ability to select most/least frequent groups
  72 | * x/y labels are wrapped so that they don't overlap
  73 | * Figure size (plus several other options) and available to change without using matplotlib
  74 | * A matplotlib figure object is returned
  75 | 
  76 | ## Examples
  77 | 
  78 | Most of the examples below use long data.
  79 | 
  80 | ## Aggregating plots - bar, line and scatter
  81 | 
  82 | We'll begin by covering the plots that **aggregate**. An aggregation is defined as a function that summarizes a sequence of numbers with a single value. The examples come from the Airbnb dataset, which contains many property rental listings from the Washington D.C. area.
  83 | 
  84 | 
  85 | ```python
  86 | import dexplot as dxp
  87 | import pandas as pd
  88 | airbnb = dxp.load_dataset('airbnb')
  89 | airbnb.head()
  90 | ```
  91 | 
  92 | 
  93 | 
  94 | 
  95 | <div>
  96 | <table border="1" class="dataframe">
  97 |   <thead>
  98 |     <tr style="text-align: right;">
  99 |       <th></th>
 100 |       <th>neighborhood</th>
 101 |       <th>property_type</th>
 102 |       <th>accommodates</th>
 103 |       <th>bathrooms</th>
 104 |       <th>bedrooms</th>
 105 |       <th>price</th>
 106 |       <th>cleaning_fee</th>
 107 |       <th>rating</th>
 108 |       <th>superhost</th>
 109 |       <th>response_time</th>
 110 |       <th>latitude</th>
 111 |       <th>longitude</th>
 112 |     </tr>
 113 |   </thead>
 114 |   <tbody>
 115 |     <tr>
 116 |       <th>0</th>
 117 |       <td>Shaw</td>
 118 |       <td>Townhouse</td>
 119 |       <td>16</td>
 120 |       <td>3.5</td>
 121 |       <td>4</td>
 122 |       <td>433</td>
 123 |       <td>250</td>
 124 |       <td>95.0</td>
 125 |       <td>No</td>
 126 |       <td>within an hour</td>
 127 |       <td>38.90982</td>
 128 |       <td>-77.02016</td>
 129 |     </tr>
 130 |     <tr>
 131 |       <th>1</th>
 132 |       <td>Brightwood Park</td>
 133 |       <td>Townhouse</td>
 134 |       <td>4</td>
 135 |       <td>3.5</td>
 136 |       <td>4</td>
 137 |       <td>154</td>
 138 |       <td>50</td>
 139 |       <td>97.0</td>
 140 |       <td>No</td>
 141 |       <td>NaN</td>
 142 |       <td>38.95888</td>
 143 |       <td>-77.02554</td>
 144 |     </tr>
 145 |     <tr>
 146 |       <th>2</th>
 147 |       <td>Capitol Hill</td>
 148 |       <td>House</td>
 149 |       <td>2</td>
 150 |       <td>1.5</td>
 151 |       <td>1</td>
 152 |       <td>83</td>
 153 |       <td>35</td>
 154 |       <td>97.0</td>
 155 |       <td>Yes</td>
 156 |       <td>within an hour</td>
 157 |       <td>38.88791</td>
 158 |       <td>-76.99668</td>
 159 |     </tr>
 160 |     <tr>
 161 |       <th>3</th>
 162 |       <td>Shaw</td>
 163 |       <td>House</td>
 164 |       <td>2</td>
 165 |       <td>2.5</td>
 166 |       <td>1</td>
 167 |       <td>475</td>
 168 |       <td>0</td>
 169 |       <td>98.0</td>
 170 |       <td>No</td>
 171 |       <td>NaN</td>
 172 |       <td>38.91331</td>
 173 |       <td>-77.02436</td>
 174 |     </tr>
 175 |     <tr>
 176 |       <th>4</th>
 177 |       <td>Kalorama Heights</td>
 178 |       <td>Apartment</td>
 179 |       <td>3</td>
 180 |       <td>1.0</td>
 181 |       <td>1</td>
 182 |       <td>118</td>
 183 |       <td>15</td>
 184 |       <td>91.0</td>
 185 |       <td>No</td>
 186 |       <td>within an hour</td>
 187 |       <td>38.91933</td>
 188 |       <td>-77.04124</td>
 189 |     </tr>
 190 |   </tbody>
 191 | </table>
 192 | </div>
 193 | 
 194 | 
 195 | 
 196 | There are more than 4,000 listings in our dataset. We will use bar charts to aggregate the data.
 197 | 
 198 | 
 199 | ```python
 200 | airbnb.shape
 201 | ```
 202 | 
 203 | 
 204 | 
 205 | 
 206 |     (4581, 12)
 207 | 
 208 | 
 209 | 
 210 | ### Vertical bar charts
 211 | 
 212 | In order to performa an aggregation, you must supply a value for `aggfunc`. Here, we find the median price per neighborhood. Notice that the column names automatically wrap.
 213 | 
 214 | 
 215 | ```python
 216 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 217 | ```
 218 | 
 219 | 
 220 | 
 221 | 
 222 | ![png](images/output_7_0.png)
 223 | 
 224 | 
 225 | 
 226 | Line and scatter plots can be created with the same command, just substituting the name of the function. They both are not good choices for the visualization since the grouping variable (neighborhood) has no meaningful order.
 227 | 
 228 | 
 229 | ```python
 230 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 231 | ```
 232 | 
 233 | 
 234 | 
 235 | 
 236 | ![png](images/output_9_0.png)
 237 | 
 238 | 
 239 | 
 240 | 
 241 | ```python
 242 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 243 | ```
 244 | 
 245 | 
 246 | 
 247 | 
 248 | ![png](images/output_10_0.png)
 249 | 
 250 | 
 251 | 
 252 | ### Components of the groupby aggregation
 253 | 
 254 | Anytime the `aggfunc` parameter is set, you have performed a groupby aggregation, which always consists of three components:
 255 | 
 256 | * Grouping column - unique values of this column form independent groups (neighborhood)
 257 | * Aggregating column - the column that will get summarized with a single value (price)
 258 | * Aggregating function - a function that returns a single value (median)
 259 | 
 260 | The general format for doing this in pandas is:
 261 | 
 262 | ```python
 263 | df.groupby('grouping column').agg({'aggregating column': 'aggregating function'})
 264 | ```
 265 | 
 266 | Specifically, the following code is executed within dexplot.
 267 | 
 268 | 
 269 | ```python
 270 | airbnb.groupby('neighborhood').agg({'price': 'median'})
 271 | ```
 272 | 
 273 | 
 274 | 
 275 | 
 276 | <div>
 277 | <table border="1" class="dataframe">
 278 |   <thead>
 279 |     <tr style="text-align: right;">
 280 |       <th></th>
 281 |       <th>price</th>
 282 |     </tr>
 283 |     <tr>
 284 |       <th>neighborhood</th>
 285 |       <th></th>
 286 |     </tr>
 287 |   </thead>
 288 |   <tbody>
 289 |     <tr>
 290 |       <th>Brightwood Park</th>
 291 |       <td>87.0</td>
 292 |     </tr>
 293 |     <tr>
 294 |       <th>Capitol Hill</th>
 295 |       <td>129.5</td>
 296 |     </tr>
 297 |     <tr>
 298 |       <th>Columbia Heights</th>
 299 |       <td>95.0</td>
 300 |     </tr>
 301 |     <tr>
 302 |       <th>Dupont Circle</th>
 303 |       <td>125.0</td>
 304 |     </tr>
 305 |     <tr>
 306 |       <th>Edgewood</th>
 307 |       <td>100.0</td>
 308 |     </tr>
 309 |     <tr>
 310 |       <th>Kalorama Heights</th>
 311 |       <td>118.0</td>
 312 |     </tr>
 313 |     <tr>
 314 |       <th>Shaw</th>
 315 |       <td>133.5</td>
 316 |     </tr>
 317 |     <tr>
 318 |       <th>Union Station</th>
 319 |       <td>120.0</td>
 320 |     </tr>
 321 |   </tbody>
 322 | </table>
 323 | </div>
 324 | 
 325 | 
 326 | 
 327 | ### Number and percent of missing values with  `'countna'` and `'percna'`
 328 | 
 329 | In addition to all the common aggregating functions, you can use the strings `'countna'` and `'percna'` to get the number and percentage of missing values per group.
 330 | 
 331 | 
 332 | ```python
 333 | dxp.bar(x='neighborhood', y='response_time', data=airbnb, aggfunc='countna')
 334 | ```
 335 | 
 336 | 
 337 | 
 338 | 
 339 | ![png](images/output_14_0.png)
 340 | 
 341 | 
 342 | 
 343 | ### Sorting the bars by values
 344 | 
 345 | By default, the bars will be sorted by the grouping column (x-axis here) in alphabetical order. Use the `sort_values` parameter to sort the bars by value.
 346 | 
 347 | * None - sort x/y axis labels alphabetically (default)
 348 | * `asc` - sort values from least to greatest
 349 | * `desc` - sort values from greatest to least
 350 | 
 351 | 
 352 | ```python
 353 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
 354 | ```
 355 | 
 356 | 
 357 | 
 358 | 
 359 | ![png](images/output_16_0.png)
 360 | 
 361 | 
 362 | 
 363 | Here, we sort the values from greatest to least.
 364 | 
 365 | 
 366 | ```python
 367 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
 368 | ```
 369 | 
 370 | 
 371 | 
 372 | 
 373 | ![png](images/output_18_0.png)
 374 | 
 375 | 
 376 | 
 377 | ### Specify order with `x_order`
 378 | 
 379 | Specify a specific order of the labels on the x-axis by passing a list of values to `x_order`. This can also act as a filter to limit the number of bars.
 380 | 
 381 | 
 382 | ```python
 383 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 384 |         x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
 385 | ```
 386 | 
 387 | 
 388 | 
 389 | 
 390 | ![png](images/output_20_0.png)
 391 | 
 392 | 
 393 | 
 394 | By default, `x_order` and all of the `_order` parameters are set to `'asc'` by default, which will order them alphabetically. Use the string `'desc'` to sort in the opposite direction.
 395 | 
 396 | 
 397 | ```python
 398 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc')
 399 | ```
 400 | 
 401 | 
 402 | 
 403 | 
 404 | ![png](images/output_22_0.png)
 405 | 
 406 | 
 407 | 
 408 | ### Filter for the neighborhoods with most/least frequency of occurrence
 409 | 
 410 | You can use `x_order` again to filter for the x-values that appear the most/least often by setting it to the string `'top n'` or `'bottom n'` where `n` is an integer. Here, we filter for the top 4 most frequently occurring neighborhoods. This option is useful when there are dozens of unique values in the grouping column.
 411 | 
 412 | 
 413 | ```python
 414 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 415 |         x_order='top 4')
 416 | ```
 417 | 
 418 | 
 419 | 
 420 | 
 421 | ![png](images/output_24_0.png)
 422 | 
 423 | 
 424 | 
 425 | We can verify that the four neighborhoods are the most common.
 426 | 
 427 | 
 428 | ```python
 429 | airbnb['neighborhood'].value_counts()
 430 | ```
 431 | 
 432 | 
 433 | 
 434 | 
 435 |     Columbia Heights    773
 436 |     Union Station       713
 437 |     Capitol Hill        654
 438 |     Edgewood            610
 439 |     Dupont Circle       549
 440 |     Shaw                514
 441 |     Brightwood Park     406
 442 |     Kalorama Heights    362
 443 |     Name: neighborhood, dtype: int64
 444 | 
 445 | 
 446 | 
 447 | ### Horizontal bars
 448 | 
 449 | Set `orientation` to `'h'` for horizontal bars. When you do this, you'll need to switch `x` and `y` since the grouping column (neighborhood) will be along the y-axis and the aggregating column (price) will be along the x-axis.
 450 | 
 451 | 
 452 | ```python
 453 | dxp.bar(x='price', y='neighborhood', data=airbnb, aggfunc='median', 
 454 |         orientation='h', sort_values='desc')
 455 | ```
 456 | 
 457 | 
 458 | 
 459 | 
 460 | ![png](images/output_28_0.png)
 461 | 
 462 | 
 463 | 
 464 | Switching orientation is possible for most other plots.
 465 | 
 466 | 
 467 | ```python
 468 | dxp.line(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
 469 | ```
 470 | 
 471 | 
 472 | 
 473 | 
 474 | ![png](images/output_30_0.png)
 475 | 
 476 | 
 477 | 
 478 | ### Split bars into groups
 479 | 
 480 | You can split each bar into further groups by setting the `split` parameter to another column.
 481 | 
 482 | 
 483 | ```python
 484 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
 485 | ```
 486 | 
 487 | 
 488 | 
 489 | 
 490 | ![png](images/output_32_0.png)
 491 | 
 492 | 
 493 | 
 494 | We can use the `pivot_table` method to verify the results in pandas.
 495 | 
 496 | 
 497 | ```python
 498 | airbnb.pivot_table(index='superhost', columns='neighborhood', 
 499 |                    values='price', aggfunc='median')
 500 | ```
 501 | 
 502 | 
 503 | 
 504 | 
 505 | <div>
 506 | <table border="1" class="dataframe">
 507 |   <thead>
 508 |     <tr style="text-align: right;">
 509 |       <th>neighborhood</th>
 510 |       <th>Brightwood Park</th>
 511 |       <th>Capitol Hill</th>
 512 |       <th>Columbia Heights</th>
 513 |       <th>Dupont Circle</th>
 514 |       <th>Edgewood</th>
 515 |       <th>Kalorama Heights</th>
 516 |       <th>Shaw</th>
 517 |       <th>Union Station</th>
 518 |     </tr>
 519 |     <tr>
 520 |       <th>superhost</th>
 521 |       <th></th>
 522 |       <th></th>
 523 |       <th></th>
 524 |       <th></th>
 525 |       <th></th>
 526 |       <th></th>
 527 |       <th></th>
 528 |       <th></th>
 529 |     </tr>
 530 |   </thead>
 531 |   <tbody>
 532 |     <tr>
 533 |       <th>No</th>
 534 |       <td>85.0</td>
 535 |       <td>129.0</td>
 536 |       <td>90.5</td>
 537 |       <td>120.0</td>
 538 |       <td>100.0</td>
 539 |       <td>110.0</td>
 540 |       <td>130.0</td>
 541 |       <td>120.0</td>
 542 |     </tr>
 543 |     <tr>
 544 |       <th>Yes</th>
 545 |       <td>90.0</td>
 546 |       <td>130.0</td>
 547 |       <td>103.0</td>
 548 |       <td>135.0</td>
 549 |       <td>100.0</td>
 550 |       <td>124.0</td>
 551 |       <td>135.0</td>
 552 |       <td>125.0</td>
 553 |     </tr>
 554 |   </tbody>
 555 | </table>
 556 | </div>
 557 | 
 558 | 
 559 | 
 560 | Set the order of the unique split values with `split_order`, which can also act as a filter.
 561 | 
 562 | 
 563 | ```python
 564 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 565 |         split='superhost', split_order=['Yes', 'No'])
 566 | ```
 567 | 
 568 | 
 569 | 
 570 | 
 571 | ![png](images/output_36_0.png)
 572 | 
 573 | 
 574 | 
 575 | Like all the `_order` parameters, `split_order` defaults to `'asc'` (alphabetical) order. Set it to `'desc'` for the opposite.
 576 | 
 577 | 
 578 | ```python
 579 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 580 |         split='property_type', split_order='desc')
 581 | ```
 582 | 
 583 | 
 584 | 
 585 | 
 586 | ![png](images/output_38_0.png)
 587 | 
 588 | 
 589 | 
 590 | Filtering for the most/least frequent split categories is possible.
 591 | 
 592 | 
 593 | ```python
 594 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 595 |         split='property_type', split_order='bottom 2')
 596 | ```
 597 | 
 598 | 
 599 | 
 600 | 
 601 | ![png](images/output_40_0.png)
 602 | 
 603 | 
 604 | 
 605 | Verifying that the least frequent property types are Townhouse and Condominium.
 606 | 
 607 | 
 608 | ```python
 609 | airbnb['property_type'].value_counts()
 610 | ```
 611 | 
 612 | 
 613 | 
 614 | 
 615 |     Apartment      2403
 616 |     House           877
 617 |     Townhouse       824
 618 |     Condominium     477
 619 |     Name: property_type, dtype: int64
 620 | 
 621 | 
 622 | 
 623 | ### Stacked bar charts
 624 | 
 625 | Stack all the split groups one on top of the other by setting `stacked` to `True`.
 626 | 
 627 | 
 628 | ```python
 629 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 630 |         split='superhost', split_order=['Yes', 'No'], stacked=True)
 631 | ```
 632 | 
 633 | 
 634 | 
 635 | 
 636 | ![png](images/output_44_0.png)
 637 | 
 638 | 
 639 | 
 640 | ### Split into multiple plots
 641 | 
 642 | It's possible to split the data further into separate plots by the unique values in a different column with the `row` and `col` parameters. Here, each kind of `property_type` has its own plot.
 643 | 
 644 | 
 645 | ```python
 646 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 647 |         split='superhost', col='property_type')
 648 | ```
 649 | 
 650 | 
 651 | 
 652 | 
 653 | ![png](images/output_46_0.png)
 654 | 
 655 | 
 656 | 
 657 | If there isn't room for all of the plots, set the `wrap` parameter to an integer to set the maximum number of plots per row/col. We also specify the `col_order` to be descending alphabetically.
 658 | 
 659 | 
 660 | ```python
 661 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 662 |         split='superhost', col='property_type', wrap=2, col_order='desc')
 663 | ```
 664 | 
 665 | 
 666 | 
 667 | 
 668 | ![png](images/output_48_0.png)
 669 | 
 670 | 
 671 | 
 672 | Use `col_order` to both filter and set a specific order for the plots.
 673 | 
 674 | 
 675 | ```python
 676 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 677 |         split='superhost', col='property_type', col_order=['House', 'Condominium'])
 678 | ```
 679 | 
 680 | 
 681 | 
 682 | 
 683 | ![png](images/output_50_0.png)
 684 | 
 685 | 
 686 | 
 687 | Splits can be made simultaneously along row and columns.
 688 | 
 689 | 
 690 | ```python
 691 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost', 
 692 |         col='property_type', col_order=['House', 'Condominium', 'Apartment'],
 693 |         row='bedrooms', row_order=[1, 2, 3])
 694 | ```
 695 | 
 696 | 
 697 | 
 698 | 
 699 | ![png](images/output_52_0.png)
 700 | 
 701 | 
 702 | 
 703 | By default, all axis limits are shared. Allow each plot to set its own limits by setting `sharex` and `sharey` to `False`.
 704 | 
 705 | 
 706 | ```python
 707 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost', 
 708 |         col='property_type', col_order=['House', 'Condominium', 'Apartment'],
 709 |         row='bedrooms', row_order=[1, 2, 3], sharey=False)
 710 | ```
 711 | 
 712 | 
 713 | 
 714 | 
 715 | ![png](images/output_54_0.png)
 716 | 
 717 | 
 718 | 
 719 | ### Set the width of each bar with `size`
 720 | 
 721 | The width (height when horizontal) of the bars is set with the `size` parameter. By default, this value is .9. Think of this number as the relative width of all the bars for a particular x/y value, where 1 is the distance between each x/y value.
 722 | 
 723 | 
 724 | ```python
 725 | dxp.bar(x='neighborhood', y='price', data=airbnb, 
 726 |         aggfunc='median', split='property_type',
 727 |         split_order=['Apartment', 'House'], 
 728 |         x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'], size=.5)
 729 | ```
 730 | 
 731 | 
 732 | 
 733 | 
 734 | ![png](images/output_56_0.png)
 735 | 
 736 | 
 737 | 
 738 | ### Splitting line plots
 739 | 
 740 | All the other aggregating plots work similarly.
 741 | 
 742 | 
 743 | ```python
 744 | dxp.line(x='neighborhood', y='price', data=airbnb, 
 745 |         aggfunc='median', split='property_type',
 746 |         split_order=['Apartment', 'House'], 
 747 |         x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'])
 748 | ```
 749 | 
 750 | 
 751 | 
 752 | 
 753 | ![png](images/output_58_0.png)
 754 | 
 755 | 
 756 | 
 757 | ## Distribution plots - box, violin, histogram, kde
 758 | 
 759 | Distribution plots work similarly, but do not have an `aggfunc` since they do not aggregate. They take their group of values and draw some kind of shape that gives information on how that variable is distributed. 
 760 | 
 761 | ### Box plots
 762 | 
 763 | Box plots have colored boxes with ends at the first and third quartiles and a line at the median. The whiskers are placed at 1.5 times the difference between the third and first quartiles (Interquartile range (IQR)). Fliers are the points outside this range and plotted individually. By default, both box and violin plots are plotted horizontally.
 764 | 
 765 | 
 766 | ```python
 767 | dxp.box(x='price', y='neighborhood', data=airbnb)
 768 | ```
 769 | 
 770 | 
 771 | 
 772 | 
 773 | ![png](images/output_60_0.png)
 774 | 
 775 | 
 776 | 
 777 | Split the groups in the same manner as with the aggregation plots.
 778 | 
 779 | 
 780 | ```python
 781 | dxp.box(x='price', y='neighborhood', data=airbnb, 
 782 |         split='superhost', split_order=['Yes', 'No'])
 783 | ```
 784 | 
 785 | 
 786 | 
 787 | 
 788 | ![png](images/output_62_0.png)
 789 | 
 790 | 
 791 | 
 792 | Order the appearance of the splits alphabetically (in descending order here).
 793 | 
 794 | 
 795 | ```python
 796 | dxp.box(x='price', y='neighborhood', data=airbnb, 
 797 |         split='property_type', split_order='desc')
 798 | ```
 799 | 
 800 | 
 801 | 
 802 | 
 803 | ![png](images/output_64_0.png)
 804 | 
 805 | 
 806 | 
 807 | ### Filter range of values with `x_order`
 808 | 
 809 | It's possible to filter the range of possible values by passing in a list of the minimum and maximum to `x_order`.
 810 | 
 811 | 
 812 | ```python
 813 | dxp.box(x='price', y='neighborhood', data=airbnb, 
 814 |         split='superhost', x_order=[50, 250])
 815 | ```
 816 | 
 817 | 
 818 | 
 819 | 
 820 | ![png](images/output_66_0.png)
 821 | 
 822 | 
 823 | 
 824 | Change the `x` and `y` while setting `orientation` to make vertical bar plots.
 825 | 
 826 | 
 827 | ```python
 828 | dxp.box(x='neighborhood', y='price', data=airbnb, orientation='v',
 829 |         split='property_type', split_order='top 2')
 830 | ```
 831 | 
 832 | 
 833 | 
 834 | 
 835 | ![png](images/output_68_0.png)
 836 | 
 837 | 
 838 | 
 839 | Violin plots work identically to box plots, but show "violins", kernel density plots duplicated on both sides of a line.
 840 | 
 841 | 
 842 | ```python
 843 | dxp.violin(x='price', y='neighborhood', data=airbnb, 
 844 |           split='superhost', split_order=['Yes', 'No'])
 845 | ```
 846 | 
 847 | 
 848 | 
 849 | 
 850 | ![png](images/output_70_0.png)
 851 | 
 852 | 
 853 | 
 854 | Splitting by rows and columns is possible as well with distribution plots.
 855 | 
 856 | 
 857 | ```python
 858 | dxp.box(x='price', y='neighborhood', data=airbnb,split='superhost', 
 859 |         col='property_type', col_order=['House', 'Condominium', 'Apartment'],
 860 |         row='bedrooms', row_order=[1, 2])
 861 | ```
 862 | 
 863 | 
 864 | 
 865 | 
 866 | ![png](images/output_72_0.png)
 867 | 
 868 | 
 869 | 
 870 | ### Histograms
 871 | 
 872 | Histograms work in a slightly different manner. Instead of passing both `x` and `y`, you give it a single numeric column. A vertical histogram with 20 bins of the counts is created by default.
 873 | 
 874 | 
 875 | ```python
 876 | dxp.hist(val='price', data=airbnb)
 877 | ```
 878 | 
 879 | 
 880 | 
 881 | 
 882 | ![png](images/output_74_0.png)
 883 | 
 884 | 
 885 | 
 886 | We can use `split` just like we did above and also create horizontal histograms.
 887 | 
 888 | 
 889 | ```python
 890 | dxp.hist(val='price', data=airbnb, orientation='h', split='superhost', bins=15)
 891 | ```
 892 | 
 893 | 
 894 | 
 895 | 
 896 | ![png](images/output_76_0.png)
 897 | 
 898 | 
 899 | 
 900 | Here, we customize our histogram by plotting the cumulative density as opposed to the raw frequency count using the outline of the bars ('step').
 901 | 
 902 | 
 903 | ```python
 904 | dxp.hist(val='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3], 
 905 |          bins=30, density=True, histtype='step', cumulative=True)
 906 | ```
 907 | 
 908 | 
 909 | 
 910 | 
 911 | ![png](images/output_78_0.png)
 912 | 
 913 | 
 914 | 
 915 | ### KDE Plots
 916 | 
 917 | Kernel density estimates provide an estimate for the probability distribution of a continuous variable. Here, we examine how price is distributed by bedroom.
 918 | 
 919 | 
 920 | ```python
 921 | dxp.kde(x='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3])
 922 | ```
 923 | 
 924 | 
 925 | 
 926 | 
 927 | ![png](images/output_80_0.png)
 928 | 
 929 | 
 930 | 
 931 | Graph the cumulative distribution instead on multiple plots.
 932 | 
 933 | 
 934 | ```python
 935 | dxp.kde(x='price', data=airbnb, split='bedrooms', 
 936 |         split_order=[1, 2, 3], cumulative=True, col='property_type', wrap=2)
 937 | ```
 938 | 
 939 | 
 940 | 
 941 | 
 942 | ![png](images/output_82_0.png)
 943 | 
 944 | 
 945 | 
 946 | ### Two-dimensional KDE's
 947 | 
 948 | Provide two numeric columns to `x` and `y` to get a two dimensional KDE.
 949 | 
 950 | 
 951 | ```python
 952 | dxp.kde(x='price', y='cleaning_fee', data=airbnb)
 953 | ```
 954 | 
 955 | 
 956 | 
 957 | 
 958 | ![png](images/output_84_0.png)
 959 | 
 960 | 
 961 | 
 962 | Create a grid of two-dimensional KDE's.
 963 | 
 964 | 
 965 | ```python
 966 | dxp.kde(x='price', y='cleaning_fee', data=airbnb, row='neighborhood', wrap=3)
 967 | ```
 968 | 
 969 | 
 970 | 
 971 | 
 972 | ![png](images/output_86_0.png)
 973 | 
 974 | 
 975 | 
 976 | ## Count plots
 977 | 
 978 | The `count` function graphs the frequency of unique values as bars. By default, it plots the values in descending order.
 979 | 
 980 | 
 981 | ```python
 982 | dxp.count(val='neighborhood', data=airbnb)
 983 | ```
 984 | 
 985 | 
 986 | 
 987 | 
 988 | ![png](images/output_88_0.png)
 989 | 
 990 | 
 991 | 
 992 | In pandas, this is a straightforward call to the `value_counts` method.
 993 | 
 994 | 
 995 | ```python
 996 | airbnb['neighborhood'].value_counts()
 997 | ```
 998 | 
 999 | 
1000 | 
1001 | 
1002 |     Columbia Heights    773
1003 |     Union Station       713
1004 |     Capitol Hill        654
1005 |     Edgewood            610
1006 |     Dupont Circle       549
1007 |     Shaw                514
1008 |     Brightwood Park     406
1009 |     Kalorama Heights    362
1010 |     Name: neighborhood, dtype: int64
1011 | 
1012 | 
1013 | 
1014 | ### Relative frequency with `normalize`
1015 | 
1016 | Instead of the raw counts, get the relative frequency by setting normalize to `True`.
1017 | 
1018 | 
1019 | ```python
1020 | dxp.count(val='neighborhood', data=airbnb, normalize=True)
1021 | ```
1022 | 
1023 | 
1024 | 
1025 | 
1026 | ![png](images/output_92_0.png)
1027 | 
1028 | 
1029 | 
1030 | Here, we split by property type.
1031 | 
1032 | 
1033 | ```python
1034 | dxp.count(val='neighborhood', data=airbnb, split='property_type')
1035 | ```
1036 | 
1037 | 
1038 | 
1039 | 
1040 | ![png](images/output_94_0.png)
1041 | 
1042 | 
1043 | 
1044 | In pandas, this is done with the `crosstab` function.
1045 | 
1046 | 
1047 | ```python
1048 | pd.crosstab(index=airbnb['property_type'], columns=airbnb['neighborhood'])
1049 | ```
1050 | 
1051 | 
1052 | 
1053 | 
1054 | <div>
1055 | <table border="1" class="dataframe">
1056 |   <thead>
1057 |     <tr style="text-align: right;">
1058 |       <th>neighborhood</th>
1059 |       <th>Brightwood Park</th>
1060 |       <th>Capitol Hill</th>
1061 |       <th>Columbia Heights</th>
1062 |       <th>Dupont Circle</th>
1063 |       <th>Edgewood</th>
1064 |       <th>Kalorama Heights</th>
1065 |       <th>Shaw</th>
1066 |       <th>Union Station</th>
1067 |     </tr>
1068 |     <tr>
1069 |       <th>property_type</th>
1070 |       <th></th>
1071 |       <th></th>
1072 |       <th></th>
1073 |       <th></th>
1074 |       <th></th>
1075 |       <th></th>
1076 |       <th></th>
1077 |       <th></th>
1078 |     </tr>
1079 |   </thead>
1080 |   <tbody>
1081 |     <tr>
1082 |       <th>Apartment</th>
1083 |       <td>167</td>
1084 |       <td>299</td>
1085 |       <td>374</td>
1086 |       <td>397</td>
1087 |       <td>244</td>
1088 |       <td>284</td>
1089 |       <td>315</td>
1090 |       <td>323</td>
1091 |     </tr>
1092 |     <tr>
1093 |       <th>Condominium</th>
1094 |       <td>35</td>
1095 |       <td>70</td>
1096 |       <td>97</td>
1097 |       <td>62</td>
1098 |       <td>65</td>
1099 |       <td>42</td>
1100 |       <td>52</td>
1101 |       <td>54</td>
1102 |     </tr>
1103 |     <tr>
1104 |       <th>House</th>
1105 |       <td>131</td>
1106 |       <td>137</td>
1107 |       <td>157</td>
1108 |       <td>47</td>
1109 |       <td>146</td>
1110 |       <td>23</td>
1111 |       <td>61</td>
1112 |       <td>175</td>
1113 |     </tr>
1114 |     <tr>
1115 |       <th>Townhouse</th>
1116 |       <td>73</td>
1117 |       <td>148</td>
1118 |       <td>145</td>
1119 |       <td>43</td>
1120 |       <td>155</td>
1121 |       <td>13</td>
1122 |       <td>86</td>
1123 |       <td>161</td>
1124 |     </tr>
1125 |   </tbody>
1126 | </table>
1127 | </div>
1128 | 
1129 | 
1130 | 
1131 | Horizontal stacked count plots.
1132 | 
1133 | 
1134 | ```python
1135 | dxp.count(val='neighborhood', data=airbnb, split='property_type', 
1136 |           orientation='h', stacked=True, col='superhost')
1137 | ```
1138 | 
1139 | 
1140 | 
1141 | 
1142 | ![png](images/output_98_0.png)
1143 | 
1144 | 
1145 | 
1146 | ### Normalize over different variables
1147 | 
1148 | Setting `normalize` to `True`, returns the relative frequency with respect to all of the data. You can normalize over any of the variables provided.
1149 | 
1150 | 
1151 | ```python
1152 | dxp.count(val='neighborhood', data=airbnb, split='property_type', normalize='neighborhood', 
1153 |                 title='Relative Frequency by Neighborhood')
1154 | ```
1155 | 
1156 | 
1157 | 
1158 | 
1159 | ![png](images/output_100_0.png)
1160 | 
1161 | 
1162 | 
1163 | Normalize over several variables at once with a list.
1164 | 
1165 | 
1166 | ```python
1167 | dxp.count(val='neighborhood', data=airbnb, split='superhost', 
1168 |           row='property_type', col='bedrooms', col_order=[1, 2],
1169 |           normalize=['neighborhood', 'property_type', 'bedrooms'], stacked=True)
1170 | ```
1171 | 
1172 | 
1173 | 
1174 | 
1175 | ![png](images/output_102_0.png)
1176 | 
1177 | 
1178 | 
1179 | ## Wide data
1180 | 
1181 | Dexplot can also plot wide data, or data where no aggregation happens. Here is a scatter plot of the location of each listing.
1182 | 
1183 | 
1184 | ```python
1185 | dxp.scatter(x='longitude', y='latitude', data=airbnb, 
1186 |             split='neighborhood', col='bedrooms', col_order=[2, 3])
1187 | ```
1188 | 
1189 | 
1190 | 
1191 | 
1192 | ![png](images/output_104_0.png)
1193 | 
1194 | 
1195 | 
1196 | If you've already aggregated your data, you can plot it directly without specifying `x` or `y`.
1197 | 
1198 | 
1199 | ```python
1200 | df = airbnb.pivot_table(index='neighborhood', columns='property_type', 
1201 |                         values='price', aggfunc='mean')
1202 | df
1203 | ```
1204 | 
1205 | 
1206 | 
1207 | 
1208 | <div>
1209 | 
1210 | <table border="1" class="dataframe">
1211 |   <thead>
1212 |     <tr style="text-align: right;">
1213 |       <th>property_type</th>
1214 |       <th>Apartment</th>
1215 |       <th>Condominium</th>
1216 |       <th>House</th>
1217 |       <th>Townhouse</th>
1218 |     </tr>
1219 |     <tr>
1220 |       <th>neighborhood</th>
1221 |       <th></th>
1222 |       <th></th>
1223 |       <th></th>
1224 |       <th></th>
1225 |     </tr>
1226 |   </thead>
1227 |   <tbody>
1228 |     <tr>
1229 |       <th>Brightwood Park</th>
1230 |       <td>96.119760</td>
1231 |       <td>105.000000</td>
1232 |       <td>121.671756</td>
1233 |       <td>133.479452</td>
1234 |     </tr>
1235 |     <tr>
1236 |       <th>Capitol Hill</th>
1237 |       <td>141.210702</td>
1238 |       <td>104.200000</td>
1239 |       <td>170.153285</td>
1240 |       <td>184.459459</td>
1241 |     </tr>
1242 |     <tr>
1243 |       <th>Columbia Heights</th>
1244 |       <td>114.676471</td>
1245 |       <td>126.773196</td>
1246 |       <td>135.292994</td>
1247 |       <td>124.358621</td>
1248 |     </tr>
1249 |     <tr>
1250 |       <th>Dupont Circle</th>
1251 |       <td>146.858942</td>
1252 |       <td>130.709677</td>
1253 |       <td>179.574468</td>
1254 |       <td>139.348837</td>
1255 |     </tr>
1256 |     <tr>
1257 |       <th>Edgewood</th>
1258 |       <td>108.508197</td>
1259 |       <td>112.846154</td>
1260 |       <td>156.335616</td>
1261 |       <td>147.503226</td>
1262 |     </tr>
1263 |     <tr>
1264 |       <th>Kalorama Heights</th>
1265 |       <td>122.542254</td>
1266 |       <td>155.928571</td>
1267 |       <td>92.695652</td>
1268 |       <td>158.230769</td>
1269 |     </tr>
1270 |     <tr>
1271 |       <th>Shaw</th>
1272 |       <td>153.888889</td>
1273 |       <td>158.500000</td>
1274 |       <td>202.114754</td>
1275 |       <td>173.279070</td>
1276 |     </tr>
1277 |     <tr>
1278 |       <th>Union Station</th>
1279 |       <td>128.458204</td>
1280 |       <td>133.833333</td>
1281 |       <td>162.748571</td>
1282 |       <td>162.167702</td>
1283 |     </tr>
1284 |   </tbody>
1285 | </table>
1286 | </div>
1287 | 
1288 | 
1289 | 
1290 | 
1291 | ```python
1292 | dxp.bar(data=df, orientation='h')
1293 | ```
1294 | 
1295 | 
1296 | 
1297 | 
1298 | ![png](images/output_107_0.png)
1299 | 
1300 | 
1301 | 
1302 | ### Time series
1303 | 
1304 | 
1305 | ```python
1306 | stocks = pd.read_csv('../data/stocks10.csv', parse_dates=['date'], index_col='date')
1307 | stocks.head()
1308 | ```
1309 | 
1310 | 
1311 | 
1312 | 
1313 | <div>
1314 | <table border="1" class="dataframe">
1315 |   <thead>
1316 |     <tr style="text-align: right;">
1317 |       <th></th>
1318 |       <th>MSFT</th>
1319 |       <th>AAPL</th>
1320 |       <th>SLB</th>
1321 |       <th>AMZN</th>
1322 |       <th>TSLA</th>
1323 |       <th>XOM</th>
1324 |       <th>WMT</th>
1325 |       <th>T</th>
1326 |       <th>FB</th>
1327 |       <th>V</th>
1328 |     </tr>
1329 |     <tr>
1330 |       <th>date</th>
1331 |       <th></th>
1332 |       <th></th>
1333 |       <th></th>
1334 |       <th></th>
1335 |       <th></th>
1336 |       <th></th>
1337 |       <th></th>
1338 |       <th></th>
1339 |       <th></th>
1340 |       <th></th>
1341 |     </tr>
1342 |   </thead>
1343 |   <tbody>
1344 |     <tr>
1345 |       <th>1999-10-25</th>
1346 |       <td>29.84</td>
1347 |       <td>2.32</td>
1348 |       <td>17.02</td>
1349 |       <td>82.75</td>
1350 |       <td>NaN</td>
1351 |       <td>21.45</td>
1352 |       <td>38.99</td>
1353 |       <td>16.78</td>
1354 |       <td>NaN</td>
1355 |       <td>NaN</td>
1356 |     </tr>
1357 |     <tr>
1358 |       <th>1999-10-26</th>
1359 |       <td>29.82</td>
1360 |       <td>2.34</td>
1361 |       <td>16.65</td>
1362 |       <td>81.25</td>
1363 |       <td>NaN</td>
1364 |       <td>20.89</td>
1365 |       <td>37.11</td>
1366 |       <td>17.28</td>
1367 |       <td>NaN</td>
1368 |       <td>NaN</td>
1369 |     </tr>
1370 |     <tr>
1371 |       <th>1999-10-27</th>
1372 |       <td>29.33</td>
1373 |       <td>2.38</td>
1374 |       <td>16.52</td>
1375 |       <td>75.94</td>
1376 |       <td>NaN</td>
1377 |       <td>20.80</td>
1378 |       <td>36.94</td>
1379 |       <td>18.27</td>
1380 |       <td>NaN</td>
1381 |       <td>NaN</td>
1382 |     </tr>
1383 |     <tr>
1384 |       <th>1999-10-28</th>
1385 |       <td>29.01</td>
1386 |       <td>2.43</td>
1387 |       <td>16.59</td>
1388 |       <td>71.00</td>
1389 |       <td>NaN</td>
1390 |       <td>21.19</td>
1391 |       <td>38.85</td>
1392 |       <td>19.79</td>
1393 |       <td>NaN</td>
1394 |       <td>NaN</td>
1395 |     </tr>
1396 |     <tr>
1397 |       <th>1999-10-29</th>
1398 |       <td>29.88</td>
1399 |       <td>2.50</td>
1400 |       <td>17.21</td>
1401 |       <td>70.62</td>
1402 |       <td>NaN</td>
1403 |       <td>21.47</td>
1404 |       <td>39.25</td>
1405 |       <td>20.00</td>
1406 |       <td>NaN</td>
1407 |       <td>NaN</td>
1408 |     </tr>
1409 |   </tbody>
1410 | </table>
1411 | </div>
1412 | 
1413 | 
1414 | 
1415 | 
1416 | ```python
1417 | dxp.line(data=stocks.head(500))
1418 | ```
1419 | 
1420 | 
1421 | 
1422 | 
1423 | ![png](images/output_110_0.png)
1424 | 
1425 | 
1426 | 


--------------------------------------------------------------------------------
/dexplot/_common_plot.py:
--------------------------------------------------------------------------------
  1 | import textwrap
  2 | import warnings
  3 | from collections import defaultdict
  4 | import io
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import matplotlib.pyplot as plt
  9 | from matplotlib import ticker
 10 | from matplotlib.colors import Colormap
 11 | 
 12 | 
 13 | NONETYPE = type(None)
 14 | 
 15 | class CommonPlot:
 16 | 
 17 | 
 18 |     def __init__(self, x, y, data, aggfunc, split, row, col, 
 19 |                  x_order, y_order, split_order, row_order, col_order,
 20 |                  orientation, sort_values, wrap, figsize, title, sharex, sharey, 
 21 |                  xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 
 22 |                  x_textwrap, y_textwrap, x_rot, y_rot, 
 23 |                  check_numeric=False, kind=None):
 24 | 
 25 |         self.used_columns = set()
 26 |         self.data = self.get_data(data)
 27 |         self.x = self.get_col(x)
 28 |         self.y = self.get_col(y)
 29 |         self.validate_x_y()
 30 |         self.orientation = orientation
 31 |         self.aggfunc = self.get_aggfunc(aggfunc)
 32 |         self.groupby = self.get_groupby()
 33 |         self.split = self.get_col(split)
 34 |         self.row = self.get_col(row)
 35 |         self.col = self.get_col(col)
 36 |         
 37 |         self.agg = self.set_agg()
 38 |         self.make_groups_categorical(kind)
 39 |         self.validate_numeric(check_numeric)
 40 |         
 41 |         self.x_order = self.validate_order(x_order, 'x')
 42 |         self.y_order = self.validate_order(y_order, 'y')
 43 |         self.split_order = self.validate_order(split_order, 'split')
 44 |         self.row_order = self.validate_order(row_order, 'row')
 45 |         self.col_order = self.validate_order(col_order, 'col')
 46 |         self.filter_data()
 47 |         self.groupby_order = self.get_groupby_order()
 48 | 
 49 |         self.sort_values = sort_values
 50 |         self.groupby_sort = True
 51 |         self.wrap = wrap
 52 |         self.figsize = figsize
 53 |         self.title = title
 54 |         self.sharex = sharex
 55 |         self.sharey = sharey
 56 |         self.xlabel = xlabel
 57 |         self.ylabel = ylabel
 58 |         self.xlim = xlim
 59 |         self.ylim = ylim
 60 |         self.xscale = xscale
 61 |         self.yscale = yscale
 62 |         self.colors = self.get_colors(cmap)
 63 |         self.x_textwrap = x_textwrap
 64 |         self.y_textwrap = y_textwrap
 65 |         self.x_rot = x_rot
 66 |         self.y_rot = y_rot
 67 | 
 68 |         self.validate_args()
 69 |         self.plot_type = self.get_plot_type()
 70 |         self.agg_kind = self.get_agg_kind()
 71 |         self.data = self.set_index()
 72 |         self.rows, self.cols = self.get_uniques()
 73 |         self.rows, self.cols = self.get_row_col_order()
 74 |         self.fig_shape = self.get_fig_shape()
 75 |         
 76 |         
 77 |     def get_data(self, data):
 78 |         if isinstance(data, pd.Series):
 79 |             return data.to_frame()
 80 | 
 81 |         if not isinstance(data, pd.DataFrame):
 82 |             raise TypeError('`data` must be a pandas DataFrame or Series')
 83 |         elif len(data) == 0:
 84 |             raise ValueError('DataFrame contains no data')
 85 |         return data.copy()
 86 | 
 87 |     def get_col(self, col, group=False):
 88 |         if col:
 89 |             try:
 90 |                 col in self.data.columns
 91 |             except KeyError:
 92 |                 raise KeyError(f'{col} is not a column in the DataFrame')
 93 |             
 94 |             if col in self.used_columns:
 95 |                 raise ValueError(f'Column {col} has already been chosen. '
 96 |                                  '`x`, `y`, `split`, `row`, and `col` must all be unique.')
 97 |             self.used_columns.add(col)
 98 |             return col
 99 | 
100 |     def validate_x_y(self):
101 |         if self.x == self.y and self.x is not None and self.y is not None:
102 |             raise ValueError('`x` and `y` cannot be the same column name')
103 | 
104 |     def get_aggfunc(self, aggfunc):
105 |         if aggfunc == 'countna':
106 |             return lambda x: x.isna().sum()
107 |         if aggfunc == 'percna':
108 |             return lambda x: x.isna().mean()
109 |         return aggfunc
110 | 
111 |     def get_groupby(self):
112 |         if self.x is None or self.y is None or self.aggfunc is None:
113 |             return
114 |         return self.x if self.orientation == 'v' else self.y
115 | 
116 |     def set_agg(self):
117 |         return self.y if self.orientation == 'v' else self.x
118 | 
119 |     def filter_data(self):
120 |         params = 'x', 'y', 'split', 'row', 'col'
121 |         for param in params:
122 |             name, order = getattr(self, param), getattr(self, param + '_order')
123 |             if name and order:
124 |                 s = self.data[name]
125 |                 if isinstance(order, list):
126 |                     if s.dtype.kind == 'O':
127 |                         for val in order:
128 |                             if not (s == val).any():
129 |                                 raise ValueError(f'Value {val} is not in column {name}')
130 |                         self.data = self.data[s.isin(order)]
131 |                     else:
132 |                         # allow datetimes?
133 |                         if len(order) != 2:
134 |                             raise ValueError(f'You are filtering {name}. Provide a two-item list '
135 |                                             'of the min and max values')
136 |                         self.data = self.data[s.between(*order)]
137 |                 elif isinstance(order, int):
138 |                     vc = s.value_counts()
139 |                     if order > 0:
140 |                         idx = vc.index[:order]
141 |                     else:
142 |                         idx = vc.index[order:]
143 |                     self.data = self.data[s.isin(idx)]
144 |                     setattr(self, param +'_order', idx.tolist())
145 | 
146 |             if name and self.data[name].dtype.name == 'category':
147 |                 self.data[name].cat.remove_unused_categories(inplace=True)
148 | 
149 |     def make_groups_categorical(self, kind):
150 |         category_cols = [self.groupby, self.split, self.row, self.col]
151 |         for col in category_cols:
152 |             if col:
153 |                 if self.data[col].dtype.name != 'category':
154 |                     self.data[col] = self.data[col].astype('category')
155 |         if kind == 'count':
156 |             col = self.x or self.y
157 |             if self.data[col].dtype.name != 'category':
158 |                 self.data[col] = self.data[col].astype('category')
159 | 
160 |     def validate_numeric(self, check_numeric):
161 |         if check_numeric:
162 |             for val in (self.x, self.y):
163 |                 if val and self.data[val].dtype.kind not in ('i', 'f', 'b'):
164 |                     raise TypeError(f'Column {val} must be numeric (integer or float)')
165 | 
166 |     def validate_order(self, order, kind):
167 |         if isinstance(order, str):
168 |             order = order.strip().lower()
169 |             if order in ('asc', 'desc'):
170 |                 return order
171 |             command = order.split()
172 |             if len(command) != 2 or command[0] not in ('top', 'bottom'):
173 |                 raise ValueError(f'{kind}_order string must begin with either "asc"/"desc" OR '
174 |                                  ' "top" or "bottom" followed by a space and then an integer.')
175 |             mult = int(command[0] == "top") * 2 - 1
176 |             try:
177 |                 num = int(command[1])
178 |             except ValueError:
179 |                 raise ValueError(f'{command[1]} is not a valid integer')
180 |             if num == 0:
181 |                 raise ValueError('Number cannot be 0')
182 |             return num * mult
183 |         elif isinstance(order, (tuple, list)):
184 |             return list(order)
185 |         elif hasattr(order, 'tolist'):
186 |             return order.tolist()
187 |         elif order is not None:
188 |             raise TypeError(f'{kind}_order must be a str or tuple/list/array/series.')
189 | 
190 |     def get_groupby_order(self):
191 |         if self.x == self.groupby:
192 |             return self.x_order
193 |         if self.y == self.groupby:
194 |             return self.y_order
195 | 
196 |     def get_colors(self, cmap):
197 |         if cmap is None:
198 |             cmap = 't10'
199 |             
200 |         if isinstance(cmap, str):
201 |             from .colors._colormaps import colormaps
202 |             try:
203 |                 return colormaps[cmap.lower()]
204 |             except KeyError:
205 |                 raise KeyError(f'Colormap {cmap} does not exist. Here are the '
206 |                                f'possible colormaps: {colormaps.keys()}')
207 |         elif isinstance(cmap, Colormap):
208 |             return cmap(range(cmap.N)).tolist()
209 |         elif isinstance(cmap, list):
210 |             return cmap
211 |         elif isinstance(cmap, tuple):
212 |             return list(cmap)
213 |         elif hasattr(cmap, 'tolist'):
214 |             return cmap.tolist()
215 |         else:
216 |             raise TypeError('`cmap` must be a string name of a colormap, a matplotlib colormap '
217 |                             'instance, list, or tuple of colors')
218 |         
219 |     def validate_args(self):
220 |         self.validate_plot_args()
221 |         self.validate_mpl_args()
222 |         self.validate_sort_values()
223 | 
224 |     def validate_plot_args(self):
225 |         if self.orientation not in ('v', 'h'):
226 |             raise ValueError('`orientation` must be either "v" or "h".')
227 | 
228 |         if not isinstance(self.wrap, (np.integer, int, NONETYPE)):
229 |             raise TypeError(f'`wrap` must either be None or an integer, not {type(wrap)}')
230 | 
231 |         if self.row and self.col and self.wrap is not None:
232 |             raise ValueError('You cannot provide a value for `wrap` if `row` '
233 |                              'and `col` are also provided')
234 | 
235 |     def validate_mpl_args(self):
236 |         if not isinstance(self.title, (NONETYPE, str)):
237 |             raise TypeError('`title` must be either None or a str')
238 |         if self.sharex not in (False, True, None, 'row', 'col'):
239 |             raise ValueError('`sharex` must be one of `False`, `True`, `None`, "row", or "col"')
240 |         if self.sharey not in (False, True, None, 'row', 'col'):
241 |             raise ValueError('`sharex` must be one of `False`, `True`, `None`, "row", or "col"')
242 | 
243 |         if not isinstance(self.xlabel, (NONETYPE, str)):
244 |             raise TypeError('`xlabel` must be either None or a str')
245 |         if not isinstance(self.ylabel, (NONETYPE, str)):
246 |             raise TypeError('`ylabel` must be either None or a str')
247 | 
248 |         if not isinstance(self.xlim, (NONETYPE, tuple)):
249 |             raise TypeError('`xlim` must be a two-item tuple of numerics or `None`')
250 |         if not isinstance(self.ylim, (NONETYPE, tuple)):
251 |             raise TypeError('`xlim` must be a two-item tuple of numerics or `None`')
252 |         if self.xscale not in ('linear', 'log', 'symlog', 'logit'):
253 |             raise ValueError("`xscale must be one of 'linear', 'log', 'symlog', 'logit'")
254 |         if self.yscale not in ('linear', 'log', 'symlog', 'logit'):
255 |             raise ValueError("`xscale must be one of 'linear', 'log', 'symlog', 'logit'")
256 | 
257 |     def validate_sort_values(self):
258 |         if self.sort_values not in ['asc', 'desc', None]:
259 |             raise ValueError('`sort_values` must be one of "asc", "desc", or `None`')
260 |         if self.sort_values and (self.split or self.row or self.col):
261 |             raise ValueError('Can only use `sort_values` if `split`, `row`, and `col` are `None`.')
262 | 
263 |     def get_plot_type(self):
264 |         if self.row and self.col:
265 |             return 'square'
266 |         if self.row:
267 |             return 'row_only'
268 |         if self.col:
269 |             return 'col_only'
270 |         return 'single'
271 | 
272 |     def get_agg_kind(self):
273 |         if self.agg:
274 |             # string and category use 'O'
275 |             agg_kind = self.data[self.agg].dtype.kind
276 |             return agg_kind
277 | 
278 |     def set_index(self):
279 |         data = self.data
280 |         rc = []
281 |         if self.row:
282 |             rc.append(self.row)
283 |         if self.col:
284 |             rc.append(self.col)
285 |         if rc:
286 |             data = data.set_index(rc)
287 |         return data
288 | 
289 |     def get_uniques(self):
290 |         if self.plot_type == 'single':
291 |             return None, None
292 |         elif self.plot_type == 'row_only':
293 |             return self.data.index.unique(), None
294 |         elif self.plot_type == 'col_only':
295 |             return None, self.data.index.unique()
296 |         else:
297 |             return self.data.index.levels
298 | 
299 |     def get_row_col_order(self):
300 |         rows, cols = self.rows, self.cols
301 |         if rows is not None:
302 |             if self.row_order == 'desc':
303 |                 rows = sorted(rows, reverse=True)
304 |             else:
305 |                 rows = sorted(rows)
306 |         if cols is not None:
307 |             if self.col_order == 'desc':
308 |                 cols = sorted(cols, reverse=True)
309 |             else:
310 |                 cols = sorted(cols)
311 | 
312 |         if isinstance(self.row_order, list):
313 |             new_rows = []
314 |             for row in self.row_order:
315 |                 if row not in rows:
316 |                     raise ValueError(f'Row value {row} does not exist')
317 |                 new_rows.append(row)
318 |             rows = new_rows
319 |         if isinstance(self.col_order, list):
320 |             new_cols = []
321 |             for col in self.col_order:
322 |                 if col not in cols:
323 |                     raise ValueError(f'Column value {col} does not exist')
324 |                 new_cols.append(col)
325 |             cols = new_cols
326 |         return rows, cols
327 |         
328 |     def get_fig_shape(self):
329 |         if self.plot_type == 'single':
330 |             return 1, 1
331 | 
332 |         nrows = ncols = 1
333 |         if self.rows is not None:
334 |             nrows = len(self.rows)
335 |         if self.cols is not None:
336 |             ncols = len(self.cols) 
337 | 
338 |         if self.wrap:
339 |             if self.plot_type == 'row_only':
340 |                 ncols = (nrows - 1) // self.wrap + 1
341 |                 nrows = min(nrows, self.wrap)
342 |             elif self.plot_type == 'col_only':
343 |                 nrows = (ncols - 1) // self.wrap + 1
344 |                 ncols = min(ncols, self.wrap)
345 |         return nrows, ncols
346 | 
347 |     def get_data_for_every_plot(self):
348 |         # TODO: catch keyerror for groups that dont exist
349 |         rows, cols = self.get_row_col_order()
350 |         if self.plot_type == 'row_only':
351 |             return [(row, self.data.loc[row]) for row in rows]
352 |         if self.plot_type in ('row_only', 'col_only'):
353 |             return [(col, self.data.loc[col]) for col in cols]
354 |         elif self.plot_type == 'square':
355 |             groups = []
356 |             for col in cols:
357 |                 for row in rows:
358 |                     group = row, col
359 |                     try:
360 |                         with warnings.catch_warnings():
361 |                             warnings.simplefilter("ignore")
362 |                             data = self.data.loc[group]
363 |                     except (KeyError, TypeError):
364 |                         data = self.data.iloc[:0]
365 |                     groups.append((group, data))
366 |             return groups
367 |         else:
368 |             return [(None, self.data)]
369 | 
370 |     def get_labels(self, labels):
371 |         # this won't work for wrapping
372 |         if self.plot_type == 'square':
373 |             return str(labels[0]), str(labels[1])
374 |         elif self.plot_type == 'row_only':
375 |             return str(labels), None
376 |         elif self.plot_type == 'col_only':
377 |             return None, str(labels)
378 |         return None, None
379 | 
380 |     def sort_values_xy(self, x, y):
381 |         grp, num = (x, y) if self.orientation == 'v' else (y, x)
382 |         if self.sort_values is None:
383 |             return x, y
384 |         elif self.sort_values == 'asc':
385 |             order = np.lexsort([grp, num])
386 |         else:
387 |             order = np.lexsort([grp, -num])
388 |         if self.orientation == 'h':
389 |             order = order[::-1]
390 |         return x[order], y[order]
391 | 
392 |     def get_order(self, arr, vals):
393 |         arr = arr.tolist()
394 |         order = []
395 |         for val in vals:
396 |             try:
397 |                 idx = arr.index(val)
398 |             except ValueError:
399 |                 raise ValueError(f'{val} is not a valid column value')
400 |             order.append(idx)
401 |         return order
402 | 
403 |     def reverse_order(self, order):
404 |         cond1 = order == 'desc' and self.orientation == 'v'
405 |         cond2 = order in ('asc', None) and self.orientation == 'h'
406 |         return cond1 or cond2
407 | 
408 |     def order_xy(self, x, y):
409 |         if self.x_order and self.x != self.agg:
410 |             if isinstance(self.x_order, list):
411 |                 order = self.get_order(x, self.x_order)
412 |             elif self.reverse_order(self.x_order):
413 |                 order = np.lexsort([x])[::-1]
414 |             else:
415 |                 return x, y
416 |         elif self.y_order and self.y != self.agg:
417 |             if isinstance(self.y_order, list):
418 |                 order = self.get_order(y, self.y_order)
419 |             elif self.reverse_order(self.y_order):
420 |                 order = np.lexsort([y])[::-1]
421 |             else:
422 |                 return x, y
423 |         else:
424 |             return x, y
425 |         return x[order], y[order]
426 | 
427 |     def get_correct_data_order(self, x, y):
428 |         x, y = self.sort_values_xy(x, y)
429 |         if self.sort_values is None:
430 |             x, y = self.order_xy(x, y)
431 |         return x, y
432 | 
433 |     def get_wide_data(self, data):
434 |         x = data.index.values
435 |         y = {col: data[col].values for col in data.columns}
436 |         if self.orientation == 'h':
437 |             x, y = y, x
438 |         return x, y
439 | 
440 |     def get_wide_columns(self, data):
441 |         cols = []
442 |         used_cols = [self.groupby, self.split, self.row, self.col]
443 |         for col in data.columns:
444 |             if col not in used_cols:
445 |                 cols.append(col)
446 |         return cols
447 | 
448 |     def get_ordered_groups(self, data, specific_order, kind):
449 |         # used for split and groupby groups
450 |         order = []
451 |         groups = []
452 |         sort = specific_order is not None
453 |         # TODO: Need to decide defaults for x_order, y_order etc... either None or 'asc'
454 |         for grp, data_grp in data.groupby(getattr(self, kind), sort=True):
455 |             order.append((grp, data_grp))
456 |             groups.append(grp)
457 | 
458 |         if isinstance(specific_order, list):
459 |             new_order = []
460 |             for grp in specific_order:
461 |                 try:
462 |                     idx = groups.index(grp)
463 |                 except ValueError:
464 |                     col = getattr(self, kind)
465 |                     raise ValueError(f'Value "{grp}" from `{kind}_order` is '
466 |                                     f'not in column {col}')
467 |         
468 |                 new_order.append(idx)
469 |             order = [order[i] for i in new_order]
470 |         elif specific_order == 'desc':
471 |             new_order = np.lexsort([groups])[::-1]
472 |             order = [order[i] for i in new_order]
473 | 
474 |         return order
475 | 
476 |     def get_final_groups(self, data, split_label, row_label, col_label):
477 |         groups = []
478 |         if self.aggfunc == '__distribution__':
479 |             if self.groupby is not None:
480 |                 for grp, data_grp in self.get_ordered_groups(data, self.groupby_order, 'groupby'):
481 |                     vals = data_grp[self.agg]
482 |                     groups.append((vals, split_label, grp, row_label, col_label))
483 |             else:
484 |                 col = self.x or self.y
485 |                 vals = data[col]
486 |                 groups.append((vals, split_label, self.col, row_label, col_label))
487 |         elif self.groupby is not None:
488 |             try:
489 |                 s = data.groupby(self.groupby, sort=self.groupby_sort)[self.agg].agg(self.aggfunc)
490 |             except Exception as e:
491 |                 if type(e).__name__ == 'DataError':
492 |                     raise ValueError(f'The aggregating column {self.agg} is not numeric and '
493 |                                      f'cannot be aggregated with {self.aggfunc}. You might need '
494 |                                       'to switch x and y')
495 |                 else:
496 |                     raise e
497 |             x, y = s.index.values, s.values
498 |             x, y = (x, y) if self.orientation == 'v' else (y, x)
499 |             x, y = self.get_correct_data_order(x, y)
500 |             groups.append((x, y, split_label, self.groupby, row_label, col_label))
501 |         elif self.x is None or self.y is None:
502 |             if self.x:
503 |                 s = data[self.x]
504 |                 x, y = s.values, s.index.values
505 |                 x, y = self.get_correct_data_order(x, y)
506 |                 groups.append((x, y, split_label, self.x, row_label, col_label))
507 |             elif self.y:
508 |                 s = data[self.y]
509 |                 x, y = s.index.values, s.values
510 |                 x, y = self.get_correct_data_order(x, y)
511 |                 groups.append((x, y, split_label, self.y, row_label, col_label))
512 |             else:
513 |                 # wide data
514 |                 for col in self.get_wide_columns(data):
515 |                     s = data[col]
516 |                     x, y = s.index.values, s.values
517 |                     x, y = self.get_correct_data_order(x, y)
518 |                     x, y = (x, y) if self.orientation == 'v' else (y, x)
519 |                     groups.append((x, y, col, None, row_label, col_label))
520 |         else:
521 |             # simple raw plot - make sure to warn when lots of data for bar/box/hist
522 |             # one graph per row - OK for scatterplots and line plots
523 |             x, y = self.get_correct_data_order(data[self.x], data[self.y])
524 |             groups.append((x, y, split_label, None, row_label, col_label))
525 |         return groups
526 | 
527 |     def get_x_y_plot(self, x, y):
528 |         x_plot, y_plot = x, y
529 |         if x_plot.dtype.kind == 'O':
530 |             x_plot = np.arange(len(x_plot))
531 |         if y_plot.dtype.kind == 'O':
532 |             y_plot = np.arange(len(y_plot))
533 |         return x_plot, y_plot
534 | 
535 |     def get_distribution_data(self, info):
536 |         cur_data = defaultdict(list)
537 |         cur_ticklabels = defaultdict(list)
538 |         for vals, split_label, col_name, row_label, col_label in info:
539 |             cur_data[split_label].append(vals)
540 |             cur_ticklabels[split_label].append(col_name)
541 |         return cur_data, cur_ticklabels
542 | 
543 | 
544 | class MPLCommon(CommonPlot):
545 | 
546 |     def __init__(self, x, y, data, aggfunc, split, row, col, 
547 |                  x_order, y_order, split_order, row_order, col_order,
548 |                  orientation, sort_values, wrap, figsize, title, sharex, sharey, 
549 |                  xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 
550 |                  x_textwrap, y_textwrap, x_rot, y_rot, 
551 |                  check_numeric=False, kind=None):
552 |         super().__init__(x, y, data, aggfunc, split, row, col, 
553 |                  x_order, y_order, split_order, row_order, col_order,
554 |                  orientation, sort_values, wrap, figsize, title, sharex, sharey, 
555 |                  xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 
556 |                  x_textwrap, y_textwrap, x_rot, y_rot, 
557 |                  check_numeric=False, kind=None)
558 |         self.figsize = self.get_figsize()
559 |         self.user_figsize = self.figsize is not None
560 |         self.original_rcParams = plt.rcParams.copy()
561 |         self.set_rcParams()
562 |         self.fig, self.axs = self.create_figure()
563 |         self.set_color_cycle()
564 |         self.data_for_plots = self.get_data_for_every_plot()
565 |         self.final_data = self.get_final_data()
566 |         self.style_fig()
567 |         self.add_ax_titles()
568 |         self.add_fig_title()
569 |     
570 |     def get_figsize(self):
571 |         if self.figsize is None:
572 |             return
573 |         elif isinstance(self.figsize, (list, tuple)):
574 |             if len(self.figsize) != 2:
575 |                 raise ValueError('figsize must be a two-item tuple/list')
576 |             for val in self.figsize:
577 |                 if not isinstance(val, (int, float)):
578 |                     raise ValueError('Each item in figsize must be an integer or a float')
579 |         else:
580 |             raise TypeError('figsize must be a two-item tuple')
581 | 
582 |         return self.fig_shape[1] * 4, self.fig_shape[0] * 3
583 | 
584 |     def create_figure(self):
585 |         fig = plt.Figure(tight_layout=True, dpi=144, figsize=self.figsize)
586 |         axs = fig.subplots(*self.fig_shape, sharex=self.sharex, sharey=self.sharey)
587 |         if self.fig_shape != (1, 1):
588 |             axs = axs.flatten(order='F')
589 |         else:
590 |             axs = [axs]
591 |         return fig, axs
592 | 
593 |     def set_color_cycle(self):
594 |         for ax in self.axs:
595 |             ax.set_prop_cycle(color=self.colors)
596 | 
597 |     def get_final_data(self):
598 |         # create list of data for each call to plotting method
599 |         final_data = defaultdict(list)
600 |         for (labels, data), ax in zip(self.data_for_plots, self.axs):
601 |             row_label, col_label = self.get_labels(labels)
602 |             if self.split:
603 |                 for grp, data_grp in self.get_ordered_groups(data, self.split_order, 'split'):
604 |                     final_data[ax].extend(self.get_final_groups(data_grp, grp, row_label, col_label))
605 |             else:
606 |                 final_data[ax].extend(self.get_final_groups(data, None, row_label, col_label))
607 |         return final_data
608 | 
609 |     def style_fig(self):
610 |         for ax in self.axs:
611 |             ax.tick_params(length=0)
612 |             ax.set_facecolor('.9')
613 |             ax.grid(True)
614 |             ax.set_axisbelow(True)
615 |             for spine in ax.spines.values():
616 |                 spine.set_visible(False)
617 | 
618 |     def add_x_y_labels(self):
619 |         if self.plot_type == 'single':
620 |             self.axs[0].set_xlabel(self.x)
621 |             self.axs[0].set_ylabel(self.y)
622 |             return
623 |         
624 |         # need to eliminate next line to save lots of time
625 |         self.fig.canvas.print_figure(io.BytesIO())
626 |         rows, cols = self.fig_shape
627 |         top_left_ax, bottom_right_ax = self.axs[0], self.axs[rows * cols - 1]
628 |         top_left_points = top_left_ax.get_position().get_points()
629 |         bottom_right_points = bottom_right_ax.get_position().get_points()
630 | 
631 |         left = top_left_points[0][0]
632 |         right = bottom_right_points[1][0]
633 |         x = (right + left) / 2
634 | 
635 |         top = top_left_points[1][1]
636 |         bottom = bottom_right_points[0][1]
637 |         y = (top + bottom) / 2
638 |         self.fig.text(0, y, self.y, rotation=90, ha='center', va='center', size='larger')
639 |         self.fig.text(x, 0, self.x, ha='center', va='center', size='larger')
640 | 
641 |     def add_ax_titles(self):
642 |         for ax, info in self.final_data.items():
643 |             row_label, col_label = info[0][-2:]
644 |             if row_label is not None:
645 |                 row_label = str(row_label)
646 |             if col_label is not None:
647 |                 col_label = str(col_label)
648 |             row_label = row_label or ''
649 |             col_label = col_label or ''
650 |             if row_label and col_label:
651 |                 title = row_label + ' - ' + col_label
652 |             else:
653 |                 title = row_label or col_label
654 |             title = textwrap.fill(str(title), 30)
655 |             ax.set_title(title)
656 | 
657 |     def set_rcParams(self):
658 |         plt.rcParams['font.size'] = 6
659 |         plt.rcParams['font.family'] = 'Helvetica'
660 | 
661 |     def add_ticklabels(self, labels, ax, delta=0):
662 |         ticks = np.arange(len(labels))
663 |         ha, va = 'center', 'center'
664 |         if self.orientation == 'v':
665 |             if self.x_textwrap:
666 |                 labels = [textwrap.fill(str(label), self.x_textwrap) for label in labels]
667 |             ax.set_xticks(ticks + delta)
668 |             if self.x_rot is not None:
669 |                 if 0 <= self.x_rot <= 180:
670 |                     ha = 'right'
671 |                 else:
672 |                     ha = 'left'
673 |             ax.set_xticklabels(labels, rotation=self.x_rot, ha=ha)
674 |         else:
675 |             if self.y_textwrap:
676 |                 labels = [textwrap.fill(str(label), self.y_textwrap) for label in labels]
677 |             ax.set_yticks(ticks - delta)
678 |             if self.y_rot is not None:
679 |                 if 0 <= self.y_rot <= 180:
680 |                     va = 'top'
681 |                 else:
682 |                     va = 'bottom'
683 |             ax.set_yticklabels(labels, rotation=self.y_rot, va=va)
684 | 
685 |     def add_legend(self, label=None, handles=None, labels=None):
686 |         if label is not None:
687 |             if handles is None:
688 |                 handles, labels = self.axs[0].get_legend_handles_labels()
689 |             ncol = len(labels) // 8 + 1
690 |             self.fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(1.01, .8), 
691 |                             title=self.split, ncol=ncol)
692 | 
693 |     def clean_up(self):
694 |         self.add_x_y_labels()
695 |         plt.rcParams = self.original_rcParams
696 |         return self.fig
697 | 
698 |     def update_fig_size(self, n_splits, n_groups_per_split):
699 |         if self.user_figsize:
700 |             return
701 |         c1 = .3 if self.orientation == 'v' else .2
702 |         c2 = .06 if self.orientation == 'v' else .04
703 |         new_size = 1.8 + (c1 + c2 * n_splits) * n_groups_per_split
704 |         if self.orientation == 'v':
705 |             height = max(2.5 - .3 * self.fig_shape[0], 1.2)
706 |             shrink = max(.9 - .1 * self.fig_shape[1], .5)
707 |             width = new_size * shrink * self.fig_shape[1]
708 |             height = height * self.fig_shape[0]
709 |         else:
710 |             width = max(3 - .3 * self.fig_shape[1], 1.5)
711 |             height = new_size * .8 * self.fig_shape[0]
712 |             width = width * self.fig_shape[1]
713 |         width, height = min(width, 25), min(height, 25)
714 |         self.fig.set_size_inches(width, height)
715 | 
716 |     def add_fig_title(self):
717 |         self.fig.suptitle(self.title, y=1.02)
718 | 
719 | 
720 | import plotly.graph_objects as go
721 | from plotly.subplots import make_subplots
722 | 
723 | 
724 | class PlotlyCommon(CommonPlot):
725 | 
726 |     def __init__(self, x, y, data, aggfunc, split, row, col, 
727 |                  x_order, y_order, split_order, row_order, col_order,
728 |                  orientation, sort_values, wrap, figsize, title, sharex, sharey, 
729 |                  xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 
730 |                  x_textwrap, y_textwrap, x_rot, y_rot, 
731 |                  check_numeric=False, kind=None):
732 |         super().__init__(x, y, data, aggfunc, split, row, col, 
733 |                  x_order, y_order, split_order, row_order, col_order,
734 |                  orientation, sort_values, wrap, figsize, title, sharex, sharey, 
735 |                  xlabel, ylabel, xlim, ylim, xscale, yscale, cmap, 
736 |                  x_textwrap, y_textwrap, x_rot, y_rot, 
737 |                  check_numeric=False, kind=None)
738 | 
739 |         self.data_for_plots = self.get_data_for_every_plot()
740 |         self.final_data = self.get_final_data()
741 |         self.fig = self.create_figure()
742 | 
743 |     def create_figure(self):
744 |         titles = self.get_subplot_titles()
745 |         fig = make_subplots(rows=self.fig_shape[0], cols=self.fig_shape[1], subplot_titles=titles,
746 |                             shared_xaxes=self.sharex, shared_yaxes=self.sharey, 
747 |                             horizontal_spacing=.03)
748 |         fig.update_layout(title_text=self.title, legend_title_text=self.split)
749 |         return fig
750 | 
751 |     def get_final_data(self):
752 |         # create list of data for each call to plotting method
753 |         final_data = defaultdict(list)
754 |         locs = []
755 |         for i in range(self.fig_shape[0]):
756 |             for j in range(self.fig_shape[1]):
757 |                 locs.append((i + 1, j + 1))
758 | 
759 |         for (labels, data), loc in zip(self.data_for_plots, locs):
760 |             row_label, col_label = self.get_labels(labels)
761 |             if self.split:
762 |                 for grp, data_grp in self.get_ordered_groups(data, self.split_order, 'split'):
763 |                     final_data[loc].extend(self.get_final_groups(data_grp, grp, row_label, col_label))
764 |             else:
765 |                 final_data[loc].extend(self.get_final_groups(data, None, row_label, col_label))
766 |         return final_data
767 | 
768 |     def get_subplot_titles(self):
769 |         titles = []
770 |         for (i, j), info in self.final_data.items():
771 |             row_label, col_label = info[0][-2:]
772 |             if row_label is not None:
773 |                 row_label = str(row_label)
774 |             if col_label is not None:
775 |                 col_label = str(col_label)
776 |             row_label = row_label or ''
777 |             col_label = col_label or ''
778 |             if row_label and col_label:
779 |                 title = row_label + ' - ' + col_label
780 |             else:
781 |                 title = row_label or col_label
782 |             title = textwrap.fill(str(title), 30)
783 |             titles.append(title)
784 |         return titles
785 |     
786 | 
787 | class CountCommon(CommonPlot):
788 | 
789 |     def get_count_dict(self, normalize):
790 |         count_dict = {}
791 | 
792 |         if isinstance(normalize, str):
793 |             if normalize in (val, self.split, self.row, self.col):
794 |                 normalize = [normalize]
795 |         
796 |         if isinstance(normalize, tuple):
797 |             normalize = list(normalize)
798 |         elif hasattr(normalize, 'tolist'):
799 |             normalize = normalize.tolist()
800 |         elif not isinstance(normalize, (bool, list)):
801 |             raise ValueError('`normalize` must either be `True`/`False`, one of the columns passed '
802 |                                     'to `val`, `split`, `row` or `col`, or a list of '
803 |                                     'those columns')
804 |         normalize_kind = None
805 |         if isinstance(normalize, list):
806 |             row_col = []
807 |             val_split = []
808 |             for col in normalize:
809 |                 if col in (self.row, self.col):
810 |                     row_col.append(col)
811 |                 elif col in (val, self.split):
812 |                     val_split.append(col)
813 |                 else:
814 |                     raise ValueError('Columns passed to `normalize` must be the same as '
815 |                                         ' `val`, `split`, `row` or `col`.')
816 | 
817 |             if row_col:
818 |                 all_counts = {}
819 |                 for grp, data in self.data.groupby(row_col):
820 |                     if len(row_col) == 1:
821 |                         grp = str(grp)
822 |                     else:
823 |                         grp = tuple(str(g) for g in grp)
824 | 
825 |                     if val_split:
826 |                         normalize_kind = 'all'
827 |                         all_counts[grp] = data.groupby(val_split).size()
828 |                     else:
829 |                         normalize_kind = 'grid'
830 |                         all_counts[grp] = len(data)
831 |             else:
832 |                 normalize_kind = 'single'
833 |                 all_counts = self.data.groupby(val_split).size()
834 | 
835 |         n = 0
836 |         for key, info in self.final_data.items():
837 |             columns = []
838 |             vcs = []
839 |             for vals, split_label, col_name, row_label, col_label in info:
840 |                 vcs.append(vals.value_counts())
841 |                 columns.append(split_label)
842 | 
843 |             df = pd.concat(vcs, axis=1)
844 |             df.columns = columns
845 |             df.index.name = vals.name
846 |             if normalize_kind == 'single':
847 |                 if len(val_split) == 2:
848 |                     df = df / all_counts.unstack(self.split)
849 |                 elif df.index.name == all_counts.index.name:
850 |                     df = df.div(all_counts, axis=0)
851 |                 else:
852 |                     df = df / all_counts
853 |             elif normalize_kind in ('grid', 'all'):
854 |                 grp = []
855 |                 for col in normalize:
856 |                     if col == self.row:
857 |                         grp.append(row_label)
858 |                     if col == self.col:
859 |                         grp.append(col_label)
860 |                 
861 |                 if len(grp) == 1:
862 |                     grp = grp[0]
863 |                 else:
864 |                     grp = tuple(grp)
865 |                 grp_val = all_counts[grp]
866 |                 
867 |                 if normalize_kind == 'grid':
868 |                     df = df / grp_val
869 |                 elif len(val_split) == 2:
870 |                     df = df / grp_val.unstack(self.split)
871 |                 elif df.index.name == grp_val.index.name:
872 |                     df = df.div(grp_val, axis=0)
873 |                 else:
874 |                     df = df / grp_val
875 | 
876 |             else:
877 |                 n += df.sum().sum()
878 |             count_dict[key] = df
879 |             
880 |         if normalize is True:
881 |             count_dict = {key: df / n for key, df in count_dict.items()}
882 | 
883 |         return count_dict
884 | 
885 | 
886 | class MPLCount(CountCommon, MPLCommon):
887 |     pass
888 | 
889 | 
890 | class PlotlyCount(CountCommon, PlotlyCommon):
891 |     pass
892 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # Dexplot
   2 | 
   3 | [![](https://img.shields.io/pypi/v/dexplot)](https://pypi.org/project/dexplot)
   4 | [![PyPI - License](https://img.shields.io/pypi/l/dexplot)](LICENSE)
   5 | 
   6 | Dexplot is a Python library for delivering beautiful data visualizations with a simple and intuitive user experience.
   7 | 
   8 | ## Goals
   9 | 
  10 | The primary goals for dexplot are:
  11 | 
  12 | * Maintain a very consistent API with as few functions as necessary to make the desired statistical plots
  13 | * Allow the user tremendous power without using matplotlib
  14 | 
  15 | 
  16 | ## Installation
  17 | 
  18 | `pip install dexplot`
  19 | 
  20 | ## Built for long and wide data
  21 | 
  22 | Dexplot is primarily built for long data, which is a form of data where each row represents a single observation and each column represents a distinct quantity. It is often referred to as "tidy" data. Here, we have some long data.
  23 | 
  24 | ![](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/long.png)
  25 | 
  26 | Dexplot also has the ability to handle wide data, where multiple columns may contain values that represent the same kind of quantity. The same data above has been aggregated to show the mean for each combination of neighborhood and property type. It is now wide data as each column contains the same quantity (price).
  27 | 
  28 | ![](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/wide.png)
  29 | 
  30 | ## Usage
  31 | 
  32 | Dexplot provides a small number of powerful functions that all work similarly. Most plotting functions have the following signature:
  33 | 
  34 | ```python
  35 | dxp.plotting_func(x, y, data, aggfunc, split, row, col, orientation, ...)
  36 | ```
  37 | 
  38 | * `x` - Column name along the x-axis
  39 | * `y` - Column name the y-axis
  40 | * `data` - Pandas DataFrame
  41 | * `aggfunc` - String of pandas aggregation function, 'min', 'max', 'mean', etc...
  42 | * `split` - Column name to split data into distinct groups
  43 | * `row` - Column name to split data into distinct subplots row-wise
  44 | * `col` - Column name to split data into distinct subplots column-wise
  45 | * `orientation` - Either vertical (`'v'`) or horizontal (`'h'`). Default for most plots is vertical.
  46 | 
  47 | When `aggfunc` is provided, `x` will be the grouping variable and `y` will be aggregated when vertical and vice-versa when horizontal. The best way to learn how to use dexplot is with the examples below.
  48 | 
  49 | ## Families of plots
  50 | 
  51 | There are two primary families of plots, **aggregation** and **distribution**. Aggregation plots take a sequence of values and return a **single** value using the function provided to `aggfunc` to do so. Distribution plots take a sequence of values and depict the shape of the distribution in some manner.
  52 | 
  53 | * Aggregation
  54 |     * bar
  55 |     * line
  56 |     * scatter
  57 |     * count
  58 | * Distribution
  59 |     * box
  60 |     * violin
  61 |     * hist
  62 |     * kde
  63 | 
  64 | ## Comparison with Seaborn
  65 | 
  66 | If you have used the seaborn library, then you should notice a lot of similarities. Much of dexplot was inspired by Seaborn. Below is a list of the extra features in dexplot not found in seaborn
  67 | 
  68 | * Ability to graph relative frequency and normalize over any number of variables
  69 | * No need for multiple functions to do the same thing (far fewer public functions)
  70 | * Ability to make grids with a single function instead of having to use a higher level function like `catplot`
  71 | * Pandas `groupby` methods available as strings
  72 | * Ability to sort by values
  73 | * Ability to sort x/y labels lexicographically
  74 | * Ability to select most/least frequent groups
  75 | * x/y labels are wrapped so that they don't overlap
  76 | * Figure size (plus several other options) and available to change without using matplotlib
  77 | * A matplotlib figure object is returned
  78 | 
  79 | ## Examples
  80 | 
  81 | Most of the examples below use long data.
  82 | 
  83 | ## Aggregating plots - bar, line and scatter
  84 | 
  85 | We'll begin by covering the plots that **aggregate**. An aggregation is defined as a function that summarizes a sequence of numbers with a single value. The examples come from the Airbnb dataset, which contains many property rental listings from the Washington D.C. area.
  86 | 
  87 | 
  88 | ```python
  89 | import dexplot as dxp
  90 | import pandas as pd
  91 | airbnb = dxp.load_dataset('airbnb')
  92 | airbnb.head()
  93 | ```
  94 | 
  95 | <div>
  96 | <table border="1" class="dataframe">
  97 |   <thead style="border-bottom:1px solid black; vertical-align:bottom;">
  98 |     <tr style="text-align: right;">
  99 |       <th></th>
 100 |       <th style="color:red;">neighborhood</th>
 101 |       <th>property_type</th>
 102 |       <th>accommodates</th>
 103 |       <th>bathrooms</th>
 104 |       <th>bedrooms</th>
 105 |       <th>price</th>
 106 |       <th>cleaning_fee</th>
 107 |       <th>rating</th>
 108 |       <th>superhost</th>
 109 |       <th>response_time</th>
 110 |       <th>latitude</th>
 111 |       <th>longitude</th>
 112 |     </tr>
 113 |   </thead>
 114 |   <tbody>
 115 |     <tr>
 116 |       <th>0</th>
 117 |       <td>Shaw</td>
 118 |       <td>Townhouse</td>
 119 |       <td>16</td>
 120 |       <td>3.5</td>
 121 |       <td>4</td>
 122 |       <td>433</td>
 123 |       <td>250</td>
 124 |       <td>95.0</td>
 125 |       <td>No</td>
 126 |       <td>within an hour</td>
 127 |       <td>38.90982</td>
 128 |       <td>-77.02016</td>
 129 |     </tr>
 130 |     <tr>
 131 |       <th>1</th>
 132 |       <td>Brightwood Park</td>
 133 |       <td>Townhouse</td>
 134 |       <td>4</td>
 135 |       <td>3.5</td>
 136 |       <td>4</td>
 137 |       <td>154</td>
 138 |       <td>50</td>
 139 |       <td>97.0</td>
 140 |       <td>No</td>
 141 |       <td>NaN</td>
 142 |       <td>38.95888</td>
 143 |       <td>-77.02554</td>
 144 |     </tr>
 145 |     <tr>
 146 |       <th>2</th>
 147 |       <td>Capitol Hill</td>
 148 |       <td>House</td>
 149 |       <td>2</td>
 150 |       <td>1.5</td>
 151 |       <td>1</td>
 152 |       <td>83</td>
 153 |       <td>35</td>
 154 |       <td>97.0</td>
 155 |       <td>Yes</td>
 156 |       <td>within an hour</td>
 157 |       <td>38.88791</td>
 158 |       <td>-76.99668</td>
 159 |     </tr>
 160 |     <tr>
 161 |       <th>3</th>
 162 |       <td>Shaw</td>
 163 |       <td>House</td>
 164 |       <td>2</td>
 165 |       <td>2.5</td>
 166 |       <td>1</td>
 167 |       <td>475</td>
 168 |       <td>0</td>
 169 |       <td>98.0</td>
 170 |       <td>No</td>
 171 |       <td>NaN</td>
 172 |       <td>38.91331</td>
 173 |       <td>-77.02436</td>
 174 |     </tr>
 175 |     <tr>
 176 |       <th>4</th>
 177 |       <td>Kalorama Heights</td>
 178 |       <td>Apartment</td>
 179 |       <td>3</td>
 180 |       <td>1.0</td>
 181 |       <td>1</td>
 182 |       <td>118</td>
 183 |       <td>15</td>
 184 |       <td>91.0</td>
 185 |       <td>No</td>
 186 |       <td>within an hour</td>
 187 |       <td>38.91933</td>
 188 |       <td>-77.04124</td>
 189 |     </tr>
 190 |   </tbody>
 191 | </table>
 192 | </div>
 193 | 
 194 | 
 195 | 
 196 | There are more than 4,000 listings in our dataset. We will use bar charts to aggregate the data.
 197 | 
 198 | 
 199 | ```python
 200 | airbnb.shape
 201 | ```
 202 | 
 203 | 
 204 | 
 205 | 
 206 |     (4581, 12)
 207 | 
 208 | 
 209 | 
 210 | ### Vertical bar charts
 211 | 
 212 | In order to performa an aggregation, you must supply a value for `aggfunc`. Here, we find the median price per neighborhood. Notice that the column names automatically wrap.
 213 | 
 214 | 
 215 | ```python
 216 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 217 | ```
 218 | 
 219 | 
 220 | 
 221 | 
 222 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_7_0.png)
 223 | 
 224 | 
 225 | 
 226 | Line and scatter plots can be created with the same command, just substituting the name of the function. They both are not good choices for the visualization since the grouping variable (neighborhood) has no meaningful order.
 227 | 
 228 | 
 229 | ```python
 230 | dxp.line(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 231 | ```
 232 | 
 233 | 
 234 | 
 235 | 
 236 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_9_0.png)
 237 | 
 238 | 
 239 | 
 240 | 
 241 | ```python
 242 | dxp.scatter(x='neighborhood', y='price', data=airbnb, aggfunc='median')
 243 | ```
 244 | 
 245 | 
 246 | 
 247 | 
 248 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_10_0.png)
 249 | 
 250 | 
 251 | 
 252 | ### Components of the groupby aggregation
 253 | 
 254 | Anytime the `aggfunc` parameter is set, you have performed a groupby aggregation, which always consists of three components:
 255 | 
 256 | * Grouping column - unique values of this column form independent groups (neighborhood)
 257 | * Aggregating column - the column that will get summarized with a single value (price)
 258 | * Aggregating function - a function that returns a single value (median)
 259 | 
 260 | The general format for doing this in pandas is:
 261 | 
 262 | ```python
 263 | df.groupby('grouping column').agg({'aggregating column': 'aggregating function'})
 264 | ```
 265 | 
 266 | Specifically, the following code is executed within dexplot.
 267 | 
 268 | 
 269 | ```python
 270 | airbnb.groupby('neighborhood').agg({'price': 'median'})
 271 | ```
 272 | 
 273 | 
 274 | 
 275 | 
 276 | <div>
 277 | <table border="1" class="dataframe">
 278 |   <thead>
 279 |     <tr style="text-align: right;">
 280 |       <th></th>
 281 |       <th>price</th>
 282 |     </tr>
 283 |     <tr>
 284 |       <th>neighborhood</th>
 285 |       <th></th>
 286 |     </tr>
 287 |   </thead>
 288 |   <tbody>
 289 |     <tr>
 290 |       <th>Brightwood Park</th>
 291 |       <td>87.0</td>
 292 |     </tr>
 293 |     <tr>
 294 |       <th>Capitol Hill</th>
 295 |       <td>129.5</td>
 296 |     </tr>
 297 |     <tr>
 298 |       <th>Columbia Heights</th>
 299 |       <td>95.0</td>
 300 |     </tr>
 301 |     <tr>
 302 |       <th>Dupont Circle</th>
 303 |       <td>125.0</td>
 304 |     </tr>
 305 |     <tr>
 306 |       <th>Edgewood</th>
 307 |       <td>100.0</td>
 308 |     </tr>
 309 |     <tr>
 310 |       <th>Kalorama Heights</th>
 311 |       <td>118.0</td>
 312 |     </tr>
 313 |     <tr>
 314 |       <th>Shaw</th>
 315 |       <td>133.5</td>
 316 |     </tr>
 317 |     <tr>
 318 |       <th>Union Station</th>
 319 |       <td>120.0</td>
 320 |     </tr>
 321 |   </tbody>
 322 | </table>
 323 | </div>
 324 | 
 325 | 
 326 | 
 327 | ### Number and percent of missing values with  `'countna'` and `'percna'`
 328 | 
 329 | In addition to all the common aggregating functions, you can use the strings `'countna'` and `'percna'` to get the number and percentage of missing values per group.
 330 | 
 331 | 
 332 | ```python
 333 | dxp.bar(x='neighborhood', y='response_time', data=airbnb, aggfunc='countna')
 334 | ```
 335 | 
 336 | 
 337 | 
 338 | 
 339 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_14_0.png)
 340 | 
 341 | 
 342 | 
 343 | ### Sorting the bars by values
 344 | 
 345 | By default, the bars will be sorted by the grouping column (x-axis here) in alphabetical order. Use the `sort_values` parameter to sort the bars by value.
 346 | 
 347 | * None - sort x/y axis labels alphabetically (default)
 348 | * `asc` - sort values from least to greatest
 349 | * `desc` - sort values from greatest to least
 350 | 
 351 | 
 352 | ```python
 353 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='asc')
 354 | ```
 355 | 
 356 | 
 357 | 
 358 | 
 359 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_16_0.png)
 360 | 
 361 | 
 362 | 
 363 | Here, we sort the values from greatest to least.
 364 | 
 365 | 
 366 | ```python
 367 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', sort_values='desc')
 368 | ```
 369 | 
 370 | 
 371 | 
 372 | 
 373 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_18_0.png)
 374 | 
 375 | 
 376 | 
 377 | ### Specify order with `x_order`
 378 | 
 379 | Specify a specific order of the labels on the x-axis by passing a list of values to `x_order`. This can also act as a filter to limit the number of bars.
 380 | 
 381 | 
 382 | ```python
 383 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 384 |         x_order=['Dupont Circle', 'Edgewood', 'Union Station'])
 385 | ```
 386 | 
 387 | 
 388 | 
 389 | 
 390 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_20_0.png)
 391 | 
 392 | 
 393 | 
 394 | By default, `x_order` and all of the `_order` parameters are set to `'asc'` by default, which will order them alphabetically. Use the string `'desc'` to sort in the opposite direction.
 395 | 
 396 | 
 397 | ```python
 398 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', x_order='desc')
 399 | ```
 400 | 
 401 | 
 402 | 
 403 | 
 404 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_22_0.png)
 405 | 
 406 | 
 407 | 
 408 | ### Filter for the neighborhoods with most/least frequency of occurrence
 409 | 
 410 | You can use `x_order` again to filter for the x-values that appear the most/least often by setting it to the string `'top n'` or `'bottom n'` where `n` is an integer. Here, we filter for the top 4 most frequently occurring neighborhoods. This option is useful when there are dozens of unique values in the grouping column.
 411 | 
 412 | 
 413 | ```python
 414 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 415 |         x_order='top 4')
 416 | ```
 417 | 
 418 | 
 419 | 
 420 | 
 421 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_24_0.png)
 422 | 
 423 | 
 424 | 
 425 | We can verify that the four neighborhoods are the most common.
 426 | 
 427 | 
 428 | ```python
 429 | airbnb['neighborhood'].value_counts()
 430 | ```
 431 | 
 432 | 
 433 | 
 434 | 
 435 |     Columbia Heights    773
 436 |     Union Station       713
 437 |     Capitol Hill        654
 438 |     Edgewood            610
 439 |     Dupont Circle       549
 440 |     Shaw                514
 441 |     Brightwood Park     406
 442 |     Kalorama Heights    362
 443 |     Name: neighborhood, dtype: int64
 444 | 
 445 | 
 446 | 
 447 | ### Horizontal bars
 448 | 
 449 | Set `orientation` to `'h'` for horizontal bars. When you do this, you'll need to switch `x` and `y` since the grouping column (neighborhood) will be along the y-axis and the aggregating column (price) will be along the x-axis.
 450 | 
 451 | 
 452 | ```python
 453 | dxp.bar(x='price', y='neighborhood', data=airbnb, aggfunc='median', 
 454 |         orientation='h', sort_values='desc')
 455 | ```
 456 | 
 457 | 
 458 | 
 459 | 
 460 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_28_0.png)
 461 | 
 462 | 
 463 | 
 464 | Switching orientation is possible for most other plots.
 465 | 
 466 | 
 467 | ```python
 468 | dxp.line(x='price', y='neighborhood', data=airbnb, aggfunc='median', orientation='h')
 469 | ```
 470 | 
 471 | 
 472 | 
 473 | 
 474 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_30_0.png)
 475 | 
 476 | 
 477 | 
 478 | ### Split bars into groups
 479 | 
 480 | You can split each bar into further groups by setting the `split` parameter to another column.
 481 | 
 482 | 
 483 | ```python
 484 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost')
 485 | ```
 486 | 
 487 | 
 488 | 
 489 | 
 490 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_32_0.png)
 491 | 
 492 | 
 493 | 
 494 | We can use the `pivot_table` method to verify the results in pandas.
 495 | 
 496 | 
 497 | ```python
 498 | airbnb.pivot_table(index='superhost', columns='neighborhood', 
 499 |                    values='price', aggfunc='median')
 500 | ```
 501 | 
 502 | 
 503 | 
 504 | 
 505 | <div>
 506 | <table border="1" class="dataframe">
 507 |   <thead>
 508 |     <tr style="text-align: right;">
 509 |       <th>neighborhood</th>
 510 |       <th>Brightwood Park</th>
 511 |       <th>Capitol Hill</th>
 512 |       <th>Columbia Heights</th>
 513 |       <th>Dupont Circle</th>
 514 |       <th>Edgewood</th>
 515 |       <th>Kalorama Heights</th>
 516 |       <th>Shaw</th>
 517 |       <th>Union Station</th>
 518 |     </tr>
 519 |     <tr>
 520 |       <th>superhost</th>
 521 |       <th></th>
 522 |       <th></th>
 523 |       <th></th>
 524 |       <th></th>
 525 |       <th></th>
 526 |       <th></th>
 527 |       <th></th>
 528 |       <th></th>
 529 |     </tr>
 530 |   </thead>
 531 |   <tbody>
 532 |     <tr>
 533 |       <th>No</th>
 534 |       <td>85.0</td>
 535 |       <td>129.0</td>
 536 |       <td>90.5</td>
 537 |       <td>120.0</td>
 538 |       <td>100.0</td>
 539 |       <td>110.0</td>
 540 |       <td>130.0</td>
 541 |       <td>120.0</td>
 542 |     </tr>
 543 |     <tr>
 544 |       <th>Yes</th>
 545 |       <td>90.0</td>
 546 |       <td>130.0</td>
 547 |       <td>103.0</td>
 548 |       <td>135.0</td>
 549 |       <td>100.0</td>
 550 |       <td>124.0</td>
 551 |       <td>135.0</td>
 552 |       <td>125.0</td>
 553 |     </tr>
 554 |   </tbody>
 555 | </table>
 556 | </div>
 557 | 
 558 | 
 559 | 
 560 | Set the order of the unique split values with `split_order`, which can also act as a filter.
 561 | 
 562 | 
 563 | ```python
 564 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 565 |         split='superhost', split_order=['Yes', 'No'])
 566 | ```
 567 | 
 568 | 
 569 | 
 570 | 
 571 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_36_0.png)
 572 | 
 573 | 
 574 | 
 575 | Like all the `_order` parameters, `split_order` defaults to `'asc'` (alphabetical) order. Set it to `'desc'` for the opposite.
 576 | 
 577 | 
 578 | ```python
 579 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 580 |         split='property_type', split_order='desc')
 581 | ```
 582 | 
 583 | 
 584 | 
 585 | 
 586 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_38_0.png)
 587 | 
 588 | 
 589 | 
 590 | Filtering for the most/least frequent split categories is possible.
 591 | 
 592 | 
 593 | ```python
 594 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 595 |         split='property_type', split_order='bottom 2')
 596 | ```
 597 | 
 598 | 
 599 | 
 600 | 
 601 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_40_0.png)
 602 | 
 603 | 
 604 | 
 605 | Verifying that the least frequent property types are Townhouse and Condominium.
 606 | 
 607 | 
 608 | ```python
 609 | airbnb['property_type'].value_counts()
 610 | ```
 611 | 
 612 | 
 613 | 
 614 | 
 615 |     Apartment      2403
 616 |     House           877
 617 |     Townhouse       824
 618 |     Condominium     477
 619 |     Name: property_type, dtype: int64
 620 | 
 621 | 
 622 | 
 623 | ### Stacked bar charts
 624 | 
 625 | Stack all the split groups one on top of the other by setting `stacked` to `True`.
 626 | 
 627 | 
 628 | ```python
 629 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 630 |         split='superhost', split_order=['Yes', 'No'], stacked=True)
 631 | ```
 632 | 
 633 | 
 634 | 
 635 | 
 636 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_44_0.png)
 637 | 
 638 | 
 639 | 
 640 | ### Split into multiple plots
 641 | 
 642 | It's possible to split the data further into separate plots by the unique values in a different column with the `row` and `col` parameters. Here, each kind of `property_type` has its own plot.
 643 | 
 644 | 
 645 | ```python
 646 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 647 |         split='superhost', col='property_type')
 648 | ```
 649 | 
 650 | 
 651 | 
 652 | 
 653 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_46_0.png)
 654 | 
 655 | 
 656 | 
 657 | If there isn't room for all of the plots, set the `wrap` parameter to an integer to set the maximum number of plots per row/col. We also specify the `col_order` to be descending alphabetically.
 658 | 
 659 | 
 660 | ```python
 661 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', 
 662 |         split='superhost', col='property_type', wrap=2, col_order='desc')
 663 | ```
 664 | 
 665 | 
 666 | 
 667 | 
 668 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_48_0.png)
 669 | 
 670 | 
 671 | 
 672 | Use `col_order` to both filter and set a specific order for the plots.
 673 | 
 674 | 
 675 | ```python
 676 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median',
 677 |         split='superhost', col='property_type', col_order=['House', 'Condominium'])
 678 | ```
 679 | 
 680 | 
 681 | 
 682 | 
 683 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_50_0.png)
 684 | 
 685 | 
 686 | 
 687 | Splits can be made simultaneously along row and columns.
 688 | 
 689 | 
 690 | ```python
 691 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost', 
 692 |         col='property_type', col_order=['House', 'Condominium', 'Apartment'],
 693 |         row='bedrooms', row_order=[1, 2, 3])
 694 | ```
 695 | 
 696 | 
 697 | 
 698 | 
 699 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_52_0.png)
 700 | 
 701 | 
 702 | 
 703 | By default, all axis limits are shared. Allow each plot to set its own limits by setting `sharex` and `sharey` to `False`.
 704 | 
 705 | 
 706 | ```python
 707 | dxp.bar(x='neighborhood', y='price', data=airbnb, aggfunc='median', split='superhost', 
 708 |         col='property_type', col_order=['House', 'Condominium', 'Apartment'],
 709 |         row='bedrooms', row_order=[1, 2, 3], sharey=False)
 710 | ```
 711 | 
 712 | 
 713 | 
 714 | 
 715 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_54_0.png)
 716 | 
 717 | 
 718 | 
 719 | ### Set the width of each bar with `size`
 720 | 
 721 | The width (height when horizontal) of the bars is set with the `size` parameter. By default, this value is .9. Think of this number as the relative width of all the bars for a particular x/y value, where 1 is the distance between each x/y value.
 722 | 
 723 | 
 724 | ```python
 725 | dxp.bar(x='neighborhood', y='price', data=airbnb, 
 726 |         aggfunc='median', split='property_type',
 727 |         split_order=['Apartment', 'House'], 
 728 |         x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'], size=.5)
 729 | ```
 730 | 
 731 | 
 732 | 
 733 | 
 734 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_56_0.png)
 735 | 
 736 | 
 737 | 
 738 | ### Splitting line plots
 739 | 
 740 | All the other aggregating plots work similarly.
 741 | 
 742 | 
 743 | ```python
 744 | dxp.line(x='neighborhood', y='price', data=airbnb, 
 745 |         aggfunc='median', split='property_type',
 746 |         split_order=['Apartment', 'House'], 
 747 |         x_order=['Dupont Circle', 'Capitol Hill', 'Union Station'])
 748 | ```
 749 | 
 750 | 
 751 | 
 752 | 
 753 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_58_0.png)
 754 | 
 755 | 
 756 | 
 757 | ## Distribution plots - box, violin, histogram, kde
 758 | 
 759 | Distribution plots work similarly, but do not have an `aggfunc` since they do not aggregate. They take their group of values and draw some kind of shape that gives information on how that variable is distributed. 
 760 | 
 761 | ### Box plots
 762 | 
 763 | Box plots have colored boxes with ends at the first and third quartiles and a line at the median. The whiskers are placed at 1.5 times the difference between the third and first quartiles (Interquartile range (IQR)). Fliers are the points outside this range and plotted individually. By default, both box and violin plots are plotted horizontally.
 764 | 
 765 | 
 766 | ```python
 767 | dxp.box(x='price', y='neighborhood', data=airbnb)
 768 | ```
 769 | 
 770 | 
 771 | 
 772 | 
 773 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_60_0.png)
 774 | 
 775 | 
 776 | 
 777 | Split the groups in the same manner as with the aggregation plots.
 778 | 
 779 | 
 780 | ```python
 781 | dxp.box(x='price', y='neighborhood', data=airbnb, 
 782 |         split='superhost', split_order=['Yes', 'No'])
 783 | ```
 784 | 
 785 | 
 786 | 
 787 | 
 788 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_62_0.png)
 789 | 
 790 | 
 791 | 
 792 | Order the appearance of the splits alphabetically (in descending order here).
 793 | 
 794 | 
 795 | ```python
 796 | dxp.box(x='price', y='neighborhood', data=airbnb, 
 797 |         split='property_type', split_order='desc')
 798 | ```
 799 | 
 800 | 
 801 | 
 802 | 
 803 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_64_0.png)
 804 | 
 805 | 
 806 | 
 807 | ### Filter range of values with `x_order`
 808 | 
 809 | It's possible to filter the range of possible values by passing in a list of the minimum and maximum to `x_order`.
 810 | 
 811 | 
 812 | ```python
 813 | dxp.box(x='price', y='neighborhood', data=airbnb, 
 814 |         split='superhost', x_order=[50, 250])
 815 | ```
 816 | 
 817 | 
 818 | 
 819 | 
 820 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_66_0.png)
 821 | 
 822 | 
 823 | 
 824 | Change the `x` and `y` while setting `orientation` to make vertical bar plots.
 825 | 
 826 | 
 827 | ```python
 828 | dxp.box(x='neighborhood', y='price', data=airbnb, orientation='v',
 829 |         split='property_type', split_order='top 2')
 830 | ```
 831 | 
 832 | 
 833 | 
 834 | 
 835 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_68_0.png)
 836 | 
 837 | 
 838 | 
 839 | Violin plots work identically to box plots, but show "violins", kernel density plots duplicated on both sides of a line.
 840 | 
 841 | 
 842 | ```python
 843 | dxp.violin(x='price', y='neighborhood', data=airbnb, 
 844 |           split='superhost', split_order=['Yes', 'No'])
 845 | ```
 846 | 
 847 | 
 848 | 
 849 | 
 850 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_70_0.png)
 851 | 
 852 | 
 853 | 
 854 | Splitting by rows and columns is possible as well with distribution plots.
 855 | 
 856 | 
 857 | ```python
 858 | dxp.box(x='price', y='neighborhood', data=airbnb,split='superhost', 
 859 |         col='property_type', col_order=['House', 'Condominium', 'Apartment'],
 860 |         row='bedrooms', row_order=[1, 2])
 861 | ```
 862 | 
 863 | 
 864 | 
 865 | 
 866 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_72_0.png)
 867 | 
 868 | 
 869 | 
 870 | ### Histograms
 871 | 
 872 | Histograms work in a slightly different manner. Instead of passing both `x` and `y`, you give it a single numeric column. A vertical histogram with 20 bins of the counts is created by default.
 873 | 
 874 | 
 875 | ```python
 876 | dxp.hist(val='price', data=airbnb)
 877 | ```
 878 | 
 879 | 
 880 | 
 881 | 
 882 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_74_0.png)
 883 | 
 884 | 
 885 | 
 886 | We can use `split` just like we did above and also create horizontal histograms.
 887 | 
 888 | 
 889 | ```python
 890 | dxp.hist(val='price', data=airbnb, orientation='h', split='superhost', bins=15)
 891 | ```
 892 | 
 893 | 
 894 | 
 895 | 
 896 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_76_0.png)
 897 | 
 898 | 
 899 | 
 900 | Here, we customize our histogram by plotting the cumulative density as opposed to the raw frequency count using the outline of the bars ('step').
 901 | 
 902 | 
 903 | ```python
 904 | dxp.hist(val='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3], 
 905 |          bins=30, density=True, histtype='step', cumulative=True)
 906 | ```
 907 | 
 908 | 
 909 | 
 910 | 
 911 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_78_0.png)
 912 | 
 913 | 
 914 | 
 915 | ### KDE Plots
 916 | 
 917 | Kernel density estimates provide an estimate for the probability distribution of a continuous variable. Here, we examine how price is distributed by bedroom.
 918 | 
 919 | 
 920 | ```python
 921 | dxp.kde(x='price', data=airbnb, split='bedrooms', split_order=[1, 2, 3])
 922 | ```
 923 | 
 924 | 
 925 | 
 926 | 
 927 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_80_0.png)
 928 | 
 929 | 
 930 | 
 931 | Graph the cumulative distribution instead on multiple plots.
 932 | 
 933 | 
 934 | ```python
 935 | dxp.kde(x='price', data=airbnb, split='bedrooms', 
 936 |         split_order=[1, 2, 3], cumulative=True, col='property_type', wrap=2)
 937 | ```
 938 | 
 939 | 
 940 | 
 941 | 
 942 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_82_0.png)
 943 | 
 944 | 
 945 | 
 946 | ### Two-dimensional KDE's
 947 | 
 948 | Provide two numeric columns to `x` and `y` to get a two dimensional KDE.
 949 | 
 950 | 
 951 | ```python
 952 | dxp.kde(x='price', y='cleaning_fee', data=airbnb)
 953 | ```
 954 | 
 955 | 
 956 | 
 957 | 
 958 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_84_0.png)
 959 | 
 960 | 
 961 | 
 962 | Create a grid of two-dimensional KDE's.
 963 | 
 964 | 
 965 | ```python
 966 | dxp.kde(x='price', y='cleaning_fee', data=airbnb, row='neighborhood', wrap=3)
 967 | ```
 968 | 
 969 | 
 970 | 
 971 | 
 972 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_86_0.png)
 973 | 
 974 | 
 975 | 
 976 | ## Count plots
 977 | 
 978 | The `count` function graphs the frequency of unique values as bars. By default, it plots the values in descending order.
 979 | 
 980 | 
 981 | ```python
 982 | dxp.count(val='neighborhood', data=airbnb)
 983 | ```
 984 | 
 985 | 
 986 | 
 987 | 
 988 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_88_0.png)
 989 | 
 990 | 
 991 | 
 992 | In pandas, this is a straightforward call to the `value_counts` method.
 993 | 
 994 | 
 995 | ```python
 996 | airbnb['neighborhood'].value_counts()
 997 | ```
 998 | 
 999 | 
1000 | 
1001 | 
1002 |     Columbia Heights    773
1003 |     Union Station       713
1004 |     Capitol Hill        654
1005 |     Edgewood            610
1006 |     Dupont Circle       549
1007 |     Shaw                514
1008 |     Brightwood Park     406
1009 |     Kalorama Heights    362
1010 |     Name: neighborhood, dtype: int64
1011 | 
1012 | 
1013 | 
1014 | ### Relative frequency with `normalize`
1015 | 
1016 | Instead of the raw counts, get the relative frequency by setting normalize to `True`.
1017 | 
1018 | 
1019 | ```python
1020 | dxp.count(val='neighborhood', data=airbnb, normalize=True)
1021 | ```
1022 | 
1023 | 
1024 | 
1025 | 
1026 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_92_0.png)
1027 | 
1028 | 
1029 | 
1030 | Here, we split by property type.
1031 | 
1032 | 
1033 | ```python
1034 | dxp.count(val='neighborhood', data=airbnb, split='property_type')
1035 | ```
1036 | 
1037 | 
1038 | 
1039 | 
1040 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_94_0.png)
1041 | 
1042 | 
1043 | 
1044 | In pandas, this is done with the `crosstab` function.
1045 | 
1046 | 
1047 | ```python
1048 | pd.crosstab(index=airbnb['property_type'], columns=airbnb['neighborhood'])
1049 | ```
1050 | 
1051 | 
1052 | 
1053 | 
1054 | <div>
1055 | <table border="1" class="dataframe">
1056 |   <thead>
1057 |     <tr style="text-align: right;">
1058 |       <th>neighborhood</th>
1059 |       <th>Brightwood Park</th>
1060 |       <th>Capitol Hill</th>
1061 |       <th>Columbia Heights</th>
1062 |       <th>Dupont Circle</th>
1063 |       <th>Edgewood</th>
1064 |       <th>Kalorama Heights</th>
1065 |       <th>Shaw</th>
1066 |       <th>Union Station</th>
1067 |     </tr>
1068 |     <tr>
1069 |       <th>property_type</th>
1070 |       <th></th>
1071 |       <th></th>
1072 |       <th></th>
1073 |       <th></th>
1074 |       <th></th>
1075 |       <th></th>
1076 |       <th></th>
1077 |       <th></th>
1078 |     </tr>
1079 |   </thead>
1080 |   <tbody>
1081 |     <tr>
1082 |       <th>Apartment</th>
1083 |       <td>167</td>
1084 |       <td>299</td>
1085 |       <td>374</td>
1086 |       <td>397</td>
1087 |       <td>244</td>
1088 |       <td>284</td>
1089 |       <td>315</td>
1090 |       <td>323</td>
1091 |     </tr>
1092 |     <tr>
1093 |       <th>Condominium</th>
1094 |       <td>35</td>
1095 |       <td>70</td>
1096 |       <td>97</td>
1097 |       <td>62</td>
1098 |       <td>65</td>
1099 |       <td>42</td>
1100 |       <td>52</td>
1101 |       <td>54</td>
1102 |     </tr>
1103 |     <tr>
1104 |       <th>House</th>
1105 |       <td>131</td>
1106 |       <td>137</td>
1107 |       <td>157</td>
1108 |       <td>47</td>
1109 |       <td>146</td>
1110 |       <td>23</td>
1111 |       <td>61</td>
1112 |       <td>175</td>
1113 |     </tr>
1114 |     <tr>
1115 |       <th>Townhouse</th>
1116 |       <td>73</td>
1117 |       <td>148</td>
1118 |       <td>145</td>
1119 |       <td>43</td>
1120 |       <td>155</td>
1121 |       <td>13</td>
1122 |       <td>86</td>
1123 |       <td>161</td>
1124 |     </tr>
1125 |   </tbody>
1126 | </table>
1127 | </div>
1128 | 
1129 | 
1130 | 
1131 | Horizontal stacked count plots.
1132 | 
1133 | 
1134 | ```python
1135 | dxp.count(val='neighborhood', data=airbnb, split='property_type', 
1136 |           orientation='h', stacked=True, col='superhost')
1137 | ```
1138 | 
1139 | 
1140 | 
1141 | 
1142 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_98_0.png)
1143 | 
1144 | 
1145 | 
1146 | ### Normalize over different variables
1147 | 
1148 | Setting `normalize` to `True`, returns the relative frequency with respect to all of the data. You can normalize over any of the variables provided.
1149 | 
1150 | 
1151 | ```python
1152 | dxp.count(val='neighborhood', data=airbnb, split='property_type', normalize='neighborhood', 
1153 |                 title='Relative Frequency by Neighborhood')
1154 | ```
1155 | 
1156 | 
1157 | 
1158 | 
1159 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_100_0.png)
1160 | 
1161 | 
1162 | 
1163 | Normalize over several variables at once with a list.
1164 | 
1165 | 
1166 | ```python
1167 | dxp.count(val='neighborhood', data=airbnb, split='superhost', 
1168 |           row='property_type', col='bedrooms', col_order=[1, 2],
1169 |           normalize=['neighborhood', 'property_type', 'bedrooms'], stacked=True)
1170 | ```
1171 | 
1172 | 
1173 | 
1174 | 
1175 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_102_0.png)
1176 | 
1177 | 
1178 | 
1179 | ## Wide data
1180 | 
1181 | Dexplot can also plot wide data, or data where no aggregation happens. Here is a scatter plot of the location of each listing.
1182 | 
1183 | 
1184 | ```python
1185 | dxp.scatter(x='longitude', y='latitude', data=airbnb, 
1186 |             split='neighborhood', col='bedrooms', col_order=[2, 3])
1187 | ```
1188 | 
1189 | 
1190 | 
1191 | 
1192 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_104_0.png)
1193 | 
1194 | 
1195 | 
1196 | If you've already aggregated your data, you can plot it directly without specifying `x` or `y`.
1197 | 
1198 | 
1199 | ```python
1200 | df = airbnb.pivot_table(index='neighborhood', columns='property_type', 
1201 |                         values='price', aggfunc='mean')
1202 | df
1203 | ```
1204 | 
1205 | 
1206 | 
1207 | 
1208 | <div>
1209 | 
1210 | <table border="1" class="dataframe">
1211 |   <thead>
1212 |     <tr style="text-align: right;">
1213 |       <th>property_type</th>
1214 |       <th>Apartment</th>
1215 |       <th>Condominium</th>
1216 |       <th>House</th>
1217 |       <th>Townhouse</th>
1218 |     </tr>
1219 |     <tr>
1220 |       <th>neighborhood</th>
1221 |       <th></th>
1222 |       <th></th>
1223 |       <th></th>
1224 |       <th></th>
1225 |     </tr>
1226 |   </thead>
1227 |   <tbody>
1228 |     <tr>
1229 |       <th>Brightwood Park</th>
1230 |       <td>96.119760</td>
1231 |       <td>105.000000</td>
1232 |       <td>121.671756</td>
1233 |       <td>133.479452</td>
1234 |     </tr>
1235 |     <tr>
1236 |       <th>Capitol Hill</th>
1237 |       <td>141.210702</td>
1238 |       <td>104.200000</td>
1239 |       <td>170.153285</td>
1240 |       <td>184.459459</td>
1241 |     </tr>
1242 |     <tr>
1243 |       <th>Columbia Heights</th>
1244 |       <td>114.676471</td>
1245 |       <td>126.773196</td>
1246 |       <td>135.292994</td>
1247 |       <td>124.358621</td>
1248 |     </tr>
1249 |     <tr>
1250 |       <th>Dupont Circle</th>
1251 |       <td>146.858942</td>
1252 |       <td>130.709677</td>
1253 |       <td>179.574468</td>
1254 |       <td>139.348837</td>
1255 |     </tr>
1256 |     <tr>
1257 |       <th>Edgewood</th>
1258 |       <td>108.508197</td>
1259 |       <td>112.846154</td>
1260 |       <td>156.335616</td>
1261 |       <td>147.503226</td>
1262 |     </tr>
1263 |     <tr>
1264 |       <th>Kalorama Heights</th>
1265 |       <td>122.542254</td>
1266 |       <td>155.928571</td>
1267 |       <td>92.695652</td>
1268 |       <td>158.230769</td>
1269 |     </tr>
1270 |     <tr>
1271 |       <th>Shaw</th>
1272 |       <td>153.888889</td>
1273 |       <td>158.500000</td>
1274 |       <td>202.114754</td>
1275 |       <td>173.279070</td>
1276 |     </tr>
1277 |     <tr>
1278 |       <th>Union Station</th>
1279 |       <td>128.458204</td>
1280 |       <td>133.833333</td>
1281 |       <td>162.748571</td>
1282 |       <td>162.167702</td>
1283 |     </tr>
1284 |   </tbody>
1285 | </table>
1286 | </div>
1287 | 
1288 | 
1289 | 
1290 | 
1291 | ```python
1292 | dxp.bar(data=df, orientation='h')
1293 | ```
1294 | 
1295 | 
1296 | 
1297 | 
1298 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_107_0.png)
1299 | 
1300 | 
1301 | 
1302 | ### Time series
1303 | 
1304 | 
1305 | ```python
1306 | stocks = pd.read_csv('../data/stocks10.csv', parse_dates=['date'], index_col='date')
1307 | stocks.head()
1308 | ```
1309 | 
1310 | 
1311 | 
1312 | 
1313 | <div>
1314 | <table border="1" class="dataframe">
1315 |   <thead>
1316 |     <tr style="text-align: right;">
1317 |       <th></th>
1318 |       <th>MSFT</th>
1319 |       <th>AAPL</th>
1320 |       <th>SLB</th>
1321 |       <th>AMZN</th>
1322 |       <th>TSLA</th>
1323 |       <th>XOM</th>
1324 |       <th>WMT</th>
1325 |       <th>T</th>
1326 |       <th>FB</th>
1327 |       <th>V</th>
1328 |     </tr>
1329 |     <tr>
1330 |       <th>date</th>
1331 |       <th></th>
1332 |       <th></th>
1333 |       <th></th>
1334 |       <th></th>
1335 |       <th></th>
1336 |       <th></th>
1337 |       <th></th>
1338 |       <th></th>
1339 |       <th></th>
1340 |       <th></th>
1341 |     </tr>
1342 |   </thead>
1343 |   <tbody>
1344 |     <tr>
1345 |       <th>1999-10-25</th>
1346 |       <td>29.84</td>
1347 |       <td>2.32</td>
1348 |       <td>17.02</td>
1349 |       <td>82.75</td>
1350 |       <td>NaN</td>
1351 |       <td>21.45</td>
1352 |       <td>38.99</td>
1353 |       <td>16.78</td>
1354 |       <td>NaN</td>
1355 |       <td>NaN</td>
1356 |     </tr>
1357 |     <tr>
1358 |       <th>1999-10-26</th>
1359 |       <td>29.82</td>
1360 |       <td>2.34</td>
1361 |       <td>16.65</td>
1362 |       <td>81.25</td>
1363 |       <td>NaN</td>
1364 |       <td>20.89</td>
1365 |       <td>37.11</td>
1366 |       <td>17.28</td>
1367 |       <td>NaN</td>
1368 |       <td>NaN</td>
1369 |     </tr>
1370 |     <tr>
1371 |       <th>1999-10-27</th>
1372 |       <td>29.33</td>
1373 |       <td>2.38</td>
1374 |       <td>16.52</td>
1375 |       <td>75.94</td>
1376 |       <td>NaN</td>
1377 |       <td>20.80</td>
1378 |       <td>36.94</td>
1379 |       <td>18.27</td>
1380 |       <td>NaN</td>
1381 |       <td>NaN</td>
1382 |     </tr>
1383 |     <tr>
1384 |       <th>1999-10-28</th>
1385 |       <td>29.01</td>
1386 |       <td>2.43</td>
1387 |       <td>16.59</td>
1388 |       <td>71.00</td>
1389 |       <td>NaN</td>
1390 |       <td>21.19</td>
1391 |       <td>38.85</td>
1392 |       <td>19.79</td>
1393 |       <td>NaN</td>
1394 |       <td>NaN</td>
1395 |     </tr>
1396 |     <tr>
1397 |       <th>1999-10-29</th>
1398 |       <td>29.88</td>
1399 |       <td>2.50</td>
1400 |       <td>17.21</td>
1401 |       <td>70.62</td>
1402 |       <td>NaN</td>
1403 |       <td>21.47</td>
1404 |       <td>39.25</td>
1405 |       <td>20.00</td>
1406 |       <td>NaN</td>
1407 |       <td>NaN</td>
1408 |     </tr>
1409 |   </tbody>
1410 | </table>
1411 | </div>
1412 | 
1413 | 
1414 | 
1415 | 
1416 | ```python
1417 | dxp.line(data=stocks.head(500))
1418 | ```
1419 | 
1420 | 
1421 | 
1422 | 
1423 | ![png](https://raw.githubusercontent.com/dexplo/dexplot/gh-pages/images/output_110_0.png)
1424 | 
1425 | 
1426 | 


--------------------------------------------------------------------------------