├── .gitignore
├── .travis.yml
├── AUTHORS.rst
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    ├── environment.yml
    ├── requirements.txt
    └── source
    │   ├── api.rst
    │   ├── conf.py
    │   ├── explorer_api.rst
    │   ├── index.rst
    │   ├── room_occupancy_example.ipynb
    │   ├── sphinxext
    │       └── notebook_sphinxext.py
    │   ├── summarise_api.rst
    │   └── tutorial.rst
├── it
    ├── test_regression.py
    └── update_reference_reports.py
├── lens
    ├── __init__.py
    ├── bins.py
    ├── dask_graph.py
    ├── explorer.py
    ├── formatting.py
    ├── metrics.py
    ├── plotting.py
    ├── summarise.py
    ├── tdigest_utils.py
    ├── utils.py
    ├── version.py
    └── widget.py
├── pyproject.toml
├── readthedocs.yml
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── conftest.py
    ├── data
    │   └── test-artworks.csv
    ├── multivariate_kde.py
    ├── test_explorer.py
    ├── test_summarise.py
    └── test_summary_class.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled files
 2 | *.py[co]
 3 | *.a
 4 | *.o
 5 | *.so
 6 | __pycache__
 7 | 
 8 | # Ignore .c files by default to avoid including generated code. If you want to
 9 | # add a non-generated .c extension, use `git add -f filename.c`.
10 | *.c
11 | 
12 | # Other generated files
13 | emr-bootstrap.sh
14 | MANIFEST
15 | */cython_version.py
16 | htmlcov
17 | .coverage
18 | .eggs
19 | .pytest_cache
20 | .mypy_cache/
21 | 
22 | # Sphinx
23 | _build
24 | 
25 | # Packages/installer info
26 | *.egg
27 | .eggs
28 | *.egg-info
29 | dist
30 | build
31 | eggs
32 | parts
33 | bin
34 | var
35 | sdist
36 | develop-eggs
37 | .installed.cfg
38 | distribute-*.tar.gz
39 | .cache
40 | .tox
41 | .venv
42 | 
43 | # Other
44 | .*.swp
45 | *~
46 | 
47 | # Mac OSX
48 | .DS_Store
49 | 
50 | .project
51 | .pydevproject
52 | .settings
53 | docs/_generated/
54 | docs/api/
55 | 
56 | .idea
57 | *.h5
58 | 
59 | # Autogenerated file on build
60 | tests/test_results/
61 | it/generated_reports/
62 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | os: linux
 3 | dist: xenial
 4 | 
 5 | branches:
 6 |   only:
 7 |   - master
 8 | 
 9 | env:
10 |     global:
11 |         # The following versions are the 'default' for tests, unless
12 |         # overidden underneath. They are defined here in order to save having
13 |         # to repeat them for all configurations.
14 |         - MPLBACKEND=Agg
15 | 
16 | matrix:
17 | 
18 |     include:
19 | 
20 |         # Check for sphinx doc build warnings
21 |         # - os: linux
22 |         #   env: CMD='sphinx-build'
23 | 
24 |         - python: 3.6
25 |           env: TOXENV=py36
26 |         - python: 3.7
27 |           env: TOXENV=py37
28 |         - python: 2.7
29 |           env: TOXENV=py27
30 | 
31 |         - python: 2.7
32 |           env: TOXENV=flake8
33 |         - python: 3.7
34 |           env: TOXENV=flake8
35 |         - python: 3.7
36 |           env: TOXENV=black
37 | 
38 | install:
39 |     - pip install tox
40 | 
41 | script:
42 |     - tox
43 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | Contributors
 2 | ============
 3 | 
 4 | The follow people have contributed directly to this project:
 5 | 
 6 | - `Andrew Brookes <https://github.com/brookesey>`_
 7 | - `Andrew Crozier <https://github.com/acroz>`_
 8 | - `Pascal Bugnion <https://github.com/pbugnion>`_
 9 | - `Peter Foster <https://github.com/pafoster>`_
10 | - `Scott Stevenson <https://github.com/srstevenson>`_
11 | - `Setrak Balian <https://github.com/sbailan>`_
12 | - `Víctor Zabalza <https://github.com/zblz>`_
13 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.rst
 2 | include *.txt
 3 | include *.yml
 4 | include tox.ini
 5 | recursive-include ci *.sh
 6 | recursive-include docs *.ipynb
 7 | recursive-include docs *.py
 8 | recursive-include docs *.rst
 9 | recursive-include docs *.txt
10 | recursive-include docs *.yml
11 | recursive-include docs Makefile
12 | recursive-include it *.py
13 | recursive-include tests *.csv
14 | recursive-include tests *.py
15 | recursive-include tests *.txt
16 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | lens
 2 | ====
 3 | 
 4 | .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.2593336.svg
 5 |    :target: https://doi.org/10.5281/zenodo.2593336
 6 | 
 7 | ``lens`` is a library for exploring data in Pandas DataFrames. It computes
 8 | single column summary statistics and estimates the correlation between columns.
 9 | We wrote ``lens`` when we realised that the initial steps of acquiring a new
10 | data set were almost formulaic: What data type is in this column? How many null
11 | values are there? Which columns are correlated? What's the distribution of this
12 | value? ``lens`` calculates all this for you.
13 | 
14 | See the documentation_ for more details.
15 | 
16 | .. _documentation: https://lens.readthedocs.io/en/latest
17 | 
18 | Installation
19 | ------------
20 | 
21 | ``lens`` can be installed from PyPI with ``pip``:
22 | 
23 | .. code-block:: bash
24 | 
25 |     pip install lens
26 | 
27 | Testing
28 | -------
29 | 
30 | Tests can be run using [`tox`](https://tox.readthedocs.io) (replace `py37` with
31 | the version of python you wish to use to run the tests):
32 | 
33 | .. code-block:: bash
34 | 
35 |     pip install tox
36 |     tox -e py37
37 | 
38 | License
39 | -------
40 | 
41 | ``lens`` is licensed under the Apache License, see LICENSE.txt for details.
42 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | PAPER         =
 8 | BUILDDIR      = build
 9 | 
10 | # User-friendly check for sphinx-build
11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
12 | 	$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/)
13 | endif
14 | 
15 | # Internal variables.
16 | PAPEROPT_a4     = -D latex_paper_size=a4
17 | PAPEROPT_letter = -D latex_paper_size=letter
18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
19 | # the i18n builder cannot share the environment and doctrees with the others
20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
21 | 
22 | .PHONY: help
23 | help:
24 | 	@echo "Please use \`make <target>' where <target> is one of"
25 | 	@echo "  html       to make standalone HTML files"
26 | 
27 | .PHONY: clean
28 | clean:
29 | 	rm -rf $(BUILDDIR)/*
30 | 
31 | .PHONY: html
32 | html:
33 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
34 | 	@echo
35 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
36 | 


--------------------------------------------------------------------------------
/docs/environment.yml:
--------------------------------------------------------------------------------
 1 | name: lens-docs
 2 | dependencies:
 3 |   - python=3.6
 4 |   - numpy
 5 |   - dask
 6 |   - ipywidgets
 7 |   - matplotlib
 8 |   - numpy>=1.11
 9 |   - numpydoc
10 |   - pandas
11 |   - plotly
12 |   - scipy
13 |   - pip:
14 |     - tdigest
15 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | numpydoc
2 | sphinx
3 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | API documentation
 2 | =================
 3 | 
 4 | .. toctree::
 5 | 
 6 |     summarise_api
 7 | 
 8 |     explorer_api
 9 | 
10 |     widget_api
11 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Lens documentation build configuration file, created by
  4 | # sphinx-quickstart on Tue Sep 27 10:49:51 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | sys.path.append(os.path.abspath('sphinxext'))
 22 | 
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = [
 33 |     'sphinx.ext.mathjax',
 34 |     'sphinx.ext.viewcode',
 35 |     'sphinx.ext.autodoc',
 36 |     'sphinx.ext.autosummary',
 37 |     'notebook_sphinxext',
 38 |     'numpydoc',
 39 | ]
 40 | 
 41 | numpydoc_show_class_members = False
 42 | 
 43 | # Add any paths that contain templates here, relative to this directory.
 44 | templates_path = ['_templates']
 45 | 
 46 | # The suffix(es) of source filenames.
 47 | # You can specify multiple suffix as a list of string:
 48 | # source_suffix = ['.rst', '.md']
 49 | source_suffix = '.rst'
 50 | 
 51 | # The encoding of source files.
 52 | #source_encoding = 'utf-8-sig'
 53 | 
 54 | # The master toctree document.
 55 | master_doc = 'index'
 56 | 
 57 | # General information about the project.
 58 | project = u'Lens'
 59 | copyright = u'2016-2019, Faculty Science Limited'
 60 | author = u'Faculty'
 61 | 
 62 | # The version info for the project you're documenting, acts as replacement for
 63 | # |version| and |release|, also used in various other places throughout the
 64 | # built documents.
 65 | #
 66 | # The short X.Y version.
 67 | version = u'0.0.1'
 68 | # The full version, including alpha/beta/rc tags.
 69 | release = u'0.0.1'
 70 | 
 71 | # The language for content autogenerated by Sphinx. Refer to documentation
 72 | # for a list of supported languages.
 73 | #
 74 | # This is also used if you do content translation via gettext catalogs.
 75 | # Usually you set "language" from the command line for these cases.
 76 | language = None
 77 | 
 78 | # There are two options for replacing |today|: either, you set today to some
 79 | # non-false value, then it is used:
 80 | #today = ''
 81 | # Else, today_fmt is used as the format for a strftime call.
 82 | #today_fmt = '%B %d, %Y'
 83 | 
 84 | # List of patterns, relative to source directory, that match files and
 85 | # directories to ignore when looking for source files.
 86 | # This patterns also effect to html_static_path and html_extra_path
 87 | exclude_patterns = ['_build', '**.ipynb_checkpoints']
 88 | 
 89 | # The reST default role (used for this markup: `text`) to use for all
 90 | # documents.
 91 | #default_role = None
 92 | 
 93 | # If true, '()' will be appended to :func: etc. cross-reference text.
 94 | #add_function_parentheses = True
 95 | 
 96 | # If true, the current module name will be prepended to all description
 97 | # unit titles (such as .. function::).
 98 | #add_module_names = True
 99 | 
100 | # If true, sectionauthor and moduleauthor directives will be shown in the
101 | # output. They are ignored by default.
102 | #show_authors = False
103 | 
104 | # The name of the Pygments (syntax highlighting) style to use.
105 | pygments_style = 'sphinx'
106 | 
107 | # A list of ignored prefixes for module index sorting.
108 | #modindex_common_prefix = []
109 | 
110 | # If true, keep warnings as "system message" paragraphs in the built documents.
111 | #keep_warnings = False
112 | 
113 | # If true, `todo` and `todoList` produce output, else they produce nothing.
114 | todo_include_todos = False
115 | 
116 | 
117 | # -- Options for HTML output ----------------------------------------------
118 | 
119 | # The theme to use for HTML and HTML Help pages.  See the documentation for
120 | # a list of builtin themes.
121 | html_theme = 'alabaster'
122 | 
123 | # Theme options are theme-specific and customize the look and feel of a theme
124 | # further.  For a list of options available for each theme, see the
125 | # documentation.
126 | #html_theme_options = {}
127 | 
128 | # Add any paths that contain custom themes here, relative to this directory.
129 | #html_theme_path = []
130 | 
131 | # The name for this set of Sphinx documents.
132 | # "<project> v<release> documentation" by default.
133 | #html_title = u'Lens v0.0.1'
134 | 
135 | # A shorter title for the navigation bar.  Default is the same as html_title.
136 | #html_short_title = None
137 | 
138 | # The name of an image file (relative to this directory) to place at the top
139 | # of the sidebar.
140 | #html_logo = None
141 | 
142 | # The name of an image file (relative to this directory) to use as a favicon of
143 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
144 | # pixels large.
145 | #html_favicon = None
146 | 
147 | # Add any paths that contain custom static files (such as style sheets) here,
148 | # relative to this directory. They are copied after the builtin static files,
149 | # so a file named "default.css" will overwrite the builtin "default.css".
150 | html_static_path = ['_static']
151 | 
152 | # Add any extra paths that contain custom files (such as robots.txt or
153 | # .htaccess) here, relative to this directory. These files are copied
154 | # directly to the root of the documentation.
155 | #html_extra_path = []
156 | 
157 | # If not None, a 'Last updated on:' timestamp is inserted at every page
158 | # bottom, using the given strftime format.
159 | # The empty string is equivalent to '%b %d, %Y'.
160 | #html_last_updated_fmt = None
161 | 
162 | # If true, SmartyPants will be used to convert quotes and dashes to
163 | # typographically correct entities.
164 | #html_use_smartypants = True
165 | 
166 | # Custom sidebar templates, maps document names to template names.
167 | #html_sidebars = {}
168 | 
169 | # Additional templates that should be rendered to pages, maps page names to
170 | # template names.
171 | #html_additional_pages = {}
172 | 
173 | # If false, no module index is generated.
174 | #html_domain_indices = True
175 | 
176 | # If false, no index is generated.
177 | #html_use_index = True
178 | 
179 | # If true, the index is split into individual pages for each letter.
180 | #html_split_index = False
181 | 
182 | # If true, links to the reST sources are added to the pages.
183 | #html_show_sourcelink = True
184 | 
185 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
186 | #html_show_sphinx = True
187 | 
188 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
189 | #html_show_copyright = True
190 | 
191 | # If true, an OpenSearch description file will be output, and all pages will
192 | # contain a <link> tag referring to it.  The value of this option must be the
193 | # base URL from which the finished HTML is served.
194 | #html_use_opensearch = ''
195 | 
196 | # This is the file name suffix for HTML files (e.g. ".xhtml").
197 | #html_file_suffix = None
198 | 
199 | # Language to be used for generating the HTML full-text search index.
200 | # Sphinx supports the following languages:
201 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
202 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
203 | #html_search_language = 'en'
204 | 
205 | # A dictionary with options for the search language support, empty by default.
206 | # 'ja' uses this config value.
207 | # 'zh' user can custom change `jieba` dictionary path.
208 | #html_search_options = {'type': 'default'}
209 | 
210 | # The name of a javascript file (relative to the configuration directory) that
211 | # implements a search results scorer. If empty, the default will be used.
212 | #html_search_scorer = 'scorer.js'
213 | 
214 | # Output file base name for HTML help builder.
215 | htmlhelp_basename = 'Lensdoc'
216 | 
217 | # -- Options for LaTeX output ---------------------------------------------
218 | 
219 | latex_elements = {
220 | # The paper size ('letterpaper' or 'a4paper').
221 | #'papersize': 'letterpaper',
222 | 
223 | # The font size ('10pt', '11pt' or '12pt').
224 | #'pointsize': '10pt',
225 | 
226 | # Additional stuff for the LaTeX preamble.
227 | #'preamble': '',
228 | 
229 | # Latex figure (float) alignment
230 | #'figure_align': 'htbp',
231 | }
232 | 
233 | # Grouping the document tree into LaTeX files. List of tuples
234 | # (source start file, target name, title,
235 | #  author, documentclass [howto, manual, or own class]).
236 | latex_documents = [
237 |     (master_doc, 'Lens.tex', u'Lens Documentation',
238 |      u'Faculty', 'manual'),
239 | ]
240 | 
241 | # The name of an image file (relative to this directory) to place at the top of
242 | # the title page.
243 | #latex_logo = None
244 | 
245 | # For "manual" documents, if this is true, then toplevel headings are parts,
246 | # not chapters.
247 | #latex_use_parts = False
248 | 
249 | # If true, show page references after internal links.
250 | #latex_show_pagerefs = False
251 | 
252 | # If true, show URL addresses after external links.
253 | #latex_show_urls = False
254 | 
255 | # Documents to append as an appendix to all manuals.
256 | #latex_appendices = []
257 | 
258 | # If false, no module index is generated.
259 | #latex_domain_indices = True
260 | 
261 | 
262 | # -- Options for manual page output ---------------------------------------
263 | 
264 | # One entry per manual page. List of tuples
265 | # (source start file, name, description, authors, manual section).
266 | man_pages = [
267 |     (master_doc, 'lens', u'Lens Documentation',
268 |      [author], 1)
269 | ]
270 | 
271 | # If true, show URL addresses after external links.
272 | #man_show_urls = False
273 | 
274 | 
275 | # -- Options for Texinfo output -------------------------------------------
276 | 
277 | # Grouping the document tree into Texinfo files. List of tuples
278 | # (source start file, target name, title, author,
279 | #  dir menu entry, description, category)
280 | texinfo_documents = [
281 |     (master_doc, 'Lens', u'Lens Documentation',
282 |      author, 'Lens', 'One line description of project.',
283 |      'Miscellaneous'),
284 | ]
285 | 
286 | # Documents to append as an appendix to all manuals.
287 | #texinfo_appendices = []
288 | 
289 | # If false, no module index is generated.
290 | #texinfo_domain_indices = True
291 | 
292 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
293 | #texinfo_show_urls = 'footnote'
294 | 
295 | # If true, do not generate a @detailmenu in the "Top" node's menu.
296 | #texinfo_no_detailmenu = False
297 | 


--------------------------------------------------------------------------------
/docs/source/explorer_api.rst:
--------------------------------------------------------------------------------
 1 | lens.explorer API
 2 | =================
 3 | 
 4 | .. currentmodule:: lens.explorer
 5 | 
 6 | .. automodule:: lens.explorer
 7 | 
 8 | .. autoclass:: lens.explorer.Explorer
 9 |    :members:
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Lens documentation master file, created by
 2 |    sphinx-quickstart on Tue Sep 27 10:49:51 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Lens
 7 | ====
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 | 
12 |    tutorial
13 |    api
14 | 
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 


--------------------------------------------------------------------------------
/docs/source/room_occupancy_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Lens Tutorial\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "*Lens* is a library for exploring data in Pandas DataFrames. It computes single\n",
 11 |     "column summary statistics and estimates the correlation between columns.\n",
 12 |     "\n",
 13 |     "We wrote *Lens* when we realised that the initial steps of acquiring a new\n",
 14 |     "dataset were almost formulaic: what data type is in this column? How many null\n",
 15 |     "values are there? Which columns are correlated? What's the distribution of this\n",
 16 |     "value? Lens calculates all this for you, and provides convenient visualisation\n",
 17 |     "of this information.\n",
 18 |     "\n",
 19 |     "You can use *Lens* to analyse new datasets as well as using it to compare how\n",
 20 |     "DataFrames change over time.\n",
 21 |     "\n",
 22 |     "## Using lens\n",
 23 |     "\n",
 24 |     "To start using *Lens* you need to import the library:"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import lens"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "*Lens* has two key functions; ``lens.summarise`` for generating a Lens Summary from a DataFrame and\n",
 41 |     "``lens.explore`` for visualising the results of a summary.\n",
 42 |     "\n",
 43 |     "For this tutorial we are going to use *Lens* to analyse the Room Occupancy\n",
 44 |     "dataset provided in the [Machine Learning Repository of UC Irvine](https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+). It includes \n",
 45 |     "ambient information about a room such as Temperature, Humidity,\n",
 46 |     "Light, CO2 and whether it was occupied. The goal is to\n",
 47 |     "predict occupancy based on the room measurements.\n",
 48 |     "\n",
 49 |     "We read the training portion of the dataset into pandas directly from the UCI repository:"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 24,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "import pandas as pd\n",
 59 |     "from urllib.request import urlopen\n",
 60 |     "from io import BytesIO\n",
 61 |     "from zipfile import ZipFile\n",
 62 |     "\n",
 63 |     "remote_zip = urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip')\n",
 64 |     "df = pd.read_csv(BytesIO(ZipFile(BytesIO(remote_zip.read())).read('datatraining.txt')))\n",
 65 |     "\n",
 66 |     "# Split a numerical variable to have additional categorical variables\n",
 67 |     "df['Humidity_cat'] = pd.cut(df['Humidity'], 5,\n",
 68 |     "                            labels=['low', 'medium-low', 'medium',\n",
 69 |     "                                    'medium-high', 'high']).astype('str')"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {
 76 |     "scrolled": false
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "print('Number of rows in dataset: {}'.format(len(df.index)))\n",
 81 |     "df.head()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## Creating the summary\n",
 89 |     "\n",
 90 |     "When you have a DataFrame that you'd like to analyse the first thing to do is\n",
 91 |     "to create a Lens ``Summary`` object."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "ls = lens.summarise(df)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "The `summarise` function takes a DataFrame and returns a Lens ``Summary`` object. The\n",
108 |     "time this takes to run is dependent on both the number of rows and the number of\n",
109 |     "columns in the DataFrame. It will use all cores available on the machine, so you \n",
110 |     "might want to use a SherlockML instance with more cores to speed up the computation \n",
111 |     "of the summary. There are additional optional parameters that can be\n",
112 |     "passed in. Details of these can be found in the [summarise API docs](https://docs.sherlockml.com/lens/summarise_api.html#lens.summarise.summarise).\n",
113 |     "\n",
114 |     "Given that creating the summary is computationally intensive, *Lens* provides a way to save this summary to a JSON file on disk and recover a saved summary through the `to_json` and `from_json` methods of `lens.summary`. This allows to store it for future analysis or to share it with collaborators:"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# Saving to JSON\n",
124 |     "ls.to_json('room_occupancy_lens_summary.json')\n",
125 |     "\n",
126 |     "# Reading from a file\n",
127 |     "ls_from_json = lens.Summary.from_json('room_occupancy_lens_summary.json')"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "The `LensSummary` object contains the information computed from the dataset and provides methods to access both column-wise and whole dataset information. It is designed to be used programatically, and information about the methods can be accessed in the [LensSummary API docs](https://docs.sherlockml.com/lens/summarise_api.html#lens.summarise.Summary)."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "print(ls.columns)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## Create explorer\n",
151 |     "\n",
152 |     "Lens provides a function that converts a Lens Summary into an `Explorer` object.\n",
153 |     "This can be used to see the summary information in tabular form and to display\n",
154 |     "plots."
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "explorer = lens.explore(ls)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Coming back to our room occupancy dataset, the first thing that we'd like to\n",
171 |     "know is a high-level overview of the data."
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "### Describe\n",
179 |     "\n",
180 |     "To show a general description of the DataFrame call the `describe` function.\n",
181 |     "This is similar to Pandas' ``DataFrame.describe`` but also shows information for non-numeric columns."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "explorer.describe()"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "We can see that our dataset has 8143 rows and all the rows are complete. This\n",
198 |     "is a very clean dataset! It also tells us the columns and their types, including a `desc` field that explains how *Lens* will treat this column."
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "### Column details\n",
206 |     "\n",
207 |     "To see type-specific column details, use the `column_details` method. Used on a numeric column such as `Temperature`, it provides summary statistics for the data in that column, including minimun, maximum, mean, median, and standard deviation."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "explorer.column_details('Temperature')"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "We saw in the ouput of `explorer.describe()` that `Occupancy`, our target variable, is a categorical column with two unique values. With `explorer.column_details` we can obtain a frequency table for these two categories - empty (0) or occupied (1):"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "explorer.column_details('Occupancy')"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "### Correlation\n",
240 |     "\n",
241 |     "As a first step in exploring the relationships between the columns we can look at the correlation coefficients. `explorer.correlation()` returns a Spearman rank-order correlation coefficient matrix in tabular form."
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "explorer.correlation()"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "However, parsing a correlation table becomes difficult when there are many columns in the dataset. To get a better overview, we can plot the correlation matrix as a heatmap, which immediately highlights a group of columns correlated with `Occupancy`: `Temperature`, `Light`, and `CO2`."
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "explorer.correlation_plot()"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "### Distribution and Cumulative Distribution"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {},
279 |    "source": [
280 |     "We can explore the distribution of numerical variables through the `distribution_plot` and `cdf_plot` functions:"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "explorer.distribution_plot('Temperature')"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "explorer.cdf_plot('Temperature')"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "### Pairwise plot\n",
306 |     "\n",
307 |     "Once we know that certain columns might be correlated, it is useful to visually explore that correlation. This would typically be done through a scatter plot, and *Lens* has computed a 2D Kernel Density Estimate of the scatter plot that can be accessed through `pairwise_density_plot`."
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {
314 |     "scrolled": false
315 |    },
316 |    "outputs": [],
317 |    "source": [
318 |     "explorer.pairwise_density_plot('Temperature', 'Humidity')"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "`pairwise_density_plot` can also show the relationship between a numeric column and a categorical column. In this case, a 1D KDE is computed for each of the categories in the categorical column."
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "explorer.pairwise_density_plot('Temperature', 'Occupancy')"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {
340 |     "collapsed": true
341 |    },
342 |    "source": [
343 |     "### Crosstab\n",
344 |     "\n",
345 |     "The pairwise relationship between two categorical variables can also be seen as a cross-tabulation: how many observations exist in the dataset of the combination of categories in the two variables. This can be seen as a table or as a plot, which can be useful when the number of categories is very large."
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "explorer.crosstab('Occupancy', 'Humidity_cat')"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "explorer.pairwise_density_plot('Occupancy', 'Humidity_cat')"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "## Interactive widget\n",
371 |     "\n",
372 |     "An alternative way of quickly exploring the plots available in *Lens* is through a Jupyter widget provided by `lens.interactive_explore`. Creating it is as easy as running this function on a *Lens* `Summary`.\n",
373 |     "\n",
374 |     "Note that if you are reading this tutorial through the online docs the output of the following cell will not be interactive as it needs to run within a notebook. Download the notebook from the links below to try out the interactive explorer!"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {
381 |     "scrolled": false
382 |    },
383 |    "outputs": [],
384 |    "source": [
385 |     "lens.interactive_explore(ls)"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": []
394 |   }
395 |  ],
396 |  "metadata": {
397 |   "kernelspec": {
398 |    "display_name": "Python 3",
399 |    "language": "python",
400 |    "name": "python3"
401 |   },
402 |   "language_info": {
403 |    "codemirror_mode": {
404 |     "name": "ipython",
405 |     "version": 3
406 |    },
407 |    "file_extension": ".py",
408 |    "mimetype": "text/x-python",
409 |    "name": "python",
410 |    "nbconvert_exporter": "python",
411 |    "pygments_lexer": "ipython3",
412 |    "version": "3.6.3"
413 |   },
414 |   "widgets": {
415 |    "state": {
416 |     "496016afdc01477d9f515f2c073feb4f": {
417 |      "views": [
418 |       {
419 |        "cell_index": 36
420 |       }
421 |      ]
422 |     }
423 |    },
424 |    "version": "1.2.0"
425 |   }
426 |  },
427 |  "nbformat": 4,
428 |  "nbformat_minor": 1
429 | }
430 | 


--------------------------------------------------------------------------------
/docs/source/sphinxext/notebook_sphinxext.py:
--------------------------------------------------------------------------------
  1 | """This is a modified version of ``notebook_sphinxext.py``
  2 | 
  3 | The original was from: https://github.com/ngoldbaum/RunNotebook as of commit
  4 | 944f983
  5 | 
  6 | This is the license for RunNotebook:
  7 | 
  8 | Copyright (c) 2013 Nathan Goldbaum. All rights reserved.
  9 | 
 10 | Redistribution and use in source and binary forms, with or without
 11 | modification, are permitted provided that the following conditions are
 12 | met:
 13 | 
 14 |    * Redistributions of source code must retain the above copyright
 15 | notice, this list of conditions and the following disclaimer.
 16 |    * Redistributions in binary form must reproduce the above
 17 | copyright notice, this list of conditions and the following disclaimer
 18 | in the documentation and/or other materials provided with the
 19 | distribution.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 32 | """
 33 | 
 34 | import errno
 35 | import nbformat
 36 | import os
 37 | import shutil
 38 | import tempfile
 39 | import uuid
 40 | from sphinx.util.compat import Directive
 41 | from docutils import nodes
 42 | from docutils.parsers.rst import directives
 43 | from traitlets.config import Config
 44 | from nbconvert import html, python
 45 | 
 46 | 
 47 | class NotebookDirective(Directive):
 48 |     """Insert an evaluated notebook into a document
 49 | 
 50 |     This uses runipy and nbconvert to transform a path to an unevaluated notebook
 51 |     into html suitable for embedding in a Sphinx document.
 52 |     """
 53 |     required_arguments = 1
 54 |     optional_arguments = 1
 55 |     option_spec = {'skip_exceptions': directives.flag}
 56 |     final_argument_whitespace = True
 57 | 
 58 |     def run(self):  # check if there are spaces in the notebook name
 59 |         nb_path = self.arguments[0]
 60 |         if ' ' in nb_path:
 61 |             raise ValueError(
 62 |                 "Due to issues with docutils stripping spaces from links, white "
 63 |                 "space is not allowed in notebook filenames '{0}'".format(
 64 |                     nb_path))
 65 |         # check if raw html is supported
 66 |         if not self.state.document.settings.raw_enabled:
 67 |             raise self.warning('"%s" directive disabled.' % self.name)
 68 | 
 69 |         cwd = os.getcwd()
 70 |         tmpdir = tempfile.mkdtemp()
 71 |         os.chdir(tmpdir)
 72 | 
 73 |         # get path to notebook
 74 |         nb_filename = self.arguments[0]
 75 |         nb_basename = os.path.basename(nb_filename)
 76 |         rst_file = self.state_machine.document.attributes['source']
 77 |         rst_dir = os.path.abspath(os.path.dirname(rst_file))
 78 |         nb_abs_path = os.path.abspath(os.path.join(rst_dir, nb_filename))
 79 | 
 80 |         # Move files around.
 81 |         rel_dir = os.path.relpath(rst_dir, setup.confdir)
 82 |         dest_dir = os.path.join(setup.app.builder.outdir, rel_dir)
 83 |         dest_path = os.path.join(dest_dir, nb_basename)
 84 | 
 85 |         image_dir, image_rel_dir = make_image_dir(setup, rst_dir)
 86 | 
 87 |         # Ensure desination build directory exists
 88 |         thread_safe_mkdir(os.path.dirname(dest_path))
 89 | 
 90 |         # Copy unevaluated notebook
 91 |         shutil.copyfile(nb_abs_path, dest_path)
 92 | 
 93 |         # Construct paths to versions getting copied over
 94 |         dest_path_eval = dest_path.replace('.ipynb', '_evaluated.ipynb')
 95 |         dest_path_script = dest_path.replace('.ipynb', '.py')
 96 |         rel_path_eval = nb_basename.replace('.ipynb', '_evaluated.ipynb')
 97 |         rel_path_script = nb_basename.replace('.ipynb', '.py')
 98 | 
 99 |         # Create python script vesion
100 |         script_text = nb_to_python(nb_abs_path)
101 |         f = open(dest_path_script, 'wb')
102 |         f.write(script_text.encode('utf8'))
103 |         f.close()
104 | 
105 |         skip_exceptions = 'skip_exceptions' in self.options
106 | 
107 |         evaluated_text, resources = evaluate_notebook(
108 |             nb_abs_path, dest_path_eval, skip_exceptions=skip_exceptions)
109 | 
110 |         evaluated_text = write_notebook_output(
111 |             resources, image_dir, image_rel_dir, evaluated_text)
112 | 
113 |         # Create link to notebook and script files
114 |         link_rst = "(" + \
115 |                    formatted_link(nb_basename) + "; " + \
116 |                    formatted_link(rel_path_eval) + "; " + \
117 |                    formatted_link(rel_path_script) + \
118 |                    ")"
119 | 
120 |         self.state_machine.insert_input([link_rst], rst_file)
121 | 
122 |         # create notebook node
123 |         attributes = {'format': 'html', 'source': 'nb_path'}
124 |         nb_node = notebook_node('', evaluated_text, **attributes)
125 |         (nb_node.source, nb_node.line) = \
126 |             self.state_machine.get_source_and_line(self.lineno)
127 | 
128 |         # add dependency
129 |         self.state.document.settings.record_dependencies.add(nb_abs_path)
130 | 
131 |         # clean up
132 |         os.chdir(cwd)
133 |         shutil.rmtree(tmpdir, True)
134 | 
135 |         return [nb_node]
136 | 
137 | 
138 | class notebook_node(nodes.raw):
139 |     pass
140 | 
141 | 
142 | def nb_to_python(nb_path):
143 |     """convert notebook to python script"""
144 |     exporter = python.PythonExporter()
145 |     output, resources = exporter.from_filename(nb_path)
146 |     return output
147 | 
148 | 
149 | def nb_to_html(nb_path, skip_exceptions):
150 |     """convert notebook to html"""
151 | 
152 |     nbconvert_config = Config({
153 |         'ExtractOutputPreprocessor': {'enabled': True},
154 |         'ExecutePreprocessor': {
155 |             'enabled': True,
156 |             # make this configurable?
157 |             'timeout': 3600,
158 |         }
159 |     })
160 | 
161 |     if skip_exceptions is False:
162 |         nbconvert_config['ExecutePreprocessor']['allow_errors'] = True
163 | 
164 |     exporter = html.HTMLExporter(template_file='full', config=nbconvert_config)
165 |     notebook = nbformat.read(nb_path, nbformat.NO_CONVERT)
166 |     output, resources = exporter.from_notebook_node(notebook)
167 |     header = output.split('<head>', 1)[1].split('</head>', 1)[0]
168 |     body = output.split('<body>', 1)[1].split('</body>', 1)[0]
169 | 
170 |     # http://imgur.com/eR9bMRH
171 |     header = header.replace('<style', '<style scoped="scoped"')
172 |     header = header.replace(
173 |         'body {\n  overflow: visible;\n  padding: 8px;\n}\n', '')
174 |     header = header.replace(
175 |         'div#notebook {\n  overflow: visible;',
176 |         '.container {\n  width: 100%;\n}\ndiv#notebook {\n  overflow: visible;\n  width: 100%;')
177 |     header = header.replace(
178 |         '#notebook-container {\n    padding: 15px;',
179 |         '#notebook-container {\n    padding: 0px;',)
180 |     header = header.replace("code,pre{", "code{")
181 | 
182 |     # Filter out styles that conflict with the sphinx theme.
183 |     filter_strings = [
184 |         'navbar',
185 |         'body{',
186 |         'alert{',
187 |         'uneditable-input{',
188 |         'collapse{',
189 |     ]
190 | 
191 |     #filter_strings.extend(['h%s{' % (i+1) for i in range(6)])
192 | 
193 |     line_begin = [
194 |         'pre{',
195 |         'p{margin'
196 |     ]
197 | 
198 |     filterfunc = lambda x: not any([s in x for s in filter_strings])
199 |     header_lines = filter(filterfunc, header.split('\n'))
200 | 
201 |     filterfunc = lambda x: not any([x.startswith(s) for s in line_begin])
202 |     header_lines = filter(filterfunc, header_lines)
203 | 
204 |     header = '\n'.join(header_lines)
205 | 
206 |     # concatenate raw html lines
207 |     lines = ['<div class="ipynotebook">']
208 |     lines.append(header)
209 |     lines.append(body)
210 |     lines.append('</div>')
211 |     return '\n'.join(lines), resources, notebook
212 | 
213 | 
214 | def evaluate_notebook(nb_path, dest_path, skip_exceptions=True):
215 |     # Create evaluated version and save it to the dest path.
216 |     lines, resources, notebook = nb_to_html(nb_path, skip_exceptions)
217 |     nbformat.write(notebook, dest_path)
218 |     return lines, resources
219 | 
220 | 
221 | def formatted_link(path):
222 |     return "`%s <%s>`__" % (os.path.basename(path), path)
223 | 
224 | 
225 | def visit_notebook_node(self, node):
226 |     self.visit_raw(node)
227 | 
228 | 
229 | def depart_notebook_node(self, node):
230 |     self.depart_raw(node)
231 | 
232 | 
233 | def setup(app):
234 |     setup.app = app
235 |     setup.config = app.config
236 |     setup.confdir = app.confdir
237 | 
238 |     app.add_node(notebook_node,
239 |                  html=(visit_notebook_node, depart_notebook_node))
240 | 
241 |     app.add_directive('notebook', NotebookDirective)
242 | 
243 |     retdict = dict(
244 |         version='0.1',
245 |         parallel_read_safe=True,
246 |         parallel_write_safe=True
247 |     )
248 | 
249 |     return retdict
250 | 
251 | 
252 | def make_image_dir(setup, rst_dir):
253 |     image_dir = setup.app.builder.outdir + os.path.sep + '_images'
254 |     rel_dir = os.path.relpath(setup.confdir, rst_dir)
255 |     image_rel_dir = rel_dir + os.path.sep + '_images'
256 |     thread_safe_mkdir(image_dir)
257 |     return image_dir, image_rel_dir
258 | 
259 | 
260 | def write_notebook_output(resources, image_dir, image_rel_dir, evaluated_text):
261 |     my_uuid = uuid.uuid4().hex
262 | 
263 |     for output in resources['outputs']:
264 |         new_name = image_dir + os.path.sep + my_uuid + output
265 |         new_relative_name = image_rel_dir + os.path.sep + my_uuid + output
266 |         evaluated_text = evaluated_text.replace(output, new_relative_name)
267 |         with open(new_name, 'wb') as f:
268 |             f.write(resources['outputs'][output])
269 |     return evaluated_text
270 | 
271 | 
272 | def thread_safe_mkdir(dirname):
273 |     try:
274 |         os.makedirs(dirname)
275 |     except OSError as e:
276 |         if e.errno != errno.EEXIST:
277 |             raise
278 |         pass
279 | 


--------------------------------------------------------------------------------
/docs/source/summarise_api.rst:
--------------------------------------------------------------------------------
 1 | lens.summarise API
 2 | ==================
 3 | 
 4 | .. currentmodule:: lens.summarise
 5 | 
 6 | .. automodule:: lens.summarise
 7 | 
 8 | .. autofunction:: lens.summarise.summarise
 9 | 
10 | .. autoclass:: lens.summarise.Summary
11 |    :members:
12 | 
13 | 


--------------------------------------------------------------------------------
/docs/source/tutorial.rst:
--------------------------------------------------------------------------------
1 | Lens Tutorial
2 | =============
3 | 
4 | We have prepared a Lens tutorial in the form of a Jupyter notebook. A static
5 | version is reproduced below, but you can also execute it yourself by downloading
6 | :download:`the notebook file <./room_occupancy_example.ipynb>`.
7 | 
8 | .. notebook:: room_occupancy_example.ipynb
9 | 


--------------------------------------------------------------------------------
/it/test_regression.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import json
  4 | 
  5 | import boto3
  6 | import lens
  7 | import numpy as np
  8 | import pandas as pd
  9 | import pytest
 10 | import os
 11 | import inspect
 12 | 
 13 | S3 = boto3.client("s3")
 14 | BUCKET = "asi-lens-test-data"
 15 | 
 16 | datasets = [
 17 |     "room_occupancy.csv",
 18 |     "artworks-5k.csv",
 19 |     "air-quality-london-time-of-day.csv",
 20 |     "momaExhibitions-5k.csv",
 21 |     "noheader.csv",
 22 |     "monthly-milk-production.csv",
 23 |     "customer-data.csv",
 24 | ]
 25 | 
 26 | dirname = os.path.dirname(
 27 |     os.path.abspath(inspect.getfile(inspect.currentframe()))
 28 | )
 29 | result_dir = os.path.join(dirname, "generated_reports")
 30 | 
 31 | if not os.path.exists(result_dir):
 32 |     os.mkdir(result_dir)
 33 | 
 34 | 
 35 | @pytest.mark.parametrize("input_", datasets)
 36 | def test_summary_regression(input_):
 37 |     # load the input into a pandas dataframe
 38 |     df = pd.read_csv("s3://{}/input/{}".format(BUCKET, input_))
 39 | 
 40 |     # run the lens summarise method
 41 |     summary = lens.summarise(df)
 42 | 
 43 |     # Save generated report
 44 |     summary.to_json(os.path.join(result_dir, input_.replace(".csv", ".json")))
 45 | 
 46 |     # load the expected output file into a summary object
 47 |     output = input_.replace(".csv", ".json")
 48 |     s3_summary = read_s3_file(BUCKET, "output/{}".format(output))[
 49 |         "Body"
 50 |     ].read()
 51 | 
 52 |     if isinstance(s3_summary, bytes):
 53 |         s3_summary = s3_summary.decode("utf-8")
 54 | 
 55 |     expected_summary = json.loads(s3_summary)
 56 | 
 57 |     # list of keys to ignore from the response because they are
 58 |     # probablistically generated
 59 |     exclude = [
 60 |         "_run_time",
 61 |         "tdigest",
 62 |         "density",
 63 |         "bw",
 64 |         "logtrans_IQR",
 65 |         "kde",
 66 |         "_lens_version",
 67 |     ]
 68 | 
 69 |     diffs = find_diff(
 70 |         json.loads(json.dumps(summary._report)), expected_summary, exclude
 71 |     )
 72 | 
 73 |     for diff in diffs:
 74 |         print(diff)
 75 | 
 76 |     if len(diffs):
 77 |         # Save expected report to check the differences manually if needed
 78 |         exp_name = os.path.join(
 79 |             result_dir, output.replace(".json", "-expected.json")
 80 |         )
 81 |         with open(exp_name, "w") as f:
 82 |             f.write(s3_summary)
 83 | 
 84 |     # compare the input and output summary objects
 85 |     assert len(diffs) == 0
 86 | 
 87 | 
 88 | def read_s3_file(bucket, key):
 89 |     return S3.get_object(Bucket=BUCKET, Key=key)
 90 | 
 91 | 
 92 | def find_diff(d1, d2, exclude=[], path="", update_path=True):
 93 |     diffs = []
 94 |     for k in d1.keys():
 95 |         if k in exclude:
 96 |             continue
 97 | 
 98 |         if k not in d2:
 99 |             msg = "{} :\n {} as key not in d2".format(path, k)
100 |             diffs.append(msg)
101 |         else:
102 |             new_path = path
103 |             if update_path:
104 |                 if new_path == "":
105 |                     new_path = k
106 |                 else:
107 |                     new_path = new_path + "->" + k
108 | 
109 |             if isinstance(d1[k], dict):
110 |                 diffs = diffs + find_diff(d1[k], d2[k], exclude, new_path)
111 |             elif isinstance(d1[k], list):
112 |                 # convert the list to a dict using the index as the key.
113 |                 diffs = diffs + find_diff(
114 |                     list_to_dict(d1[k]),
115 |                     list_to_dict(d2[k]),
116 |                     exclude,
117 |                     new_path,
118 |                     False,
119 |                 )
120 |             else:
121 |                 a = d1[k]
122 |                 b = d2[k]
123 |                 if not isinstance(a, float) or not (
124 |                     np.isnan(a) and np.isnan(b)
125 |                 ):
126 |                     if isinstance(a, float):
127 |                         if not np.allclose(a, b):
128 |                             msg = "{} :\n - {} : {}\n + {} : {}".format(
129 |                                 path, k, a, k, b
130 |                             )
131 |                             diffs.append(msg)
132 |                     elif a != b:
133 |                         msg = "{} :\n - {} : {}\n + {} : {}".format(
134 |                             path, k, a, k, b
135 |                         )
136 |                         diffs.append(msg)
137 | 
138 |     return diffs
139 | 
140 | 
141 | def list_to_dict(list_):
142 |     dict_ = {}
143 |     for index, item in enumerate(list_):
144 |         dict_[index] = item
145 | 
146 |     return dict_
147 | 


--------------------------------------------------------------------------------
/it/update_reference_reports.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import boto3
 4 | import os
 5 | 
 6 | S3 = boto3.client("s3")
 7 | BUCKET = "asi-lens-test-data"
 8 | 
 9 | 
10 | def check_report_bom(name):
11 |     """Check whether the report has Byte Order Marks
12 | 
13 |     Parameters
14 |     ----------
15 |     name : str
16 |         Filename of the report.
17 |     """
18 |     with open(name, "r") as f:
19 |         report_json = f.read()
20 |     if report_json.count("\\ufeff"):
21 |         print(
22 |             "  WARNING: {} Byte-order-Marks found in report! This might"
23 |             " provoke failures on codeship. Have you checked that the csv"
24 |             " is encoded in UTF8 without BOM?".format(
25 |                 report_json.count("\\ufeff")
26 |             )
27 |         )
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     from test_regression import datasets
32 | 
33 |     for dataset in datasets:
34 |         report_name = dataset.replace(".csv", ".json")
35 |         report_path = os.path.join("generated_reports", report_name)
36 |         check_report_bom(report_path)
37 |         S3.upload_file(
38 |             report_path, Bucket=BUCKET, Key="output/{}".format(report_name)
39 |         )
40 | 


--------------------------------------------------------------------------------
/lens/__init__.py:
--------------------------------------------------------------------------------
 1 | """Summarise and explore Pandas DataFrames"""
 2 | 
 3 | from lens.explorer import Explorer, explore
 4 | from lens.summarise import Summary, summarise
 5 | from lens.version import __version__
 6 | from lens.widget import interactive_explore
 7 | 
 8 | __all__ = [
 9 |     "Summary",
10 |     "summarise",
11 |     "Explorer",
12 |     "explore",
13 |     "interactive_explore",
14 |     "__version__",
15 | ]
16 | 


--------------------------------------------------------------------------------
/lens/bins.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def freedman_diaconis_bins(data_range, iqr, number_samples):
 5 |     """
 6 |     Calculate number of hist bins using Freedman-Diaconis rule.
 7 | 
 8 |     Parameters
 9 |     ----------
10 | 
11 |     data_range: float
12 |         total range of the data
13 | 
14 |     iqr: float
15 |         interquartile range of the data
16 | 
17 |     number_samples: int
18 |         number of values in the data
19 |     """
20 |     # From http://stats.stackexchange.com/questions/798/
21 |     # adapted from seaborn
22 |     h = 2 * iqr / (float(number_samples) ** (1.0 / 3.0))
23 |     # fall back to sqrt(a) bins if iqr is 0
24 |     if h == 0:
25 |         return int(np.sqrt(number_samples))
26 |     else:
27 |         return int(np.ceil(data_range) / h)
28 | 


--------------------------------------------------------------------------------
/lens/dask_graph.py:
--------------------------------------------------------------------------------
  1 | """Build a Dask graph of Summary computation"""
  2 | 
  3 | import itertools
  4 | import pandas as pd
  5 | from dask.delayed import delayed, Delayed
  6 | from . import metrics
  7 | 
  8 | 
  9 | def _nested_merge(first, second, path=None):
 10 |     """Merge two nested dictionaries into a single dictionary.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     first : dict
 15 |         The first dictionary.
 16 |     second : dict
 17 |         The second dictionary.
 18 |     path : TODO
 19 |         TODO
 20 | 
 21 |     Returns
 22 |     -------
 23 |     dict
 24 |         The merged dictionary.
 25 |     """
 26 |     if path is None:
 27 |         path = []
 28 |     for key in second:
 29 |         if key in first:
 30 |             if isinstance(first[key], dict) and isinstance(second[key], dict):
 31 |                 _nested_merge(first[key], second[key], path + [str(key)])
 32 |             elif first[key] == second[key]:
 33 |                 pass  # Same leaf value.
 34 |             else:
 35 |                 raise Exception(
 36 |                     "Conflict at {}".format(".".join(path + [str(key)]))
 37 |                 )
 38 |         else:
 39 |             first[key] = second[key]
 40 |     return first
 41 | 
 42 | 
 43 | @delayed(pure=True)
 44 | def _join_dask_results(results):
 45 |     """Join a list of column-wise results into a single dictionary.
 46 | 
 47 |     The `_run_time` and `_columns` keys are appended to, whilst other
 48 |     keys are merged.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     results : list
 53 |         List of Dask results dictionaries to join.
 54 |     """
 55 |     report = {"_run_time": 0.0, "_columns": []}
 56 | 
 57 |     for result in results:
 58 |         if isinstance(result, Delayed):
 59 |             result = result.compute()
 60 |         if result is not None:
 61 |             report["_run_time"] += result["_run_time"]
 62 |             report["_columns"] += result["_columns"]
 63 |             columns = result.keys()
 64 |             report = _nested_merge(
 65 |                 report,
 66 |                 {
 67 |                     column: result[column]
 68 |                     for column in columns
 69 |                     if column not in ["_columns", "_run_time"]
 70 |                 },
 71 |             )
 72 | 
 73 |     report["_columns"] = sorted(list(set(report["_columns"])))
 74 | 
 75 |     return report
 76 | 
 77 | 
 78 | def create_dask_graph(df, pairdensities=True):
 79 |     """Create a Dask graph for executing the summary generation.
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     df : pd.DataFrame
 84 |         The DataFrame for which to generate the summary.
 85 | 
 86 |     pairdensities : bool, optional
 87 |         Whether to compute the pairdensity estimation between all pairs of
 88 |         numerical columns. For most datasets, this is the most expensive
 89 |         computation. Default is True.
 90 | 
 91 |     Returns
 92 |     -------
 93 |     dict
 94 |         The generated data summary.
 95 |     """
 96 |     # Create a series for each column in the DataFrame.
 97 |     columns = df.columns
 98 |     df = delayed(df)
 99 |     cols = {k: delayed(df.get)(k) for k in columns}
100 | 
101 |     # Create the delayed reports using Dask.
102 |     row_c = delayed(metrics.row_count)(df)
103 | 
104 |     cprops = {k: delayed(metrics.column_properties)(cols[k]) for k in columns}
105 |     joined_cprops = _join_dask_results(list(cprops.values()))
106 | 
107 |     freqs = {
108 |         k: delayed(metrics.frequencies)(cols[k], cprops[k]) for k in columns
109 |     }
110 |     joined_freqs = _join_dask_results(list(freqs.values()))
111 | 
112 |     csumms = {
113 |         k: delayed(metrics.column_summary)(cols[k], cprops[k]) for k in columns
114 |     }
115 |     joined_csumms = _join_dask_results(list(csumms.values()))
116 | 
117 |     out = {k: delayed(metrics.outliers)(cols[k], csumms[k]) for k in columns}
118 |     joined_outliers = _join_dask_results(list(out.values()))
119 | 
120 |     corr = delayed(metrics.correlation)(df, joined_cprops)
121 | 
122 |     pdens_results = []
123 |     if pairdensities:
124 |         for col1, col2 in itertools.combinations(columns, 2):
125 |             pdens_df = delayed(pd.concat)([cols[col1], cols[col2]], axis=1)
126 |             pdens_cp = {k: cprops[k] for k in [col1, col2]}
127 |             pdens_cs = {k: csumms[k] for k in [col1, col2]}
128 |             pdens_fr = {k: freqs[k] for k in [col1, col2]}
129 |             pdens = delayed(metrics.pairdensity)(
130 |                 pdens_df, pdens_cp, pdens_cs, pdens_fr
131 |             )
132 |             pdens_results.append(pdens)
133 | 
134 |     joined_pairdensities = _join_dask_results(pdens_results)
135 | 
136 |     # Join the delayed reports per-metric into a dictionary.
137 |     dask_dict = delayed(dict)(
138 |         row_count=row_c,
139 |         column_properties=joined_cprops,
140 |         frequencies=joined_freqs,
141 |         column_summary=joined_csumms,
142 |         outliers=joined_outliers,
143 |         correlation=corr,
144 |         pairdensity=joined_pairdensities,
145 |         _columns=list(columns),
146 |     )
147 | 
148 |     return dask_dict
149 | 


--------------------------------------------------------------------------------
/lens/explorer.py:
--------------------------------------------------------------------------------
  1 | """Explore a Summary"""
  2 | 
  3 | import sys
  4 | import logging
  5 | 
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | import plotly.tools
 10 | import plotly.offline as py
 11 | 
 12 | from lens.summarise import Summary
 13 | from lens.formatting import JupyterTable
 14 | from lens.plotting import (
 15 |     plot_distribution,
 16 |     plot_pairdensity,
 17 |     plot_correlation,
 18 |     plot_cdf,
 19 | )
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | logger.addHandler(logging.StreamHandler())
 23 | 
 24 | # Check whether we are in a notebook environment
 25 | # this is a false positive if we are in the Jupyter console
 26 | IN_NOTEBOOK = "ipykernel" in sys.modules
 27 | 
 28 | PLOTLY_TO_MPL_KWS = {"strip_style": True, "resize": True}
 29 | 
 30 | PLOTLY_KWS = {"show_link": False}
 31 | 
 32 | 
 33 | def _render(fig, showlegend=None):
 34 |     """Plot a matploltib or plotly figure"""
 35 |     if isinstance(fig, plt.Figure):
 36 |         fig = plotly.tools.mpl_to_plotly(fig, **PLOTLY_TO_MPL_KWS)
 37 | 
 38 |     if showlegend is not None:
 39 |         fig.layout["showlegend"] = showlegend
 40 | 
 41 |     if not IN_NOTEBOOK:
 42 |         message = "Lens explorer can only plot in a Jupyter notebook"
 43 |         logger.error(message)
 44 |         raise ValueError(message)
 45 |     else:
 46 |         if not py.offline.__PLOTLY_OFFLINE_INITIALIZED:
 47 |             py.init_notebook_mode()
 48 |         return py.iplot(fig, **PLOTLY_KWS)
 49 | 
 50 | 
 51 | class Explorer(object):
 52 |     """An explorer to visualise a Lens Summary
 53 | 
 54 |     Once a Lens ``Summary`` has been generated with
 55 |     :func:`lens.summarise.summarise`, this class provides the methods necessary
 56 |     to explore the summary though tables and plots. It is best used from within
 57 |     a Jupyter notebook.
 58 |     """
 59 | 
 60 |     # Number of points to show in the CDF plot
 61 |     _N_cdf = 1000
 62 | 
 63 |     def __init__(self, summary, plot_renderer=_render):
 64 |         if not isinstance(summary, Summary):
 65 |             raise TypeError("Can only explore a lens Summary")
 66 |         self.summary = summary
 67 |         self.plot_renderer = plot_renderer
 68 | 
 69 |     def describe(self):
 70 |         """General description of the dataset.
 71 | 
 72 |         Produces a table including the following information about each column:
 73 | 
 74 |         ``desc``
 75 |             the type of data: currently ``categorical`` or ``numeric``.
 76 |             Lens will calculate different quantities for this column
 77 |             depending on the value of ``desc``.
 78 | 
 79 |         ``dtype``
 80 |             the type of data in Pandas.
 81 | 
 82 |         ``name``
 83 |             column name
 84 | 
 85 |         ``notnulls``
 86 |             number of non-null values in the column
 87 | 
 88 |         ``nulls``
 89 |             number of null-values in the column
 90 | 
 91 |         ``unique``
 92 |             number of unique values in the column
 93 |         """
 94 |         summary = self.summary
 95 |         columns = summary.columns
 96 | 
 97 |         header = [""]
 98 |         header.extend(columns)
 99 | 
100 |         desc = ["desc"]
101 |         desc.extend([summary._desc(column) for column in columns])
102 | 
103 |         dtype = ["dtype"]
104 |         dtype.extend([summary.summary(column)["dtype"] for column in columns])
105 | 
106 |         notnulls = ["notnulls"]
107 |         notnulls.extend(
108 |             [summary.summary(column)["notnulls"] for column in columns]
109 |         )
110 | 
111 |         nulls = ["nulls"]
112 |         nulls.extend([summary.summary(column)["nulls"] for column in columns])
113 | 
114 |         unique = ["unique"]
115 |         unique.extend(
116 |             [summary.summary(column)["unique"] for column in columns]
117 |         )
118 | 
119 |         return JupyterTable([header, desc, dtype, notnulls, nulls, unique])
120 | 
121 |     def column_details(self, column, sort=False):
122 |         """Show type-specific column details.
123 | 
124 |         For numeric columns, this method produces a table with summary
125 |         statistics, including minimum, maximum, mean, and median. For
126 |         categorical columns, it produces a frequency table for each category
127 |         sorted in descending order of frequency.
128 | 
129 |         Parameters
130 |         ----------
131 |         column : str
132 |             Name of the column.
133 |         sort : boolean, optional
134 |             Sort frequency tables in categorical variables by
135 |             category name.
136 |         """
137 |         details = self.summary.details(column)
138 |         desc = details["desc"]
139 | 
140 |         if desc == "numeric":
141 |             caption = ""
142 |             data = [
143 |                 ["", details["name"]],
144 |                 ["desc", details["desc"]],
145 |                 ["dtype", self.summary.summary(column)["dtype"]],
146 |                 ["min", details["min"]],
147 |                 ["max", details["max"]],
148 |                 ["mean", details["mean"]],
149 |                 ["median", details["median"]],
150 |                 ["std", details["std"]],
151 |                 ["sum", details["sum"]],
152 |                 ["IQR", details["iqr"]],
153 |             ]
154 |             return JupyterTable(data)
155 |         elif desc == "categorical":
156 |             caption = "<p>desc: {}, dtype: {}</p>".format(
157 |                 details["desc"], self.summary.summary(column)["dtype"]
158 |             )
159 |             data = [["item", "frequency"]]
160 |             frequencies = []
161 |             for item, frequency in details["frequencies"].items():
162 |                 frequencies.append([item, frequency])
163 |             if sort:
164 |                 data.extend(sorted(frequencies, key=lambda x: x[0]))
165 |             else:
166 |                 data.extend(sorted(frequencies, key=lambda x: -x[1]))
167 |         else:
168 |             caption = ""
169 |             data = [
170 |                 ["", details["name"]],
171 |                 ["desc", details["desc"]],
172 |                 ["dtype", self.summary.summary(column)["dtype"]],
173 |             ]
174 | 
175 |         return JupyterTable(data, caption=caption)
176 | 
177 |     def distribution(self, column):
178 |         """Show properties of the distribution of values in the column.
179 | 
180 |         Parameters
181 |         ----------
182 |         column : str
183 |             Name of the column.
184 |         """
185 |         raise NotImplementedError
186 | 
187 |     def distribution_plot(self, column, bins=None):
188 |         """Plot the distribution of a numeric column.
189 | 
190 |         Create a plotly plot with a histogram of the values in a column. The
191 |         number of bin in the histogram is decided according to the
192 |         Freedman-Diaconis rule unless given by the `bins` parameter.
193 | 
194 |         Parameters
195 |         ----------
196 |         column : str
197 |             Name of the column.
198 |         bins : int, optional
199 |             Number of bins to use for histogram. If not given, the
200 |             Freedman-Diaconis rule will be used to estimate the best number of
201 |             bins. This argument also accepts the formats taken by the `bins`
202 |             parameter of matplotlib's :function:`~matplotlib.pyplot.hist`.
203 |         """
204 |         ax = plot_distribution(self.summary, column, bins)
205 |         self.plot_renderer(ax)
206 | 
207 |     def cdf_plot(self, column):
208 |         """Plot the empirical cumulative distribution function of a column.
209 | 
210 |         Creates a plotly plot with the empirical CDF of a column.
211 | 
212 |         Parameters
213 |         ----------
214 |         column : str
215 |             Name of the column.
216 |         """
217 |         ax = plot_cdf(self.summary, column, self._N_cdf)
218 |         self.plot_renderer(ax)
219 | 
220 |     def crosstab(self, column1, column2):
221 |         """Show a contingency table of two categorical columns.
222 | 
223 |         Print a contingency table for two categorical variables showing the
224 |         multivariate frequancy distribution of the columns.
225 | 
226 |         Parameters
227 |         ----------
228 |         column1 : str
229 |             First column.
230 |         column2 : str
231 |             Second column.
232 |         """
233 |         pair_details = self.summary.pair_details(column1, column2)
234 | 
235 |         for column in [column1, column2]:
236 |             column_details = self.summary.details(column)
237 |             if column_details["desc"] != "categorical":
238 |                 raise ValueError(
239 |                     "Column `{}` is not categorical".format(column)
240 |                 )
241 | 
242 |         pair_details = self.summary.pair_details(column1, column2)
243 |         pairdensity = pair_details["pairdensity"]
244 | 
245 |         # Convert to numpy arrays for ease of reindexing
246 |         x = np.array(pairdensity["x"])
247 |         y = np.array(pairdensity["y"])
248 |         crosstab = np.array(pairdensity["density"])
249 | 
250 |         # Sort by first column category names
251 |         idx = np.argsort(x)
252 |         x = x[idx]
253 |         crosstab = crosstab[:, idx]
254 | 
255 |         # Sort by second column category names
256 |         idx = np.argsort(y)
257 |         y = y[idx]
258 |         crosstab = crosstab[idx]
259 | 
260 |         table = [[""] + x.tolist()]
261 |         for y_category, crosstab_row in zip(y, crosstab):
262 |             table.append([y_category] + crosstab_row.tolist())
263 | 
264 |         return JupyterTable(table)
265 | 
266 |     def pairwise_density_plot(self, column1, column2):
267 |         """Plot the pairwise density between two columns.
268 | 
269 |         This plot is an approximation of a scatterplot through a 2D Kernel
270 |         Density Estimate for two numerical variables. When one of the variables
271 |         is categorical, a 1D KDE for each of the categories is shown,
272 |         normalised to the total number of non-null observations. For two
273 |         categorical variables, the plot produced is a heatmap representation of
274 |         the contingency table.
275 | 
276 |         Parameters
277 |         ----------
278 |         column1 : str
279 |             First column.
280 |         column2 : str
281 |             Second column.
282 |         """
283 |         allowed_descriptions = ["numeric", "categorical"]
284 |         for column in [column1, column2]:
285 |             column_description = self.summary.summary(column)["desc"]
286 |             if column_description not in allowed_descriptions:
287 |                 raise ValueError(
288 |                     "Column {} is not numeric or categorical".format(column)
289 |                 )
290 | 
291 |         fig = plot_pairdensity(self.summary, column1, column2)
292 |         self.plot_renderer(fig)
293 | 
294 |     def correlation_plot(self, include=None, exclude=None):
295 |         """Plot the correlation matrix for numeric columns
296 | 
297 |         Plot a Spearman rank order correlation coefficient matrix showing the
298 |         correlation between columns. The matrix is reordered to group together
299 |         columns that have a higher correlation coefficient.  The columns to be
300 |         plotted in the correlation plot can be selected through either the
301 |         ``include`` or ``exclude`` keyword arguments. Only one of them can be
302 |         given.
303 | 
304 |         Parameters
305 |         ----------
306 | 
307 |         include : list of str
308 |             List of columns to include in the correlation plot.
309 |         exclude : list of str
310 |             List of columns to exclude from the correlation plot.
311 |         """
312 |         fig = plot_correlation(self.summary, include, exclude)
313 |         self.plot_renderer(fig)
314 | 
315 |     def correlation(self, include=None, exclude=None):
316 |         """Show the correlation matrix for numeric columns.
317 | 
318 |         Print a Spearman rank order correlation coefficient matrix in tabular
319 |         form, showing the correlation between columns. The matrix is reordered
320 |         to group together columns that have a higher correlation coefficient.
321 |         The columns to be shown in the table can be selected
322 |         through either the ``include`` or ``exclude`` keyword arguments. Only
323 |         one of them can be given.
324 | 
325 |         Parameters
326 |         ----------
327 | 
328 |         include : list of str
329 |             List of columns to include in the correlation plot.
330 |         exclude : list of str
331 |             List of columns to exclude from the correlation plot.
332 |         """
333 |         columns, correlation_matrix = self.summary.correlation_matrix(
334 |             include, exclude
335 |         )
336 |         headers = [""] + columns
337 |         rows = []
338 |         for column, correlation_row in zip(columns, correlation_matrix):
339 |             rows.append([column] + correlation_row.tolist())
340 |         return JupyterTable([headers] + rows)
341 | 
342 | 
343 | def explore(summary):
344 |     """Create an Explorer instance from a Lens Summary"""
345 |     return Explorer(summary)
346 | 


--------------------------------------------------------------------------------
/lens/formatting.py:
--------------------------------------------------------------------------------
  1 | """Table formatting for Jupyter notebooks"""
  2 | 
  3 | # Copyright (c) 2012-2013, Eric Moyer <eric@lemoncrab.com>
  4 | # Copyright (c) 2016-2019, Faculty Science Limited
  5 | #
  6 | # All rights reserved.
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #
 12 | # Redistributions of source code must retain the above copyright notice,
 13 | # this list of conditions and the following disclaimer.
 14 | #
 15 | # Redistributions in binary form must reproduce the above copyright
 16 | # notice, this list of conditions and the following disclaimer in the
 17 | # documentation and/or other materials provided with the distribution.
 18 | #
 19 | # Neither the name of the ipy_table Development Team nor the names of
 20 | # its contributors may be used to endorse or promote products derived
 21 | # from this software without specific prior written permission.
 22 | #
 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
 27 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 34 | 
 35 | import sys
 36 | 
 37 | 
 38 | class JupyterTable(object):
 39 | 
 40 |     _valid_borders = {"left", "right", "top", "bottom", "all"}
 41 | 
 42 |     def __init__(self, array, caption=""):
 43 |         self.array = array
 44 |         self.caption = caption
 45 | 
 46 |         self._num_rows = len(array)
 47 |         self._num_columns = len(array[0])
 48 | 
 49 |         # Check that array is well formed
 50 |         for row in array:
 51 |             if len(row) != self._num_columns:
 52 |                 raise ValueError("Array rows must all be of equal length.")
 53 | 
 54 |         self._cell_styles = [
 55 |             [{"float_format": "%0.4f"} for dummy in range(self._num_columns)]
 56 |             for dummy2 in range(self._num_rows)
 57 |         ]
 58 | 
 59 |     def _repr_html_(self):
 60 |         """Jupyter display protocol: HTML representation.
 61 | 
 62 |         The Jupyter display protocol calls this method to get the HTML
 63 |         representation of this object.
 64 |         """
 65 |         # Generate TABLE tag (<tr>)
 66 |         html = (
 67 |             self.caption
 68 |             + '<table border="1" cellpadding="3" cellspacing="0" '
 69 |             + ' style="border:1px solid black;border-collapse:collapse;">'
 70 |         )
 71 | 
 72 |         for row, row_data in enumerate(self.array):
 73 | 
 74 |             # Generate ROW tag (<tr>)
 75 |             html += "<tr>"
 76 |             for (column, item) in enumerate(row_data):
 77 |                 if not _key_is_valid(
 78 |                     self._cell_styles[row][column], "suppress"
 79 |                 ):
 80 | 
 81 |                     # Generate CELL tag (<td>)
 82 |                     # Apply floating point formatter to the cell contents
 83 |                     # (if it is a float)
 84 |                     item_html = self._formatter(
 85 |                         item, self._cell_styles[row][column]
 86 |                     )
 87 | 
 88 |                     # Add bold and italic tags if set
 89 |                     if _key_is_valid(self._cell_styles[row][column], "bold"):
 90 |                         item_html = "<b>" + item_html + "</b>"
 91 |                     if _key_is_valid(self._cell_styles[row][column], "italic"):
 92 |                         item_html = "<i>" + item_html + "</i>"
 93 | 
 94 |                     # Get html style string
 95 |                     style_html = self._get_style_html(
 96 |                         self._cell_styles[row][column]
 97 |                     )
 98 | 
 99 |                     # Append cell
100 |                     html += "<td" + style_html + ">" + item_html + "</td>"
101 |             html += "</tr>"
102 |         html += "</table>"
103 |         return html
104 | 
105 |     def _get_style_html(self, style_dict):
106 |         """Parse the style dictionary and return equivalent html style text."""
107 |         style_html = ""
108 |         if _key_is_valid(style_dict, "color"):
109 |             style_html += "background-color:" + style_dict["color"] + ";"
110 | 
111 |         if _key_is_valid(style_dict, "thick_border"):
112 |             for edge in self._split_by_comma(style_dict["thick_border"]):
113 |                 style_html += "border-%s: 3px solid black;" % edge
114 | 
115 |         if _key_is_valid(style_dict, "no_border"):
116 |             for edge in self._split_by_comma(style_dict["no_border"]):
117 |                 style_html += "border-%s: 1px solid transparent;" % edge
118 | 
119 |         if _key_is_valid(style_dict, "align"):
120 |             style_html += "text-align:" + str(style_dict["align"]) + ";"
121 | 
122 |         if _key_is_valid(style_dict, "width"):
123 |             style_html += "width:" + str(style_dict["width"]) + "px;"
124 | 
125 |         if style_html:
126 |             style_html = ' style="' + style_html + '"'
127 | 
128 |         if _key_is_valid(style_dict, "row_span"):
129 |             style_html = (
130 |                 'rowspan="' + str(style_dict["row_span"]) + '";' + style_html
131 |             )
132 | 
133 |         if _key_is_valid(style_dict, "column_span"):
134 |             style_html = (
135 |                 'colspan="'
136 |                 + str(style_dict["column_span"])
137 |                 + '";'
138 |                 + style_html
139 |             )
140 | 
141 |         # Prepend a space if non-blank
142 |         if style_html:
143 |             return " " + style_html
144 |         return ""
145 | 
146 |     def _formatter(self, item, cell_style):
147 |         """Apply formatting to cell contents.
148 | 
149 |         Applies float format to item if item is a float or float64.
150 |         Converts spaces to non-breaking if wrap is not enabled.
151 |         Returns string.
152 |         """
153 | 
154 |         # The following check is performed as a string comparison
155 |         # so that ipy_table does not need to require (import) numpy.
156 |         if (
157 |             str(type(item)) in ["<type 'float'>", "<type 'numpy.float64'>"]
158 |             and "float_format" in cell_style
159 |         ):
160 |             text = cell_style["float_format"] % item
161 |         else:
162 |             if isinstance(item, str):
163 |                 text = item
164 |             else:
165 |                 text = str(item)
166 | 
167 |         if sys.version_info.major < 3:
168 |             # QA disabled as unicode is a NameError in Python 3.
169 |             text = unicode(text, encoding="utf-8")  # noqa
170 | 
171 |         # If cell wrapping is not specified
172 |         if not ("wrap" in cell_style and cell_style["wrap"]):
173 |             # Convert all spaces to non-breaking and return
174 |             text = text.replace(" ", "&nbsp")
175 |         return text
176 | 
177 |     def _split_by_comma(self, comma_delimited_text):
178 |         """Returns a list of the words in the comma delimited text."""
179 |         return comma_delimited_text.replace(" ", "").split(",")
180 | 
181 | 
182 | def _key_is_valid(dictionary, key):
183 |     """Test that a dictionary key exists and that its value is not blank."""
184 |     if key in dictionary:
185 |         if dictionary[key]:
186 |             return True
187 |     return False
188 | 


--------------------------------------------------------------------------------
/lens/metrics.py:
--------------------------------------------------------------------------------
  1 | """Metrics for the computation of a Lens summary"""
  2 | 
  3 | from __future__ import division
  4 | 
  5 | import logging
  6 | import time
  7 | from functools import wraps
  8 | 
  9 | from tdigest import TDigest
 10 | import numpy as np
 11 | from scipy import stats
 12 | from scipy import signal
 13 | import pandas as pd
 14 | 
 15 | from .utils import hierarchical_ordering_indices
 16 | 
 17 | DENSITY_N = 100
 18 | LOGNORMALITY_P_THRESH = 0.05
 19 | CAT_FRAC_THRESHOLD = 0.5
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | logger.addHandler(logging.StreamHandler())
 23 | 
 24 | 
 25 | def timeit(func):
 26 |     """Decorator to time callable execution and add it to the report.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     func : callable
 31 |         The callable to execute.
 32 | 
 33 |     Returns
 34 |     -------
 35 |     callable
 36 |         Decorated function.
 37 |     """
 38 | 
 39 |     @wraps(func)
 40 |     def decorator(*args, **kwargs):
 41 |         tstart = time.time()
 42 |         report = func(*args, **kwargs)
 43 |         if report is not None:
 44 |             report["_run_time"] = time.time() - tstart
 45 |         return report
 46 | 
 47 |     return decorator
 48 | 
 49 | 
 50 | @timeit
 51 | def row_count(df):
 52 |     """Count number of total and unique rows.
 53 | 
 54 |     Parameters
 55 |     ----------
 56 |     df : pd.DataFrame
 57 |         A DataFrame.
 58 | 
 59 |     Returns
 60 |     -------
 61 |     dict
 62 |         Dictionary with `total` and `unique` keys.
 63 |     """
 64 |     report = {}
 65 |     report["total"] = len(df.index)
 66 |     report["unique"] = len(df.drop_duplicates().index)
 67 |     return report
 68 | 
 69 | 
 70 | @timeit
 71 | def column_properties(series):
 72 |     """Infer properties of a Pandas Series.
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     series : pd.Series
 77 |         Series to infer properties of.
 78 | 
 79 |     Returns
 80 |     -------
 81 |     dict
 82 |         Dictionary of inferred properties.
 83 |     """
 84 |     cat_N_threshold = {"object": 1000, "int64": 10, "float64": 10}
 85 | 
 86 |     name = series.name
 87 |     colresult = {}
 88 |     colresult["dtype"] = str(series.dtype)
 89 |     nulls = series.isnull().sum()
 90 |     colresult["nulls"] = int(nulls) if not np.isnan(nulls) else 0
 91 |     notnulls = series.dropna()
 92 | 
 93 |     colresult["notnulls"] = len(notnulls.index)
 94 |     colresult["numeric"] = (
 95 |         series.dtype in [np.float64, np.int64] and colresult["notnulls"] > 0
 96 |     )
 97 |     unique = notnulls.unique().size
 98 |     colresult["unique"] = unique
 99 |     colresult["is_categorical"] = False
100 |     if (
101 |         colresult["dtype"] in {"object", "int64", "float64"}
102 |         and colresult["notnulls"] > 0
103 |     ):
104 |         # In Pandas integers with nulls are cast as floats, so we have
105 |         # to include floats as possible categoricals to detect
106 |         # categorical integers.
107 |         colresult["is_categorical"] = (
108 |             unique / colresult["notnulls"] <= CAT_FRAC_THRESHOLD
109 |         ) and (unique <= cat_N_threshold[colresult["dtype"]])
110 |         logger.debug(
111 |             "Column {:15}: {:6} unique, {:6} notnulls, {:6} total"
112 |             " --> {}categorical".format(
113 |                 name,
114 |                 unique,
115 |                 colresult["notnulls"],
116 |                 colresult["notnulls"] + colresult["nulls"],
117 |                 "NOT " * (not colresult["is_categorical"]),
118 |             )
119 |         )
120 | 
121 |     # Don't use the is_ID field for now:
122 |     # it's too prone to false positives.
123 |     # If a columns is wrongly identified as ID-like,
124 |     # it doesn't get analyzed
125 |     colresult["is_ID"] = False
126 | 
127 |     return {name: colresult, "_columns": [name]}
128 | 
129 | 
130 | def _tdigest_mean(digest):
131 |     """TODO
132 | 
133 |     Parameters
134 |     ----------
135 |     digest : tdigest.TDigest
136 |         t-digest data structure.
137 | 
138 |     Returns
139 |     -------
140 |     TODO
141 |     """
142 |     means = [c.mean for c in digest.C.values()]
143 |     counts = [c.count for c in digest.C.values()]
144 |     return np.average(means, weights=counts)
145 | 
146 | 
147 | def _tdigest_std(digest):
148 |     """TODO
149 | 
150 |     Parameters
151 |     ----------
152 |     digest : tdigest.TDigest
153 |         t-digest data structure.
154 | 
155 |     Returns
156 |     -------
157 |     TODO
158 |     """
159 |     mean = _tdigest_mean(digest)
160 |     sums = [(x.mean - mean) ** 2 * x.count for x in digest.C.values()]
161 |     return np.sqrt(np.sum(sums) / digest.n)
162 | 
163 | 
164 | def _tdigest_normalise(digest):
165 |     """TODO
166 | 
167 |     Parameters
168 |     ----------
169 |     digest : tdigest.TDigest
170 |         t-digest data structure.
171 | 
172 |     Returns
173 |     -------
174 |     TODO
175 |     """
176 |     m = _tdigest_mean(digest)
177 |     s = _tdigest_std(digest)
178 |     ndigest = TDigest()
179 |     for x in digest.C.values():
180 |         ndigest.update((x.mean - m) / s, x.count)
181 |     return ndigest
182 | 
183 | 
184 | def _tdigest_norm_kstest(digest):
185 |     """TODO
186 | 
187 |     Parameters
188 |     ----------
189 |     digest : tdigest.TDigest
190 |         t-digest data structure.
191 | 
192 |     Returns
193 |     -------
194 |     TODO
195 |     """
196 |     normdigest = _tdigest_normalise(digest)
197 | 
198 |     x = np.linspace(-3, 3, 500)
199 |     dig_q = np.array([normdigest.cdf(xx) for xx in x])
200 |     norm_q = stats.norm.cdf(x)
201 | 
202 |     D = np.max(np.abs(dig_q - norm_q))
203 | 
204 |     if digest.n > 3000:
205 |         return D, stats.distributions.kstwobign.sf(D * np.sqrt(digest.n))
206 |     else:
207 |         return D, 2 * stats.distributions.ksone.sf(D, digest.n)
208 | 
209 | 
210 | def _test_logtrans(digest):
211 |     """
212 |     Test if t-digest distribution is more normal when log-transformed.
213 | 
214 |     Test whether a log-transform improves normality of data with a
215 |     simplified Kolmogorov-Smirnov two-sided test (the location and scale
216 |     of the normal distribution are estimated from the median and
217 |     standard deviation of the data).
218 | 
219 |     Parameters
220 |     ----------
221 |     digest : tdigest.TDigest
222 |         t-digest data structure.
223 | 
224 |     Returns
225 |     -------
226 |     TODO
227 |     """
228 |     if digest.percentile(0) <= 0:
229 |         return False
230 | 
231 |     logdigest = TDigest()
232 |     for c in digest.C.values():
233 |         logdigest.update(np.log(c.mean), c.count)
234 | 
235 |     lKS, lp = _tdigest_norm_kstest(logdigest)
236 |     KS, p = _tdigest_norm_kstest(digest)
237 |     logger.debug(
238 |         "KSnorm: log: {:.2g}, {:.2g}; linear: {:.2g}, {:.2g}".format(
239 |             lKS, lp, KS, p
240 |         )
241 |     )
242 | 
243 |     return (
244 |         (lKS < KS)
245 |         and (lp > p)
246 |         and (lp > LOGNORMALITY_P_THRESH)
247 |         and (p < LOGNORMALITY_P_THRESH)
248 |     )
249 | 
250 | 
251 | @timeit
252 | def column_summary(series, column_props, delta=0.01):
253 |     """Summarise a numeric column.
254 | 
255 |     Parameters
256 |     ----------
257 |     series : pd.Series
258 |         Numeric column.
259 |     column_props : TODO
260 |         TODO
261 |     delta : float
262 |         TODO
263 | 
264 |     Returns
265 |     -------
266 |     TODO
267 |     """
268 |     col = series.name
269 |     if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0:
270 |         # Series is not numeric or is all NaNs.
271 |         return None
272 | 
273 |     logger.debug("column_summary - " + col)
274 | 
275 |     # select non-nulls from column
276 |     data = series.dropna()
277 | 
278 |     colresult = {}
279 |     for m in ["mean", "min", "max", "std", "sum"]:
280 |         val = getattr(data, m)()
281 |         if type(val) is np.int64:
282 |             colresult[m] = int(val)
283 |         else:
284 |             colresult[m] = val
285 | 
286 |     colresult["n"] = column_props[col]["notnulls"]
287 | 
288 |     percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9]
289 |     colresult["percentiles"] = {
290 |         perc: np.nanpercentile(series, perc) for perc in percentiles
291 |     }
292 |     colresult["median"] = colresult["percentiles"][50]
293 |     colresult["iqr"] = (
294 |         colresult["percentiles"][75] - colresult["percentiles"][25]
295 |     )
296 | 
297 |     # Compute the t-digest.
298 |     logger.debug("column_summary - {} - creating TDigest...".format(col))
299 |     digest = TDigest(delta)
300 |     digest.batch_update(data)
301 | 
302 |     logger.debug("column_summary - {} - testing log trans...".format(col))
303 |     try:
304 |         colresult["logtrans"] = bool(_test_logtrans(digest))
305 |     except Exception as e:
306 |         # Hard to pinpoint problems with the logtrans TDigest.
307 |         logger.warning(
308 |             "test_logtrans has failed for column `{}`: {}".format(col, e)
309 |         )
310 |         colresult["logtrans"] = False
311 | 
312 |     if colresult["logtrans"]:
313 |         logdigest = TDigest()
314 |         for c in digest.C.values():
315 |             logdigest.update(np.log(c.mean), c.count)
316 |         colresult["logtrans_mean"] = _tdigest_mean(logdigest)
317 |         colresult["logtrans_std"] = _tdigest_std(logdigest)
318 |         colresult["logtrans_IQR"] = logdigest.percentile(
319 |             75
320 |         ) - logdigest.percentile(25)
321 | 
322 |     logger.debug(
323 |         "column_summary - {} - should {}be log-transformed".format(
324 |             col, "NOT " if not colresult["logtrans"] else ""
325 |         )
326 |     )
327 | 
328 |     # Compress and store the t-digest.
329 |     digest.delta = delta
330 |     digest.compress()
331 |     colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()]
332 | 
333 |     # Compute histogram
334 |     logger.debug("column_summary - {} - computing histogram...".format(col))
335 | 
336 |     if column_props[col]["is_categorical"]:
337 |         # Compute frequency table and store as histogram
338 |         counts, edges = _compute_histogram_from_frequencies(data)
339 |     else:
340 |         if colresult["logtrans"]:
341 |             counts, log_edges = np.histogram(
342 |                 np.log10(data), density=False, bins="fd"
343 |             )
344 |             edges = 10 ** log_edges
345 |         else:
346 |             counts, edges = np.histogram(data, density=False, bins="fd")
347 | 
348 |     colresult["histogram"] = {
349 |         "counts": counts.tolist(),
350 |         "bin_edges": edges.tolist(),
351 |     }
352 | 
353 |     # Compute KDE
354 |     logger.debug("column_summary - {} - computing KDE...".format(col))
355 |     bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1)
356 | 
357 |     logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw))
358 | 
359 |     if column_props[col]["is_categorical"]:
360 |         kde_x, kde_y = np.zeros(1), np.zeros(1)
361 |     else:
362 |         coord_range = colresult["min"], colresult["max"]
363 |         kde_x, kde_y = _compute_smoothed_histogram(
364 |             data, bw, coord_range, logtrans=colresult["logtrans"]
365 |         )
366 | 
367 |     colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()}
368 | 
369 |     return {col: colresult, "_columns": [col]}
370 | 
371 | 
372 | def _compute_histogram_from_frequencies(series):
373 |     """Compute histogram from frequencies
374 | 
375 |     This method uses the frequencies dict to produce a histogram data structure
376 |     with emtpy bins where the difference between the category values is larger
377 |     than 1
378 | 
379 |     Parameters
380 |     ----------
381 |     series : pd.Series
382 |         Categorical column.a
383 | 
384 |     Returns
385 |     -------
386 |     counts, edges:
387 |         Histogram bin edges and counts in each bin.
388 |     """
389 |     freqs = _compute_frequencies(series)
390 |     categories = sorted(freqs.keys())
391 |     diffs = list(np.diff(categories)) + [1]
392 |     edges = [categories[0] - 0.5]
393 |     counts = []
394 |     for cat, diff in zip(categories, diffs):
395 |         if diff <= 1:
396 |             edges.append(cat + diff / 2.0)
397 |             counts.append(freqs[cat])
398 |         else:
399 |             edges += [cat + 0.5, cat + diff - 0.5]
400 |             counts += [freqs[cat], 0]
401 | 
402 |     return np.array(counts), np.array(edges)
403 | 
404 | 
405 | def _compute_frequencies(series):
406 |     """Helper to compute frequencies of a categorical column
407 | 
408 |     Parameters
409 |     ----------
410 |     series : pd.Series
411 |         Categorical column.a
412 | 
413 |     Returns
414 |     -------
415 |     dict:
416 |         Dictionary from category name to count.
417 |     """
418 |     freqs = series.value_counts()
419 |     if freqs.index.dtype == np.int64:
420 |         categories = [int(index) for index in freqs.index]
421 |     elif freqs.index.dtype == np.float64:
422 |         categories = [float(index) for index in freqs.index]
423 |     else:
424 |         categories = freqs.index
425 |     return dict(zip(categories, freqs.values.tolist()))
426 | 
427 | 
428 | @timeit
429 | def frequencies(series, column_props):
430 |     """Compute frequencies for categorical columns.
431 | 
432 |     Parameters
433 |     ----------
434 |     series : pd.Series
435 |         Categorical column.
436 |     column_props : dict
437 |         Dictionary as returned by `column_properties`
438 | 
439 |     Returns
440 |     -------
441 |     TODO
442 |     """
443 |     name = series.name
444 | 
445 |     if column_props[name]["is_categorical"]:
446 |         logger.debug("frequencies - " + series.name)
447 |         freqs = _compute_frequencies(series)
448 |         return {name: freqs, "_columns": [name]}
449 |     else:
450 |         return None
451 | 
452 | 
453 | @timeit
454 | def outliers(series, column_summ):
455 |     """Count outliers for numeric columns.
456 | 
457 |     Parameters
458 |     ----------
459 |     series : pd.Series
460 |         Numeric column.
461 |     column_summ : TODO
462 |         TODO
463 | 
464 |     Returns
465 |     -------
466 |     TODO
467 |     """
468 |     name = series.name
469 |     if column_summ is None:
470 |         # Not a numeric column.
471 |         return None
472 |     else:
473 |         column_summ = column_summ[name]
474 | 
475 |     Q1, Q3 = [column_summ["percentiles"][p] for p in [25, 75]]
476 |     IQR = Q3 - Q1
477 |     # Mild outlier limits.
478 |     lom = Q1 - 1.5 * IQR
479 |     him = Q3 + 1.5 * IQR
480 |     # Extreme outlier limits.
481 |     lox = Q1 - 3.0 * IQR
482 |     hix = Q3 + 3.0 * IQR
483 | 
484 |     nn = series.dropna()
485 | 
486 |     Nmildlo = len(nn[(nn < lom) & (nn > lox)].index)
487 |     Nmildhi = len(nn[(nn > him) & (nn < hix)].index)
488 |     Nextrlo = len(nn[nn < lox].index)
489 |     Nextrhi = len(nn[nn > hix].index)
490 | 
491 |     return {
492 |         name: {"mild": [Nmildlo, Nmildhi], "extreme": [Nextrlo, Nextrhi]},
493 |         "_columns": [name],
494 |     }
495 | 
496 | 
497 | @timeit
498 | def correlation(df, column_props):
499 |     """Compute correlation table between non-ID numeric variables.
500 | 
501 |     Parameters
502 |     ----------
503 |     df : pd.DataFrame
504 |         DataFrame.
505 |     column_props : TODO
506 |         TODO
507 | 
508 |     Returns
509 |     -------
510 |     dict
511 |         Dictionary containing correlation coefficients.
512 |     """
513 | 
514 |     cols = [
515 |         col
516 |         for col in df.columns
517 |         if (column_props[col]["numeric"] and not column_props[col]["is_ID"])
518 |     ]
519 | 
520 |     numdf = df[cols]
521 |     pcorr = numdf.corr(method="pearson", min_periods=5)
522 |     scorr = numdf.corr(method="spearman", min_periods=5)
523 | 
524 |     report = {}
525 |     report["_columns"] = list(numdf.columns)
526 |     report["pearson"] = np.array(pcorr).tolist()
527 |     report["spearman"] = np.array(scorr).tolist()
528 | 
529 |     report["order"] = hierarchical_ordering_indices(
530 |         numdf.columns, scorr.values
531 |     )
532 | 
533 |     return report
534 | 
535 | 
536 | def _compute_smoothed_histogram(
537 |     values, bandwidth, coord_range, logtrans=False
538 | ):
539 |     """Approximate 1-D density estimation.
540 | 
541 |     Estimate 1-D probability densities at evenly-spaced grid points,
542 |     for specified data. This method is based on creating a 1-D histogram of
543 |     data points quantised with respect to evenly-spaced grid points.
544 |     Probability densities are then estimated at the grid points by convolving
545 |     the obtained histogram with a Gaussian kernel.
546 | 
547 |     Parameters
548 |     ----------
549 |     values : np.array (N,)
550 |         A vector containing the data for which to perform density estimation.
551 |         Successive data points are indexed by the first axis in the array.
552 |     bandwidth : float
553 |         The desired KDE bandwidth. (When log-transformation
554 |         of data is desired, bandwidth should be specified in log-space.)
555 |     coord_range: (2,)
556 |         Minimum and maximum values of coordinate on which to evaluate the
557 |         smoothed histogram.
558 |     logtrans : boolean
559 |         Whether or not to log-transform the data before performing density
560 |         estimation.
561 | 
562 |     Returns
563 |     -------
564 |     np.array (M-1,)
565 |     An array of estimated probability densities at specified grid points.
566 |     """
567 |     if logtrans:
568 |         ber = [np.log10(extreme) for extreme in coord_range]
569 |         bin_edges = np.logspace(*ber, num=DENSITY_N + 1)
570 |         bin_edge_range = ber[1] - ber[0]
571 |     else:
572 |         bin_edges = np.linspace(*coord_range, num=DENSITY_N + 1)
573 |         bin_edge_range = coord_range[1] - coord_range[0]
574 | 
575 |     if values.size < 2:
576 |         # Return zeros if there are too few points to do anything useful.
577 |         return bin_edges[:-1], np.zeros(bin_edges.shape[0] - 1)
578 | 
579 |     # Bin the values
580 |     H = np.histogram(values, bin_edges)[0]
581 | 
582 |     relative_bw = bandwidth / bin_edge_range
583 |     K = _compute_gaussian_kernel(H.shape, relative_bw)
584 | 
585 |     pdf = signal.fftconvolve(H, K, mode="same")
586 | 
587 |     # Return lower edges of bins and normalized pdf
588 |     return bin_edges[:-1], pdf / np.trapz(pdf, bin_edges[:-1])
589 | 
590 | 
591 | def _compute_smoothed_histogram2d(
592 |     values, bandwidth, coord_ranges, logtrans=False
593 | ):
594 |     """Approximate 2-D density estimation.
595 | 
596 |     Estimate 2-D probability densities at evenly-spaced grid points,
597 |     for specified data. This method is based on creating a 2-D histogram of
598 |     data points quantised with respect to evenly-spaced grid points.
599 |     Probability densities are then estimated at the grid points by convolving
600 |     the obtained histogram with a Gaussian kernel.
601 | 
602 |     Parameters
603 |     ----------
604 |     values : np.array (N,2)
605 |         A 2-D array containing the data for which to perform density
606 |         estimation. Successive data points are indexed by the first axis in the
607 |         array. The second axis indexes x and y coordinates of data points
608 |         (values[:,0] and values[:,1] respectively).
609 |     bandwidth : array-like (2,)
610 |         The desired KDE bandwidths for x and y axes. (When log-transformation
611 |         of data is desired, bandwidths should be specified in log-space.)
612 |     coord_range: (2,2)
613 |         Minimum and maximum values of coordinates on which to evaluate the
614 |         smoothed histogram.
615 |     logtrans : array-like (2,)
616 |         A 2-element boolean array specifying whether or not to log-transform
617 |         the x or y coordinates of the data before performing density
618 |         estimation.
619 | 
620 |     Returns
621 |     -------
622 |     np.array (M-1, M-1)
623 |         An array of estimated probability densities at specified grid points.
624 |     """
625 |     bin_edges = []
626 |     bedge_range = []
627 |     for minmax, lt in zip(coord_ranges, logtrans):
628 |         if lt:
629 |             ber = [np.log10(extreme) for extreme in minmax]
630 |             bin_edges.append(np.logspace(*ber, num=DENSITY_N + 1))
631 |             bedge_range.append(ber[1] - ber[0])
632 |         else:
633 |             bin_edges.append(np.linspace(*minmax, num=DENSITY_N + 1))
634 |             bedge_range.append(minmax[1] - minmax[0])
635 | 
636 |     # Bin the observations
637 |     H = np.histogram2d(values[:, 0], values[:, 1], bins=bin_edges)[0]
638 | 
639 |     relative_bw = [bw / berange for bw, berange in zip(bandwidth, bedge_range)]
640 |     K = _compute_gaussian_kernel(H.shape, relative_bw)
641 | 
642 |     pdf = signal.fftconvolve(H.T, K, mode="same")
643 | 
644 |     # Normalize pdf
645 |     bin_centers = [edges[:-1] + np.diff(edges) / 2.0 for edges in bin_edges]
646 |     pdf /= np.trapz(np.trapz(pdf, bin_centers[1]), bin_centers[0])
647 | 
648 |     # Return lower bin edges and density
649 |     return bin_edges[0][:-1], bin_edges[1][:-1], pdf
650 | 
651 | 
652 | def _compute_gaussian_kernel(histogram_shape, relative_bw):
653 |     """Compute a gaussian kernel double the size of the histogram matrix"""
654 |     if len(histogram_shape) == 2:
655 |         kernel_shape = [2 * n for n in histogram_shape]
656 |         # Create a scaled grid in which the kernel is symmetric to avoid matrix
657 |         # inversion problems when the bandwiths are very different
658 |         bw_ratio = relative_bw[0] / relative_bw[1]
659 |         bw = relative_bw[0]
660 |         X, Y = np.mgrid[
661 |             -bw_ratio : bw_ratio : kernel_shape[0] * 1j,
662 |             -1 : 1 : kernel_shape[1] * 1j,
663 |         ]
664 |         grid_points = np.vstack([X.ravel(), Y.ravel()]).T
665 |         Cov = np.array(((bw, 0), (0, bw))) ** 2
666 |         K = stats.multivariate_normal.pdf(grid_points, mean=(0, 0), cov=Cov)
667 | 
668 |         return K.reshape(kernel_shape)
669 |     else:
670 |         grid = np.mgrid[-1 : 1 : histogram_shape[0] * 2j]
671 |         return stats.norm.pdf(grid, loc=0, scale=relative_bw)
672 | 
673 | 
674 | def _bw_scott(column_summ, N, logtrans, d):
675 |     """Scott's rule of thumb for KDE kernel bandwidth.
676 | 
677 |     Parameters
678 |     ----------
679 |     column_summ : dict
680 |         Dictionary as returned by `column_summary`.
681 |     N : int
682 |         Number of elements in the series for which the KDE is to be
683 |         evaluated.
684 |     logtrans : bool
685 |         Whether the series is assumed to be 'exponential' (True) or
686 |         'linear' (False). An 'exponential' series (representing, e.g.
687 |         income) is log-transformed before the KDE. The bandwidth
688 |         therefore needs to be estimated for the log transformed series.
689 |     d : int
690 |         Dimension of the KDE.
691 | 
692 |     Returns
693 |     -------
694 |     float
695 |         Estimate of the kernel bandwidth for the KDE.
696 |     """
697 |     if N == 0:
698 |         return 0
699 | 
700 |     norm = 1.349  # norm.ppf(0.75) - norm.ppf(0.25)
701 |     if logtrans:
702 |         std, IQR = column_summ["logtrans_std"], column_summ["logtrans_IQR"]
703 |         factor = 2
704 |     else:
705 |         std, IQR = column_summ["std"], column_summ["iqr"]
706 |         factor = 1.4
707 | 
708 |     if IQR > 0:
709 |         iqr_estimate = min(IQR / norm, std)
710 |     elif std > 0:
711 |         iqr_estimate = std
712 |     else:
713 |         iqr_estimate = 1.0
714 | 
715 |     bandwidth = 1.06 * iqr_estimate * N ** (-1.0 / (4.0 + d))
716 | 
717 |     return bandwidth / factor
718 | 
719 | 
720 | @timeit
721 | def pairdensity(df, column_props, column_summ, freq, log_transform=True):
722 |     """Compute a variable pair heatmap.
723 | 
724 |     Parameters
725 |     ----------
726 |     df : pd.DataFrame
727 |         DataFrame with the columns for which the pair density is
728 |         computed.
729 |     column_props : dict
730 |         Column properties dictionary with at least col1 and col2, as
731 |         returned by `column_properties`.
732 |     column_summ : dict
733 |         Column summary dictionary with at least col1 and col2, as
734 |         returned by `column_summary`.
735 |     freq : dict
736 |         Frequencies dictionary with at least col1 and col2.
737 |     log_transform : bool
738 |         Whether to compute the KDE in log-space when needed.
739 | 
740 |     Returns
741 |     -------
742 |     TODO
743 |     """
744 |     col1, col2 = df.columns
745 | 
746 |     # Test that both columns have valid entries and are either
747 |     # categorical or numeric, returning None if not.
748 |     column_props = {col: column_props[col][col] for col in [col1, col2]}
749 |     for col in [col1, col2]:
750 |         if (
751 |             not (
752 |                 column_props[col]["is_categorical"]
753 |                 or column_props[col]["numeric"]
754 |             )
755 |             or column_props[col]["notnulls"] == 0
756 |         ):
757 |             return None
758 | 
759 |     report = {"_columns": [col1, col2], col1: {}}
760 | 
761 |     log_string = "pairdensity - {} - {}".format(col1, col2)
762 |     logger.debug("{}".format(log_string))
763 | 
764 |     data = df.dropna()
765 |     N = len(data.index)
766 | 
767 |     coord_ranges, scales, categories = [], [], []
768 |     bandwidths = [None, None]
769 |     for col in [col1, col2]:
770 |         if column_props[col]["is_categorical"]:
771 |             scales.append("category")
772 |             coord_ranges.append(None)
773 |             categories.append(sorted(list(freq[col][col].keys())))
774 |         else:
775 |             scales.append(
776 |                 "log" if column_summ[col][col]["logtrans"] else "linear"
777 |             )
778 |             coord_ranges.append(
779 |                 [column_summ[col][col][extreme] for extreme in ["min", "max"]]
780 |             )
781 |             categories.append(None)
782 | 
783 |     Ncat = np.sum([scale == "category" for scale in scales])
784 | 
785 |     if N == 0:
786 |         logger.warning("{}: No valid pairs found!".format(log_string))
787 | 
788 |     if Ncat == 0:
789 |         # 2D pair density is not useful with very few observations
790 |         if N > 3:
791 |             logtrans = [scale == "log" for scale in scales]
792 | 
793 |             bandwidths = [
794 |                 _bw_scott(column_summ[col][col], N, lt, 2 - Ncat)
795 |                 for col, lt in zip([col1, col2], logtrans)
796 |             ]
797 | 
798 |             x, y, density = _compute_smoothed_histogram2d(
799 |                 np.array(data), bandwidths, coord_ranges, logtrans=logtrans
800 |             )
801 | 
802 |             x, y = x.tolist(), y.tolist()
803 |         else:
804 |             x, y = coord_ranges
805 |             density = np.zeros((2, 2))
806 | 
807 |     elif Ncat == 1:
808 |         # Split into categories and do a univariate KDE on each.
809 |         if column_props[col1]["is_categorical"]:
810 |             cats = categories[0]
811 |             coord_range = coord_ranges[1]
812 |             catcol, numcol, numcolsum = col1, col2, column_summ[col2][col2]
813 |             logtrans = scales[1] == "log"
814 |         else:
815 |             cats = categories[1]
816 |             coord_range = coord_ranges[0]
817 |             catcol, numcol, numcolsum = col2, col1, column_summ[col1][col1]
818 |             logtrans = scales[0] == "log"
819 | 
820 |         density = []
821 |         for cat in cats:
822 |             # Filter data for this category.
823 |             datacat = data[data[catcol] == cat][numcol]
824 |             Nincat = datacat.count()
825 | 
826 |             # Recompute the bandwidth because the number of pairs in
827 |             # this category might be lower than the total number of
828 |             # pairs.
829 |             num_bw = _bw_scott(numcolsum, Nincat, logtrans, 1)
830 |             grid, catdensity = _compute_smoothed_histogram(
831 |                 datacat, num_bw, coord_range, logtrans=logtrans
832 |             )
833 | 
834 |             # Remove normalisation to normalise it later to the total
835 |             # number of pairs.
836 |             density.append(catdensity * Nincat)
837 | 
838 |         density = np.array(density) / N
839 | 
840 |         if column_props[col1]["is_categorical"]:
841 |             density = density.T
842 |             x, y = cats, grid.tolist()
843 |         else:
844 |             x, y = grid.tolist(), cats
845 | 
846 |     elif Ncat == 2:
847 |         if N > 0:
848 |             # Crosstab frequencies.
849 |             dfcs = (
850 |                 pd.crosstab(data[col2], data[col1])
851 |                 .sort_index(axis=0)
852 |                 .sort_index(axis=1)
853 |             )
854 | 
855 |             x = [str(column) for column in dfcs.columns]
856 |             if "" in x:
857 |                 x[x.index("")] = " Null"
858 | 
859 |             y = [str(index) for index in dfcs.index]
860 |             if "" in y:
861 |                 y[y.index("")] = " Null"
862 | 
863 |             density = dfcs.get_values()
864 |         else:
865 |             x, y = categories
866 |             density = np.zeros((len(x), len(y)))
867 | 
868 |     report[col1][col2] = {
869 |         "density": density.tolist(),
870 |         "axes": {col1: x, col2: y},
871 |         "bw": bandwidths,
872 |         "scales": scales,
873 |     }
874 | 
875 |     return report
876 | 


--------------------------------------------------------------------------------
/lens/plotting.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | from matplotlib.ticker import FuncFormatter, MaxNLocator
  3 | import numpy as np
  4 | import plotly.graph_objs as go
  5 | import seaborn as sns
  6 | import plotly.figure_factory as pff
  7 | 
  8 | DEFAULT_COLORSCALE = "Viridis"
  9 | 
 10 | 
 11 | def plot_distribution(ls, column, bins=None):
 12 |     """Plot the distribution of numerical columns.
 13 | 
 14 |     Create a plotly plot with a histogram of the values in a column. The
 15 |     number of bin in the histogram is decided according to the
 16 |     Freedman-Diaconis rule unless given by the `bins` parameter.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     ls : :class:`~lens.Summary`
 21 |         Lens `Summary`.
 22 |     column : str
 23 |         Name of the column.
 24 |     bins : int, optional
 25 |         Number of bins to use for histogram. If not given, the
 26 |         Freedman-Diaconis rule will be used to estimate the best number of
 27 |         bins. This argument also accepts the formats taken by the `bins`
 28 |         parameter of matplotlib's :function:`~matplotlib.pyplot.hist`.
 29 | 
 30 |     Returns
 31 |     -------
 32 |     :class:`~matplotlib.Axes`
 33 |         Matplotlib axes containing the distribution plot.
 34 |     """
 35 |     column_summary = ls.summary(column)
 36 |     if column_summary["notnulls"] <= 2:
 37 |         # Plotly refuses to plot histograms if
 38 |         # the tdigest has too few values
 39 |         raise ValueError(
 40 |             "There are fewer than two non-null values in this column"
 41 |         )
 42 | 
 43 |     if bins is None:
 44 |         counts, edges = ls.histogram(column)
 45 |     else:
 46 |         xs, counts = ls.tdigest_centroids(column)
 47 |         counts, edges = np.histogram(xs, weights=counts, bins=bins)
 48 | 
 49 |     fig, ax = plt.subplots()
 50 | 
 51 |     ax.bar(
 52 |         edges[:-1], counts, width=np.diff(edges), label=column, align="edge"
 53 |     )
 54 | 
 55 |     ax.set_ylim(bottom=0)
 56 | 
 57 |     ax.set_xlabel(column)
 58 |     ax.set_title('Distribution of column "{}"'.format(column))
 59 | 
 60 |     ax.figure.tight_layout()
 61 | 
 62 |     return fig
 63 | 
 64 | 
 65 | def _set_integer_tick_labels(axis, labels):
 66 |     """Use labels dict to set labels on axis"""
 67 |     axis.set_major_formatter(FuncFormatter(lambda x, _: labels.get(x, "")))
 68 |     axis.set_major_locator(MaxNLocator(integer=True))
 69 | 
 70 | 
 71 | def plot_pairdensity_mpl(ls, column1, column2):
 72 |     """Plot the pairwise density between two columns.
 73 | 
 74 |     This plot is an approximation of a scatterplot through a 2D Kernel
 75 |     Density Estimate for two numerical variables. When one of the variables
 76 |     is categorical, a 1D KDE for each of the categories is shown,
 77 |     normalised to the total number of non-null observations. For two
 78 |     categorical variables, the plot produced is a heatmap representation of
 79 |     the contingency table.
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     ls : :class:`~lens.Summary`
 84 |         Lens `Summary`.
 85 |     column1 : str
 86 |         First column.
 87 |     column2 : str
 88 |         Second column.
 89 | 
 90 |     Returns
 91 |     -------
 92 |     :class:`plt.Figure`
 93 |         Matplotlib figure containing the pairwise density plot.
 94 |     """
 95 |     pair_details = ls.pair_details(column1, column2)
 96 |     pairdensity = pair_details["pairdensity"]
 97 | 
 98 |     x = np.array(pairdensity["x"])
 99 |     y = np.array(pairdensity["y"])
100 |     Z = np.array(pairdensity["density"])
101 | 
102 |     fig, ax = plt.subplots()
103 | 
104 |     if ls.summary(column1)["desc"] == "categorical":
105 |         idx = np.argsort(x)
106 |         x = x[idx]
107 |         Z = Z[:, idx]
108 |         # Create labels and positions for categorical axis
109 |         x_labels = dict(enumerate(x))
110 |         _set_integer_tick_labels(ax.xaxis, x_labels)
111 |         x = np.arange(-0.5, len(x), 1.0)
112 | 
113 |     if ls.summary(column2)["desc"] == "categorical":
114 |         idx = np.argsort(y)
115 |         y = y[idx]
116 |         Z = Z[idx]
117 |         y_labels = dict(enumerate(y))
118 |         _set_integer_tick_labels(ax.yaxis, y_labels)
119 |         y = np.arange(-0.5, len(y), 1.0)
120 | 
121 |     X, Y = np.meshgrid(x, y)
122 | 
123 |     ax.pcolormesh(X, Y, Z, cmap=DEFAULT_COLORSCALE.lower())
124 | 
125 |     ax.set_xlabel(column1)
126 |     ax.set_ylabel(column2)
127 | 
128 |     ax.set_title(r"$\it{{ {} }}$ vs $\it{{ {} }}$".format(column1, column2))
129 | 
130 |     return fig
131 | 
132 | 
133 | def plot_correlation_mpl(ls, include=None, exclude=None):
134 |     """Plot the correlation matrix for numeric columns
135 | 
136 |     Plot a Spearman rank order correlation coefficient matrix showing the
137 |     correlation between columns. The matrix is reordered to group together
138 |     columns that have a higher correlation coefficient.  The columns to be
139 |     plotted in the correlation plot can be selected through either the
140 |     ``include`` or ``exclude`` keyword arguments. Only one of them can be
141 |     given.
142 | 
143 |     Parameters
144 |     ----------
145 |     ls : :class:`~lens.Summary`
146 |         Lens `Summary`.
147 |     include : list of str
148 |         List of columns to include in the correlation plot.
149 |     exclude : list of str
150 |         List of columns to exclude from the correlation plot.
151 | 
152 |     Returns
153 |     -------
154 |     :class:`plt.Figure`
155 |         Matplotlib figure containing the pairwise density plot.
156 |     """
157 | 
158 |     columns, correlation_matrix = ls.correlation_matrix(include, exclude)
159 |     num_cols = len(columns)
160 | 
161 |     if num_cols > 10:
162 |         annotate = False
163 |     else:
164 |         annotate = True
165 | 
166 |     fig, ax = plt.subplots()
167 |     sns.heatmap(
168 |         correlation_matrix,
169 |         annot=annotate,
170 |         fmt=".2f",
171 |         ax=ax,
172 |         xticklabels=columns,
173 |         yticklabels=columns,
174 |         vmin=-1,
175 |         vmax=1,
176 |         cmap="RdBu_r",
177 |         square=True,
178 |     )
179 | 
180 |     ax.xaxis.tick_top()
181 | 
182 |     # Enforces a width of 2.5 inches per cell in the plot,
183 |     # unless this exceeds 10 inches.
184 |     width_inches = len(columns) * 2.5
185 |     while width_inches > 10:
186 |         width_inches = 10
187 | 
188 |     fig.set_size_inches(width_inches, width_inches)
189 | 
190 |     return fig
191 | 
192 | 
193 | def plot_cdf(ls, column, N_cdf=100):
194 |     """Plot the empirical cumulative distribution function of a column.
195 | 
196 |     Creates a plotly plot with the empirical CDF of a column.
197 | 
198 |     Parameters
199 |     ----------
200 |     ls : :class:`~lens.Summary`
201 |         Lens `Summary`.
202 |     column : str
203 |         Name of the column.
204 |     N_cdf : int
205 |         Number of points in the CDF plot.
206 | 
207 |     Returns
208 |     -------
209 |     :class:`~matplotlib.Axes`
210 |         Matplotlib axes containing the distribution plot.
211 |     """
212 |     tdigest = ls.tdigest(column)
213 | 
214 |     cdfs = np.linspace(0, 100, N_cdf)
215 |     xs = [tdigest.percentile(p) for p in cdfs]
216 | 
217 |     fig, ax = plt.subplots()
218 | 
219 |     ax.set_ylabel("Percentile")
220 |     ax.set_xlabel(column)
221 |     ax.plot(xs, cdfs)
222 | 
223 |     if ls._report["column_summary"][column]["logtrans"]:
224 |         ax.set_xscale("log")
225 | 
226 |     ax.set_title("Empirical Cumulative Distribution Function")
227 | 
228 |     return fig
229 | 
230 | 
231 | def plot_pairdensity(ls, column1, column2):
232 |     """Plot the pairwise density between two columns.
233 | 
234 |     This plot is an approximation of a scatterplot through a 2D Kernel
235 |     Density Estimate for two numerical variables. When one of the variables
236 |     is categorical, a 1D KDE for each of the categories is shown,
237 |     normalised to the total number of non-null observations. For two
238 |     categorical variables, the plot produced is a heatmap representation of
239 |     the contingency table.
240 | 
241 |     Parameters
242 |     ----------
243 |     ls : :class:`~lens.Summary`
244 |         Lens `Summary`.
245 |     column1 : str
246 |         First column.
247 |     column2 : str
248 |         Second column.
249 | 
250 |     Returns
251 |     -------
252 |     :class:`plotly.Figure`
253 |         Plotly figure containing the pairwise density plot.
254 |     """
255 |     pair_details = ls.pair_details(column1, column2)
256 |     pairdensity = pair_details["pairdensity"]
257 | 
258 |     x = np.array(pairdensity["x"])
259 |     y = np.array(pairdensity["y"])
260 |     Z = np.array(pairdensity["density"])
261 | 
262 |     if ls.summary(column1)["desc"] == "categorical":
263 |         idx = np.argsort(x)
264 |         x = x[idx]
265 |         Z = Z[:, idx]
266 | 
267 |     if ls.summary(column2)["desc"] == "categorical":
268 |         idx = np.argsort(y)
269 |         y = y[idx]
270 |         Z = Z[idx]
271 | 
272 |     data = [go.Heatmap(z=Z, x=x, y=y, colorscale=DEFAULT_COLORSCALE)]
273 |     layout = go.Layout(title="<i>{}</i> vs <i>{}</i>".format(column1, column2))
274 |     layout["xaxis"] = {
275 |         "type": pairdensity["x_scale"],
276 |         "autorange": True,
277 |         "title": column1,
278 |     }
279 |     layout["yaxis"] = {
280 |         "type": pairdensity["y_scale"],
281 |         "autorange": True,
282 |         "title": column2,
283 |     }
284 |     fig = go.Figure(data=data, layout=layout)
285 |     fig.data[0]["showscale"] = False
286 | 
287 |     return fig
288 | 
289 | 
290 | def plot_correlation(ls, include=None, exclude=None):
291 |     """Plot the correlation matrix for numeric columns
292 | 
293 |     Plot a Spearman rank order correlation coefficient matrix showing the
294 |     correlation between columns. The matrix is reordered to group together
295 |     columns that have a higher correlation coefficient.  The columns to be
296 |     plotted in the correlation plot can be selected through either the
297 |     ``include`` or ``exclude`` keyword arguments. Only one of them can be
298 |     given.
299 | 
300 |     Parameters
301 |     ----------
302 |     ls : :class:`~lens.Summary`
303 |         Lens `Summary`.
304 |     include : list of str
305 |         List of columns to include in the correlation plot.
306 |     exclude : list of str
307 |         List of columns to exclude from the correlation plot.
308 | 
309 |     Returns
310 |     -------
311 |     :class:`plotly.Figure`
312 |         Plotly figure containing the pairwise density plot.
313 |     """
314 | 
315 |     columns, correlation_matrix = ls.correlation_matrix(include, exclude)
316 |     num_cols = len(columns)
317 | 
318 |     if num_cols > 10:
319 |         annotate = False
320 |     else:
321 |         annotate = True
322 | 
323 |     hover_text = []
324 |     for i in range(num_cols):
325 |         hover_text.append(
326 |             [
327 |                 "Corr({}, {}) = {:.2g}".format(
328 |                     columns[i], columns[j], correlation_matrix[i, j]
329 |                 )
330 |                 for j in range(num_cols)
331 |             ]
332 |         )
333 | 
334 |     if annotate:
335 |         t = np.reshape(
336 |             ["{:.2g}".format(x) for x in correlation_matrix.flatten()],
337 |             correlation_matrix.shape,
338 |         )[::-1].tolist()
339 |     else:
340 |         nrows, ncolumns = correlation_matrix.shape
341 |         t = [["" for i in range(nrows)] for j in range(ncolumns)]
342 | 
343 |     fig = pff.create_annotated_heatmap(
344 |         z=correlation_matrix.tolist()[::-1],
345 |         colorscale="RdBu",
346 |         x=columns,
347 |         y=columns[::-1],
348 |         zmin=-1.0,
349 |         zmax=1.0,
350 |         annotation_text=t,
351 |         text=hover_text[::-1],
352 |         hoverinfo="text",
353 |     )
354 |     w = len(columns) * 2.5 * 72
355 |     while w > 600:
356 |         w /= np.sqrt(1.4)
357 |     fig.layout["width"] = w
358 |     fig.layout["height"] = w
359 |     fig.data[0]["showscale"] = True
360 | 
361 |     return fig
362 | 


--------------------------------------------------------------------------------
/lens/summarise.py:
--------------------------------------------------------------------------------
  1 | """Summarise a Pandas DataFrame"""
  2 | 
  3 | import json
  4 | import logging
  5 | import os
  6 | import time
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import scipy
 11 | 
 12 | from .dask_graph import create_dask_graph
 13 | from .tdigest_utils import tdigest_from_centroids
 14 | from .utils import hierarchical_ordering
 15 | from .version import __version__
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | logger.addHandler(logging.StreamHandler())
 19 | 
 20 | 
 21 | class LensSummaryError(Exception):
 22 |     pass
 23 | 
 24 | 
 25 | class EmptyDataFrameError(Exception):
 26 |     pass
 27 | 
 28 | 
 29 | def _validate_report(report, schema_version):
 30 |     """Validates a dict report"""
 31 |     report_schema_version = report.get("_schema_version")
 32 |     if (
 33 |         report_schema_version is not None
 34 |         and report_schema_version != schema_version
 35 |     ):
 36 |         raise LensSummaryError(
 37 |             "The version of the report schema `{}` does "
 38 |             "not match the schema version `{}` supported "
 39 |             "by this version of lens {}.".format(
 40 |                 report_schema_version, schema_version, __version__
 41 |             )
 42 |         )
 43 | 
 44 |     columns = report["_columns"]
 45 |     column_props = report["column_properties"]
 46 |     num_cols = [col for col in columns if (column_props[col]["numeric"])]
 47 |     for num_col in num_cols:
 48 |         if (
 49 |             num_col not in report["column_summary"].keys()
 50 |             or num_col not in report["correlation"]["_columns"]
 51 |             or num_col not in report["outliers"].keys()
 52 |         ):
 53 |             raise LensSummaryError(
 54 |                 "Column `{}` is marked as numeric but "
 55 |                 "the report lacks its numeric summary"
 56 |                 " and correlation".format(num_col)
 57 |             )
 58 | 
 59 |     cat_cols = [col for col in columns if column_props[col]["is_categorical"]]
 60 |     for cat_col in cat_cols:
 61 |         if cat_col not in report["frequencies"].keys():
 62 |             raise LensSummaryError(
 63 |                 "Column `{}` is marked as categorical but "
 64 |                 "the report lacks its frequency analysis".format(cat_col)
 65 |             )
 66 | 
 67 | 
 68 | class NumpyEncoder(json.JSONEncoder):
 69 |     def default(self, obj):
 70 |         if isinstance(obj, np.integer):
 71 |             return int(obj)
 72 |         elif isinstance(obj, np.floating):
 73 |             return float(obj)
 74 |         elif isinstance(obj, np.ndarray):
 75 |             return obj.tolist()
 76 |         else:
 77 |             return super(NumpyEncoder, self).default(obj)
 78 | 
 79 | 
 80 | class Summary(object):
 81 |     """A summary of a pandas DataFrame.
 82 | 
 83 |     Create a summary instance by calling :func:`lens.summarise.summarise` on a
 84 |     DataFrame.  This calculates several quantities of interest to data
 85 |     scientists.
 86 | 
 87 |     The Summary object is designed for programmatic use. For more direct
 88 |     visual inspection, use the :class:`lens.explorer.Explorer` class
 89 |     in a Jupyter notebook.
 90 | 
 91 |     """
 92 | 
 93 |     schema_version = 1
 94 | 
 95 |     def __init__(self, report):
 96 |         if not isinstance(report, dict):
 97 |             raise TypeError("report argument must be a dict")
 98 | 
 99 |         if "_schema_version" not in report.keys():
100 |             report["_schema_version"] = self.schema_version
101 | 
102 |         _validate_report(report, schema_version=self.schema_version)
103 |         self._report = report
104 | 
105 |     @staticmethod
106 |     def from_json(file):
107 |         """Create a Summary from a report saved in JSON format.
108 | 
109 |         Parameters
110 |         ----------
111 |         file : str or buffer
112 |             Path to file containing the JSON report or buffer from which the
113 |             report can be read.
114 | 
115 |         Returns
116 |         -------
117 |         :class:`~lens.summarise.Summary`
118 |             ``Summary`` object containing the summary in the JSON file.
119 |         """
120 |         if hasattr(file, "read"):
121 |             report = json.load(file)
122 |         else:
123 |             with open(file, "r") as f:
124 |                 report = json.load(f)
125 | 
126 |         return Summary(report)
127 | 
128 |     def to_json(self, file=None):
129 |         """Produce a JSON serialization of the report.
130 | 
131 |         Parameters
132 |         ----------
133 |         file : str or buffer, optional
134 |             File name or writeable buffer to save the JSON report. If omitted,
135 |             a string containing the report will be returned.
136 | 
137 |         Returns
138 |         -------
139 |         str
140 |            JSON serialization of the summary report
141 |         """
142 |         if file is None:
143 |             return json.dumps(
144 |                 self._report, separators=(",", ":"), cls=NumpyEncoder
145 |             )
146 |         else:
147 |             if hasattr(file, "write"):
148 |                 json.dump(
149 |                     self._report, file, separators=(",", ":"), cls=NumpyEncoder
150 |                 )
151 |             else:
152 |                 with open(file, "w") as f:
153 |                     json.dump(
154 |                         self._report,
155 |                         f,
156 |                         separators=(",", ":"),
157 |                         cls=NumpyEncoder,
158 |                     )
159 | 
160 |     @property
161 |     def columns(self):
162 |         """Get a list of column names of the dataset.
163 | 
164 |         Returns
165 |         -------
166 |         list
167 |             Column names
168 | 
169 |         Examples
170 |         --------
171 | 
172 |         >>> summary.columns
173 |         ['fixed acidity',
174 |          'volatile acidity',
175 |          'citric acid',
176 |          'residual sugar',
177 |          'chlorides',
178 |          'free sulfur dioxide',
179 |          'total sulfur dioxide',
180 |          'density',
181 |          'pH',
182 |          'sulphates',
183 |          'alcohol',
184 |          'quality']
185 |         """
186 |         return self._report["_columns"]
187 | 
188 |     @property
189 |     def rows(self):
190 |         """Get the number of rows in the dataset.
191 | 
192 |         Returns
193 |         -------
194 |         int
195 |             Number of rows
196 | 
197 |         Examples
198 |         --------
199 | 
200 |         >>> summary.rows
201 |         4898
202 |         """
203 |         return self._report["row_count"]["total"]
204 | 
205 |     @property
206 |     def rows_unique(self):
207 |         """Get the number of unique rows in the dataset.
208 | 
209 |         Returns
210 |         -------
211 |         int
212 |             Number of unique rows.
213 |         """
214 |         return self._report["row_count"]["unique"]
215 | 
216 |     def _desc(self, column):
217 |         """Return the inferred description of a column.
218 | 
219 |         Parameters
220 |         ----------
221 |         column : str
222 |             Column name.
223 | 
224 |         Returns
225 |         -------
226 |         str
227 |             Description of the column.
228 |         """
229 | 
230 |         column_props = self._report["column_properties"][column]
231 | 
232 |         if column_props["is_categorical"]:
233 |             return "categorical"
234 |         elif column_props["numeric"]:
235 |             return "numeric"
236 |         elif column_props["is_ID"]:
237 |             return "ID_like"
238 |         else:
239 |             return None
240 | 
241 |     def summary(self, column):
242 |         """Basic information about the column
243 | 
244 |         This returns information about the number of nulls and unique
245 |         values in ``column`` as well as which type this column is.
246 |         This is guaranteed to return a dictionary with the same keys
247 |         for every column.
248 | 
249 |         The dictionary contains the following keys:
250 | 
251 |         ``desc``
252 |             the type of data: currently ``categorical`` or ``numeric``.
253 |             Lens will calculate different quantities for this column
254 |             depending on the value of ``desc``.
255 | 
256 |         ``dtype``
257 |             the type of data in Pandas.
258 | 
259 |         ``name``
260 |             column name
261 | 
262 |         ``notnulls``
263 |             number of non-null values in the column
264 | 
265 |         ``nulls``
266 |             number of null-values in the column
267 | 
268 |         ``unique``
269 |             number of unique values in the column
270 | 
271 | 
272 |         Examples
273 |         --------
274 | 
275 |         >>> summary.summary('quality')
276 |         {'desc': 'categorical',
277 |          'dtype': 'int64',
278 |          'name': 'quality',
279 |          'notnulls': 4898,
280 |          'nulls': 0,
281 |          'unique': 7}
282 | 
283 |         >>> summary.summary('chlorides')
284 |         {'desc': 'numeric',
285 |          'dtype': 'float64',
286 |          'name': 'chlorides',
287 |          'notnulls': 4898,
288 |          'nulls': 0,
289 |          'unique': 160}
290 | 
291 |         Parameters
292 |         ----------
293 |         column : str
294 |             Column name
295 | 
296 |         Returns
297 |         -------
298 |         dict
299 |             Dictionary of summary information.
300 |         """
301 |         if column not in self._report["_columns"]:
302 |             raise LensSummaryError(
303 |                 "The data summary does not contain"
304 |                 " information about column `{}`.".format(column)
305 |             )
306 | 
307 |         column_props = self._report["column_properties"][column]
308 | 
309 |         summary = {"name": column, "desc": self._desc(column)}
310 | 
311 |         for key in ["nulls", "notnulls", "unique", "dtype"]:
312 |             summary[key] = column_props[key]
313 | 
314 |         return summary
315 | 
316 |     def details(self, column):
317 |         """Type-specific information for a column
318 | 
319 |         The `details` method returns additional information on ``column``,
320 |         beyond that provided by the ``summary`` method. If ``column`` is
321 |         numeric, this returns summary statistics. If it is categorical,
322 |         it returns a dictionary of how often each category occurs.
323 | 
324 |         Examples
325 |         --------
326 | 
327 |         >>> summary.details('alcohol')
328 |         {'desc': 'numeric',
329 |          'iqr': 1.9000000000000004,
330 |          'max': 14.199999999999999,
331 |          'mean': 10.514267047774602,
332 |          'median': 10.4,
333 |          'min': 8.0,
334 |          'name': 'alcohol',
335 |          'std': 1.2306205677573181,
336 |          'sum': 51498.880000000005}
337 | 
338 |         >>> summary.details('quality')
339 |         {'desc': 'categorical',
340 |          'frequencies':
341 |               {3: 20, 4: 163, 5: 1457, 6: 2198, 7: 880, 8: 175, 9: 5},
342 |          'iqr': 1.0,
343 |          'max': 9,
344 |          'mean': 5.8779093507554103,
345 |          'median': 6.0,
346 |          'min': 3,
347 |          'name': 'quality',
348 |          'std': 0.88563857496783116,
349 |          'sum': 28790}
350 | 
351 |         Parameters
352 |         ----------
353 |         column : str
354 |             Column name
355 | 
356 |         Returns
357 |         -------
358 |         dict
359 |             Dictionary of detailed information.
360 |         """
361 |         if column not in self._report["_columns"]:
362 |             raise LensSummaryError(
363 |                 "The data summary does not contain"
364 |                 " information about column `{}`.".format(column)
365 |             )
366 | 
367 |         column_props = self._report["column_properties"][column]
368 | 
369 |         details = {"name": column, "desc": self._desc(column)}
370 | 
371 |         if column_props["is_categorical"]:
372 |             details["frequencies"] = self._report["frequencies"][column]
373 | 
374 |         if column_props["numeric"]:
375 |             column_summ = self._report["column_summary"][column]
376 |             for k in ["min", "max", "mean", "median", "std", "sum", "iqr"]:
377 |                 details[k] = column_summ[k]
378 |         return details
379 | 
380 |     def pair_details(self, first, second):
381 |         """Get pairwise information for a column pair.
382 | 
383 |         The information returned depends on the types of the two columns.
384 |         It may contain the following keys.
385 | 
386 |         correlation
387 |             dictionary with the Spearman rank correlation
388 |             coefficient and Pearson product-moment correlation coefficient
389 |             between the columns. This is returned when both columns are
390 |             numeric.
391 | 
392 |         pairdensity
393 |             dictionary with an estimate of the pairwise
394 |             density between the columns. The density is either
395 |             a 2D KDE estimate if both columns are numerical, or
396 |             several 1D KDE estimates if one of the columns is categorical
397 |             and the other numerical (grouped by the categorical column)
398 |             or a cross-tabuluation.
399 | 
400 |         Examples
401 |         --------
402 | 
403 |         >>> summary.pair_details('chlorides', 'quality')
404 |         {'correlation': {
405 |             'pearson': -0.20993441094675602,
406 |             'spearman': -0.31448847828244203},
407 |         {'pairdensity': {
408 |             'density': <2d numpy array>
409 |             'x': <1d numpy array of x-values>
410 |             'y': <1d numpy array of y-values>
411 |             'x_scale': 'linear',
412 |             'y_scale': 'cat'}
413 |         }
414 | 
415 |         >>> summary.pair_details('alcohol', 'chlorides')
416 |         {'correlation': {
417 |             'pearson': -0.36018871210816106,
418 |             'spearman': -0.5708064071153713},
419 |         {'pairdensity': {
420 |             'density': <2d numpy array>
421 |             'x': <1d numpy array of x-values>
422 |             'y': <1d numpy array of y-values>
423 |             'x_scale': 'linear',
424 |             'y_scale': 'linear'}
425 |         }
426 | 
427 |         Parameters
428 |         ----------
429 |         first : str
430 |             Name of the first column.
431 |         second : str
432 |             Name of the second column.
433 | 
434 |         Returns
435 |         -------
436 |         dict
437 |             Dictionary of pairwise information.
438 |         """
439 |         if first == second:
440 |             raise ValueError(
441 |                 "Can only return the pair details of two different columns: "
442 |                 "received {} twice.".format(first)
443 |             )
444 | 
445 |         pair_details = {}
446 | 
447 |         # Correlation
448 | 
449 |         corr_report = self._report["correlation"]
450 |         try:
451 |             idx = [
452 |                 corr_report["_columns"].index(col) for col in [first, second]
453 |             ]
454 |         except ValueError as e:
455 |             logger.info(
456 |                 "No correlation information for column `{}`".format(
457 |                     e.args[0].split()[0]
458 |                 )
459 |             )
460 |         else:
461 |             correlation = {
462 |                 k: corr_report[k][idx[0]][idx[1]]
463 |                 for k in ["spearman", "pearson"]
464 |             }
465 |             pair_details["correlation"] = correlation
466 | 
467 |         # Pair density / Crosstab
468 | 
469 |         pairdensity_report = self._report["pairdensity"]
470 | 
471 |         # We store pairdensity information for both first/second and
472 |         # second/first in a single key in the report, so we check for both
473 |         # report[first][second] and report[second][first] to find it and
474 |         # transpose if necessary.
475 |         try:
476 |             pairdensity = pairdensity_report[first][second]
477 |             scales = pairdensity["scales"]
478 |             density = np.array(pairdensity["density"])
479 |         except KeyError:
480 |             try:
481 |                 pairdensity = pairdensity_report[second][first]
482 |                 # Invert scale information and transpose matrix
483 |                 scales = pairdensity["scales"][::-1]
484 |                 density = np.array(pairdensity["density"]).T
485 |             except KeyError:
486 |                 logger.info(
487 |                     "No pairdensity information for columns `{}`"
488 |                     " and `{}`".format(first, second)
489 |                 )
490 |                 pairdensity = None
491 | 
492 |         if pairdensity is not None:
493 |             pairdensity = {
494 |                 "density": density,
495 |                 "x": pairdensity["axes"][first],
496 |                 "y": pairdensity["axes"][second],
497 |                 "x_scale": scales[0],
498 |                 "y_scale": scales[1],
499 |             }
500 | 
501 |             pair_details["pairdensity"] = pairdensity
502 | 
503 |         return pair_details
504 | 
505 |     def histogram(self, column):
506 |         """
507 |         Return the histogram for `column`.
508 | 
509 |         This function returns a histogram for the column. The number of bins is
510 |         estimated through the Freedman-Diaconis rule.
511 | 
512 |         Parameters
513 |         ----------
514 | 
515 |         column: str
516 |             Name of the column
517 | 
518 |         Returns
519 |         -------
520 | 
521 |         counts: array
522 |             Counts for each of the bins of the histogram.
523 |         bin_edges : array
524 |             Edges of the bins in the histogram. Length is ``length(counts)+1``.
525 |         """
526 |         self._check_column_name(column)
527 |         try:
528 |             histogram = self._report["column_summary"][column]["histogram"]
529 |         except KeyError:
530 |             raise ValueError("{} is not a numeric column".format(column))
531 | 
532 |         return [np.array(histogram[key]) for key in ["counts", "bin_edges"]]
533 | 
534 |     def kde(self, column):
535 |         """
536 |         Return a Kernel Density Estimate for `column`.
537 | 
538 |         This function returns a KDE for the column. It is computed between the
539 |         minimum and maximum values of the column and uses Scott's rule to
540 |         compute the bandwith.
541 | 
542 |         Parameters
543 |         ----------
544 | 
545 |         column: str
546 |             Name of the column
547 | 
548 |         Returns
549 |         -------
550 | 
551 |         x: array
552 |             Values at which the KDE has been evaluated.
553 |         y : array
554 |             Values of the KDE.
555 |         """
556 |         self._check_column_name(column)
557 |         try:
558 |             kde = self._report["column_summary"][column]["kde"]
559 |         except KeyError:
560 |             raise ValueError("{} is not a numeric column".format(column))
561 | 
562 |         return [np.array(kde[key]) for key in ["x", "y"]]
563 | 
564 |     def _tdigest_report(self, column):
565 |         """ Return the list of tdigest centroids and means from report
566 |         """
567 |         self._check_column_name(column)
568 |         try:
569 |             tdigest_list = self._report["column_summary"][column]["tdigest"]
570 |         except KeyError:
571 |             raise ValueError("{} is not a numeric column".format(column))
572 |         return tdigest_list
573 | 
574 |     def tdigest_centroids(self, column):
575 |         """Get TDigest centroids and counts for column.
576 | 
577 |         Parameters
578 |         ----------
579 |         column : str
580 |             Name of the column.
581 | 
582 |         Returns
583 |         -------
584 |         :class:`numpy.array`
585 |             Means of the TDigest centroids.
586 |         :class:`numpy.array`
587 |             Counts for each of the TDigest centroids.
588 |         """
589 | 
590 |         tdigest_list = self._tdigest_report(column)
591 |         xs, counts = zip(*tdigest_list)
592 |         return np.array(xs), np.array(counts)
593 | 
594 |     def pdf(self, column):
595 |         """ Approximate pdf for `column`
596 | 
597 |         This returns a function representing the pdf of a numeric column.
598 | 
599 |         Examples
600 |         --------
601 | 
602 |         >>> pdf = summary.pdf('chlorides')
603 |         >>> min_value = summary.details('chlorides')['min']
604 |         >>> max_value = summary.details('chlorides')['max']
605 |         >>> xs = np.linspace(min_value, max_value, 200)
606 |         >>> plt.plot(xs, pdf(xs))
607 | 
608 |         Parameters
609 |         ----------
610 | 
611 |         column : str
612 |             Name of the column.
613 | 
614 |         Returns
615 |         -------
616 |         pdf: function
617 |             Function representing the pdf.
618 |         """
619 |         xs, counts = self.tdigest_centroids(column)
620 |         return scipy.interpolate.interp1d(xs, counts)
621 | 
622 |     def tdigest(self, column):
623 |         """Return a TDigest object approximating the distribution of a column
624 | 
625 |         Documentation for the TDigest class can be found at
626 |         https://github.com/CamDavidsonPilon/tdigest.
627 | 
628 |         Parameters
629 |         ----------
630 |         column : str
631 |             Name of the column.
632 | 
633 |         Returns
634 |         -------
635 |         :class:`tdigest.TDigest`
636 |             TDigest instance computed from the values of the column.
637 |         """
638 |         return tdigest_from_centroids(self._tdigest_report(column))
639 | 
640 |     def cdf(self, column):
641 |         """ Approximate cdf for `column`
642 | 
643 |         This returns a function representing the cdf of a numeric column.
644 | 
645 |         Examples
646 |         --------
647 | 
648 |         >>> cdf = summary.cdf('chlorides')
649 |         >>> min_value = summary.details('chlorides')['min']
650 |         >>> max_value = summary.details('chlorides')['max']
651 |         >>> xs = np.linspace(min_value, max_value, 200)
652 |         >>> plt.plot(xs, cdf(xs))
653 | 
654 |         Parameters
655 |         ----------
656 | 
657 |         column : str
658 |             Name of the column.
659 | 
660 |         Returns
661 |         -------
662 |         cdf: function
663 |             Function representing the cdf.
664 |         """
665 |         tdigest = self.tdigest(column)
666 |         return tdigest.cdf
667 | 
668 |     def correlation_matrix(self, include=None, exclude=None):
669 |         """ Correlation matrix for numeric columns
670 | 
671 |         Parameters
672 |         ----------
673 | 
674 |         include: list of strings, optional
675 |             List of numeric columns to include. Includes all columns
676 |             by default.
677 | 
678 |         exclude: list of strings, optional
679 |             List of numeric columns to exclude. Includes all columns
680 |             by default.
681 | 
682 |         Returns
683 |         -------
684 | 
685 |         columns: list of strings
686 |             List of column names
687 | 
688 |         correlation_matrix: 2D array of floats
689 |             The correlation matrix, ordered such that
690 |             ``correlation_matrix[i, j]`` is the correlation between
691 |             ``columns[i]`` and ``columns[j]``
692 | 
693 |         Notes
694 |         -----
695 | 
696 |         The columns are ordered through hierarchical clustering. Thus,
697 |         neighbouring columns in the output will be more correlated.
698 |         """
699 |         if include is not None and exclude is not None:
700 |             raise ValueError(
701 |                 "Either 'include' or 'exclude' should be defined, "
702 |                 "but not both"
703 |             )
704 | 
705 |         available_columns = self._report["correlation"]["_columns"]
706 |         if include is not None:
707 |             non_numeric_includes = set(include) - set(available_columns)
708 |             if non_numeric_includes:
709 |                 raise ValueError(
710 |                     "Only numeric columns can be included in the "
711 |                     "correlation plot. Columns {} are not "
712 |                     "numeric".format(non_numeric_includes)
713 |                 )
714 |             columns = include
715 |         elif exclude is not None:
716 |             columns = set(available_columns) - set(exclude)
717 |         else:
718 |             columns = available_columns
719 |         columns = list(columns)
720 | 
721 |         # Filter the correlation matrix to select only the above columns
722 |         correlation_report = self._report["correlation"]
723 |         idx = [correlation_report["_columns"].index(col) for col in columns]
724 |         correlation_matrix = np.array(correlation_report["spearman"])[idx][
725 |             :, idx
726 |         ]
727 | 
728 |         return hierarchical_ordering(columns, correlation_matrix)
729 | 
730 |     def _check_column_name(self, column):
731 |         if column not in self.columns:
732 |             raise KeyError(column)
733 | 
734 | 
735 | def summarise(
736 |     df,
737 |     scheduler="multiprocessing",
738 |     num_workers=None,
739 |     size=None,
740 |     pairdensities=True,
741 | ):
742 |     """Create a Lens Summary for a Pandas DataFrame.
743 | 
744 |     This creates a :class:`~lens.Summary` instance containing
745 |     many quantities of interest to a data scientist.
746 | 
747 |     Examples
748 |     --------
749 | 
750 |     Let's explore the wine quality dataset.
751 | 
752 |     >>> import pandas as pd
753 |     >>> import lens
754 |     >>> url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"  # noqa
755 |     >>> wines_df = pd.read_csv(url, sep=';')
756 |     >>> summary = lens.summarise(wines_df)
757 | 
758 |     Now that we have a :class:`~lens.Summary` instance we can inspect
759 |     the shape of the dataset
760 | 
761 |     >>> summary.columns
762 |     ['fixed acidity',
763 |      'volatile acidity',
764 |      'citric acid',
765 |      'residual sugar',
766 |      'chlorides',
767 |      'free sulfur dioxide',
768 |      'total sulfur dioxide',
769 |      'density',
770 |      'pH',
771 |      'sulphates',
772 |      'alcohol',
773 |      'quality']
774 |     >>> summary.rows
775 |     4898
776 | 
777 |     So far, nothing groundbreaking. Let's look at the ``quality`` column:
778 | 
779 |     >>> summary.summary('quality')
780 |     {'desc': 'categorical',
781 |      'dtype': 'int64',
782 |      'name': 'quality',
783 |      'notnulls': 4898,
784 |      'nulls': 0,
785 |      'unique': 7}
786 | 
787 |     This tells us that there are seven unique values in the quality columns,
788 |     and zero null values. It also tells us that lens will treat this
789 |     column as categorical. Let's look at this in more details:
790 | 
791 |     >>> summary.details('quality')
792 |     {'desc': 'categorical',
793 |      'frequencies': {3: 20, 4: 163, 5: 1457, 6: 2198, 7: 880, 8: 175, 9: 5},
794 |      'iqr': 1.0,
795 |      'max': 9,
796 |      'mean': 5.8779093507554103,
797 |      'median': 6.0,
798 |      'min': 3,
799 |      'name': 'quality',
800 |      'std': 0.88563857496783116,
801 |      'sum': 28790}
802 | 
803 |     This tells us that the median wine quality is 6 and the standard deviation
804 |     is less than one. Let's now get the correlation between the ``quality``
805 |     column and the ``alcohol`` column:
806 | 
807 |     >>> summary.pair_detail('quality', 'alcohol')['correlation']
808 |     {'pearson': 0.4355747154613688, 'spearman': 0.4403691816246831}
809 | 
810 |     Thus, the Spearman Rank Correlation coefficient between these two columns
811 |     is 0.44.
812 | 
813 |     Parameters
814 |     ----------
815 |     df : pd.DataFrame
816 |         DataFrame to be analysed.
817 |     scheduler : str, optional
818 |         Dask scheduler to use. Must be one of [distributed, multiprocessing,
819 |         processes, single-threaded, sync, synchronous, threading, threads].
820 |     num_workers : int or None, optional
821 |         Number of workers in the pool. If the environment variable `NUM_CPUS`
822 |         is set that number will be used, otherwise it will use as many workers
823 |         as CPUs available in the machine.
824 |     size : int, optional
825 |         DataFrame size on disk, which will be added to the report.
826 |     pairdensities : bool, optional
827 |         Whether to compute the pairdensity estimation between all pairs of
828 |         numerical columns. For most datasets, this is the most expensive
829 |         computation. Default is True.
830 | 
831 |     Returns
832 |     -------
833 |     summary : :class:`~lens.Summary`
834 |         The computed data summary.
835 |     """
836 |     if not isinstance(df, pd.DataFrame):
837 |         raise TypeError("Can only summarise a Pandas DataFrame")
838 | 
839 |     if len(df.columns) == 0:
840 |         raise EmptyDataFrameError("The DataFrame has no columns")
841 | 
842 |     if num_workers is None:
843 |         try:
844 |             num_workers = int(os.environ["NUM_CPUS"])
845 |             logger.debug(
846 |                 "Number of workers read from environment: {}".format(
847 |                     num_workers
848 |                 )
849 |             )
850 |         except ValueError:
851 |             # Set to None if NUM_CPUS cannot be cast to an integer
852 |             logger.warning(
853 |                 "Environment variable NUM_CPUS={} cannot be"
854 |                 " interpreted as an integer, defaulting to"
855 |                 " number of cores in system".format(os.environ.get("NUM_CPUS"))
856 |             )
857 |             num_workers = None
858 |         except KeyError:
859 |             # NUM_CPUS not in environment
860 |             num_workers = None
861 | 
862 |     kwargs = {"scheduler": scheduler}
863 |     if num_workers is not None:
864 |         kwargs["num_workers"] = num_workers
865 | 
866 |     tstart = time.time()
867 |     report = create_dask_graph(df, pairdensities=pairdensities).compute(
868 |         **kwargs
869 |     )
870 |     report["_run_time"] = time.time() - tstart
871 | 
872 |     report["_lens_version"] = __version__
873 | 
874 |     if size is not None:
875 |         report["size"] = size
876 | 
877 |     return Summary(report)
878 | 


--------------------------------------------------------------------------------
/lens/tdigest_utils.py:
--------------------------------------------------------------------------------
 1 | from tdigest.tdigest import TDigest, Centroid
 2 | 
 3 | 
 4 | def tdigest_from_centroids(seq):
 5 |     """Create a TDigest from a list of centroid means and weights tuples
 6 | 
 7 |     Parameters
 8 |     ----------
 9 | 
10 |     seq : iterable
11 |         List of tuples of length 2 that contain the centroid mean and weight
12 |         from a TDigest.
13 |     """
14 | 
15 |     tdigest = TDigest()
16 | 
17 |     for mean, weight in seq:
18 |         tdigest.C.insert(mean, Centroid(mean, weight))
19 |         tdigest.n += weight
20 | 
21 |     return tdigest
22 | 
23 | 
24 | def centroids_from_tdigest(tdigest):
25 |     """Return centroid means and weights from a TDigest instance"""
26 | 
27 |     if not isinstance(tdigest, TDigest):
28 |         raise ValueError("Argument must be a TDigest instance")
29 | 
30 |     means = [c.mean for c in tdigest.C.values()]
31 |     counts = [c.count for c in tdigest.C.values()]
32 | 
33 |     return means, counts
34 | 


--------------------------------------------------------------------------------
/lens/utils.py:
--------------------------------------------------------------------------------
  1 | """Plotting utils, mostly adapted from seaborn for use with TDigests."""
  2 | import numpy as np
  3 | from scipy import stats
  4 | from six import string_types
  5 | 
  6 | import scipy.spatial.distance as distance
  7 | import scipy.cluster.hierarchy as hierarchy
  8 | 
  9 | 
 10 | def _kde_support(data, bw, gridsize, cut, clip):
 11 |     """Establish support for a kernel density estimate."""
 12 |     support_min = max(data.min() - bw * cut, clip[0])
 13 |     support_max = min(data.max() + bw * cut, clip[1])
 14 |     return np.linspace(support_min, support_max, gridsize)
 15 | 
 16 | 
 17 | def _scipy_univariate_kde(data, bw, gridsize, cut, clip):
 18 |     """Compute a univariate kernel density estimate using scipy."""
 19 |     kde = stats.gaussian_kde(data, bw_method=bw)
 20 |     if isinstance(bw, string_types):
 21 |         bw = "scotts" if bw == "scott" else bw
 22 |         bw = getattr(kde, "%s_factor" % bw)() * np.std(data)
 23 |     grid = _kde_support(data, bw, gridsize, cut, clip)
 24 |     y = kde(grid)
 25 |     return grid, y
 26 | 
 27 | 
 28 | def _scipy_bivariate_kde(x, y, bw, gridsize, cut, clip):
 29 |     """Compute a bivariate kde using scipy."""
 30 |     data = np.c_[x, y]
 31 |     kde = stats.gaussian_kde(data.T)
 32 |     data_std = data.std(axis=0, ddof=1)
 33 |     if isinstance(bw, string_types):
 34 |         bw = "scotts" if bw == "scott" else bw
 35 |         bw_x = getattr(kde, "%s_factor" % bw)() * data_std[0]
 36 |         bw_y = getattr(kde, "%s_factor" % bw)() * data_std[1]
 37 |     elif np.isscalar(bw):
 38 |         bw_x, bw_y = bw, bw
 39 |     else:
 40 |         msg = (
 41 |             "Cannot specify a different bandwidth for each dimension "
 42 |             "with the scipy backend. You should install statsmodels."
 43 |         )
 44 |         raise ValueError(msg)
 45 |     x_support = _kde_support(data[:, 0], bw_x, gridsize, cut, clip[0])
 46 |     y_support = _kde_support(data[:, 1], bw_y, gridsize, cut, clip[1])
 47 |     xx, yy = np.meshgrid(x_support, y_support)
 48 |     z = kde([xx.ravel(), yy.ravel()]).reshape(xx.shape)
 49 |     return xx, yy, z
 50 | 
 51 | 
 52 | def axis_ticklabels_overlap(labels):
 53 |     """Return a boolean for whether the list of ticklabels have overlaps.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     labels : list of ticklabels
 58 | 
 59 |     Returns
 60 |     -------
 61 |     overlap : boolean
 62 |         True if any of the labels overlap.
 63 |     """
 64 |     if not labels:
 65 |         return False
 66 |     try:
 67 |         bboxes = [l.get_window_extent() for l in labels]
 68 |         overlaps = [b.count_overlaps(bboxes) for b in bboxes]
 69 |         return max(overlaps) > 1
 70 |     except RuntimeError:
 71 |         # Issue on macosx backend rasies an error in the above code
 72 |         return False
 73 | 
 74 | 
 75 | def hierarchical_ordering_indices(columns, correlation_matrix):
 76 |     """Return array with hierarchical cluster ordering of columns
 77 | 
 78 |     Parameters
 79 |     ----------
 80 |     columns: iterable of str
 81 |         Names of columns.
 82 |     correlation_matrix: np.ndarray
 83 |         Matrix of correlation coefficients between columns.
 84 | 
 85 |     Returns
 86 |     -------
 87 |     indices: iterable of int
 88 |         Indices with order of columns
 89 |     """
 90 |     if len(columns) > 2:
 91 |         pairwise_dists = distance.pdist(
 92 |             np.where(np.isnan(correlation_matrix), 0, correlation_matrix),
 93 |             metric="euclidean",
 94 |         )
 95 |         linkage = hierarchy.linkage(pairwise_dists, method="average")
 96 |         dendogram = hierarchy.dendrogram(
 97 |             linkage, no_plot=True, color_threshold=-np.inf
 98 |         )
 99 |         idx = dendogram["leaves"]
100 |     else:
101 |         idx = list(range(len(columns)))
102 | 
103 |     return idx
104 | 
105 | 
106 | def hierarchical_ordering(columns, correlation_matrix):
107 |     """Reorder matrix by hierarchical clustering of columns
108 | 
109 |     Parameters
110 |     ----------
111 |     columns: iterable of str
112 |         Names of columns.
113 |     correlation_matrix: np.ndarray
114 |         Matrix of correlation coefficients between columns.
115 | 
116 |     Returns
117 |     ------
118 |     columns: iterable of str
119 |         Reorderd names of columns.
120 |     correlation_matrix: np.ndarray
121 |         Reordered matrix of correlation coefficients between columns.
122 |     """
123 |     if len(columns) > 2:
124 |         idx = hierarchical_ordering_indices(columns, correlation_matrix)
125 |         correlation_matrix = correlation_matrix[idx, :][:, idx]
126 |         columns = [columns[i] for i in idx]
127 | 
128 |     return columns, correlation_matrix
129 | 


--------------------------------------------------------------------------------
/lens/version.py:
--------------------------------------------------------------------------------
1 | """Lens version"""
2 | 
3 | __version__ = "0.4.5.dev0"
4 | 


--------------------------------------------------------------------------------
/lens/widget.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import sys
  3 | import logging
  4 | from ipywidgets import widgets
  5 | from IPython.display import display
  6 | from lens.plotting import (
  7 |     plot_distribution,
  8 |     plot_cdf,
  9 |     plot_pairdensity_mpl,
 10 |     plot_correlation_mpl,
 11 | )
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | logger.addHandler(logging.StreamHandler())
 15 | 
 16 | # Check whether we are in a notebook environment
 17 | # this is a false positive if we are in the Jupyter console
 18 | IN_NOTEBOOK = "ipykernel" in sys.modules
 19 | 
 20 | PADDING = "10px"
 21 | PLOT_HEIGHT = 400
 22 | PLOT_WIDTH = 600
 23 | DPI = 72
 24 | 
 25 | 
 26 | def update_plot(f, args, plot_area, **kwargs):
 27 |     """Updates the content of an output widget with rendered function"""
 28 | 
 29 |     fig = f(*args)
 30 |     plot_area.clear_output()
 31 | 
 32 |     height = kwargs.get("height", PLOT_HEIGHT)
 33 |     width = kwargs.get("width", PLOT_WIDTH)
 34 |     dpi = kwargs.get("dpi", DPI)
 35 | 
 36 |     fig.set_size_inches(width / dpi, height / dpi)
 37 | 
 38 |     plot_area.layout.height = "{:.0f}px".format(height)
 39 |     plot_area.layout.width = "{:.0f}px".format(width)
 40 | 
 41 |     with plot_area:
 42 |         display(fig)
 43 | 
 44 | 
 45 | def create_correlation_plot_widget(ls):
 46 |     """Return a widget with correlation plot.
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |     ls : :class:`~lens.Summary`
 51 |         Lens `Summary`.
 52 | 
 53 |     Returns
 54 |     -------
 55 |     :class:`ipywidgets.Widget`
 56 |         Jupyter widget to explore correlation matrix plot.
 57 |     """
 58 | 
 59 |     plot_area = widgets.Output()
 60 | 
 61 |     update_plot(
 62 |         plot_correlation_mpl,
 63 |         [ls],
 64 |         plot_area,
 65 |         height=PLOT_WIDTH,
 66 |         width=PLOT_WIDTH * 1.3,
 67 |     )
 68 | 
 69 |     return plot_area
 70 | 
 71 | 
 72 | def _update_pairdensity_plot(ls, dd1, dd2, plot_area):
 73 |     if dd1.value != dd2.value:
 74 |         update_plot(
 75 |             plot_pairdensity_mpl,
 76 |             [ls, dd1.value, dd2.value],
 77 |             plot_area,
 78 |             height=600,
 79 |             width=600,
 80 |         )
 81 | 
 82 | 
 83 | def create_pairdensity_plot_widget(ls):
 84 |     """Create a pairwise density widget.
 85 | 
 86 |     Parameters
 87 |     ----------
 88 |     ls : :class:`~lens.Summary`
 89 |         Lens `Summary`.
 90 | 
 91 |     Returns
 92 |     -------
 93 |     :class:`ipywidgets.Widget`
 94 |         Jupyter widget to explore pairdensity plots.
 95 |     """
 96 |     numeric_columns = ls._report["column_summary"]["_columns"]
 97 |     dropdown1 = widgets.Dropdown(options=numeric_columns, description="First:")
 98 |     dropdown2 = widgets.Dropdown(
 99 |         options=numeric_columns, description="Second:"
100 |     )
101 |     if len(numeric_columns) > 1:
102 |         dropdown1.value, dropdown2.value = numeric_columns[:2]
103 | 
104 |     plot_area = widgets.Output()
105 | 
106 |     for dropdown in [dropdown1, dropdown2]:
107 |         dropdown.observe(
108 |             lambda x: _update_pairdensity_plot(
109 |                 ls, dropdown1, dropdown2, plot_area
110 |             ),
111 |             names="value",
112 |             type="change",
113 |         )
114 | 
115 |     _update_pairdensity_plot(ls, dropdown1, dropdown2, plot_area)
116 | 
117 |     return widgets.VBox([dropdown1, dropdown2, plot_area], padding=PADDING)
118 | 
119 | 
120 | def _simple_columnwise_widget(ls, plot_function, columns):
121 |     """Basic column-wise plot widget"""
122 | 
123 |     dropdown = widgets.Dropdown(options=columns, description="Column:")
124 |     plot_area = widgets.Output()
125 |     update_plot(plot_function, [ls, columns[0]], plot_area, height=PLOT_HEIGHT)
126 | 
127 |     dropdown.observe(
128 |         lambda x: update_plot(
129 |             plot_function, [ls, x["new"]], plot_area, height=PLOT_HEIGHT
130 |         ),
131 |         names="value",
132 |         type="change",
133 |     )
134 | 
135 |     return widgets.VBox([dropdown, plot_area], padding=PADDING)
136 | 
137 | 
138 | def create_distribution_plot_widget(ls):
139 |     """Create a distribution plot widget.
140 | 
141 |     Parameters
142 |     ----------
143 |     ls : :class:`~lens.Summary`
144 |         Lens `Summary`.
145 | 
146 |     Returns
147 |     -------
148 |     :class:`ipywidgets.Widget`
149 |         Jupyter widget to explore distribution plots.
150 |     """
151 |     numeric_columns = ls._report["column_summary"]["_columns"]
152 |     return _simple_columnwise_widget(ls, plot_distribution, numeric_columns)
153 | 
154 | 
155 | def create_cdf_plot_widget(ls):
156 |     """Create a CDF plot widget.
157 | 
158 |     Parameters
159 |     ----------
160 |     ls : :class:`~lens.Summary`
161 |         Lens `Summary`.
162 | 
163 |     Returns
164 |     -------
165 |     :class:`ipywidgets.Widget`
166 |         Jupyter widget to explore CDF plots.
167 |     """
168 |     numeric_columns = ls._report["column_summary"]["_columns"]
169 |     return _simple_columnwise_widget(ls, plot_cdf, numeric_columns)
170 | 
171 | 
172 | def interactive_explore(ls):
173 |     """Create a widget to visually explore a dataset summary.
174 | 
175 |     Note that the widget will only work when created within a Jupyter notebook.
176 | 
177 |     Parameters
178 |     ----------
179 |     ls : :class:`~lens.Summary`
180 |         Lens `Summary`.
181 | 
182 |     Returns
183 |     -------
184 |     :class:`ipywidgets.Widget`
185 |         Jupyter widget with summary plots.
186 |     """
187 |     if not IN_NOTEBOOK:
188 |         message = (
189 |             "Lens interactive_explore can only be used in a"
190 |             " Jupyter notebook"
191 |         )
192 |         logger.error(message)
193 |         raise ValueError(message)
194 | 
195 |     tabs = widgets.Tab()
196 |     tabs.children = [
197 |         create_distribution_plot_widget(ls),
198 |         create_cdf_plot_widget(ls),
199 |         create_pairdensity_plot_widget(ls),
200 |         create_correlation_plot_widget(ls),
201 |     ]
202 | 
203 |     tabs.set_title(0, "Distribution")
204 |     tabs.set_title(1, "CDF")
205 |     tabs.set_title(2, "Pairwise density")
206 |     tabs.set_title(3, "Correlation matrix")
207 | 
208 |     return tabs
209 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 79
 3 | exclude = '''
 4 | /(
 5 |     \.git
 6 |   | \.mypy_cache
 7 |   | \.tox
 8 |   | \.venv
 9 |   | _build
10 |   | build
11 |   | dist
12 |   | \.eggs
13 | )/
14 | '''
15 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
1 | conda:
2 |     file: docs/environment.yml
3 | python:
4 |    version: 3
5 |    setup_py_install: true
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Placeholder file so that `pip install -r requirements.txt` uses dependencies
2 | # from setup.py
3 | .
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 
4 | [metadata]
5 | license_file = LICENSE.txt
6 | 
7 | [flake8]
8 | ignore = E203,W503
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | from setuptools import setup
 5 | 
 6 | 
 7 | def source_root_dir():
 8 |     """Return the path to the root of the source distribution"""
 9 |     return os.path.abspath(os.path.dirname(__file__))
10 | 
11 | 
12 | def read_version():
13 |     """Read the version from the ``lens.version`` module"""
14 |     filename = os.path.join(source_root_dir(), "lens/version.py")
15 |     with open(filename) as fin:
16 |         namespace = {}
17 |         exec(fin.read(), namespace)  # pylint: disable=exec-used
18 |         return namespace["__version__"]
19 | 
20 | 
21 | with open("README.rst") as file:
22 |     LONG_DESCRIPTION = file.read()
23 | 
24 | setup(
25 |     name="lens",
26 |     version=read_version(),
27 |     description="Summarise and explore Pandas DataFrames",
28 |     copyright="Copyright 2017-2019, Faculty",
29 |     license="Apache 2.0",
30 |     url="https://github.com/facultyai/lens",
31 |     author="Faculty",
32 |     author_email="opensource@faculty.ai",
33 |     packages=["lens"],
34 |     zip_safe=False,
35 |     long_description=LONG_DESCRIPTION,
36 |     install_requires=[
37 |         "dask[dataframe,delayed]>=0.18.0",
38 |         "ipywidgets>=6.0.0",
39 |         "matplotlib",
40 |         "numpy>=1.11",
41 |         "pandas",
42 |         "plotly>=3.0.0",
43 |         "scipy",
44 |         "tdigest>=0.5.0",
45 |         "seaborn",
46 |     ],
47 | )
48 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import datetime as dt
  2 | import os
  3 | import inspect
  4 | import random
  5 | import string
  6 | 
  7 | from lens.dask_graph import create_dask_graph
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from scipy import stats
 12 | import pytest
 13 | 
 14 | np.random.seed(4712)
 15 | 
 16 | dirname = os.path.dirname(
 17 |     os.path.abspath(inspect.getfile(inspect.currentframe()))
 18 | )
 19 | 
 20 | 
 21 | @pytest.fixture(scope="module", params=[10, 60, 500, 2000])
 22 | def df(request):
 23 |     nrows = request.param
 24 |     n1 = np.random.randn(nrows) * 3 + 20.0
 25 |     n2 = np.random.randn(nrows) * 5 + 30.0
 26 |     poisson = np.random.poisson(10, nrows)
 27 |     items = [
 28 |         ("normal", n1 + n2),
 29 |         ("normal2", n1 - n2),
 30 |         ("uniform", np.random.random(nrows)),
 31 |         ("lognormal", stats.lognorm.rvs(5, scale=10, size=nrows)),
 32 |         ("poisson", poisson),
 33 |         ("categorical13", gen_poisson_distributed_categorical_data(13, nrows)),
 34 |         ("categorical5", gen_uniformly_distributed_categorical_data(5, nrows)),
 35 |         ("categorical2", np.random.randint(0, 2, nrows)),
 36 |         ("categoricalint", gen_categoricalint_with_no_twos(nrows)),
 37 |         ("ID", ["ID{}".format(x) for x in range(int(1e3), int(1e3 + nrows))]),
 38 |         ("datetimes", gen_datetime_strings(nrows)),
 39 |         ("dates", gen_date_strings(nrows)),
 40 |         ("times", gen_time_strings(nrows)),
 41 |         ("nulls", [np.nan] * nrows),
 42 |     ]
 43 | 
 44 |     df = pd.DataFrame.from_dict(dict(items))
 45 | 
 46 |     # sprinkle nrows/50 nulls
 47 |     ncols = len(df.columns)
 48 |     ii = np.random.randint(0, ncols, int((nrows * ncols) / 50))
 49 |     jj = np.random.randint(0, nrows, int((nrows * ncols) / 50))
 50 |     for i, j in zip(ii, jj):
 51 |         df.loc[j, list(df.columns)[i]] = None
 52 | 
 53 |     # No nulls in poissonint to avoid casting as floats
 54 |     df["poissonint"] = poisson
 55 |     # Add column that is strictly correlated with a float column
 56 |     df["normalcorr"] = n1 + n2
 57 |     # Add a column that has values where normal has nulls
 58 |     df["antinormal"] = np.where(df.normal.isnull(), n1 + n2, np.nan)
 59 | 
 60 |     df.to_csv(dirname + "/test_results/test_data.csv", index=False)
 61 | 
 62 |     return df
 63 | 
 64 | 
 65 | def gen_categoricalint_with_no_twos(nrows):
 66 |     values = np.random.randint(0, 6, nrows)
 67 |     values[values == 2] = 5
 68 |     return values
 69 | 
 70 | 
 71 | def gen_poisson_distributed_categorical_data(ncategories, size):
 72 |     categories = [
 73 |         str(i) + "".join(random.sample(string.ascii_letters, 4))
 74 |         for i in range(ncategories)
 75 |     ]
 76 |     random_samples = [
 77 |         np.random.poisson(ncategories / 2.0) for i in range(size)
 78 |     ]
 79 |     truncated_random_samples = [
 80 |         max(min(0, sample), ncategories - 1) for sample in random_samples
 81 |     ]
 82 |     sampled_categories = [
 83 |         categories[sample] for sample in truncated_random_samples
 84 |     ]
 85 |     return sampled_categories
 86 | 
 87 | 
 88 | def gen_uniformly_distributed_categorical_data(ncategories, size):
 89 |     categories = [
 90 |         str(i) + "".join(random.sample(string.ascii_letters, 4))
 91 |         for i in range(ncategories)
 92 |     ]
 93 |     random_samples = np.random.randint(0, len(categories), size=size)
 94 |     truncated_random_samples = [
 95 |         max(min(0, sample), ncategories - 1) for sample in random_samples
 96 |     ]
 97 |     sampled_categories = [
 98 |         categories[sample] for sample in truncated_random_samples
 99 |     ]
100 |     return sampled_categories
101 | 
102 | 
103 | def gen_date_strings(size):
104 |     datetimes = gen_datetimes(size)
105 |     date_strings = [datetime.date().isoformat() for datetime in datetimes]
106 |     return date_strings
107 | 
108 | 
109 | def gen_time_strings(size):
110 |     datetimes = gen_datetimes(size)
111 |     date_strings = [datetime.time().isoformat() for datetime in datetimes]
112 |     return date_strings
113 | 
114 | 
115 | def gen_datetime_strings(size):
116 |     datetimes = gen_datetimes(size)
117 |     datetime_strings = [datetime.isoformat() for datetime in datetimes]
118 |     return datetime_strings
119 | 
120 | 
121 | def gen_datetimes(size):
122 |     timestamps = np.linspace(0, 86400 * 365 * 40, size)
123 |     datetimes = [dt.datetime.fromtimestamp(ts) for ts in timestamps]
124 |     return datetimes
125 | 
126 | 
127 | @pytest.fixture(scope="module")
128 | def report(df):
129 |     # Get a dict report by not calling summarise
130 |     report = create_dask_graph(df).compute(scheduler="multiprocessing")
131 | 
132 |     return report
133 | 


--------------------------------------------------------------------------------
/tests/data/test-artworks.csv:
--------------------------------------------------------------------------------
  1 | Artist,Nationality,Gender,Date,Classification,Width (cm),Height (cm),Diameter (cm),Depth (cm)
  2 | Otto Wagner,(Austrian),(Male),1896,Architecture,168.9,48.6,,
  3 | Christian de Portzamparc,(French),(Male),1987,Architecture,29.8451,40.6401,,
  4 | Emil Hoppe,(Austrian),(Male),1903,Architecture,31.8,34.3,,
  5 | Bernard Tschumi,(),(Male),1980,Architecture,50.8,50.8,,
  6 | Emil Hoppe,(Austrian),(Male),1903,Architecture,19.1,38.4,,
  7 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
  8 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
  9 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 10 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 11 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 12 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 13 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 14 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 15 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 16 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 17 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 18 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 19 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 20 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 21 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 22 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 23 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 24 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 25 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 26 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 27 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 28 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 29 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 30 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 31 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,,
 32 | Louis I. Kahn,(American),(Male),1968,Architecture,167.6,113.0,,
 33 | Bernard Tschumi,(),(Male),1980,Architecture,50.8,50.8,,
 34 | Marcel Kammerer,(Austrian),(Male),1900,Architecture,31.4,47.9,,
 35 | Bernard Tschumi,(),(Male),1978,Architecture,817.8816,60.9601,,
 36 | Otto Schönthal,(Austrian),(Male),1905,Architecture,21.6,30.5,,
 37 | Bernard Tschumi,(),(Male),1980,Architecture,50.8,50.8,,
 38 | Otto Schönthal,(Austrian),(Male),1906,Architecture,35.8,29.6,,
 39 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,,
 40 | Bernard Tschumi,(),(Male),1979,Architecture,60.9601,121.9202,,
 41 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,,
 42 | Bernard Tschumi,(),(Male),1980,Architecture,61.0,121.9,,
 43 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,,
 44 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,,
 45 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,,
 46 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,,
 47 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,,
 48 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,,
 49 | Bernard Tschumi,(),(Male),1980,Architecture,50.8,50.8,,
 50 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 51 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.2,,
 52 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 53 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 54 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 55 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 56 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 57 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 58 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 59 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 60 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 61 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 62 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 63 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 64 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,,
 65 | Hans Poelzig,(German),(Male),1918,Architecture,41.0,37.1,,
 66 | Raimund Abraham,(American),(Male),1970,Architecture,135.8903,87.3127,,
 67 | "Peter Eisenman, Robert Cole",(American) (),(Male) (Male),1975,Architecture,113.3477,34.9251,,
 68 | "Rem Koolhaas, Madelon Vriesendorp",(Dutch) (Dutch),(Male) (Female),1987,Architecture,99.0602,63.5001,,
 69 | Roger C. Ferri,(American),(Male),1979,Architecture,110.8,141.6,,
 70 | Bernard Tschumi,(),(Male),1984,Architecture,94.7739,94.4564,,
 71 | Roger C. Ferri,(American),(Male),1979,Architecture,110.8077,141.6053,,
 72 | Bernard Tschumi,(),(Male),1986,Architecture,99.6952,69.2151,,
 73 | Roger C. Ferri,(American),(Male),1979,Architecture,113.3477,112.7127,,
 74 | Michael Graves,(American),(Male),1978,Architecture,30.2,30.2,,
 75 | Michael Graves,(American),(Male),1978,Architecture,60.0,59.7,,
 76 | Michael Graves,(American),(Male),1978,Architecture,27.3,27.6,,
 77 | Michael Graves,(American),(Male),1978,Architecture,20.3,20.3,,
 78 | "Aldo Rossi, Gianni Braghieri, M. Bosshard",(Italian) (Italian) (Italian),(Male) (Male) (Male),1974,Architecture,91.4,72.4,,
 79 | Ludwig Mies van der Rohe,(American),(Male),n.d.,Mies van der Rohe Archive,,,,
 80 | Erik Gunnar Asplund,(Swedish),(Male),c. 1917,Architecture,34.9,23.2,,
 81 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,61.9,31.1,,
 82 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,61.6,31.8,,
 83 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,34.0,39.4,,
 84 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,62.5,57.2,,
 85 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,62.2,56.5,,
 86 | Erik Gunnar Asplund,(Swedish),(Male),1923,Architecture,79.4,56.8,,
 87 | Erik Gunnar Asplund,(Swedish),(Male),1923,Architecture,79.1,57.2,,
 88 | Erik Gunnar Asplund,(Swedish),(Male),1923,Architecture,55.2,47.9,,
 89 | Erik Gunnar Asplund,(Swedish),(Male),1923,Architecture,55.5,47.9,,
 90 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,29.8,33.0,,
 91 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,26.7,21.0,,
 92 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,93.3,91.4,,
 93 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,93.0,90.8,,
 94 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,93.7,90.2,,
 95 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,90.8,90.8,,
 96 | Erik Gunnar Asplund,(Swedish),(Male),1930,Architecture,33.0,52.7,,
 97 | Erik Gunnar Asplund,(Swedish),(Male),1936,Architecture,70.5,30.5,,
 98 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,51.1,39.4,,
 99 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.5,30.5,,
100 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.8,30.8,,
101 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.5,30.8,,
102 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.3,30.8,,
103 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.8,30.8,,
104 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.5,30.5,,
105 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,62.9,26.0,,
106 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,46.7,19.7,,
107 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,41.9,41.9,,
108 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,101.6,65.4,,
109 | "Erik Gunnar Asplund, Sigurd Lewerentz",(Swedish) (Swedish),(Male) (Male),1937,Architecture,96.2,41.3,,
110 | Erik Gunnar Asplund,(Swedish),(Male),1937,Architecture,83.8,29.5,,
111 | Erik Gunnar Asplund,(Swedish),(Male),1937,Architecture,158.8,49.2,,
112 | "Paul Nelson, Frantz Jourdain, Oscar Nitzchke",(American) (French) (American),(Male) (Male) (Male),1938,Architecture,95.3,37.5,,1.3
113 | "Paul Nelson, Frantz Jourdain, Oscar Nitzchke",(American) (French) (American),(Male) (Male) (Male),1938,Architecture,95.9,37.5,,1.3
114 | "Paul Nelson, Oscar Nitzchke, Frantz Jourdain",(American) (American) (French),(Male) (Male) (Male),1938,Architecture,71.1,71.1,,1.3
115 | "Paul Nelson, Frantz Jourdain, Oscar Nitzchke",(American) (French) (American),(Male) (Male) (Male),1938,Architecture,127.6,71.0,,
116 | Steven Holl,(American),(Male),1977,Architecture,30.2,33.3,,
117 | Steven Holl,(American),(Male),1977,Architecture,55.9,34.3,,
118 | Erich Mendelsohn,(American),(Male),1935,Architecture,29.2,35.6,,
119 | Steven Holl,(American),(Male),1977,Architecture,75.6,55.9,,
120 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,35.9,21.6,,
121 | Paolo Soleri,(American),(Male),1958,Architecture,150.2,55.2,,
122 | Steven Holl,(American),(Male),1980,Architecture,33.0,8.3,,
123 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,35.9,20.3,,
124 | Steven Holl,(American),(Male),1984,Architecture,103.5,52.1,,
125 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,17.8,34.9,,
126 | Steven Holl,(American),(Male),1985,Architecture,107.9502,52.0701,,
127 | Erich Mendelsohn,(American),(Male),1936,Architecture,13.6525,21.59,,
128 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,,
129 | Erich Mendelsohn,(American),(Male),1936,Architecture,35.8776,27.3051,,
130 | Steven Holl,(American),(Male),1986,Architecture,56.8,75.6,,
131 | Erich Mendelsohn,(American),(Male),1936,Architecture,29.2,36.2,,
132 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,,
133 | Erich Mendelsohn,(American),(Male),1936,Architecture,29.2101,36.8301,,
134 | Steven Holl,(American),(Male),1986,Architecture,56.5,76.2,,
135 | Erich Mendelsohn,(American),(Male),1936,Architecture,40.6401,21.9075,,
136 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,,
137 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,28.7338,34.2901,,
138 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,,
139 | Erich Mendelsohn,(American),(Male),1936,Architecture,24.1,35.2,,
140 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,,
141 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,36.8,30.2,,
142 | Paul Rudolph,(American),(Male),1989,Architecture,164.5,37.1,,
143 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,37.5,29.8,,
144 | Paul Rudolph,(American),(Male),1949,Architecture,47.9,64.8,,
145 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,29.2,41.9,,
146 | Paul Rudolph,(American),(Male),1958–1964,Architecture,87.0,69.2,,
147 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,28.5,37.5,,
148 | Erich Mendelsohn,(American),(Male),c. 1935,Architecture,25.2,21.0,,
149 | Erich Mendelsohn,(American),(Male),1935,Architecture,35.6,12.7,,
150 | Erich Mendelsohn,(American),(Male),1935,Architecture,38.0,64.0,,
151 | Erich Mendelsohn,(American),(Male),1935,Architecture,60.9601,54.6101,,
152 | Peter Cook,(British),(Male),1979,Architecture,87.0,27.9,,
153 | "Diller + Scofidio, Elizabeth Diller, Ricardo Scofidio",(American) (American) (American),() (Female) (Male),1989,Architecture,92.7,121.0,,3.8
154 | Zaha Hadid,(British),(Female),1991,Architecture,182.8804,129.5403,,
155 | Fumihiko Maki,(Japanese),(Male),1980,Architecture,14.6,20.3,,
156 | "Rem Koolhaas, Zoe Zenghelis, Elia Zenghelis, Madelon Vriesendorp",(Dutch) (British) (British) (Dutch),(Male) (Female) (Female) (Female),1975,Architecture,68.6,113.0,,
157 | Frank Lloyd Wright,(American),(Male),1941,Architecture,51.1176,78.1052,,
158 | Frank Lloyd Wright,(American),(Male),1941,Architecture,78.5,53.0,,
159 | Frank Lloyd Wright,(American),(Male),1941,Architecture,48.8951,43.1,,
160 | Frank Lloyd Wright,(American),(Male),1941,Architecture,78.8,52.7051,,
161 | Mario Bellini,(Italian),(Male),1987,Architecture,77.5,49.5,,
162 | "Venturi and Rauch, Robert Venturi, John Rauch",(American) (American) (American),() (Male) (Male),1965,Architecture,91.4,52.1,,
163 | "Venturi and Rauch, Robert Venturi, John Rauch, Denise Scott Brown",(American) (American) (American) (American),() (Male) (Male) (Female),1978,Architecture,59.6901,30.4801,,
164 | "Venturi, Rauch and Scott Brown, Robert Venturi, John Rauch, Denise Scott Brown",(American) (American) (American) (American),() (Male) (Male) (Female),1981,Architecture,38.4176,30.4801,,
165 | "Venturi, Rauch and Scott Brown, Robert Venturi, John Rauch, Denise Scott Brown",(American) (American) (American) (American),() (Male) (Male) (Female),1981,Architecture,34.2901,30.4801,,
166 | "Venturi, Rauch and Scott Brown, Robert Venturi, John Rauch, Denise Scott Brown",(American) (American) (American) (American),() (Male) (Male) (Female),1983,Architecture,71.1201,30.4801,,
167 | Emilio Ambasz,(Argentine),(Male),1975,Architecture,95.6,95.3,,
168 | Raimund Abraham,(American),(Male),1979,Architecture,83.1852,69.2151,,
169 | Tadao Ando,(Japanese),(Male),1985–1988,Architecture,400.3683,105.0927,,
170 | Tadao Ando,(Japanese),(Male),1985–1988,Architecture,44.5,30.8,,
171 | Tadao Ando,(Japanese),(Male),1985–1988,Architecture,32.3851,30.4801,,
172 | Tadao Ando,(Japanese),(Male),c. 1989-91,Architecture,84.1,29.5,,
173 | Arata Isozaki,(Japanese),(Male),1992,Architecture,102.2,62.2,,
174 | Arata Isozaki,(Japanese),(Male),1992,Architecture,94.9327,54.9276,,
175 | Arata Isozaki,(Japanese),(Male),1992,Architecture,102.2,62.2,,
176 | Arata Isozaki,(Japanese),(Male),1992,Architecture,105.0,65.0,,
177 | Arata Isozaki,(Japanese),(Male),1992,Architecture,105.1,65.1,,
178 | Arata Isozaki,(Japanese),(Male),1992,Architecture,105.1,65.1,,
179 | Arata Isozaki,(Japanese),(Male),1992,Architecture,105.0,65.0,,
180 | "Theo van Doesburg (Christian Emil Marie Küpper), Cornelis van Eesteren",(Dutch) (Dutch),(Male) (Male),1923,Architecture,57.2,57.2,,
181 | Arata Isozaki,(Japanese),(Male),1992,Architecture,84.1,30.5,,
182 | Arata Isozaki,(Japanese),(Male),1992,Architecture,84.5,30.5,,
183 | Arata Isozaki,(Japanese),(Male),1992,Architecture,49.5,30.5,,
184 | Arata Isozaki,(Japanese),(Male),1992,Architecture,49.9,30.5,,
185 | Arata Isozaki,(Japanese),(Male),1992,Architecture,85.1,30.5,,
186 | Bernard Tschumi,(),(Male),1983,Architecture,41.9101,29.5276,,
187 | Frank Lloyd Wright,(American),(Male),1915-17,Architecture,,,,
188 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,,
189 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.9,27.9,,
190 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,28.3,21.6,,
191 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,27.9,21.9,,
192 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,,
193 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,,
194 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.9,28.3,,
195 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,,
196 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,,
197 | "Roberto Burle Marx, Oscar Niemeyer",(Brazilian) (Brazilian),(Male) (Male),1953,Architecture,151.1,100.3,,
198 | "Roberto Burle Marx, Oscar Niemeyer",(Brazilian) (Brazilian),(Male) (Male),1953,Architecture,151.1,99.1,,
199 | Neil M. Denari,(American),(Male),1992,Architecture,84.0,62.2,,
200 | Ludwig Mies van der Rohe,(American),(Male),1910,Architecture,240.0,141.0,,
201 | David Jacob,(American),(Male),1970,Architecture,87.3127,71.1201,,87.3127
202 | 


--------------------------------------------------------------------------------
/tests/multivariate_kde.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import statsmodels.api as sm
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | 
  6 | def _KDEMultivariate(
  7 |     values, bandwidth=None, grid_points=None, grid_shape=None, logtrans=None
  8 | ):
  9 |     """Multivariate kernel density estimation."""
 10 | 
 11 |     if values.shape[0] < 3:
 12 |         # Return zeroes if there are too few points to do anything
 13 |         # useful.
 14 |         return np.zeros(grid_shape)
 15 | 
 16 |     for i, lt in enumerate(logtrans):
 17 |         if lt:
 18 |             values[:, i] = np.log10(values[:, i])
 19 |             grid_points[i] = np.log10(grid_points[i])
 20 | 
 21 |     kernel = sm.nonparametric.KDEMultivariate(
 22 |         data=values, var_type="cc", bw=bandwidth
 23 |     )
 24 | 
 25 |     pdf = np.reshape(kernel.pdf(grid_points), grid_shape)
 26 | 
 27 |     return pdf
 28 | 
 29 | 
 30 | def plot_pd_difference(hist_pd, kde_pd, filename):
 31 |     max_dev = np.max(np.abs(kde_pd - hist_pd))
 32 |     max_dev_s = "Max dev: {:.3g}".format(max_dev)
 33 | 
 34 |     norm = np.sum((hist_pd > 1e-3) + (kde_pd > 1e-3)) / 2.0
 35 |     mean_dev = np.sum(np.abs(kde_pd - hist_pd)) / norm
 36 |     mean_dev_s = "Mean dev: {:.3g}".format(mean_dev)
 37 | 
 38 |     corr = 1 - np.corrcoef(kde_pd, hist_pd)[0][1]
 39 |     corr_s = "1 - Corr: {:.3g}".format(corr)
 40 | 
 41 |     f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))
 42 | 
 43 |     ax1.pcolormesh(hist_pd, cmap="viridis")
 44 |     ax1.set_aspect("equal")
 45 |     ax1.set_title("Smoothed Hist")
 46 | 
 47 |     ax2.pcolormesh(kde_pd, cmap="viridis")
 48 |     ax2.set_aspect("equal")
 49 |     ax2.set_title("2D KDE")
 50 | 
 51 |     diff = hist_pd - kde_pd
 52 | 
 53 |     diff_img = ax3.pcolormesh(diff * 100.0, cmap="RdBu", vmin=-10, vmax=10)
 54 |     ax3.set_aspect("equal")
 55 |     ax3.set_title("Difference")
 56 | 
 57 |     f.tight_layout()
 58 | 
 59 |     f.colorbar(diff_img, ax=[ax1, ax2, ax3], label="% difference")
 60 |     f.text(0.9, 0.5, "\n".join([max_dev_s, mean_dev_s, corr_s]), va="center")
 61 | 
 62 |     f.savefig(filename)
 63 |     plt.close(f)
 64 | 
 65 | 
 66 | def _normalise_range(X):
 67 |     return X / np.max(X)
 68 | 
 69 | 
 70 | def compute_deviation_with_kde(df, pd, filename):
 71 |     """ Compute mean deviation of smoothed histogram with respect to KDE """
 72 |     columns = pd["_columns"]
 73 |     pd = pd[columns[0]][columns[1]]
 74 |     bw = pd["bw"]
 75 |     logtrans = [scale == "log" for scale in pd["scales"]]
 76 |     x = pd["axes"][columns[0]]
 77 |     y = pd["axes"][columns[1]]
 78 |     X, Y = np.meshgrid(x, y)
 79 |     grid_shape = X.shape
 80 |     grid_points = np.vstack([X.ravel(), Y.ravel()])
 81 |     kde_pd = _KDEMultivariate(
 82 |         np.array(df.dropna()),
 83 |         bandwidth=bw,
 84 |         grid_points=grid_points,
 85 |         grid_shape=grid_shape,
 86 |         logtrans=logtrans,
 87 |     )
 88 |     hist_pd = np.array(pd["density"])
 89 | 
 90 |     # hist_pd[50] = hist_pd[50] + np.mean(hist_pd) * 0.1
 91 | 
 92 |     kde_pd = _normalise_range(kde_pd)
 93 |     hist_pd = _normalise_range(hist_pd)
 94 | 
 95 |     norm = np.sum((hist_pd > 1e-3) + (kde_pd > 1e-3)) / 2.0
 96 |     mean_dev = np.sum(np.abs(kde_pd - hist_pd)) / norm
 97 | 
 98 |     if mean_dev > 0.01:
 99 |         plot_pd_difference(hist_pd, kde_pd, filename)
100 | 
101 |     return mean_dev
102 | 


--------------------------------------------------------------------------------
/tests/test_explorer.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import os
  3 | 
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import pytest
  8 | 
  9 | import lens
 10 | from lens.explorer import Explorer
 11 | 
 12 | dirname = os.path.dirname(
 13 |     os.path.abspath(inspect.getfile(inspect.currentframe()))
 14 | )
 15 | 
 16 | 
 17 | @pytest.fixture(scope="module")
 18 | def artworks_df():
 19 |     df = pd.read_csv(os.path.join(dirname, "data/test-artworks.csv"))
 20 |     return df
 21 | 
 22 | 
 23 | @pytest.fixture(scope="module")
 24 | def artworks_summary(artworks_df):
 25 |     summary = lens.summarise(artworks_df)
 26 |     return summary
 27 | 
 28 | 
 29 | def test_distribution_plot(artworks_df, artworks_summary):
 30 |     def mock_render(fig):
 31 |         # check that this draws a histogram
 32 |         assert len(fig.axes[0].patches) > 0
 33 | 
 34 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
 35 |     explorer.distribution_plot("Height (cm)")
 36 | 
 37 | 
 38 | def test_distribution_plot_bins(artworks_df, artworks_summary):
 39 |     Nbins = 13
 40 | 
 41 |     def mock_render(fig):
 42 |         # check that this draws a histogram with Nbins bars
 43 |         assert len(fig.axes[0].patches) == Nbins
 44 | 
 45 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
 46 |     explorer.distribution_plot("Height (cm)", bins=Nbins)
 47 | 
 48 | 
 49 | def test_cdf_plot(artworks_df, artworks_summary):
 50 |     column = "Height (cm)"
 51 |     plt.cla()
 52 | 
 53 |     def mock_render(fig):
 54 |         ax = fig.axes[0]
 55 |         assert len(ax.lines) == 1
 56 |         line = ax.lines[0]
 57 | 
 58 |         tdigest = artworks_summary.tdigest(column)
 59 |         xs = [tdigest.percentile(p) for p in [0, 100]]
 60 | 
 61 |         assert line.get_xdata()[0] == xs[0]
 62 |         assert line.get_xdata()[-1] == xs[-1]
 63 |         assert line.get_ydata()[0] == 0
 64 |         assert line.get_ydata()[-1] == 100
 65 | 
 66 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
 67 |     explorer.cdf_plot("Height (cm)")
 68 | 
 69 | 
 70 | def test_cdf_plot_log_transformed(artworks_df, artworks_summary):
 71 |     plt.cla()
 72 | 
 73 |     def mock_render(fig):
 74 |         ax = fig.axes[0]
 75 |         assert len(ax.lines) == 1
 76 |         assert ax.get_xaxis().get_scale() == "log"
 77 | 
 78 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
 79 |     explorer.cdf_plot("Width (cm)")
 80 | 
 81 | 
 82 | def test_cdf_plot_non_numeric(artworks_summary):
 83 |     def mock_render(fig):
 84 |         assert False
 85 | 
 86 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
 87 |     with pytest.raises(ValueError):
 88 |         explorer.cdf_plot("Nationality")
 89 | 
 90 | 
 91 | def test_pairwise_density_plot(artworks_df, artworks_summary):
 92 |     plt.cla()
 93 | 
 94 |     def mock_render(fig):
 95 |         #  currently pairwise_density_plot returns a plotly figure
 96 |         assert len(fig["data"]) == 1
 97 |         data = fig["data"][0]
 98 |         assert data["type"] == "heatmap"
 99 | 
100 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
101 |     explorer.pairwise_density_plot("Width (cm)", "Height (cm)")
102 | 
103 | 
104 | def test_pairwise_density_plot_one_categorical(artworks_df, artworks_summary):
105 |     plt.cla()
106 | 
107 |     def mock_render(fig):
108 |         #  currently pairwise_density_plot returns a plotly figure
109 |         assert len(fig["data"]) == 1
110 |         data = fig["data"][0]
111 |         assert data["type"] == "heatmap"
112 | 
113 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
114 |     explorer.pairwise_density_plot("Nationality", "Height (cm)")
115 |     explorer.pairwise_density_plot("Height (cm)", "Nationality")
116 | 
117 | 
118 | def test_pairwise_density_plot_both_categorical(artworks_df, artworks_summary):
119 |     plt.cla()
120 | 
121 |     def mock_render(fig):
122 |         #  currently pairwise_density_plot returns a plotly figure
123 |         assert len(fig["data"]) == 1
124 |         data = fig["data"][0]
125 |         assert data["type"] == "heatmap"
126 | 
127 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
128 |     explorer.pairwise_density_plot("Nationality", "Gender")
129 | 
130 | 
131 | def test_pairwise_density_plot_not_numeric(artworks_df, artworks_summary):
132 |     plt.cla()
133 | 
134 |     def mock_render(fig):
135 |         assert False
136 | 
137 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
138 |     with pytest.raises(ValueError):
139 |         explorer.pairwise_density_plot("Diameter (cm)", "Nationality")
140 | 
141 | 
142 | def test_correlation_plot(artworks_df, artworks_summary):
143 |     plt.cla()
144 | 
145 |     def mock_render(fig):
146 |         assert len(fig["data"]) == 1
147 |         data = fig["data"][0]
148 |         assert data["type"] == "heatmap"
149 |         expected_columns = {"Height (cm)", "Width (cm)", "Depth (cm)"}
150 |         assert set(data["y"]) == set(data["x"]) == expected_columns
151 | 
152 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
153 |     explorer.correlation_plot()
154 | 
155 | 
156 | def test_correlation_plot_annotations(artworks_df, artworks_summary):
157 |     plt.cla()
158 | 
159 |     def mock_render(fig):
160 |         assert len(fig["data"]) == 1
161 |         corr = [item for row in fig["data"][0]["z"] for item in row]
162 |         labels = [l["text"] for l in fig["layout"]["annotations"]]
163 |         for c, l in zip(corr, labels):
164 |             assert "{:.2g}".format(c) == l
165 | 
166 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
167 |     explorer.correlation_plot(
168 |         include=["Height (cm)", "Width (cm)", "Depth (cm)"]
169 |     )
170 |     explorer.correlation_plot(include=["Height (cm)", "Width (cm)"])
171 | 
172 | 
173 | def test_correlation_plot_include(artworks_df, artworks_summary):
174 |     plt.cla()
175 | 
176 |     def mock_render(fig):
177 |         assert len(fig["data"]) == 1
178 |         data = fig["data"][0]
179 |         assert data["type"] == "heatmap"
180 |         assert set(data["y"]) == set(data["x"]) == {"Height (cm)"}
181 | 
182 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
183 |     explorer.correlation_plot(include=["Height (cm)"])
184 | 
185 | 
186 | def test_correlation_plot_exclude(artworks_df, artworks_summary):
187 |     plt.cla()
188 | 
189 |     def mock_render(fig):
190 |         assert len(fig["data"]) == 1
191 |         data = fig["data"][0]
192 |         assert data["type"] == "heatmap"
193 |         expected_columns = {"Height (cm)", "Depth (cm)"}
194 |         assert set(data["y"]) == set(data["x"]) == expected_columns
195 | 
196 |     explorer = Explorer(artworks_summary, plot_renderer=mock_render)
197 |     explorer.correlation_plot(exclude=["Width (cm)"])
198 | 


--------------------------------------------------------------------------------
/tests/test_summarise.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import inspect
  3 | import os
  4 | import itertools
  5 | import datetime
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | import pytest
 11 | 
 12 | from lens import summarise, metrics, __version__
 13 | from lens.summarise import EmptyDataFrameError, NumpyEncoder
 14 | from lens.dask_graph import _join_dask_results
 15 | from lens.metrics import CAT_FRAC_THRESHOLD
 16 | 
 17 | from multivariate_kde import compute_deviation_with_kde
 18 | 
 19 | dirname = os.path.dirname(
 20 |     os.path.abspath(inspect.getfile(inspect.currentframe()))
 21 | )
 22 | 
 23 | test_results_dir = dirname + "/test_results"
 24 | 
 25 | if not os.path.exists(test_results_dir):
 26 |     os.mkdir(test_results_dir)
 27 | 
 28 | 
 29 | def test_dask_row_count(df):
 30 |     rc_report = metrics.row_count(df)
 31 |     assert rc_report["total"] == len(df)
 32 |     assert rc_report["unique"] == len(df.drop_duplicates().index)
 33 | 
 34 |     # test serialization
 35 |     json.dumps({"row_count": rc_report}, cls=NumpyEncoder)
 36 | 
 37 | 
 38 | def test_zero_rows_dataframe():
 39 |     columns = sorted(["a", "b", "c", "d"])
 40 |     df = pd.DataFrame(columns=columns)
 41 |     report = summarise(df)._report
 42 |     assert sorted(report["_columns"]) == columns
 43 |     for column in columns:
 44 |         props = report["column_properties"][column]
 45 |         assert props["nulls"] == 0
 46 |         assert props["notnulls"] == 0
 47 |         assert props["unique"] == 0
 48 | 
 49 | 
 50 | def test_one_row_dataframe():
 51 |     items = [
 52 |         ("a", [1]),
 53 |         ("b", [-0.5]),
 54 |         ("c", ["hello"]),
 55 |         ("d", [datetime.datetime.now()]),
 56 |     ]
 57 |     columns = sorted([item[0] for item in items])
 58 |     df = pd.DataFrame.from_dict(dict(items))
 59 |     report = summarise(df)._report
 60 |     assert sorted(report["_columns"]) == columns
 61 |     column_properties = report["column_properties"]
 62 |     for column in columns:
 63 |         props = column_properties[column]
 64 |         assert props["nulls"] == 0
 65 |         assert props["notnulls"] == 1
 66 |         assert props["unique"] == 1
 67 |     assert column_properties["a"]["dtype"] == "int64"
 68 |     assert column_properties["b"]["dtype"] == "float64"
 69 |     assert column_properties["c"]["dtype"] == "object"
 70 |     assert column_properties["d"]["dtype"] == "datetime64[ns]"
 71 |     column_summary = report["column_summary"]
 72 |     assert column_summary["a"]["max"] == 1
 73 |     assert column_summary["a"]["min"] == 1
 74 |     assert column_summary["a"]["mean"] == 1.0
 75 |     assert column_summary["a"]["median"] == 1.0
 76 |     assert column_summary["a"]["iqr"] == 0.0
 77 | 
 78 |     assert column_summary["b"]["max"] == -0.5
 79 |     assert column_summary["b"]["min"] == -0.5
 80 |     assert column_summary["b"]["median"] == -0.5
 81 |     assert column_summary["b"]["mean"] == -0.5
 82 | 
 83 | 
 84 | @pytest.fixture(scope="module")
 85 | def column_properties(df):
 86 |     cols = df.columns
 87 |     cps = {col: metrics.column_properties(df[col]) for col in cols}
 88 | 
 89 |     return cps
 90 | 
 91 | 
 92 | @pytest.fixture(scope="module")
 93 | def column_summary(df, column_properties):
 94 |     cols = df.columns
 95 |     cs = {
 96 |         col: metrics.column_summary(df[col], column_properties[col])
 97 |         for col in cols
 98 |     }
 99 |     return cs
100 | 
101 | 
102 | def test_dask_column_properties(column_properties):
103 |     # Only worth checking that we determine categorical columns
104 |     # correctly if there are enough rows in the dataframe.
105 |     # There are 13 distinct categories.
106 |     categorical13_props = column_properties["categorical13"]["categorical13"]
107 |     row_threshold = 2 * 13.0 / CAT_FRAC_THRESHOLD
108 |     if categorical13_props["notnulls"] > row_threshold:
109 |         assert categorical13_props["is_categorical"]
110 | 
111 |     # test serialization
112 |     joined = _join_dask_results(column_properties.values()).compute()
113 |     json.dumps({"column_summary": joined}, cls=NumpyEncoder)
114 | 
115 | 
116 | def test_dask_column_summary(df, column_summary):
117 |     for col in df.columns:
118 |         series = df[col]
119 |         cs_report = column_summary[col]
120 | 
121 |         if cs_report is None or series.isnull().sum() == len(df.index):
122 |             continue
123 | 
124 |         else:
125 |             cs_report = cs_report[col]
126 | 
127 |         # Test that only lognormal is set to log transform
128 |         # Only run this test if the column has enough valid
129 |         # values
130 |         if len(df.index) >= 50:
131 |             if col == "lognormal":
132 |                 assert cs_report["logtrans"]
133 |             else:
134 |                 assert not cs_report["logtrans"]
135 | 
136 |         _percs = list(cs_report["percentiles"].keys())
137 |         _percs.sort()
138 |         cs_report_perc = [cs_report["percentiles"][p] for p in _percs]
139 |         exact_perc = np.nanpercentile(series, _percs)
140 |         np.testing.assert_allclose(
141 |             cs_report_perc, exact_perc, rtol=1e-3, atol=1e-3
142 |         )
143 | 
144 |         exact_meanminmax = [
145 |             np.nanmean(series.get_values()),
146 |             np.nanmin(series.get_values()),
147 |             np.nanmax(series.get_values()),
148 |         ]
149 |         rep_meanminmax = [cs_report[x] for x in ["mean", "min", "max"]]
150 |         np.testing.assert_allclose(
151 |             exact_meanminmax, rep_meanminmax, rtol=1e-3, atol=0.01
152 |         )
153 | 
154 |         # test histogram
155 |         histogram = cs_report["histogram"]
156 | 
157 |         assert np.sum(histogram["counts"]) == series.notnull().sum()
158 |         if cs_report["n"] > 1 and not np.all(np.mod(series.dropna(), 1) == 0):
159 |             # Bin edges for single-count histograms are not relevant, and
160 |             # integer-only histograms not bounded by extremes in distribution
161 |             assert np.allclose(histogram["bin_edges"][0], series.min())
162 |             assert np.allclose(histogram["bin_edges"][-1], series.max())
163 | 
164 |         if col == "categoricalint":
165 |             # Check that bins are set correctly for integers
166 |             # we are removing the twos so there should be at least one empty
167 |             # bin in the histogram
168 |             n_unique = series.dropna().unique().size
169 |             assert len(histogram["counts"]) >= n_unique
170 |             assert len(histogram["bin_edges"]) == len(histogram["counts"]) + 1
171 | 
172 |             # Check that the bin that contains 2 is set to 0
173 |             idx = np.where(np.array(histogram["bin_edges"]) < 2)[0][-1]
174 |             assert histogram["counts"][idx] == 0
175 | 
176 |             assert np.allclose(
177 |                 histogram["bin_edges"][0], series.dropna().min() - 0.5
178 |             )
179 |             assert np.allclose(
180 |                 histogram["bin_edges"][-1], series.dropna().max() + 0.5
181 |             )
182 | 
183 |         # test kde
184 |         kde = cs_report["kde"]
185 | 
186 |         assert np.all(~np.isnan(kde["x"]))
187 |         assert np.all(~np.isnan(kde["y"]))
188 | 
189 |         if "categorical" not in col and np.sum(kde["y"]) > 0:
190 |             assert np.allclose(np.trapz(kde["y"], kde["x"]), 1)
191 | 
192 |         if col == "normal":
193 |             mean = cs_report["mean"]
194 |             kde_max = kde["x"][np.argmax(kde["y"])]
195 |             assert np.allclose(kde_max, mean, atol=5, rtol=0.1)
196 | 
197 |     # test serialization
198 |     joined = _join_dask_results(column_summary.values()).compute()
199 |     json.dumps({"column_summary": joined}, cls=NumpyEncoder)
200 | 
201 | 
202 | def test_dask_outliers(df, column_summary):
203 |     reps = []
204 |     for col in df.columns:
205 |         reps.append(metrics.outliers(df[col], column_summary[col]))
206 | 
207 |     # test serialization
208 |     joined = _join_dask_results(reps).compute()
209 |     json.dumps({"outliers": joined}, cls=NumpyEncoder)
210 | 
211 | 
212 | @pytest.fixture(scope="module")
213 | def frequencies(df, column_properties):
214 |     return {
215 |         col: metrics.frequencies(df[col], column_properties[col])
216 |         for col in df.columns
217 |     }
218 | 
219 | 
220 | def test_dask_frequencies(df, frequencies):
221 |     for col in frequencies.keys():
222 |         freq_report = frequencies[col]
223 |         if freq_report is None:
224 |             continue
225 |         else:
226 |             freq_report = freq_report[col]
227 | 
228 |         freqs = df[col].value_counts().to_dict()
229 | 
230 |         for k in freqs.keys():
231 |             assert freqs[k] == freq_report[k]
232 | 
233 |     # test serialization
234 |     joined = _join_dask_results(frequencies.values()).compute()
235 |     json.dumps({"freqs": joined}, cls=NumpyEncoder)
236 | 
237 | 
238 | def test_dask_correlation(df, column_properties):
239 |     cp = _join_dask_results(column_properties.values()).compute()
240 |     rep = metrics.correlation(df, cp)
241 |     cols = rep["_columns"]
242 |     sp = np.array(rep["spearman"])
243 |     order = rep["order"]
244 | 
245 |     assert len(order) == len(cols)
246 |     assert sp.shape[0] == len(cols)
247 |     assert sp.shape[1] == len(cols)
248 | 
249 |     # test serialization
250 |     json.dumps({"correlation": rep}, cls=NumpyEncoder)
251 | 
252 | 
253 | def test_dask_pairdensity(df, column_properties, column_summary, frequencies):
254 |     pds = []
255 |     for col1, col2 in itertools.combinations(df.columns, 2):
256 |         cp = {k: column_properties[k] for k in [col1, col2]}
257 |         cs = {k: column_summary[k] for k in [col1, col2]}
258 |         fr = {k: frequencies[k] for k in [col1, col2]}
259 |         pd = metrics.pairdensity(df[[col1, col2]], cp, cs, fr)
260 |         if pd is not None:
261 |             if should_pair_density_norm_be_finite(df[[col1, col2]], cp):
262 |                 if (
263 |                     not cp[col1][col1]["is_categorical"]
264 |                     and not cp[col2][col2]["is_categorical"]
265 |                     and "poisson" not in col1
266 |                     and "poisson" not in col2
267 |                 ):
268 |                     filename = "{}/{}_{}_{}_pd_diff.png".format(
269 |                         test_results_dir, len(df.index), col1, col2
270 |                     )
271 |                     mean_dev = compute_deviation_with_kde(
272 |                         df[[col1, col2]], pd, filename
273 |                     )
274 |                     assert mean_dev < 0.02
275 |                 assert (
276 |                     np.sum(pd[col1][col2]["density"]) > 0
277 |                 ), "Failed on columns {} - {}".format(col1, col2)
278 | 
279 |         pds.append(pd)
280 | 
281 |     joined = _join_dask_results(pds).compute()
282 | 
283 |     # test serialization
284 |     json.dumps({"pairdensity": joined}, cls=NumpyEncoder)
285 | 
286 | 
287 | def should_pair_density_norm_be_finite(df, column_properties):
288 |     col1, col2 = df.columns
289 |     valid_rows = df.dropna().index
290 |     is_col1_categorical = column_properties[col1][col1]["is_categorical"]
291 |     is_col2_categorical = column_properties[col2][col2]["is_categorical"]
292 |     if is_col1_categorical and is_col2_categorical:
293 |         return len(valid_rows) >= 1
294 |     elif is_col1_categorical:
295 |         n_distinct = column_properties[col1][col1]["unique"]
296 |         return len(valid_rows) >= (n_distinct * 2)
297 |     elif is_col2_categorical:
298 |         n_distinct = column_properties[col2][col2]["unique"]
299 |         return len(valid_rows) >= (n_distinct * 2)
300 |     else:
301 |         return len(valid_rows) >= 3
302 | 
303 | 
304 | def serialize_full_report(dreport, fname=None):
305 |     # test that it can be serialized as json
306 |     try:
307 |         if fname is None:
308 |             json.dumps(dreport, cls=NumpyEncoder)
309 |         else:
310 |             with open(fname, "w") as f:
311 |                 json.dump(dreport, f, indent=2)
312 |     except TypeError:
313 |         # Nail down which metric is failing
314 |         for k in dreport.keys():
315 |             try:
316 |                 json.dumps({k: dreport[k]}, cls=NumpyEncoder)
317 |             except TypeError as e:
318 |                 raise TypeError(
319 |                     "Metric {} is not JSON serializable: {}".format(k, e)
320 |                 )
321 | 
322 | 
323 | def test_dask_compute_graph_default(report):
324 |     fname = "{}/test_results/report_test_data.json".format(dirname)
325 | 
326 |     serialize_full_report(report, fname=fname)
327 | 
328 | 
329 | @pytest.mark.parametrize(
330 |     "scheduler,num_workers,pairdensities",
331 |     [
332 |         ("sync", None, True),
333 |         ("multiprocessing", 2, True),
334 |         ("threading", None, True),
335 |         ("multiprocessing", 4, False),
336 |     ],
337 | )
338 | def test_dask_compute_graph(df, scheduler, num_workers, pairdensities):
339 |     dreport = summarise(
340 |         df,
341 |         scheduler=scheduler,
342 |         num_workers=num_workers,
343 |         pairdensities=pairdensities,
344 |     )._report
345 |     fname = None
346 |     if scheduler == "multiprocessing" and num_workers is None:
347 |         fname = "{}/test_results/report_test_data_{}.json".format(
348 |             dirname, "mp"
349 |         )
350 |     assert dreport["_lens_version"] == __version__
351 |     if not pairdensities:
352 |         assert dreport["pairdensity"] == {"_columns": [], "_run_time": 0.0}
353 | 
354 |     serialize_full_report(dreport, fname=fname)
355 | 
356 | 
357 | def test_empty_df():
358 |     empty_df = pd.DataFrame()
359 |     with pytest.raises(EmptyDataFrameError):
360 |         summarise(empty_df)
361 | 
362 | 
363 | @pytest.fixture
364 | def small_df():
365 |     N = 100
366 |     df = pd.DataFrame.from_dict(
367 |         {"foo": np.random.randn(N), "bar": np.random.randint(10, size=N)}
368 |     )
369 |     return df
370 | 
371 | 
372 | def test_string_num_cpus_env(small_df, monkeypatch):
373 |     monkeypatch.setenv("NUM_CPUS", "not-an-int")
374 |     ls = summarise(small_df)
375 |     assert set(ls._report["_columns"]) == set(small_df.columns)
376 | 
377 | 
378 | def test_int_num_cpus_env(small_df, monkeypatch):
379 |     num_cpus_env = 2
380 |     monkeypatch.setenv("NUM_CPUS", str(num_cpus_env))
381 |     ls = summarise(small_df)
382 |     assert set(ls._report["_columns"]) == set(small_df.columns)
383 | 


--------------------------------------------------------------------------------
/tests/test_summary_class.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import inspect
  3 | 
  4 | import pytest
  5 | import numpy as np
  6 | import numpy.testing
  7 | import scipy.stats
  8 | import pandas as pd
  9 | import json
 10 | 
 11 | from lens import Summary, summarise
 12 | 
 13 | dirname = os.path.dirname(
 14 |     os.path.abspath(inspect.getfile(inspect.currentframe()))
 15 | )
 16 | 
 17 | 
 18 | @pytest.fixture(scope="function")
 19 | def ls(report):
 20 |     return Summary(report)
 21 | 
 22 | 
 23 | # VZ: I have not managed to get the below test not to mutate the report
 24 | # fixture, so subsequent tests fail if this is run. Disabling for now.
 25 | # def test_report_validation(report):
 26 | #     # load it into the class
 27 | #     Summary(report)
 28 | #     r = report.copy()
 29 | #
 30 | #     # Test that it fails on missing data
 31 | #     for metric in ['frequencies', 'column_summary', 'outliers']:
 32 | #         r[metric].pop(r[metric]['_columns'][0])
 33 | #         with pytest.raises(LensSummaryError):
 34 | #             Summary(r)
 35 | 
 36 | 
 37 | def test_columns_method(report, ls):
 38 |     assert set(ls.columns) == set(report["_columns"])
 39 | 
 40 | 
 41 | def test_row_count_method(report, ls):
 42 |     assert report["row_count"]["total"] == ls.rows
 43 |     assert report["row_count"]["unique"] == ls.rows_unique
 44 | 
 45 | 
 46 | def test_summary_method(report, ls):
 47 |     for col in ls.columns:
 48 |         summary = ls.summary(col)
 49 |         assert summary["name"] == col
 50 |         for k in ["nulls", "notnulls", "unique", "dtype"]:
 51 |             assert summary[k] == report["column_properties"][col][k]
 52 | 
 53 |         assert (summary["desc"] == "categorical") == report[
 54 |             "column_properties"
 55 |         ][col]["is_categorical"]
 56 | 
 57 |         assert (summary["desc"] == "numeric") == (
 58 |             report["column_properties"][col]["numeric"]
 59 |             and not report["column_properties"][col]["is_categorical"]
 60 |         )
 61 | 
 62 | 
 63 | def test_numeric_details(report, ls):
 64 |     num_cols = [
 65 |         col
 66 |         for col in ls.columns
 67 |         if report["column_properties"][col]["numeric"]
 68 |     ]
 69 | 
 70 |     metrics = ["min", "max", "mean", "median", "std", "sum"]
 71 | 
 72 |     for col in num_cols:
 73 |         details = ls.details(col)
 74 |         for m in metrics:
 75 |             if not np.isnan(details[m]):
 76 |                 assert details[m] == report["column_summary"][col][m]
 77 | 
 78 | 
 79 | def test_categorical_details(report, ls):
 80 |     cat_cols = [
 81 |         col
 82 |         for col in ls.columns
 83 |         if report["column_properties"][col]["is_categorical"]
 84 |     ]
 85 | 
 86 |     for col in cat_cols:
 87 |         details = ls.details(col)
 88 |         for category in report["frequencies"][col].keys():
 89 |             assert (
 90 |                 details["frequencies"][category]
 91 |                 == report["frequencies"][col][category]
 92 |             )
 93 | 
 94 | 
 95 | def test_histogram(report, ls):
 96 |     num_cols = [
 97 |         col
 98 |         for col in ls.columns
 99 |         if report["column_properties"][col]["numeric"]
100 |     ]
101 | 
102 |     for col in num_cols:
103 |         histogram = ls.histogram(col)
104 |         for key, actual in zip(["counts", "bin_edges"], histogram):
105 |             assert np.allclose(
106 |                 report["column_summary"][col]["histogram"][key], actual
107 |             )
108 | 
109 | 
110 | def test_kde(report, ls):
111 |     num_cols = [
112 |         col
113 |         for col in ls.columns
114 |         if report["column_properties"][col]["numeric"]
115 |     ]
116 | 
117 |     for col in num_cols:
118 |         kde = ls.kde(col)
119 |         for key, actual in zip(["x", "y"], kde):
120 |             assert np.allclose(
121 |                 report["column_summary"][col]["kde"][key], actual
122 |             )
123 | 
124 | 
125 | @pytest.mark.parametrize(
126 |     "col1, col2",
127 |     [
128 |         ("normal", "poisson"),
129 |         ("normal", "lognormal"),
130 |         ("normal", "categorical5"),
131 |         ("categorical5", "categorical13"),
132 |     ],
133 | )
134 | def test_pair_details_pairdensity(report, ls, col1, col2):
135 |     details = ls.pair_details(col1, col2)
136 | 
137 |     for col, key in zip([col1, col2], ["x", "y"]):
138 |         if col in report["column_summary"].keys():
139 |             # Test that logtrans matches scale.
140 |             assert report["column_summary"][col]["logtrans"] == (
141 |                 details["pairdensity"][key + "_scale"] == "log"
142 |             )
143 |             # Test that min/max match range of coordinates.
144 |             assert np.allclose(
145 |                 report["column_summary"][col]["min"],
146 |                 np.min(details["pairdensity"][key]),
147 |             )
148 |             assert (
149 |                 np.max(details["pairdensity"][key])
150 |                 <= report["column_summary"][col]["max"]
151 |             )
152 | 
153 |     details_transposed = ls.pair_details(col2, col1)
154 |     assert np.allclose(
155 |         details["pairdensity"]["density"],
156 |         details_transposed["pairdensity"]["density"].T,
157 |     )
158 | 
159 | 
160 | @pytest.mark.parametrize(
161 |     "col1, col2", [("normal", "poisson"), ("normal", "lognormal")]
162 | )
163 | def test_pair_details_correlation(report, ls, col1, col2):
164 |     details = ls.pair_details(col1, col2)
165 |     details_transposed = ls.pair_details(col1, col2)
166 |     idx = [
167 |         report["correlation"]["_columns"].index(col) for col in [col1, col2]
168 |     ]
169 | 
170 |     for coeff in ["spearman", "pearson"]:
171 |         assert np.allclose(
172 |             report["correlation"][coeff][idx[0]][idx[1]],
173 |             details["correlation"][coeff],
174 |         )
175 |         assert np.allclose(
176 |             details["correlation"][coeff],
177 |             details_transposed["correlation"][coeff],
178 |         )
179 | 
180 | 
181 | def test_pair_details_empty(ls):
182 |     # Test that non-numeric pairs return an empty dict without raising
183 |     # exceptions.
184 |     details = ls.pair_details("normal", "datetimes")
185 |     assert len(details.keys()) == 0
186 | 
187 | 
188 | def test_pair_details_same_column(ls):
189 |     with pytest.raises(ValueError):
190 |         ls.pair_details("normal", "normal")
191 | 
192 | 
193 | @pytest.mark.parametrize(
194 |     "col1, col2", [("normal", "lognormal"), ("normal", "normal")]
195 | )
196 | def test_correlation_matrix(report, ls, col1, col2):
197 |     columns, correlation_matrix = ls.correlation_matrix()
198 |     index_column1 = columns.index(col1)
199 |     index_column2 = columns.index(col2)
200 |     correlation_value = (
201 |         1
202 |         if col1 == col2
203 |         else (ls.pair_details(col1, col2)["correlation"]["spearman"])
204 |     )
205 |     assert (
206 |         correlation_matrix[index_column1, index_column2]
207 |         == correlation_matrix[index_column2, index_column1]
208 |         == correlation_value
209 |     )
210 | 
211 | 
212 | def test_correlation_matrix_one_column():
213 |     column_values = np.random.ranf(size=200)
214 |     df = pd.DataFrame.from_dict({"a": column_values})
215 |     summary = summarise(df)
216 |     columns, correlation_matrix = summary.correlation_matrix()
217 |     assert columns == ["a"]
218 |     assert correlation_matrix.shape == (1, 1)
219 |     numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0)
220 | 
221 | 
222 | def test_correlation_matrix_two_columns():
223 |     column1_values = np.random.ranf(size=200)
224 |     column2_values = np.random.ranf(size=200)
225 |     df = pd.DataFrame.from_dict({"a": column1_values, "b": column2_values})
226 |     summary = summarise(df)
227 |     columns, correlation_matrix = summary.correlation_matrix()
228 |     assert sorted(columns) == ["a", "b"]
229 |     numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0)
230 |     numpy.testing.assert_approx_equal(correlation_matrix[1, 1], 1.0)
231 |     off_diagonal_term = scipy.stats.spearmanr(df["a"], df["b"]).correlation
232 |     numpy.testing.assert_approx_equal(
233 |         correlation_matrix[1, 0], off_diagonal_term
234 |     )
235 |     numpy.testing.assert_approx_equal(
236 |         correlation_matrix[0, 1], off_diagonal_term
237 |     )
238 | 
239 | 
240 | def test_correlation_matrix_three_columns():
241 |     column_values = [np.random.ranf(size=200) for i in range(3)]
242 |     column_headers = ["a", "b", "c"]
243 |     df = pd.DataFrame.from_dict(dict(zip(column_headers, column_values)))
244 |     summary = summarise(df)
245 |     columns, correlation_matrix = summary.correlation_matrix()
246 |     assert sorted(columns) == column_headers
247 | 
248 |     for i, first_column in enumerate(columns):
249 |         for j, second_column in enumerate(columns):
250 |             expected = scipy.stats.spearmanr(
251 |                 df[first_column], df[second_column]
252 |             ).correlation
253 |             actual = correlation_matrix[i, j]
254 |             numpy.testing.assert_approx_equal(expected, actual)
255 | 
256 | 
257 | def test_json_roundtrip(ls):
258 |     # Run reference report through JSON roundtrip for comparison
259 |     original_report = json.loads(json.dumps(ls._report))
260 |     string_report = json.loads(ls.to_json())
261 | 
262 |     filename = "test-report.json"
263 | 
264 |     # Test filename roundtrip
265 |     ls.to_json(filename)
266 |     file_report = Summary.from_json(filename)._report
267 | 
268 |     # Test buffer roundtrip
269 |     with open(filename, "w") as f:
270 |         ls.to_json(f)
271 | 
272 |     with open(filename, "r") as f:
273 |         buffer_report = Summary.from_json(f)._report
274 | 
275 |     os.remove(filename)
276 | 
277 |     for json_report in [string_report, file_report, buffer_report]:
278 |         diffs = find_diff(original_report, json_report)
279 |         for diff in diffs:
280 |             print(diff)
281 | 
282 |         assert len(diffs) == 0
283 | 
284 | 
285 | def find_diff(d1, d2, exclude=[], path="", update_path=True):
286 |     diffs = []
287 |     for k in d1.keys():
288 |         if k in exclude:
289 |             continue
290 | 
291 |         if k not in d2:
292 |             msg = "{} :\n {} as key not in d2".format(path, k)
293 |             diffs.append(msg)
294 |         else:
295 |             new_path = path
296 |             if update_path:
297 |                 if new_path == "":
298 |                     new_path = k
299 |                 else:
300 |                     new_path = new_path + "->" + k
301 | 
302 |             if isinstance(d1[k], dict):
303 |                 diffs = diffs + find_diff(d1[k], d2[k], exclude, new_path)
304 |             elif isinstance(d1[k], list):
305 |                 # convert the list to a dict using the index as the key.
306 |                 diffs = diffs + find_diff(
307 |                     list_to_dict(d1[k]),
308 |                     list_to_dict(d2[k]),
309 |                     exclude,
310 |                     new_path,
311 |                     False,
312 |                 )
313 |             else:
314 |                 a = d1[k]
315 |                 b = d2[k]
316 |                 if not isinstance(a, float) or not (
317 |                     np.isnan(a) and np.isnan(b)
318 |                 ):
319 |                     if isinstance(a, float):
320 |                         if not np.allclose(a, b):
321 |                             msg = "{} :\n - {} : {}\n + {} : {}".format(
322 |                                 path, k, a, k, b
323 |                             )
324 |                             diffs.append(msg)
325 |                     elif a != b:
326 |                         msg = "{} :\n - {} : {}\n + {} : {}".format(
327 |                             path, k, a, k, b
328 |                         )
329 |                         diffs.append(msg)
330 | 
331 |     return diffs
332 | 
333 | 
334 | def list_to_dict(list_):
335 |     dict_ = {}
336 |     for index, item in enumerate(list_):
337 |         dict_[index] = item
338 | 
339 |     return dict_
340 | 
341 | 
342 | # Tolerances for N=10k, taken from the TDigest test suite
343 | tdigest_tol = {50: 0.02, 25: 0.015, 10: 0.01, 1: 0.005, 0.1: 0.001}
344 | 
345 | for k in list(tdigest_tol.keys()):
346 |     tdigest_tol[100 - k] = tdigest_tol[k]
347 | 
348 | 
349 | @pytest.mark.parametrize("column", ["normal", "lognormal", "poisson"])
350 | def test_summary_cdf(ls, column):
351 |     cdf = ls.cdf(column)
352 | 
353 |     # Set tolerance based on number of rows
354 |     for p in ls._report["column_summary"][column]["percentiles"]:
355 |         tol = tdigest_tol[p] * np.sqrt(10000 / ls.rows)
356 |         x = ls._report["column_summary"][column]["percentiles"][p]
357 |         assert np.allclose(p / 100.0, cdf(x), atol=tol, rtol=1)
358 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27, py36, py37, flake8, black
 3 | toxworkdir = {env:TOX_WORK_DIR:.tox}
 4 | 
 5 | [testenv]
 6 | sitepackages = False
 7 | setenv =
 8 |     MPLBACKEND = Agg
 9 | deps =
10 |     boto3
11 |     cloudpickle
12 |     flake8
13 |     pytest
14 |     s3fs
15 |     statsmodels
16 |     toolz
17 |     pytest
18 | commands = py.test {posargs:-v --ignore=it}
19 | 
20 | [testenv:flake8]
21 | skip_install = True
22 | deps =
23 |     flake8
24 | commands =
25 |     flake8 lens tests
26 | 
27 | [testenv:black]
28 | skip_install = True
29 | deps =
30 |     black==18.9b0
31 | commands =
32 |     black {posargs:--check setup.py lens tests}
33 | 


--------------------------------------------------------------------------------