├── .gitignore ├── .travis.yml ├── AUTHORS.rst ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── environment.yml ├── requirements.txt └── source │ ├── api.rst │ ├── conf.py │ ├── explorer_api.rst │ ├── index.rst │ ├── room_occupancy_example.ipynb │ ├── sphinxext │ └── notebook_sphinxext.py │ ├── summarise_api.rst │ └── tutorial.rst ├── it ├── test_regression.py └── update_reference_reports.py ├── lens ├── __init__.py ├── bins.py ├── dask_graph.py ├── explorer.py ├── formatting.py ├── metrics.py ├── plotting.py ├── summarise.py ├── tdigest_utils.py ├── utils.py ├── version.py └── widget.py ├── pyproject.toml ├── readthedocs.yml ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── conftest.py ├── data │ └── test-artworks.csv ├── multivariate_kde.py ├── test_explorer.py ├── test_summarise.py └── test_summary_class.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled files 2 | *.py[co] 3 | *.a 4 | *.o 5 | *.so 6 | __pycache__ 7 | 8 | # Ignore .c files by default to avoid including generated code. If you want to 9 | # add a non-generated .c extension, use `git add -f filename.c`. 10 | *.c 11 | 12 | # Other generated files 13 | emr-bootstrap.sh 14 | MANIFEST 15 | */cython_version.py 16 | htmlcov 17 | .coverage 18 | .eggs 19 | .pytest_cache 20 | .mypy_cache/ 21 | 22 | # Sphinx 23 | _build 24 | 25 | # Packages/installer info 26 | *.egg 27 | .eggs 28 | *.egg-info 29 | dist 30 | build 31 | eggs 32 | parts 33 | bin 34 | var 35 | sdist 36 | develop-eggs 37 | .installed.cfg 38 | distribute-*.tar.gz 39 | .cache 40 | .tox 41 | .venv 42 | 43 | # Other 44 | .*.swp 45 | *~ 46 | 47 | # Mac OSX 48 | .DS_Store 49 | 50 | .project 51 | .pydevproject 52 | .settings 53 | docs/_generated/ 54 | docs/api/ 55 | 56 | .idea 57 | *.h5 58 | 59 | # Autogenerated file on build 60 | tests/test_results/ 61 | it/generated_reports/ 62 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | os: linux 3 | dist: xenial 4 | 5 | branches: 6 | only: 7 | - master 8 | 9 | env: 10 | global: 11 | # The following versions are the 'default' for tests, unless 12 | # overidden underneath. They are defined here in order to save having 13 | # to repeat them for all configurations. 14 | - MPLBACKEND=Agg 15 | 16 | matrix: 17 | 18 | include: 19 | 20 | # Check for sphinx doc build warnings 21 | # - os: linux 22 | # env: CMD='sphinx-build' 23 | 24 | - python: 3.6 25 | env: TOXENV=py36 26 | - python: 3.7 27 | env: TOXENV=py37 28 | - python: 2.7 29 | env: TOXENV=py27 30 | 31 | - python: 2.7 32 | env: TOXENV=flake8 33 | - python: 3.7 34 | env: TOXENV=flake8 35 | - python: 3.7 36 | env: TOXENV=black 37 | 38 | install: 39 | - pip install tox 40 | 41 | script: 42 | - tox 43 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | Contributors 2 | ============ 3 | 4 | The follow people have contributed directly to this project: 5 | 6 | - `Andrew Brookes `_ 7 | - `Andrew Crozier `_ 8 | - `Pascal Bugnion `_ 9 | - `Peter Foster `_ 10 | - `Scott Stevenson `_ 11 | - `Setrak Balian `_ 12 | - `Víctor Zabalza `_ 13 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include *.txt 3 | include *.yml 4 | include tox.ini 5 | recursive-include ci *.sh 6 | recursive-include docs *.ipynb 7 | recursive-include docs *.py 8 | recursive-include docs *.rst 9 | recursive-include docs *.txt 10 | recursive-include docs *.yml 11 | recursive-include docs Makefile 12 | recursive-include it *.py 13 | recursive-include tests *.csv 14 | recursive-include tests *.py 15 | recursive-include tests *.txt 16 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | lens 2 | ==== 3 | 4 | .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.2593336.svg 5 | :target: https://doi.org/10.5281/zenodo.2593336 6 | 7 | ``lens`` is a library for exploring data in Pandas DataFrames. It computes 8 | single column summary statistics and estimates the correlation between columns. 9 | We wrote ``lens`` when we realised that the initial steps of acquiring a new 10 | data set were almost formulaic: What data type is in this column? How many null 11 | values are there? Which columns are correlated? What's the distribution of this 12 | value? ``lens`` calculates all this for you. 13 | 14 | See the documentation_ for more details. 15 | 16 | .. _documentation: https://lens.readthedocs.io/en/latest 17 | 18 | Installation 19 | ------------ 20 | 21 | ``lens`` can be installed from PyPI with ``pip``: 22 | 23 | .. code-block:: bash 24 | 25 | pip install lens 26 | 27 | Testing 28 | ------- 29 | 30 | Tests can be run using [`tox`](https://tox.readthedocs.io) (replace `py37` with 31 | the version of python you wish to use to run the tests): 32 | 33 | .. code-block:: bash 34 | 35 | pip install tox 36 | tox -e py37 37 | 38 | License 39 | ------- 40 | 41 | ``lens`` is licensed under the Apache License, see LICENSE.txt for details. 42 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | 27 | .PHONY: clean 28 | clean: 29 | rm -rf $(BUILDDIR)/* 30 | 31 | .PHONY: html 32 | html: 33 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 34 | @echo 35 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 36 | -------------------------------------------------------------------------------- /docs/environment.yml: -------------------------------------------------------------------------------- 1 | name: lens-docs 2 | dependencies: 3 | - python=3.6 4 | - numpy 5 | - dask 6 | - ipywidgets 7 | - matplotlib 8 | - numpy>=1.11 9 | - numpydoc 10 | - pandas 11 | - plotly 12 | - scipy 13 | - pip: 14 | - tdigest 15 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | numpydoc 2 | sphinx 3 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API documentation 2 | ================= 3 | 4 | .. toctree:: 5 | 6 | summarise_api 7 | 8 | explorer_api 9 | 10 | widget_api 11 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Lens documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Sep 27 10:49:51 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | sys.path.append(os.path.abspath('sphinxext')) 22 | 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | #needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'sphinx.ext.mathjax', 34 | 'sphinx.ext.viewcode', 35 | 'sphinx.ext.autodoc', 36 | 'sphinx.ext.autosummary', 37 | 'notebook_sphinxext', 38 | 'numpydoc', 39 | ] 40 | 41 | numpydoc_show_class_members = False 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The suffix(es) of source filenames. 47 | # You can specify multiple suffix as a list of string: 48 | # source_suffix = ['.rst', '.md'] 49 | source_suffix = '.rst' 50 | 51 | # The encoding of source files. 52 | #source_encoding = 'utf-8-sig' 53 | 54 | # The master toctree document. 55 | master_doc = 'index' 56 | 57 | # General information about the project. 58 | project = u'Lens' 59 | copyright = u'2016-2019, Faculty Science Limited' 60 | author = u'Faculty' 61 | 62 | # The version info for the project you're documenting, acts as replacement for 63 | # |version| and |release|, also used in various other places throughout the 64 | # built documents. 65 | # 66 | # The short X.Y version. 67 | version = u'0.0.1' 68 | # The full version, including alpha/beta/rc tags. 69 | release = u'0.0.1' 70 | 71 | # The language for content autogenerated by Sphinx. Refer to documentation 72 | # for a list of supported languages. 73 | # 74 | # This is also used if you do content translation via gettext catalogs. 75 | # Usually you set "language" from the command line for these cases. 76 | language = None 77 | 78 | # There are two options for replacing |today|: either, you set today to some 79 | # non-false value, then it is used: 80 | #today = '' 81 | # Else, today_fmt is used as the format for a strftime call. 82 | #today_fmt = '%B %d, %Y' 83 | 84 | # List of patterns, relative to source directory, that match files and 85 | # directories to ignore when looking for source files. 86 | # This patterns also effect to html_static_path and html_extra_path 87 | exclude_patterns = ['_build', '**.ipynb_checkpoints'] 88 | 89 | # The reST default role (used for this markup: `text`) to use for all 90 | # documents. 91 | #default_role = None 92 | 93 | # If true, '()' will be appended to :func: etc. cross-reference text. 94 | #add_function_parentheses = True 95 | 96 | # If true, the current module name will be prepended to all description 97 | # unit titles (such as .. function::). 98 | #add_module_names = True 99 | 100 | # If true, sectionauthor and moduleauthor directives will be shown in the 101 | # output. They are ignored by default. 102 | #show_authors = False 103 | 104 | # The name of the Pygments (syntax highlighting) style to use. 105 | pygments_style = 'sphinx' 106 | 107 | # A list of ignored prefixes for module index sorting. 108 | #modindex_common_prefix = [] 109 | 110 | # If true, keep warnings as "system message" paragraphs in the built documents. 111 | #keep_warnings = False 112 | 113 | # If true, `todo` and `todoList` produce output, else they produce nothing. 114 | todo_include_todos = False 115 | 116 | 117 | # -- Options for HTML output ---------------------------------------------- 118 | 119 | # The theme to use for HTML and HTML Help pages. See the documentation for 120 | # a list of builtin themes. 121 | html_theme = 'alabaster' 122 | 123 | # Theme options are theme-specific and customize the look and feel of a theme 124 | # further. For a list of options available for each theme, see the 125 | # documentation. 126 | #html_theme_options = {} 127 | 128 | # Add any paths that contain custom themes here, relative to this directory. 129 | #html_theme_path = [] 130 | 131 | # The name for this set of Sphinx documents. 132 | # " v documentation" by default. 133 | #html_title = u'Lens v0.0.1' 134 | 135 | # A shorter title for the navigation bar. Default is the same as html_title. 136 | #html_short_title = None 137 | 138 | # The name of an image file (relative to this directory) to place at the top 139 | # of the sidebar. 140 | #html_logo = None 141 | 142 | # The name of an image file (relative to this directory) to use as a favicon of 143 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 144 | # pixels large. 145 | #html_favicon = None 146 | 147 | # Add any paths that contain custom static files (such as style sheets) here, 148 | # relative to this directory. They are copied after the builtin static files, 149 | # so a file named "default.css" will overwrite the builtin "default.css". 150 | html_static_path = ['_static'] 151 | 152 | # Add any extra paths that contain custom files (such as robots.txt or 153 | # .htaccess) here, relative to this directory. These files are copied 154 | # directly to the root of the documentation. 155 | #html_extra_path = [] 156 | 157 | # If not None, a 'Last updated on:' timestamp is inserted at every page 158 | # bottom, using the given strftime format. 159 | # The empty string is equivalent to '%b %d, %Y'. 160 | #html_last_updated_fmt = None 161 | 162 | # If true, SmartyPants will be used to convert quotes and dashes to 163 | # typographically correct entities. 164 | #html_use_smartypants = True 165 | 166 | # Custom sidebar templates, maps document names to template names. 167 | #html_sidebars = {} 168 | 169 | # Additional templates that should be rendered to pages, maps page names to 170 | # template names. 171 | #html_additional_pages = {} 172 | 173 | # If false, no module index is generated. 174 | #html_domain_indices = True 175 | 176 | # If false, no index is generated. 177 | #html_use_index = True 178 | 179 | # If true, the index is split into individual pages for each letter. 180 | #html_split_index = False 181 | 182 | # If true, links to the reST sources are added to the pages. 183 | #html_show_sourcelink = True 184 | 185 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 186 | #html_show_sphinx = True 187 | 188 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 189 | #html_show_copyright = True 190 | 191 | # If true, an OpenSearch description file will be output, and all pages will 192 | # contain a tag referring to it. The value of this option must be the 193 | # base URL from which the finished HTML is served. 194 | #html_use_opensearch = '' 195 | 196 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 197 | #html_file_suffix = None 198 | 199 | # Language to be used for generating the HTML full-text search index. 200 | # Sphinx supports the following languages: 201 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 202 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' 203 | #html_search_language = 'en' 204 | 205 | # A dictionary with options for the search language support, empty by default. 206 | # 'ja' uses this config value. 207 | # 'zh' user can custom change `jieba` dictionary path. 208 | #html_search_options = {'type': 'default'} 209 | 210 | # The name of a javascript file (relative to the configuration directory) that 211 | # implements a search results scorer. If empty, the default will be used. 212 | #html_search_scorer = 'scorer.js' 213 | 214 | # Output file base name for HTML help builder. 215 | htmlhelp_basename = 'Lensdoc' 216 | 217 | # -- Options for LaTeX output --------------------------------------------- 218 | 219 | latex_elements = { 220 | # The paper size ('letterpaper' or 'a4paper'). 221 | #'papersize': 'letterpaper', 222 | 223 | # The font size ('10pt', '11pt' or '12pt'). 224 | #'pointsize': '10pt', 225 | 226 | # Additional stuff for the LaTeX preamble. 227 | #'preamble': '', 228 | 229 | # Latex figure (float) alignment 230 | #'figure_align': 'htbp', 231 | } 232 | 233 | # Grouping the document tree into LaTeX files. List of tuples 234 | # (source start file, target name, title, 235 | # author, documentclass [howto, manual, or own class]). 236 | latex_documents = [ 237 | (master_doc, 'Lens.tex', u'Lens Documentation', 238 | u'Faculty', 'manual'), 239 | ] 240 | 241 | # The name of an image file (relative to this directory) to place at the top of 242 | # the title page. 243 | #latex_logo = None 244 | 245 | # For "manual" documents, if this is true, then toplevel headings are parts, 246 | # not chapters. 247 | #latex_use_parts = False 248 | 249 | # If true, show page references after internal links. 250 | #latex_show_pagerefs = False 251 | 252 | # If true, show URL addresses after external links. 253 | #latex_show_urls = False 254 | 255 | # Documents to append as an appendix to all manuals. 256 | #latex_appendices = [] 257 | 258 | # If false, no module index is generated. 259 | #latex_domain_indices = True 260 | 261 | 262 | # -- Options for manual page output --------------------------------------- 263 | 264 | # One entry per manual page. List of tuples 265 | # (source start file, name, description, authors, manual section). 266 | man_pages = [ 267 | (master_doc, 'lens', u'Lens Documentation', 268 | [author], 1) 269 | ] 270 | 271 | # If true, show URL addresses after external links. 272 | #man_show_urls = False 273 | 274 | 275 | # -- Options for Texinfo output ------------------------------------------- 276 | 277 | # Grouping the document tree into Texinfo files. List of tuples 278 | # (source start file, target name, title, author, 279 | # dir menu entry, description, category) 280 | texinfo_documents = [ 281 | (master_doc, 'Lens', u'Lens Documentation', 282 | author, 'Lens', 'One line description of project.', 283 | 'Miscellaneous'), 284 | ] 285 | 286 | # Documents to append as an appendix to all manuals. 287 | #texinfo_appendices = [] 288 | 289 | # If false, no module index is generated. 290 | #texinfo_domain_indices = True 291 | 292 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 293 | #texinfo_show_urls = 'footnote' 294 | 295 | # If true, do not generate a @detailmenu in the "Top" node's menu. 296 | #texinfo_no_detailmenu = False 297 | -------------------------------------------------------------------------------- /docs/source/explorer_api.rst: -------------------------------------------------------------------------------- 1 | lens.explorer API 2 | ================= 3 | 4 | .. currentmodule:: lens.explorer 5 | 6 | .. automodule:: lens.explorer 7 | 8 | .. autoclass:: lens.explorer.Explorer 9 | :members: 10 | 11 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Lens documentation master file, created by 2 | sphinx-quickstart on Tue Sep 27 10:49:51 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Lens 7 | ==== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | tutorial 13 | api 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /docs/source/room_occupancy_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lens Tutorial\n", 8 | "\n", 9 | "\n", 10 | "*Lens* is a library for exploring data in Pandas DataFrames. It computes single\n", 11 | "column summary statistics and estimates the correlation between columns.\n", 12 | "\n", 13 | "We wrote *Lens* when we realised that the initial steps of acquiring a new\n", 14 | "dataset were almost formulaic: what data type is in this column? How many null\n", 15 | "values are there? Which columns are correlated? What's the distribution of this\n", 16 | "value? Lens calculates all this for you, and provides convenient visualisation\n", 17 | "of this information.\n", 18 | "\n", 19 | "You can use *Lens* to analyse new datasets as well as using it to compare how\n", 20 | "DataFrames change over time.\n", 21 | "\n", 22 | "## Using lens\n", 23 | "\n", 24 | "To start using *Lens* you need to import the library:" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import lens" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "*Lens* has two key functions; ``lens.summarise`` for generating a Lens Summary from a DataFrame and\n", 41 | "``lens.explore`` for visualising the results of a summary.\n", 42 | "\n", 43 | "For this tutorial we are going to use *Lens* to analyse the Room Occupancy\n", 44 | "dataset provided in the [Machine Learning Repository of UC Irvine](https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+). It includes \n", 45 | "ambient information about a room such as Temperature, Humidity,\n", 46 | "Light, CO2 and whether it was occupied. The goal is to\n", 47 | "predict occupancy based on the room measurements.\n", 48 | "\n", 49 | "We read the training portion of the dataset into pandas directly from the UCI repository:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 24, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import pandas as pd\n", 59 | "from urllib.request import urlopen\n", 60 | "from io import BytesIO\n", 61 | "from zipfile import ZipFile\n", 62 | "\n", 63 | "remote_zip = urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip')\n", 64 | "df = pd.read_csv(BytesIO(ZipFile(BytesIO(remote_zip.read())).read('datatraining.txt')))\n", 65 | "\n", 66 | "# Split a numerical variable to have additional categorical variables\n", 67 | "df['Humidity_cat'] = pd.cut(df['Humidity'], 5,\n", 68 | " labels=['low', 'medium-low', 'medium',\n", 69 | " 'medium-high', 'high']).astype('str')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "scrolled": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "print('Number of rows in dataset: {}'.format(len(df.index)))\n", 81 | "df.head()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Creating the summary\n", 89 | "\n", 90 | "When you have a DataFrame that you'd like to analyse the first thing to do is\n", 91 | "to create a Lens ``Summary`` object." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "ls = lens.summarise(df)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "The `summarise` function takes a DataFrame and returns a Lens ``Summary`` object. The\n", 108 | "time this takes to run is dependent on both the number of rows and the number of\n", 109 | "columns in the DataFrame. It will use all cores available on the machine, so you \n", 110 | "might want to use a SherlockML instance with more cores to speed up the computation \n", 111 | "of the summary. There are additional optional parameters that can be\n", 112 | "passed in. Details of these can be found in the [summarise API docs](https://docs.sherlockml.com/lens/summarise_api.html#lens.summarise.summarise).\n", 113 | "\n", 114 | "Given that creating the summary is computationally intensive, *Lens* provides a way to save this summary to a JSON file on disk and recover a saved summary through the `to_json` and `from_json` methods of `lens.summary`. This allows to store it for future analysis or to share it with collaborators:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# Saving to JSON\n", 124 | "ls.to_json('room_occupancy_lens_summary.json')\n", 125 | "\n", 126 | "# Reading from a file\n", 127 | "ls_from_json = lens.Summary.from_json('room_occupancy_lens_summary.json')" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "The `LensSummary` object contains the information computed from the dataset and provides methods to access both column-wise and whole dataset information. It is designed to be used programatically, and information about the methods can be accessed in the [LensSummary API docs](https://docs.sherlockml.com/lens/summarise_api.html#lens.summarise.Summary)." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "print(ls.columns)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Create explorer\n", 151 | "\n", 152 | "Lens provides a function that converts a Lens Summary into an `Explorer` object.\n", 153 | "This can be used to see the summary information in tabular form and to display\n", 154 | "plots." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "explorer = lens.explore(ls)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "Coming back to our room occupancy dataset, the first thing that we'd like to\n", 171 | "know is a high-level overview of the data." 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "### Describe\n", 179 | "\n", 180 | "To show a general description of the DataFrame call the `describe` function.\n", 181 | "This is similar to Pandas' ``DataFrame.describe`` but also shows information for non-numeric columns." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "explorer.describe()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "We can see that our dataset has 8143 rows and all the rows are complete. This\n", 198 | "is a very clean dataset! It also tells us the columns and their types, including a `desc` field that explains how *Lens* will treat this column." 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "### Column details\n", 206 | "\n", 207 | "To see type-specific column details, use the `column_details` method. Used on a numeric column such as `Temperature`, it provides summary statistics for the data in that column, including minimun, maximum, mean, median, and standard deviation." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "explorer.column_details('Temperature')" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "We saw in the ouput of `explorer.describe()` that `Occupancy`, our target variable, is a categorical column with two unique values. With `explorer.column_details` we can obtain a frequency table for these two categories - empty (0) or occupied (1):" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "explorer.column_details('Occupancy')" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Correlation\n", 240 | "\n", 241 | "As a first step in exploring the relationships between the columns we can look at the correlation coefficients. `explorer.correlation()` returns a Spearman rank-order correlation coefficient matrix in tabular form." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "explorer.correlation()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "However, parsing a correlation table becomes difficult when there are many columns in the dataset. To get a better overview, we can plot the correlation matrix as a heatmap, which immediately highlights a group of columns correlated with `Occupancy`: `Temperature`, `Light`, and `CO2`." 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "explorer.correlation_plot()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "### Distribution and Cumulative Distribution" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "We can explore the distribution of numerical variables through the `distribution_plot` and `cdf_plot` functions:" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "explorer.distribution_plot('Temperature')" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "explorer.cdf_plot('Temperature')" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "### Pairwise plot\n", 306 | "\n", 307 | "Once we know that certain columns might be correlated, it is useful to visually explore that correlation. This would typically be done through a scatter plot, and *Lens* has computed a 2D Kernel Density Estimate of the scatter plot that can be accessed through `pairwise_density_plot`." 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "scrolled": false 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "explorer.pairwise_density_plot('Temperature', 'Humidity')" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "`pairwise_density_plot` can also show the relationship between a numeric column and a categorical column. In this case, a 1D KDE is computed for each of the categories in the categorical column." 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "explorer.pairwise_density_plot('Temperature', 'Occupancy')" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "source": [ 343 | "### Crosstab\n", 344 | "\n", 345 | "The pairwise relationship between two categorical variables can also be seen as a cross-tabulation: how many observations exist in the dataset of the combination of categories in the two variables. This can be seen as a table or as a plot, which can be useful when the number of categories is very large." 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "explorer.crosstab('Occupancy', 'Humidity_cat')" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "explorer.pairwise_density_plot('Occupancy', 'Humidity_cat')" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "## Interactive widget\n", 371 | "\n", 372 | "An alternative way of quickly exploring the plots available in *Lens* is through a Jupyter widget provided by `lens.interactive_explore`. Creating it is as easy as running this function on a *Lens* `Summary`.\n", 373 | "\n", 374 | "Note that if you are reading this tutorial through the online docs the output of the following cell will not be interactive as it needs to run within a notebook. Download the notebook from the links below to try out the interactive explorer!" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": { 381 | "scrolled": false 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "lens.interactive_explore(ls)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [] 394 | } 395 | ], 396 | "metadata": { 397 | "kernelspec": { 398 | "display_name": "Python 3", 399 | "language": "python", 400 | "name": "python3" 401 | }, 402 | "language_info": { 403 | "codemirror_mode": { 404 | "name": "ipython", 405 | "version": 3 406 | }, 407 | "file_extension": ".py", 408 | "mimetype": "text/x-python", 409 | "name": "python", 410 | "nbconvert_exporter": "python", 411 | "pygments_lexer": "ipython3", 412 | "version": "3.6.3" 413 | }, 414 | "widgets": { 415 | "state": { 416 | "496016afdc01477d9f515f2c073feb4f": { 417 | "views": [ 418 | { 419 | "cell_index": 36 420 | } 421 | ] 422 | } 423 | }, 424 | "version": "1.2.0" 425 | } 426 | }, 427 | "nbformat": 4, 428 | "nbformat_minor": 1 429 | } 430 | -------------------------------------------------------------------------------- /docs/source/sphinxext/notebook_sphinxext.py: -------------------------------------------------------------------------------- 1 | """This is a modified version of ``notebook_sphinxext.py`` 2 | 3 | The original was from: https://github.com/ngoldbaum/RunNotebook as of commit 4 | 944f983 5 | 6 | This is the license for RunNotebook: 7 | 8 | Copyright (c) 2013 Nathan Goldbaum. All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are 12 | met: 13 | 14 | * Redistributions of source code must retain the above copyright 15 | notice, this list of conditions and the following disclaimer. 16 | * Redistributions in binary form must reproduce the above 17 | copyright notice, this list of conditions and the following disclaimer 18 | in the documentation and/or other materials provided with the 19 | distribution. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | """ 33 | 34 | import errno 35 | import nbformat 36 | import os 37 | import shutil 38 | import tempfile 39 | import uuid 40 | from sphinx.util.compat import Directive 41 | from docutils import nodes 42 | from docutils.parsers.rst import directives 43 | from traitlets.config import Config 44 | from nbconvert import html, python 45 | 46 | 47 | class NotebookDirective(Directive): 48 | """Insert an evaluated notebook into a document 49 | 50 | This uses runipy and nbconvert to transform a path to an unevaluated notebook 51 | into html suitable for embedding in a Sphinx document. 52 | """ 53 | required_arguments = 1 54 | optional_arguments = 1 55 | option_spec = {'skip_exceptions': directives.flag} 56 | final_argument_whitespace = True 57 | 58 | def run(self): # check if there are spaces in the notebook name 59 | nb_path = self.arguments[0] 60 | if ' ' in nb_path: 61 | raise ValueError( 62 | "Due to issues with docutils stripping spaces from links, white " 63 | "space is not allowed in notebook filenames '{0}'".format( 64 | nb_path)) 65 | # check if raw html is supported 66 | if not self.state.document.settings.raw_enabled: 67 | raise self.warning('"%s" directive disabled.' % self.name) 68 | 69 | cwd = os.getcwd() 70 | tmpdir = tempfile.mkdtemp() 71 | os.chdir(tmpdir) 72 | 73 | # get path to notebook 74 | nb_filename = self.arguments[0] 75 | nb_basename = os.path.basename(nb_filename) 76 | rst_file = self.state_machine.document.attributes['source'] 77 | rst_dir = os.path.abspath(os.path.dirname(rst_file)) 78 | nb_abs_path = os.path.abspath(os.path.join(rst_dir, nb_filename)) 79 | 80 | # Move files around. 81 | rel_dir = os.path.relpath(rst_dir, setup.confdir) 82 | dest_dir = os.path.join(setup.app.builder.outdir, rel_dir) 83 | dest_path = os.path.join(dest_dir, nb_basename) 84 | 85 | image_dir, image_rel_dir = make_image_dir(setup, rst_dir) 86 | 87 | # Ensure desination build directory exists 88 | thread_safe_mkdir(os.path.dirname(dest_path)) 89 | 90 | # Copy unevaluated notebook 91 | shutil.copyfile(nb_abs_path, dest_path) 92 | 93 | # Construct paths to versions getting copied over 94 | dest_path_eval = dest_path.replace('.ipynb', '_evaluated.ipynb') 95 | dest_path_script = dest_path.replace('.ipynb', '.py') 96 | rel_path_eval = nb_basename.replace('.ipynb', '_evaluated.ipynb') 97 | rel_path_script = nb_basename.replace('.ipynb', '.py') 98 | 99 | # Create python script vesion 100 | script_text = nb_to_python(nb_abs_path) 101 | f = open(dest_path_script, 'wb') 102 | f.write(script_text.encode('utf8')) 103 | f.close() 104 | 105 | skip_exceptions = 'skip_exceptions' in self.options 106 | 107 | evaluated_text, resources = evaluate_notebook( 108 | nb_abs_path, dest_path_eval, skip_exceptions=skip_exceptions) 109 | 110 | evaluated_text = write_notebook_output( 111 | resources, image_dir, image_rel_dir, evaluated_text) 112 | 113 | # Create link to notebook and script files 114 | link_rst = "(" + \ 115 | formatted_link(nb_basename) + "; " + \ 116 | formatted_link(rel_path_eval) + "; " + \ 117 | formatted_link(rel_path_script) + \ 118 | ")" 119 | 120 | self.state_machine.insert_input([link_rst], rst_file) 121 | 122 | # create notebook node 123 | attributes = {'format': 'html', 'source': 'nb_path'} 124 | nb_node = notebook_node('', evaluated_text, **attributes) 125 | (nb_node.source, nb_node.line) = \ 126 | self.state_machine.get_source_and_line(self.lineno) 127 | 128 | # add dependency 129 | self.state.document.settings.record_dependencies.add(nb_abs_path) 130 | 131 | # clean up 132 | os.chdir(cwd) 133 | shutil.rmtree(tmpdir, True) 134 | 135 | return [nb_node] 136 | 137 | 138 | class notebook_node(nodes.raw): 139 | pass 140 | 141 | 142 | def nb_to_python(nb_path): 143 | """convert notebook to python script""" 144 | exporter = python.PythonExporter() 145 | output, resources = exporter.from_filename(nb_path) 146 | return output 147 | 148 | 149 | def nb_to_html(nb_path, skip_exceptions): 150 | """convert notebook to html""" 151 | 152 | nbconvert_config = Config({ 153 | 'ExtractOutputPreprocessor': {'enabled': True}, 154 | 'ExecutePreprocessor': { 155 | 'enabled': True, 156 | # make this configurable? 157 | 'timeout': 3600, 158 | } 159 | }) 160 | 161 | if skip_exceptions is False: 162 | nbconvert_config['ExecutePreprocessor']['allow_errors'] = True 163 | 164 | exporter = html.HTMLExporter(template_file='full', config=nbconvert_config) 165 | notebook = nbformat.read(nb_path, nbformat.NO_CONVERT) 166 | output, resources = exporter.from_notebook_node(notebook) 167 | header = output.split('', 1)[1].split('', 1)[0] 168 | body = output.split('', 1)[1].split('', 1)[0] 169 | 170 | # http://imgur.com/eR9bMRH 171 | header = header.replace(''] 208 | lines.append(header) 209 | lines.append(body) 210 | lines.append('') 211 | return '\n'.join(lines), resources, notebook 212 | 213 | 214 | def evaluate_notebook(nb_path, dest_path, skip_exceptions=True): 215 | # Create evaluated version and save it to the dest path. 216 | lines, resources, notebook = nb_to_html(nb_path, skip_exceptions) 217 | nbformat.write(notebook, dest_path) 218 | return lines, resources 219 | 220 | 221 | def formatted_link(path): 222 | return "`%s <%s>`__" % (os.path.basename(path), path) 223 | 224 | 225 | def visit_notebook_node(self, node): 226 | self.visit_raw(node) 227 | 228 | 229 | def depart_notebook_node(self, node): 230 | self.depart_raw(node) 231 | 232 | 233 | def setup(app): 234 | setup.app = app 235 | setup.config = app.config 236 | setup.confdir = app.confdir 237 | 238 | app.add_node(notebook_node, 239 | html=(visit_notebook_node, depart_notebook_node)) 240 | 241 | app.add_directive('notebook', NotebookDirective) 242 | 243 | retdict = dict( 244 | version='0.1', 245 | parallel_read_safe=True, 246 | parallel_write_safe=True 247 | ) 248 | 249 | return retdict 250 | 251 | 252 | def make_image_dir(setup, rst_dir): 253 | image_dir = setup.app.builder.outdir + os.path.sep + '_images' 254 | rel_dir = os.path.relpath(setup.confdir, rst_dir) 255 | image_rel_dir = rel_dir + os.path.sep + '_images' 256 | thread_safe_mkdir(image_dir) 257 | return image_dir, image_rel_dir 258 | 259 | 260 | def write_notebook_output(resources, image_dir, image_rel_dir, evaluated_text): 261 | my_uuid = uuid.uuid4().hex 262 | 263 | for output in resources['outputs']: 264 | new_name = image_dir + os.path.sep + my_uuid + output 265 | new_relative_name = image_rel_dir + os.path.sep + my_uuid + output 266 | evaluated_text = evaluated_text.replace(output, new_relative_name) 267 | with open(new_name, 'wb') as f: 268 | f.write(resources['outputs'][output]) 269 | return evaluated_text 270 | 271 | 272 | def thread_safe_mkdir(dirname): 273 | try: 274 | os.makedirs(dirname) 275 | except OSError as e: 276 | if e.errno != errno.EEXIST: 277 | raise 278 | pass 279 | -------------------------------------------------------------------------------- /docs/source/summarise_api.rst: -------------------------------------------------------------------------------- 1 | lens.summarise API 2 | ================== 3 | 4 | .. currentmodule:: lens.summarise 5 | 6 | .. automodule:: lens.summarise 7 | 8 | .. autofunction:: lens.summarise.summarise 9 | 10 | .. autoclass:: lens.summarise.Summary 11 | :members: 12 | 13 | -------------------------------------------------------------------------------- /docs/source/tutorial.rst: -------------------------------------------------------------------------------- 1 | Lens Tutorial 2 | ============= 3 | 4 | We have prepared a Lens tutorial in the form of a Jupyter notebook. A static 5 | version is reproduced below, but you can also execute it yourself by downloading 6 | :download:`the notebook file <./room_occupancy_example.ipynb>`. 7 | 8 | .. notebook:: room_occupancy_example.ipynb 9 | -------------------------------------------------------------------------------- /it/test_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import json 4 | 5 | import boto3 6 | import lens 7 | import numpy as np 8 | import pandas as pd 9 | import pytest 10 | import os 11 | import inspect 12 | 13 | S3 = boto3.client("s3") 14 | BUCKET = "asi-lens-test-data" 15 | 16 | datasets = [ 17 | "room_occupancy.csv", 18 | "artworks-5k.csv", 19 | "air-quality-london-time-of-day.csv", 20 | "momaExhibitions-5k.csv", 21 | "noheader.csv", 22 | "monthly-milk-production.csv", 23 | "customer-data.csv", 24 | ] 25 | 26 | dirname = os.path.dirname( 27 | os.path.abspath(inspect.getfile(inspect.currentframe())) 28 | ) 29 | result_dir = os.path.join(dirname, "generated_reports") 30 | 31 | if not os.path.exists(result_dir): 32 | os.mkdir(result_dir) 33 | 34 | 35 | @pytest.mark.parametrize("input_", datasets) 36 | def test_summary_regression(input_): 37 | # load the input into a pandas dataframe 38 | df = pd.read_csv("s3://{}/input/{}".format(BUCKET, input_)) 39 | 40 | # run the lens summarise method 41 | summary = lens.summarise(df) 42 | 43 | # Save generated report 44 | summary.to_json(os.path.join(result_dir, input_.replace(".csv", ".json"))) 45 | 46 | # load the expected output file into a summary object 47 | output = input_.replace(".csv", ".json") 48 | s3_summary = read_s3_file(BUCKET, "output/{}".format(output))[ 49 | "Body" 50 | ].read() 51 | 52 | if isinstance(s3_summary, bytes): 53 | s3_summary = s3_summary.decode("utf-8") 54 | 55 | expected_summary = json.loads(s3_summary) 56 | 57 | # list of keys to ignore from the response because they are 58 | # probablistically generated 59 | exclude = [ 60 | "_run_time", 61 | "tdigest", 62 | "density", 63 | "bw", 64 | "logtrans_IQR", 65 | "kde", 66 | "_lens_version", 67 | ] 68 | 69 | diffs = find_diff( 70 | json.loads(json.dumps(summary._report)), expected_summary, exclude 71 | ) 72 | 73 | for diff in diffs: 74 | print(diff) 75 | 76 | if len(diffs): 77 | # Save expected report to check the differences manually if needed 78 | exp_name = os.path.join( 79 | result_dir, output.replace(".json", "-expected.json") 80 | ) 81 | with open(exp_name, "w") as f: 82 | f.write(s3_summary) 83 | 84 | # compare the input and output summary objects 85 | assert len(diffs) == 0 86 | 87 | 88 | def read_s3_file(bucket, key): 89 | return S3.get_object(Bucket=BUCKET, Key=key) 90 | 91 | 92 | def find_diff(d1, d2, exclude=[], path="", update_path=True): 93 | diffs = [] 94 | for k in d1.keys(): 95 | if k in exclude: 96 | continue 97 | 98 | if k not in d2: 99 | msg = "{} :\n {} as key not in d2".format(path, k) 100 | diffs.append(msg) 101 | else: 102 | new_path = path 103 | if update_path: 104 | if new_path == "": 105 | new_path = k 106 | else: 107 | new_path = new_path + "->" + k 108 | 109 | if isinstance(d1[k], dict): 110 | diffs = diffs + find_diff(d1[k], d2[k], exclude, new_path) 111 | elif isinstance(d1[k], list): 112 | # convert the list to a dict using the index as the key. 113 | diffs = diffs + find_diff( 114 | list_to_dict(d1[k]), 115 | list_to_dict(d2[k]), 116 | exclude, 117 | new_path, 118 | False, 119 | ) 120 | else: 121 | a = d1[k] 122 | b = d2[k] 123 | if not isinstance(a, float) or not ( 124 | np.isnan(a) and np.isnan(b) 125 | ): 126 | if isinstance(a, float): 127 | if not np.allclose(a, b): 128 | msg = "{} :\n - {} : {}\n + {} : {}".format( 129 | path, k, a, k, b 130 | ) 131 | diffs.append(msg) 132 | elif a != b: 133 | msg = "{} :\n - {} : {}\n + {} : {}".format( 134 | path, k, a, k, b 135 | ) 136 | diffs.append(msg) 137 | 138 | return diffs 139 | 140 | 141 | def list_to_dict(list_): 142 | dict_ = {} 143 | for index, item in enumerate(list_): 144 | dict_[index] = item 145 | 146 | return dict_ 147 | -------------------------------------------------------------------------------- /it/update_reference_reports.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import boto3 4 | import os 5 | 6 | S3 = boto3.client("s3") 7 | BUCKET = "asi-lens-test-data" 8 | 9 | 10 | def check_report_bom(name): 11 | """Check whether the report has Byte Order Marks 12 | 13 | Parameters 14 | ---------- 15 | name : str 16 | Filename of the report. 17 | """ 18 | with open(name, "r") as f: 19 | report_json = f.read() 20 | if report_json.count("\\ufeff"): 21 | print( 22 | " WARNING: {} Byte-order-Marks found in report! This might" 23 | " provoke failures on codeship. Have you checked that the csv" 24 | " is encoded in UTF8 without BOM?".format( 25 | report_json.count("\\ufeff") 26 | ) 27 | ) 28 | 29 | 30 | if __name__ == "__main__": 31 | from test_regression import datasets 32 | 33 | for dataset in datasets: 34 | report_name = dataset.replace(".csv", ".json") 35 | report_path = os.path.join("generated_reports", report_name) 36 | check_report_bom(report_path) 37 | S3.upload_file( 38 | report_path, Bucket=BUCKET, Key="output/{}".format(report_name) 39 | ) 40 | -------------------------------------------------------------------------------- /lens/__init__.py: -------------------------------------------------------------------------------- 1 | """Summarise and explore Pandas DataFrames""" 2 | 3 | from lens.explorer import Explorer, explore 4 | from lens.summarise import Summary, summarise 5 | from lens.version import __version__ 6 | from lens.widget import interactive_explore 7 | 8 | __all__ = [ 9 | "Summary", 10 | "summarise", 11 | "Explorer", 12 | "explore", 13 | "interactive_explore", 14 | "__version__", 15 | ] 16 | -------------------------------------------------------------------------------- /lens/bins.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def freedman_diaconis_bins(data_range, iqr, number_samples): 5 | """ 6 | Calculate number of hist bins using Freedman-Diaconis rule. 7 | 8 | Parameters 9 | ---------- 10 | 11 | data_range: float 12 | total range of the data 13 | 14 | iqr: float 15 | interquartile range of the data 16 | 17 | number_samples: int 18 | number of values in the data 19 | """ 20 | # From http://stats.stackexchange.com/questions/798/ 21 | # adapted from seaborn 22 | h = 2 * iqr / (float(number_samples) ** (1.0 / 3.0)) 23 | # fall back to sqrt(a) bins if iqr is 0 24 | if h == 0: 25 | return int(np.sqrt(number_samples)) 26 | else: 27 | return int(np.ceil(data_range) / h) 28 | -------------------------------------------------------------------------------- /lens/dask_graph.py: -------------------------------------------------------------------------------- 1 | """Build a Dask graph of Summary computation""" 2 | 3 | import itertools 4 | import pandas as pd 5 | from dask.delayed import delayed, Delayed 6 | from . import metrics 7 | 8 | 9 | def _nested_merge(first, second, path=None): 10 | """Merge two nested dictionaries into a single dictionary. 11 | 12 | Parameters 13 | ---------- 14 | first : dict 15 | The first dictionary. 16 | second : dict 17 | The second dictionary. 18 | path : TODO 19 | TODO 20 | 21 | Returns 22 | ------- 23 | dict 24 | The merged dictionary. 25 | """ 26 | if path is None: 27 | path = [] 28 | for key in second: 29 | if key in first: 30 | if isinstance(first[key], dict) and isinstance(second[key], dict): 31 | _nested_merge(first[key], second[key], path + [str(key)]) 32 | elif first[key] == second[key]: 33 | pass # Same leaf value. 34 | else: 35 | raise Exception( 36 | "Conflict at {}".format(".".join(path + [str(key)])) 37 | ) 38 | else: 39 | first[key] = second[key] 40 | return first 41 | 42 | 43 | @delayed(pure=True) 44 | def _join_dask_results(results): 45 | """Join a list of column-wise results into a single dictionary. 46 | 47 | The `_run_time` and `_columns` keys are appended to, whilst other 48 | keys are merged. 49 | 50 | Parameters 51 | ---------- 52 | results : list 53 | List of Dask results dictionaries to join. 54 | """ 55 | report = {"_run_time": 0.0, "_columns": []} 56 | 57 | for result in results: 58 | if isinstance(result, Delayed): 59 | result = result.compute() 60 | if result is not None: 61 | report["_run_time"] += result["_run_time"] 62 | report["_columns"] += result["_columns"] 63 | columns = result.keys() 64 | report = _nested_merge( 65 | report, 66 | { 67 | column: result[column] 68 | for column in columns 69 | if column not in ["_columns", "_run_time"] 70 | }, 71 | ) 72 | 73 | report["_columns"] = sorted(list(set(report["_columns"]))) 74 | 75 | return report 76 | 77 | 78 | def create_dask_graph(df, pairdensities=True): 79 | """Create a Dask graph for executing the summary generation. 80 | 81 | Parameters 82 | ---------- 83 | df : pd.DataFrame 84 | The DataFrame for which to generate the summary. 85 | 86 | pairdensities : bool, optional 87 | Whether to compute the pairdensity estimation between all pairs of 88 | numerical columns. For most datasets, this is the most expensive 89 | computation. Default is True. 90 | 91 | Returns 92 | ------- 93 | dict 94 | The generated data summary. 95 | """ 96 | # Create a series for each column in the DataFrame. 97 | columns = df.columns 98 | df = delayed(df) 99 | cols = {k: delayed(df.get)(k) for k in columns} 100 | 101 | # Create the delayed reports using Dask. 102 | row_c = delayed(metrics.row_count)(df) 103 | 104 | cprops = {k: delayed(metrics.column_properties)(cols[k]) for k in columns} 105 | joined_cprops = _join_dask_results(list(cprops.values())) 106 | 107 | freqs = { 108 | k: delayed(metrics.frequencies)(cols[k], cprops[k]) for k in columns 109 | } 110 | joined_freqs = _join_dask_results(list(freqs.values())) 111 | 112 | csumms = { 113 | k: delayed(metrics.column_summary)(cols[k], cprops[k]) for k in columns 114 | } 115 | joined_csumms = _join_dask_results(list(csumms.values())) 116 | 117 | out = {k: delayed(metrics.outliers)(cols[k], csumms[k]) for k in columns} 118 | joined_outliers = _join_dask_results(list(out.values())) 119 | 120 | corr = delayed(metrics.correlation)(df, joined_cprops) 121 | 122 | pdens_results = [] 123 | if pairdensities: 124 | for col1, col2 in itertools.combinations(columns, 2): 125 | pdens_df = delayed(pd.concat)([cols[col1], cols[col2]], axis=1) 126 | pdens_cp = {k: cprops[k] for k in [col1, col2]} 127 | pdens_cs = {k: csumms[k] for k in [col1, col2]} 128 | pdens_fr = {k: freqs[k] for k in [col1, col2]} 129 | pdens = delayed(metrics.pairdensity)( 130 | pdens_df, pdens_cp, pdens_cs, pdens_fr 131 | ) 132 | pdens_results.append(pdens) 133 | 134 | joined_pairdensities = _join_dask_results(pdens_results) 135 | 136 | # Join the delayed reports per-metric into a dictionary. 137 | dask_dict = delayed(dict)( 138 | row_count=row_c, 139 | column_properties=joined_cprops, 140 | frequencies=joined_freqs, 141 | column_summary=joined_csumms, 142 | outliers=joined_outliers, 143 | correlation=corr, 144 | pairdensity=joined_pairdensities, 145 | _columns=list(columns), 146 | ) 147 | 148 | return dask_dict 149 | -------------------------------------------------------------------------------- /lens/explorer.py: -------------------------------------------------------------------------------- 1 | """Explore a Summary""" 2 | 3 | import sys 4 | import logging 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | import plotly.tools 10 | import plotly.offline as py 11 | 12 | from lens.summarise import Summary 13 | from lens.formatting import JupyterTable 14 | from lens.plotting import ( 15 | plot_distribution, 16 | plot_pairdensity, 17 | plot_correlation, 18 | plot_cdf, 19 | ) 20 | 21 | logger = logging.getLogger(__name__) 22 | logger.addHandler(logging.StreamHandler()) 23 | 24 | # Check whether we are in a notebook environment 25 | # this is a false positive if we are in the Jupyter console 26 | IN_NOTEBOOK = "ipykernel" in sys.modules 27 | 28 | PLOTLY_TO_MPL_KWS = {"strip_style": True, "resize": True} 29 | 30 | PLOTLY_KWS = {"show_link": False} 31 | 32 | 33 | def _render(fig, showlegend=None): 34 | """Plot a matploltib or plotly figure""" 35 | if isinstance(fig, plt.Figure): 36 | fig = plotly.tools.mpl_to_plotly(fig, **PLOTLY_TO_MPL_KWS) 37 | 38 | if showlegend is not None: 39 | fig.layout["showlegend"] = showlegend 40 | 41 | if not IN_NOTEBOOK: 42 | message = "Lens explorer can only plot in a Jupyter notebook" 43 | logger.error(message) 44 | raise ValueError(message) 45 | else: 46 | if not py.offline.__PLOTLY_OFFLINE_INITIALIZED: 47 | py.init_notebook_mode() 48 | return py.iplot(fig, **PLOTLY_KWS) 49 | 50 | 51 | class Explorer(object): 52 | """An explorer to visualise a Lens Summary 53 | 54 | Once a Lens ``Summary`` has been generated with 55 | :func:`lens.summarise.summarise`, this class provides the methods necessary 56 | to explore the summary though tables and plots. It is best used from within 57 | a Jupyter notebook. 58 | """ 59 | 60 | # Number of points to show in the CDF plot 61 | _N_cdf = 1000 62 | 63 | def __init__(self, summary, plot_renderer=_render): 64 | if not isinstance(summary, Summary): 65 | raise TypeError("Can only explore a lens Summary") 66 | self.summary = summary 67 | self.plot_renderer = plot_renderer 68 | 69 | def describe(self): 70 | """General description of the dataset. 71 | 72 | Produces a table including the following information about each column: 73 | 74 | ``desc`` 75 | the type of data: currently ``categorical`` or ``numeric``. 76 | Lens will calculate different quantities for this column 77 | depending on the value of ``desc``. 78 | 79 | ``dtype`` 80 | the type of data in Pandas. 81 | 82 | ``name`` 83 | column name 84 | 85 | ``notnulls`` 86 | number of non-null values in the column 87 | 88 | ``nulls`` 89 | number of null-values in the column 90 | 91 | ``unique`` 92 | number of unique values in the column 93 | """ 94 | summary = self.summary 95 | columns = summary.columns 96 | 97 | header = [""] 98 | header.extend(columns) 99 | 100 | desc = ["desc"] 101 | desc.extend([summary._desc(column) for column in columns]) 102 | 103 | dtype = ["dtype"] 104 | dtype.extend([summary.summary(column)["dtype"] for column in columns]) 105 | 106 | notnulls = ["notnulls"] 107 | notnulls.extend( 108 | [summary.summary(column)["notnulls"] for column in columns] 109 | ) 110 | 111 | nulls = ["nulls"] 112 | nulls.extend([summary.summary(column)["nulls"] for column in columns]) 113 | 114 | unique = ["unique"] 115 | unique.extend( 116 | [summary.summary(column)["unique"] for column in columns] 117 | ) 118 | 119 | return JupyterTable([header, desc, dtype, notnulls, nulls, unique]) 120 | 121 | def column_details(self, column, sort=False): 122 | """Show type-specific column details. 123 | 124 | For numeric columns, this method produces a table with summary 125 | statistics, including minimum, maximum, mean, and median. For 126 | categorical columns, it produces a frequency table for each category 127 | sorted in descending order of frequency. 128 | 129 | Parameters 130 | ---------- 131 | column : str 132 | Name of the column. 133 | sort : boolean, optional 134 | Sort frequency tables in categorical variables by 135 | category name. 136 | """ 137 | details = self.summary.details(column) 138 | desc = details["desc"] 139 | 140 | if desc == "numeric": 141 | caption = "" 142 | data = [ 143 | ["", details["name"]], 144 | ["desc", details["desc"]], 145 | ["dtype", self.summary.summary(column)["dtype"]], 146 | ["min", details["min"]], 147 | ["max", details["max"]], 148 | ["mean", details["mean"]], 149 | ["median", details["median"]], 150 | ["std", details["std"]], 151 | ["sum", details["sum"]], 152 | ["IQR", details["iqr"]], 153 | ] 154 | return JupyterTable(data) 155 | elif desc == "categorical": 156 | caption = "

desc: {}, dtype: {}

".format( 157 | details["desc"], self.summary.summary(column)["dtype"] 158 | ) 159 | data = [["item", "frequency"]] 160 | frequencies = [] 161 | for item, frequency in details["frequencies"].items(): 162 | frequencies.append([item, frequency]) 163 | if sort: 164 | data.extend(sorted(frequencies, key=lambda x: x[0])) 165 | else: 166 | data.extend(sorted(frequencies, key=lambda x: -x[1])) 167 | else: 168 | caption = "" 169 | data = [ 170 | ["", details["name"]], 171 | ["desc", details["desc"]], 172 | ["dtype", self.summary.summary(column)["dtype"]], 173 | ] 174 | 175 | return JupyterTable(data, caption=caption) 176 | 177 | def distribution(self, column): 178 | """Show properties of the distribution of values in the column. 179 | 180 | Parameters 181 | ---------- 182 | column : str 183 | Name of the column. 184 | """ 185 | raise NotImplementedError 186 | 187 | def distribution_plot(self, column, bins=None): 188 | """Plot the distribution of a numeric column. 189 | 190 | Create a plotly plot with a histogram of the values in a column. The 191 | number of bin in the histogram is decided according to the 192 | Freedman-Diaconis rule unless given by the `bins` parameter. 193 | 194 | Parameters 195 | ---------- 196 | column : str 197 | Name of the column. 198 | bins : int, optional 199 | Number of bins to use for histogram. If not given, the 200 | Freedman-Diaconis rule will be used to estimate the best number of 201 | bins. This argument also accepts the formats taken by the `bins` 202 | parameter of matplotlib's :function:`~matplotlib.pyplot.hist`. 203 | """ 204 | ax = plot_distribution(self.summary, column, bins) 205 | self.plot_renderer(ax) 206 | 207 | def cdf_plot(self, column): 208 | """Plot the empirical cumulative distribution function of a column. 209 | 210 | Creates a plotly plot with the empirical CDF of a column. 211 | 212 | Parameters 213 | ---------- 214 | column : str 215 | Name of the column. 216 | """ 217 | ax = plot_cdf(self.summary, column, self._N_cdf) 218 | self.plot_renderer(ax) 219 | 220 | def crosstab(self, column1, column2): 221 | """Show a contingency table of two categorical columns. 222 | 223 | Print a contingency table for two categorical variables showing the 224 | multivariate frequancy distribution of the columns. 225 | 226 | Parameters 227 | ---------- 228 | column1 : str 229 | First column. 230 | column2 : str 231 | Second column. 232 | """ 233 | pair_details = self.summary.pair_details(column1, column2) 234 | 235 | for column in [column1, column2]: 236 | column_details = self.summary.details(column) 237 | if column_details["desc"] != "categorical": 238 | raise ValueError( 239 | "Column `{}` is not categorical".format(column) 240 | ) 241 | 242 | pair_details = self.summary.pair_details(column1, column2) 243 | pairdensity = pair_details["pairdensity"] 244 | 245 | # Convert to numpy arrays for ease of reindexing 246 | x = np.array(pairdensity["x"]) 247 | y = np.array(pairdensity["y"]) 248 | crosstab = np.array(pairdensity["density"]) 249 | 250 | # Sort by first column category names 251 | idx = np.argsort(x) 252 | x = x[idx] 253 | crosstab = crosstab[:, idx] 254 | 255 | # Sort by second column category names 256 | idx = np.argsort(y) 257 | y = y[idx] 258 | crosstab = crosstab[idx] 259 | 260 | table = [[""] + x.tolist()] 261 | for y_category, crosstab_row in zip(y, crosstab): 262 | table.append([y_category] + crosstab_row.tolist()) 263 | 264 | return JupyterTable(table) 265 | 266 | def pairwise_density_plot(self, column1, column2): 267 | """Plot the pairwise density between two columns. 268 | 269 | This plot is an approximation of a scatterplot through a 2D Kernel 270 | Density Estimate for two numerical variables. When one of the variables 271 | is categorical, a 1D KDE for each of the categories is shown, 272 | normalised to the total number of non-null observations. For two 273 | categorical variables, the plot produced is a heatmap representation of 274 | the contingency table. 275 | 276 | Parameters 277 | ---------- 278 | column1 : str 279 | First column. 280 | column2 : str 281 | Second column. 282 | """ 283 | allowed_descriptions = ["numeric", "categorical"] 284 | for column in [column1, column2]: 285 | column_description = self.summary.summary(column)["desc"] 286 | if column_description not in allowed_descriptions: 287 | raise ValueError( 288 | "Column {} is not numeric or categorical".format(column) 289 | ) 290 | 291 | fig = plot_pairdensity(self.summary, column1, column2) 292 | self.plot_renderer(fig) 293 | 294 | def correlation_plot(self, include=None, exclude=None): 295 | """Plot the correlation matrix for numeric columns 296 | 297 | Plot a Spearman rank order correlation coefficient matrix showing the 298 | correlation between columns. The matrix is reordered to group together 299 | columns that have a higher correlation coefficient. The columns to be 300 | plotted in the correlation plot can be selected through either the 301 | ``include`` or ``exclude`` keyword arguments. Only one of them can be 302 | given. 303 | 304 | Parameters 305 | ---------- 306 | 307 | include : list of str 308 | List of columns to include in the correlation plot. 309 | exclude : list of str 310 | List of columns to exclude from the correlation plot. 311 | """ 312 | fig = plot_correlation(self.summary, include, exclude) 313 | self.plot_renderer(fig) 314 | 315 | def correlation(self, include=None, exclude=None): 316 | """Show the correlation matrix for numeric columns. 317 | 318 | Print a Spearman rank order correlation coefficient matrix in tabular 319 | form, showing the correlation between columns. The matrix is reordered 320 | to group together columns that have a higher correlation coefficient. 321 | The columns to be shown in the table can be selected 322 | through either the ``include`` or ``exclude`` keyword arguments. Only 323 | one of them can be given. 324 | 325 | Parameters 326 | ---------- 327 | 328 | include : list of str 329 | List of columns to include in the correlation plot. 330 | exclude : list of str 331 | List of columns to exclude from the correlation plot. 332 | """ 333 | columns, correlation_matrix = self.summary.correlation_matrix( 334 | include, exclude 335 | ) 336 | headers = [""] + columns 337 | rows = [] 338 | for column, correlation_row in zip(columns, correlation_matrix): 339 | rows.append([column] + correlation_row.tolist()) 340 | return JupyterTable([headers] + rows) 341 | 342 | 343 | def explore(summary): 344 | """Create an Explorer instance from a Lens Summary""" 345 | return Explorer(summary) 346 | -------------------------------------------------------------------------------- /lens/formatting.py: -------------------------------------------------------------------------------- 1 | """Table formatting for Jupyter notebooks""" 2 | 3 | # Copyright (c) 2012-2013, Eric Moyer 4 | # Copyright (c) 2016-2019, Faculty Science Limited 5 | # 6 | # All rights reserved. 7 | # 8 | # Redistribution and use in source and binary forms, with or without 9 | # modification, are permitted provided that the following conditions are 10 | # met: 11 | # 12 | # Redistributions of source code must retain the above copyright notice, 13 | # this list of conditions and the following disclaimer. 14 | # 15 | # Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | # 19 | # Neither the name of the ipy_table Development Team nor the names of 20 | # its contributors may be used to endorse or promote products derived 21 | # from this software without specific prior written permission. 22 | # 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 27 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 | 35 | import sys 36 | 37 | 38 | class JupyterTable(object): 39 | 40 | _valid_borders = {"left", "right", "top", "bottom", "all"} 41 | 42 | def __init__(self, array, caption=""): 43 | self.array = array 44 | self.caption = caption 45 | 46 | self._num_rows = len(array) 47 | self._num_columns = len(array[0]) 48 | 49 | # Check that array is well formed 50 | for row in array: 51 | if len(row) != self._num_columns: 52 | raise ValueError("Array rows must all be of equal length.") 53 | 54 | self._cell_styles = [ 55 | [{"float_format": "%0.4f"} for dummy in range(self._num_columns)] 56 | for dummy2 in range(self._num_rows) 57 | ] 58 | 59 | def _repr_html_(self): 60 | """Jupyter display protocol: HTML representation. 61 | 62 | The Jupyter display protocol calls this method to get the HTML 63 | representation of this object. 64 | """ 65 | # Generate TABLE tag () 66 | html = ( 67 | self.caption 68 | + '' 70 | ) 71 | 72 | for row, row_data in enumerate(self.array): 73 | 74 | # Generate ROW tag () 75 | html += "" 76 | for (column, item) in enumerate(row_data): 77 | if not _key_is_valid( 78 | self._cell_styles[row][column], "suppress" 79 | ): 80 | 81 | # Generate CELL tag (" 101 | html += "" 102 | html += "
) 82 | # Apply floating point formatter to the cell contents 83 | # (if it is a float) 84 | item_html = self._formatter( 85 | item, self._cell_styles[row][column] 86 | ) 87 | 88 | # Add bold and italic tags if set 89 | if _key_is_valid(self._cell_styles[row][column], "bold"): 90 | item_html = "" + item_html + "" 91 | if _key_is_valid(self._cell_styles[row][column], "italic"): 92 | item_html = "" + item_html + "" 93 | 94 | # Get html style string 95 | style_html = self._get_style_html( 96 | self._cell_styles[row][column] 97 | ) 98 | 99 | # Append cell 100 | html += "" + item_html + "
" 103 | return html 104 | 105 | def _get_style_html(self, style_dict): 106 | """Parse the style dictionary and return equivalent html style text.""" 107 | style_html = "" 108 | if _key_is_valid(style_dict, "color"): 109 | style_html += "background-color:" + style_dict["color"] + ";" 110 | 111 | if _key_is_valid(style_dict, "thick_border"): 112 | for edge in self._split_by_comma(style_dict["thick_border"]): 113 | style_html += "border-%s: 3px solid black;" % edge 114 | 115 | if _key_is_valid(style_dict, "no_border"): 116 | for edge in self._split_by_comma(style_dict["no_border"]): 117 | style_html += "border-%s: 1px solid transparent;" % edge 118 | 119 | if _key_is_valid(style_dict, "align"): 120 | style_html += "text-align:" + str(style_dict["align"]) + ";" 121 | 122 | if _key_is_valid(style_dict, "width"): 123 | style_html += "width:" + str(style_dict["width"]) + "px;" 124 | 125 | if style_html: 126 | style_html = ' style="' + style_html + '"' 127 | 128 | if _key_is_valid(style_dict, "row_span"): 129 | style_html = ( 130 | 'rowspan="' + str(style_dict["row_span"]) + '";' + style_html 131 | ) 132 | 133 | if _key_is_valid(style_dict, "column_span"): 134 | style_html = ( 135 | 'colspan="' 136 | + str(style_dict["column_span"]) 137 | + '";' 138 | + style_html 139 | ) 140 | 141 | # Prepend a space if non-blank 142 | if style_html: 143 | return " " + style_html 144 | return "" 145 | 146 | def _formatter(self, item, cell_style): 147 | """Apply formatting to cell contents. 148 | 149 | Applies float format to item if item is a float or float64. 150 | Converts spaces to non-breaking if wrap is not enabled. 151 | Returns string. 152 | """ 153 | 154 | # The following check is performed as a string comparison 155 | # so that ipy_table does not need to require (import) numpy. 156 | if ( 157 | str(type(item)) in ["", ""] 158 | and "float_format" in cell_style 159 | ): 160 | text = cell_style["float_format"] % item 161 | else: 162 | if isinstance(item, str): 163 | text = item 164 | else: 165 | text = str(item) 166 | 167 | if sys.version_info.major < 3: 168 | # QA disabled as unicode is a NameError in Python 3. 169 | text = unicode(text, encoding="utf-8") # noqa 170 | 171 | # If cell wrapping is not specified 172 | if not ("wrap" in cell_style and cell_style["wrap"]): 173 | # Convert all spaces to non-breaking and return 174 | text = text.replace(" ", " ") 175 | return text 176 | 177 | def _split_by_comma(self, comma_delimited_text): 178 | """Returns a list of the words in the comma delimited text.""" 179 | return comma_delimited_text.replace(" ", "").split(",") 180 | 181 | 182 | def _key_is_valid(dictionary, key): 183 | """Test that a dictionary key exists and that its value is not blank.""" 184 | if key in dictionary: 185 | if dictionary[key]: 186 | return True 187 | return False 188 | -------------------------------------------------------------------------------- /lens/metrics.py: -------------------------------------------------------------------------------- 1 | """Metrics for the computation of a Lens summary""" 2 | 3 | from __future__ import division 4 | 5 | import logging 6 | import time 7 | from functools import wraps 8 | 9 | from tdigest import TDigest 10 | import numpy as np 11 | from scipy import stats 12 | from scipy import signal 13 | import pandas as pd 14 | 15 | from .utils import hierarchical_ordering_indices 16 | 17 | DENSITY_N = 100 18 | LOGNORMALITY_P_THRESH = 0.05 19 | CAT_FRAC_THRESHOLD = 0.5 20 | 21 | logger = logging.getLogger(__name__) 22 | logger.addHandler(logging.StreamHandler()) 23 | 24 | 25 | def timeit(func): 26 | """Decorator to time callable execution and add it to the report. 27 | 28 | Parameters 29 | ---------- 30 | func : callable 31 | The callable to execute. 32 | 33 | Returns 34 | ------- 35 | callable 36 | Decorated function. 37 | """ 38 | 39 | @wraps(func) 40 | def decorator(*args, **kwargs): 41 | tstart = time.time() 42 | report = func(*args, **kwargs) 43 | if report is not None: 44 | report["_run_time"] = time.time() - tstart 45 | return report 46 | 47 | return decorator 48 | 49 | 50 | @timeit 51 | def row_count(df): 52 | """Count number of total and unique rows. 53 | 54 | Parameters 55 | ---------- 56 | df : pd.DataFrame 57 | A DataFrame. 58 | 59 | Returns 60 | ------- 61 | dict 62 | Dictionary with `total` and `unique` keys. 63 | """ 64 | report = {} 65 | report["total"] = len(df.index) 66 | report["unique"] = len(df.drop_duplicates().index) 67 | return report 68 | 69 | 70 | @timeit 71 | def column_properties(series): 72 | """Infer properties of a Pandas Series. 73 | 74 | Parameters 75 | ---------- 76 | series : pd.Series 77 | Series to infer properties of. 78 | 79 | Returns 80 | ------- 81 | dict 82 | Dictionary of inferred properties. 83 | """ 84 | cat_N_threshold = {"object": 1000, "int64": 10, "float64": 10} 85 | 86 | name = series.name 87 | colresult = {} 88 | colresult["dtype"] = str(series.dtype) 89 | nulls = series.isnull().sum() 90 | colresult["nulls"] = int(nulls) if not np.isnan(nulls) else 0 91 | notnulls = series.dropna() 92 | 93 | colresult["notnulls"] = len(notnulls.index) 94 | colresult["numeric"] = ( 95 | series.dtype in [np.float64, np.int64] and colresult["notnulls"] > 0 96 | ) 97 | unique = notnulls.unique().size 98 | colresult["unique"] = unique 99 | colresult["is_categorical"] = False 100 | if ( 101 | colresult["dtype"] in {"object", "int64", "float64"} 102 | and colresult["notnulls"] > 0 103 | ): 104 | # In Pandas integers with nulls are cast as floats, so we have 105 | # to include floats as possible categoricals to detect 106 | # categorical integers. 107 | colresult["is_categorical"] = ( 108 | unique / colresult["notnulls"] <= CAT_FRAC_THRESHOLD 109 | ) and (unique <= cat_N_threshold[colresult["dtype"]]) 110 | logger.debug( 111 | "Column {:15}: {:6} unique, {:6} notnulls, {:6} total" 112 | " --> {}categorical".format( 113 | name, 114 | unique, 115 | colresult["notnulls"], 116 | colresult["notnulls"] + colresult["nulls"], 117 | "NOT " * (not colresult["is_categorical"]), 118 | ) 119 | ) 120 | 121 | # Don't use the is_ID field for now: 122 | # it's too prone to false positives. 123 | # If a columns is wrongly identified as ID-like, 124 | # it doesn't get analyzed 125 | colresult["is_ID"] = False 126 | 127 | return {name: colresult, "_columns": [name]} 128 | 129 | 130 | def _tdigest_mean(digest): 131 | """TODO 132 | 133 | Parameters 134 | ---------- 135 | digest : tdigest.TDigest 136 | t-digest data structure. 137 | 138 | Returns 139 | ------- 140 | TODO 141 | """ 142 | means = [c.mean for c in digest.C.values()] 143 | counts = [c.count for c in digest.C.values()] 144 | return np.average(means, weights=counts) 145 | 146 | 147 | def _tdigest_std(digest): 148 | """TODO 149 | 150 | Parameters 151 | ---------- 152 | digest : tdigest.TDigest 153 | t-digest data structure. 154 | 155 | Returns 156 | ------- 157 | TODO 158 | """ 159 | mean = _tdigest_mean(digest) 160 | sums = [(x.mean - mean) ** 2 * x.count for x in digest.C.values()] 161 | return np.sqrt(np.sum(sums) / digest.n) 162 | 163 | 164 | def _tdigest_normalise(digest): 165 | """TODO 166 | 167 | Parameters 168 | ---------- 169 | digest : tdigest.TDigest 170 | t-digest data structure. 171 | 172 | Returns 173 | ------- 174 | TODO 175 | """ 176 | m = _tdigest_mean(digest) 177 | s = _tdigest_std(digest) 178 | ndigest = TDigest() 179 | for x in digest.C.values(): 180 | ndigest.update((x.mean - m) / s, x.count) 181 | return ndigest 182 | 183 | 184 | def _tdigest_norm_kstest(digest): 185 | """TODO 186 | 187 | Parameters 188 | ---------- 189 | digest : tdigest.TDigest 190 | t-digest data structure. 191 | 192 | Returns 193 | ------- 194 | TODO 195 | """ 196 | normdigest = _tdigest_normalise(digest) 197 | 198 | x = np.linspace(-3, 3, 500) 199 | dig_q = np.array([normdigest.cdf(xx) for xx in x]) 200 | norm_q = stats.norm.cdf(x) 201 | 202 | D = np.max(np.abs(dig_q - norm_q)) 203 | 204 | if digest.n > 3000: 205 | return D, stats.distributions.kstwobign.sf(D * np.sqrt(digest.n)) 206 | else: 207 | return D, 2 * stats.distributions.ksone.sf(D, digest.n) 208 | 209 | 210 | def _test_logtrans(digest): 211 | """ 212 | Test if t-digest distribution is more normal when log-transformed. 213 | 214 | Test whether a log-transform improves normality of data with a 215 | simplified Kolmogorov-Smirnov two-sided test (the location and scale 216 | of the normal distribution are estimated from the median and 217 | standard deviation of the data). 218 | 219 | Parameters 220 | ---------- 221 | digest : tdigest.TDigest 222 | t-digest data structure. 223 | 224 | Returns 225 | ------- 226 | TODO 227 | """ 228 | if digest.percentile(0) <= 0: 229 | return False 230 | 231 | logdigest = TDigest() 232 | for c in digest.C.values(): 233 | logdigest.update(np.log(c.mean), c.count) 234 | 235 | lKS, lp = _tdigest_norm_kstest(logdigest) 236 | KS, p = _tdigest_norm_kstest(digest) 237 | logger.debug( 238 | "KSnorm: log: {:.2g}, {:.2g}; linear: {:.2g}, {:.2g}".format( 239 | lKS, lp, KS, p 240 | ) 241 | ) 242 | 243 | return ( 244 | (lKS < KS) 245 | and (lp > p) 246 | and (lp > LOGNORMALITY_P_THRESH) 247 | and (p < LOGNORMALITY_P_THRESH) 248 | ) 249 | 250 | 251 | @timeit 252 | def column_summary(series, column_props, delta=0.01): 253 | """Summarise a numeric column. 254 | 255 | Parameters 256 | ---------- 257 | series : pd.Series 258 | Numeric column. 259 | column_props : TODO 260 | TODO 261 | delta : float 262 | TODO 263 | 264 | Returns 265 | ------- 266 | TODO 267 | """ 268 | col = series.name 269 | if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0: 270 | # Series is not numeric or is all NaNs. 271 | return None 272 | 273 | logger.debug("column_summary - " + col) 274 | 275 | # select non-nulls from column 276 | data = series.dropna() 277 | 278 | colresult = {} 279 | for m in ["mean", "min", "max", "std", "sum"]: 280 | val = getattr(data, m)() 281 | if type(val) is np.int64: 282 | colresult[m] = int(val) 283 | else: 284 | colresult[m] = val 285 | 286 | colresult["n"] = column_props[col]["notnulls"] 287 | 288 | percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] 289 | colresult["percentiles"] = { 290 | perc: np.nanpercentile(series, perc) for perc in percentiles 291 | } 292 | colresult["median"] = colresult["percentiles"][50] 293 | colresult["iqr"] = ( 294 | colresult["percentiles"][75] - colresult["percentiles"][25] 295 | ) 296 | 297 | # Compute the t-digest. 298 | logger.debug("column_summary - {} - creating TDigest...".format(col)) 299 | digest = TDigest(delta) 300 | digest.batch_update(data) 301 | 302 | logger.debug("column_summary - {} - testing log trans...".format(col)) 303 | try: 304 | colresult["logtrans"] = bool(_test_logtrans(digest)) 305 | except Exception as e: 306 | # Hard to pinpoint problems with the logtrans TDigest. 307 | logger.warning( 308 | "test_logtrans has failed for column `{}`: {}".format(col, e) 309 | ) 310 | colresult["logtrans"] = False 311 | 312 | if colresult["logtrans"]: 313 | logdigest = TDigest() 314 | for c in digest.C.values(): 315 | logdigest.update(np.log(c.mean), c.count) 316 | colresult["logtrans_mean"] = _tdigest_mean(logdigest) 317 | colresult["logtrans_std"] = _tdigest_std(logdigest) 318 | colresult["logtrans_IQR"] = logdigest.percentile( 319 | 75 320 | ) - logdigest.percentile(25) 321 | 322 | logger.debug( 323 | "column_summary - {} - should {}be log-transformed".format( 324 | col, "NOT " if not colresult["logtrans"] else "" 325 | ) 326 | ) 327 | 328 | # Compress and store the t-digest. 329 | digest.delta = delta 330 | digest.compress() 331 | colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()] 332 | 333 | # Compute histogram 334 | logger.debug("column_summary - {} - computing histogram...".format(col)) 335 | 336 | if column_props[col]["is_categorical"]: 337 | # Compute frequency table and store as histogram 338 | counts, edges = _compute_histogram_from_frequencies(data) 339 | else: 340 | if colresult["logtrans"]: 341 | counts, log_edges = np.histogram( 342 | np.log10(data), density=False, bins="fd" 343 | ) 344 | edges = 10 ** log_edges 345 | else: 346 | counts, edges = np.histogram(data, density=False, bins="fd") 347 | 348 | colresult["histogram"] = { 349 | "counts": counts.tolist(), 350 | "bin_edges": edges.tolist(), 351 | } 352 | 353 | # Compute KDE 354 | logger.debug("column_summary - {} - computing KDE...".format(col)) 355 | bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1) 356 | 357 | logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw)) 358 | 359 | if column_props[col]["is_categorical"]: 360 | kde_x, kde_y = np.zeros(1), np.zeros(1) 361 | else: 362 | coord_range = colresult["min"], colresult["max"] 363 | kde_x, kde_y = _compute_smoothed_histogram( 364 | data, bw, coord_range, logtrans=colresult["logtrans"] 365 | ) 366 | 367 | colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()} 368 | 369 | return {col: colresult, "_columns": [col]} 370 | 371 | 372 | def _compute_histogram_from_frequencies(series): 373 | """Compute histogram from frequencies 374 | 375 | This method uses the frequencies dict to produce a histogram data structure 376 | with emtpy bins where the difference between the category values is larger 377 | than 1 378 | 379 | Parameters 380 | ---------- 381 | series : pd.Series 382 | Categorical column.a 383 | 384 | Returns 385 | ------- 386 | counts, edges: 387 | Histogram bin edges and counts in each bin. 388 | """ 389 | freqs = _compute_frequencies(series) 390 | categories = sorted(freqs.keys()) 391 | diffs = list(np.diff(categories)) + [1] 392 | edges = [categories[0] - 0.5] 393 | counts = [] 394 | for cat, diff in zip(categories, diffs): 395 | if diff <= 1: 396 | edges.append(cat + diff / 2.0) 397 | counts.append(freqs[cat]) 398 | else: 399 | edges += [cat + 0.5, cat + diff - 0.5] 400 | counts += [freqs[cat], 0] 401 | 402 | return np.array(counts), np.array(edges) 403 | 404 | 405 | def _compute_frequencies(series): 406 | """Helper to compute frequencies of a categorical column 407 | 408 | Parameters 409 | ---------- 410 | series : pd.Series 411 | Categorical column.a 412 | 413 | Returns 414 | ------- 415 | dict: 416 | Dictionary from category name to count. 417 | """ 418 | freqs = series.value_counts() 419 | if freqs.index.dtype == np.int64: 420 | categories = [int(index) for index in freqs.index] 421 | elif freqs.index.dtype == np.float64: 422 | categories = [float(index) for index in freqs.index] 423 | else: 424 | categories = freqs.index 425 | return dict(zip(categories, freqs.values.tolist())) 426 | 427 | 428 | @timeit 429 | def frequencies(series, column_props): 430 | """Compute frequencies for categorical columns. 431 | 432 | Parameters 433 | ---------- 434 | series : pd.Series 435 | Categorical column. 436 | column_props : dict 437 | Dictionary as returned by `column_properties` 438 | 439 | Returns 440 | ------- 441 | TODO 442 | """ 443 | name = series.name 444 | 445 | if column_props[name]["is_categorical"]: 446 | logger.debug("frequencies - " + series.name) 447 | freqs = _compute_frequencies(series) 448 | return {name: freqs, "_columns": [name]} 449 | else: 450 | return None 451 | 452 | 453 | @timeit 454 | def outliers(series, column_summ): 455 | """Count outliers for numeric columns. 456 | 457 | Parameters 458 | ---------- 459 | series : pd.Series 460 | Numeric column. 461 | column_summ : TODO 462 | TODO 463 | 464 | Returns 465 | ------- 466 | TODO 467 | """ 468 | name = series.name 469 | if column_summ is None: 470 | # Not a numeric column. 471 | return None 472 | else: 473 | column_summ = column_summ[name] 474 | 475 | Q1, Q3 = [column_summ["percentiles"][p] for p in [25, 75]] 476 | IQR = Q3 - Q1 477 | # Mild outlier limits. 478 | lom = Q1 - 1.5 * IQR 479 | him = Q3 + 1.5 * IQR 480 | # Extreme outlier limits. 481 | lox = Q1 - 3.0 * IQR 482 | hix = Q3 + 3.0 * IQR 483 | 484 | nn = series.dropna() 485 | 486 | Nmildlo = len(nn[(nn < lom) & (nn > lox)].index) 487 | Nmildhi = len(nn[(nn > him) & (nn < hix)].index) 488 | Nextrlo = len(nn[nn < lox].index) 489 | Nextrhi = len(nn[nn > hix].index) 490 | 491 | return { 492 | name: {"mild": [Nmildlo, Nmildhi], "extreme": [Nextrlo, Nextrhi]}, 493 | "_columns": [name], 494 | } 495 | 496 | 497 | @timeit 498 | def correlation(df, column_props): 499 | """Compute correlation table between non-ID numeric variables. 500 | 501 | Parameters 502 | ---------- 503 | df : pd.DataFrame 504 | DataFrame. 505 | column_props : TODO 506 | TODO 507 | 508 | Returns 509 | ------- 510 | dict 511 | Dictionary containing correlation coefficients. 512 | """ 513 | 514 | cols = [ 515 | col 516 | for col in df.columns 517 | if (column_props[col]["numeric"] and not column_props[col]["is_ID"]) 518 | ] 519 | 520 | numdf = df[cols] 521 | pcorr = numdf.corr(method="pearson", min_periods=5) 522 | scorr = numdf.corr(method="spearman", min_periods=5) 523 | 524 | report = {} 525 | report["_columns"] = list(numdf.columns) 526 | report["pearson"] = np.array(pcorr).tolist() 527 | report["spearman"] = np.array(scorr).tolist() 528 | 529 | report["order"] = hierarchical_ordering_indices( 530 | numdf.columns, scorr.values 531 | ) 532 | 533 | return report 534 | 535 | 536 | def _compute_smoothed_histogram( 537 | values, bandwidth, coord_range, logtrans=False 538 | ): 539 | """Approximate 1-D density estimation. 540 | 541 | Estimate 1-D probability densities at evenly-spaced grid points, 542 | for specified data. This method is based on creating a 1-D histogram of 543 | data points quantised with respect to evenly-spaced grid points. 544 | Probability densities are then estimated at the grid points by convolving 545 | the obtained histogram with a Gaussian kernel. 546 | 547 | Parameters 548 | ---------- 549 | values : np.array (N,) 550 | A vector containing the data for which to perform density estimation. 551 | Successive data points are indexed by the first axis in the array. 552 | bandwidth : float 553 | The desired KDE bandwidth. (When log-transformation 554 | of data is desired, bandwidth should be specified in log-space.) 555 | coord_range: (2,) 556 | Minimum and maximum values of coordinate on which to evaluate the 557 | smoothed histogram. 558 | logtrans : boolean 559 | Whether or not to log-transform the data before performing density 560 | estimation. 561 | 562 | Returns 563 | ------- 564 | np.array (M-1,) 565 | An array of estimated probability densities at specified grid points. 566 | """ 567 | if logtrans: 568 | ber = [np.log10(extreme) for extreme in coord_range] 569 | bin_edges = np.logspace(*ber, num=DENSITY_N + 1) 570 | bin_edge_range = ber[1] - ber[0] 571 | else: 572 | bin_edges = np.linspace(*coord_range, num=DENSITY_N + 1) 573 | bin_edge_range = coord_range[1] - coord_range[0] 574 | 575 | if values.size < 2: 576 | # Return zeros if there are too few points to do anything useful. 577 | return bin_edges[:-1], np.zeros(bin_edges.shape[0] - 1) 578 | 579 | # Bin the values 580 | H = np.histogram(values, bin_edges)[0] 581 | 582 | relative_bw = bandwidth / bin_edge_range 583 | K = _compute_gaussian_kernel(H.shape, relative_bw) 584 | 585 | pdf = signal.fftconvolve(H, K, mode="same") 586 | 587 | # Return lower edges of bins and normalized pdf 588 | return bin_edges[:-1], pdf / np.trapz(pdf, bin_edges[:-1]) 589 | 590 | 591 | def _compute_smoothed_histogram2d( 592 | values, bandwidth, coord_ranges, logtrans=False 593 | ): 594 | """Approximate 2-D density estimation. 595 | 596 | Estimate 2-D probability densities at evenly-spaced grid points, 597 | for specified data. This method is based on creating a 2-D histogram of 598 | data points quantised with respect to evenly-spaced grid points. 599 | Probability densities are then estimated at the grid points by convolving 600 | the obtained histogram with a Gaussian kernel. 601 | 602 | Parameters 603 | ---------- 604 | values : np.array (N,2) 605 | A 2-D array containing the data for which to perform density 606 | estimation. Successive data points are indexed by the first axis in the 607 | array. The second axis indexes x and y coordinates of data points 608 | (values[:,0] and values[:,1] respectively). 609 | bandwidth : array-like (2,) 610 | The desired KDE bandwidths for x and y axes. (When log-transformation 611 | of data is desired, bandwidths should be specified in log-space.) 612 | coord_range: (2,2) 613 | Minimum and maximum values of coordinates on which to evaluate the 614 | smoothed histogram. 615 | logtrans : array-like (2,) 616 | A 2-element boolean array specifying whether or not to log-transform 617 | the x or y coordinates of the data before performing density 618 | estimation. 619 | 620 | Returns 621 | ------- 622 | np.array (M-1, M-1) 623 | An array of estimated probability densities at specified grid points. 624 | """ 625 | bin_edges = [] 626 | bedge_range = [] 627 | for minmax, lt in zip(coord_ranges, logtrans): 628 | if lt: 629 | ber = [np.log10(extreme) for extreme in minmax] 630 | bin_edges.append(np.logspace(*ber, num=DENSITY_N + 1)) 631 | bedge_range.append(ber[1] - ber[0]) 632 | else: 633 | bin_edges.append(np.linspace(*minmax, num=DENSITY_N + 1)) 634 | bedge_range.append(minmax[1] - minmax[0]) 635 | 636 | # Bin the observations 637 | H = np.histogram2d(values[:, 0], values[:, 1], bins=bin_edges)[0] 638 | 639 | relative_bw = [bw / berange for bw, berange in zip(bandwidth, bedge_range)] 640 | K = _compute_gaussian_kernel(H.shape, relative_bw) 641 | 642 | pdf = signal.fftconvolve(H.T, K, mode="same") 643 | 644 | # Normalize pdf 645 | bin_centers = [edges[:-1] + np.diff(edges) / 2.0 for edges in bin_edges] 646 | pdf /= np.trapz(np.trapz(pdf, bin_centers[1]), bin_centers[0]) 647 | 648 | # Return lower bin edges and density 649 | return bin_edges[0][:-1], bin_edges[1][:-1], pdf 650 | 651 | 652 | def _compute_gaussian_kernel(histogram_shape, relative_bw): 653 | """Compute a gaussian kernel double the size of the histogram matrix""" 654 | if len(histogram_shape) == 2: 655 | kernel_shape = [2 * n for n in histogram_shape] 656 | # Create a scaled grid in which the kernel is symmetric to avoid matrix 657 | # inversion problems when the bandwiths are very different 658 | bw_ratio = relative_bw[0] / relative_bw[1] 659 | bw = relative_bw[0] 660 | X, Y = np.mgrid[ 661 | -bw_ratio : bw_ratio : kernel_shape[0] * 1j, 662 | -1 : 1 : kernel_shape[1] * 1j, 663 | ] 664 | grid_points = np.vstack([X.ravel(), Y.ravel()]).T 665 | Cov = np.array(((bw, 0), (0, bw))) ** 2 666 | K = stats.multivariate_normal.pdf(grid_points, mean=(0, 0), cov=Cov) 667 | 668 | return K.reshape(kernel_shape) 669 | else: 670 | grid = np.mgrid[-1 : 1 : histogram_shape[0] * 2j] 671 | return stats.norm.pdf(grid, loc=0, scale=relative_bw) 672 | 673 | 674 | def _bw_scott(column_summ, N, logtrans, d): 675 | """Scott's rule of thumb for KDE kernel bandwidth. 676 | 677 | Parameters 678 | ---------- 679 | column_summ : dict 680 | Dictionary as returned by `column_summary`. 681 | N : int 682 | Number of elements in the series for which the KDE is to be 683 | evaluated. 684 | logtrans : bool 685 | Whether the series is assumed to be 'exponential' (True) or 686 | 'linear' (False). An 'exponential' series (representing, e.g. 687 | income) is log-transformed before the KDE. The bandwidth 688 | therefore needs to be estimated for the log transformed series. 689 | d : int 690 | Dimension of the KDE. 691 | 692 | Returns 693 | ------- 694 | float 695 | Estimate of the kernel bandwidth for the KDE. 696 | """ 697 | if N == 0: 698 | return 0 699 | 700 | norm = 1.349 # norm.ppf(0.75) - norm.ppf(0.25) 701 | if logtrans: 702 | std, IQR = column_summ["logtrans_std"], column_summ["logtrans_IQR"] 703 | factor = 2 704 | else: 705 | std, IQR = column_summ["std"], column_summ["iqr"] 706 | factor = 1.4 707 | 708 | if IQR > 0: 709 | iqr_estimate = min(IQR / norm, std) 710 | elif std > 0: 711 | iqr_estimate = std 712 | else: 713 | iqr_estimate = 1.0 714 | 715 | bandwidth = 1.06 * iqr_estimate * N ** (-1.0 / (4.0 + d)) 716 | 717 | return bandwidth / factor 718 | 719 | 720 | @timeit 721 | def pairdensity(df, column_props, column_summ, freq, log_transform=True): 722 | """Compute a variable pair heatmap. 723 | 724 | Parameters 725 | ---------- 726 | df : pd.DataFrame 727 | DataFrame with the columns for which the pair density is 728 | computed. 729 | column_props : dict 730 | Column properties dictionary with at least col1 and col2, as 731 | returned by `column_properties`. 732 | column_summ : dict 733 | Column summary dictionary with at least col1 and col2, as 734 | returned by `column_summary`. 735 | freq : dict 736 | Frequencies dictionary with at least col1 and col2. 737 | log_transform : bool 738 | Whether to compute the KDE in log-space when needed. 739 | 740 | Returns 741 | ------- 742 | TODO 743 | """ 744 | col1, col2 = df.columns 745 | 746 | # Test that both columns have valid entries and are either 747 | # categorical or numeric, returning None if not. 748 | column_props = {col: column_props[col][col] for col in [col1, col2]} 749 | for col in [col1, col2]: 750 | if ( 751 | not ( 752 | column_props[col]["is_categorical"] 753 | or column_props[col]["numeric"] 754 | ) 755 | or column_props[col]["notnulls"] == 0 756 | ): 757 | return None 758 | 759 | report = {"_columns": [col1, col2], col1: {}} 760 | 761 | log_string = "pairdensity - {} - {}".format(col1, col2) 762 | logger.debug("{}".format(log_string)) 763 | 764 | data = df.dropna() 765 | N = len(data.index) 766 | 767 | coord_ranges, scales, categories = [], [], [] 768 | bandwidths = [None, None] 769 | for col in [col1, col2]: 770 | if column_props[col]["is_categorical"]: 771 | scales.append("category") 772 | coord_ranges.append(None) 773 | categories.append(sorted(list(freq[col][col].keys()))) 774 | else: 775 | scales.append( 776 | "log" if column_summ[col][col]["logtrans"] else "linear" 777 | ) 778 | coord_ranges.append( 779 | [column_summ[col][col][extreme] for extreme in ["min", "max"]] 780 | ) 781 | categories.append(None) 782 | 783 | Ncat = np.sum([scale == "category" for scale in scales]) 784 | 785 | if N == 0: 786 | logger.warning("{}: No valid pairs found!".format(log_string)) 787 | 788 | if Ncat == 0: 789 | # 2D pair density is not useful with very few observations 790 | if N > 3: 791 | logtrans = [scale == "log" for scale in scales] 792 | 793 | bandwidths = [ 794 | _bw_scott(column_summ[col][col], N, lt, 2 - Ncat) 795 | for col, lt in zip([col1, col2], logtrans) 796 | ] 797 | 798 | x, y, density = _compute_smoothed_histogram2d( 799 | np.array(data), bandwidths, coord_ranges, logtrans=logtrans 800 | ) 801 | 802 | x, y = x.tolist(), y.tolist() 803 | else: 804 | x, y = coord_ranges 805 | density = np.zeros((2, 2)) 806 | 807 | elif Ncat == 1: 808 | # Split into categories and do a univariate KDE on each. 809 | if column_props[col1]["is_categorical"]: 810 | cats = categories[0] 811 | coord_range = coord_ranges[1] 812 | catcol, numcol, numcolsum = col1, col2, column_summ[col2][col2] 813 | logtrans = scales[1] == "log" 814 | else: 815 | cats = categories[1] 816 | coord_range = coord_ranges[0] 817 | catcol, numcol, numcolsum = col2, col1, column_summ[col1][col1] 818 | logtrans = scales[0] == "log" 819 | 820 | density = [] 821 | for cat in cats: 822 | # Filter data for this category. 823 | datacat = data[data[catcol] == cat][numcol] 824 | Nincat = datacat.count() 825 | 826 | # Recompute the bandwidth because the number of pairs in 827 | # this category might be lower than the total number of 828 | # pairs. 829 | num_bw = _bw_scott(numcolsum, Nincat, logtrans, 1) 830 | grid, catdensity = _compute_smoothed_histogram( 831 | datacat, num_bw, coord_range, logtrans=logtrans 832 | ) 833 | 834 | # Remove normalisation to normalise it later to the total 835 | # number of pairs. 836 | density.append(catdensity * Nincat) 837 | 838 | density = np.array(density) / N 839 | 840 | if column_props[col1]["is_categorical"]: 841 | density = density.T 842 | x, y = cats, grid.tolist() 843 | else: 844 | x, y = grid.tolist(), cats 845 | 846 | elif Ncat == 2: 847 | if N > 0: 848 | # Crosstab frequencies. 849 | dfcs = ( 850 | pd.crosstab(data[col2], data[col1]) 851 | .sort_index(axis=0) 852 | .sort_index(axis=1) 853 | ) 854 | 855 | x = [str(column) for column in dfcs.columns] 856 | if "" in x: 857 | x[x.index("")] = " Null" 858 | 859 | y = [str(index) for index in dfcs.index] 860 | if "" in y: 861 | y[y.index("")] = " Null" 862 | 863 | density = dfcs.get_values() 864 | else: 865 | x, y = categories 866 | density = np.zeros((len(x), len(y))) 867 | 868 | report[col1][col2] = { 869 | "density": density.tolist(), 870 | "axes": {col1: x, col2: y}, 871 | "bw": bandwidths, 872 | "scales": scales, 873 | } 874 | 875 | return report 876 | -------------------------------------------------------------------------------- /lens/plotting.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from matplotlib.ticker import FuncFormatter, MaxNLocator 3 | import numpy as np 4 | import plotly.graph_objs as go 5 | import seaborn as sns 6 | import plotly.figure_factory as pff 7 | 8 | DEFAULT_COLORSCALE = "Viridis" 9 | 10 | 11 | def plot_distribution(ls, column, bins=None): 12 | """Plot the distribution of numerical columns. 13 | 14 | Create a plotly plot with a histogram of the values in a column. The 15 | number of bin in the histogram is decided according to the 16 | Freedman-Diaconis rule unless given by the `bins` parameter. 17 | 18 | Parameters 19 | ---------- 20 | ls : :class:`~lens.Summary` 21 | Lens `Summary`. 22 | column : str 23 | Name of the column. 24 | bins : int, optional 25 | Number of bins to use for histogram. If not given, the 26 | Freedman-Diaconis rule will be used to estimate the best number of 27 | bins. This argument also accepts the formats taken by the `bins` 28 | parameter of matplotlib's :function:`~matplotlib.pyplot.hist`. 29 | 30 | Returns 31 | ------- 32 | :class:`~matplotlib.Axes` 33 | Matplotlib axes containing the distribution plot. 34 | """ 35 | column_summary = ls.summary(column) 36 | if column_summary["notnulls"] <= 2: 37 | # Plotly refuses to plot histograms if 38 | # the tdigest has too few values 39 | raise ValueError( 40 | "There are fewer than two non-null values in this column" 41 | ) 42 | 43 | if bins is None: 44 | counts, edges = ls.histogram(column) 45 | else: 46 | xs, counts = ls.tdigest_centroids(column) 47 | counts, edges = np.histogram(xs, weights=counts, bins=bins) 48 | 49 | fig, ax = plt.subplots() 50 | 51 | ax.bar( 52 | edges[:-1], counts, width=np.diff(edges), label=column, align="edge" 53 | ) 54 | 55 | ax.set_ylim(bottom=0) 56 | 57 | ax.set_xlabel(column) 58 | ax.set_title('Distribution of column "{}"'.format(column)) 59 | 60 | ax.figure.tight_layout() 61 | 62 | return fig 63 | 64 | 65 | def _set_integer_tick_labels(axis, labels): 66 | """Use labels dict to set labels on axis""" 67 | axis.set_major_formatter(FuncFormatter(lambda x, _: labels.get(x, ""))) 68 | axis.set_major_locator(MaxNLocator(integer=True)) 69 | 70 | 71 | def plot_pairdensity_mpl(ls, column1, column2): 72 | """Plot the pairwise density between two columns. 73 | 74 | This plot is an approximation of a scatterplot through a 2D Kernel 75 | Density Estimate for two numerical variables. When one of the variables 76 | is categorical, a 1D KDE for each of the categories is shown, 77 | normalised to the total number of non-null observations. For two 78 | categorical variables, the plot produced is a heatmap representation of 79 | the contingency table. 80 | 81 | Parameters 82 | ---------- 83 | ls : :class:`~lens.Summary` 84 | Lens `Summary`. 85 | column1 : str 86 | First column. 87 | column2 : str 88 | Second column. 89 | 90 | Returns 91 | ------- 92 | :class:`plt.Figure` 93 | Matplotlib figure containing the pairwise density plot. 94 | """ 95 | pair_details = ls.pair_details(column1, column2) 96 | pairdensity = pair_details["pairdensity"] 97 | 98 | x = np.array(pairdensity["x"]) 99 | y = np.array(pairdensity["y"]) 100 | Z = np.array(pairdensity["density"]) 101 | 102 | fig, ax = plt.subplots() 103 | 104 | if ls.summary(column1)["desc"] == "categorical": 105 | idx = np.argsort(x) 106 | x = x[idx] 107 | Z = Z[:, idx] 108 | # Create labels and positions for categorical axis 109 | x_labels = dict(enumerate(x)) 110 | _set_integer_tick_labels(ax.xaxis, x_labels) 111 | x = np.arange(-0.5, len(x), 1.0) 112 | 113 | if ls.summary(column2)["desc"] == "categorical": 114 | idx = np.argsort(y) 115 | y = y[idx] 116 | Z = Z[idx] 117 | y_labels = dict(enumerate(y)) 118 | _set_integer_tick_labels(ax.yaxis, y_labels) 119 | y = np.arange(-0.5, len(y), 1.0) 120 | 121 | X, Y = np.meshgrid(x, y) 122 | 123 | ax.pcolormesh(X, Y, Z, cmap=DEFAULT_COLORSCALE.lower()) 124 | 125 | ax.set_xlabel(column1) 126 | ax.set_ylabel(column2) 127 | 128 | ax.set_title(r"$\it{{ {} }}$ vs $\it{{ {} }}$".format(column1, column2)) 129 | 130 | return fig 131 | 132 | 133 | def plot_correlation_mpl(ls, include=None, exclude=None): 134 | """Plot the correlation matrix for numeric columns 135 | 136 | Plot a Spearman rank order correlation coefficient matrix showing the 137 | correlation between columns. The matrix is reordered to group together 138 | columns that have a higher correlation coefficient. The columns to be 139 | plotted in the correlation plot can be selected through either the 140 | ``include`` or ``exclude`` keyword arguments. Only one of them can be 141 | given. 142 | 143 | Parameters 144 | ---------- 145 | ls : :class:`~lens.Summary` 146 | Lens `Summary`. 147 | include : list of str 148 | List of columns to include in the correlation plot. 149 | exclude : list of str 150 | List of columns to exclude from the correlation plot. 151 | 152 | Returns 153 | ------- 154 | :class:`plt.Figure` 155 | Matplotlib figure containing the pairwise density plot. 156 | """ 157 | 158 | columns, correlation_matrix = ls.correlation_matrix(include, exclude) 159 | num_cols = len(columns) 160 | 161 | if num_cols > 10: 162 | annotate = False 163 | else: 164 | annotate = True 165 | 166 | fig, ax = plt.subplots() 167 | sns.heatmap( 168 | correlation_matrix, 169 | annot=annotate, 170 | fmt=".2f", 171 | ax=ax, 172 | xticklabels=columns, 173 | yticklabels=columns, 174 | vmin=-1, 175 | vmax=1, 176 | cmap="RdBu_r", 177 | square=True, 178 | ) 179 | 180 | ax.xaxis.tick_top() 181 | 182 | # Enforces a width of 2.5 inches per cell in the plot, 183 | # unless this exceeds 10 inches. 184 | width_inches = len(columns) * 2.5 185 | while width_inches > 10: 186 | width_inches = 10 187 | 188 | fig.set_size_inches(width_inches, width_inches) 189 | 190 | return fig 191 | 192 | 193 | def plot_cdf(ls, column, N_cdf=100): 194 | """Plot the empirical cumulative distribution function of a column. 195 | 196 | Creates a plotly plot with the empirical CDF of a column. 197 | 198 | Parameters 199 | ---------- 200 | ls : :class:`~lens.Summary` 201 | Lens `Summary`. 202 | column : str 203 | Name of the column. 204 | N_cdf : int 205 | Number of points in the CDF plot. 206 | 207 | Returns 208 | ------- 209 | :class:`~matplotlib.Axes` 210 | Matplotlib axes containing the distribution plot. 211 | """ 212 | tdigest = ls.tdigest(column) 213 | 214 | cdfs = np.linspace(0, 100, N_cdf) 215 | xs = [tdigest.percentile(p) for p in cdfs] 216 | 217 | fig, ax = plt.subplots() 218 | 219 | ax.set_ylabel("Percentile") 220 | ax.set_xlabel(column) 221 | ax.plot(xs, cdfs) 222 | 223 | if ls._report["column_summary"][column]["logtrans"]: 224 | ax.set_xscale("log") 225 | 226 | ax.set_title("Empirical Cumulative Distribution Function") 227 | 228 | return fig 229 | 230 | 231 | def plot_pairdensity(ls, column1, column2): 232 | """Plot the pairwise density between two columns. 233 | 234 | This plot is an approximation of a scatterplot through a 2D Kernel 235 | Density Estimate for two numerical variables. When one of the variables 236 | is categorical, a 1D KDE for each of the categories is shown, 237 | normalised to the total number of non-null observations. For two 238 | categorical variables, the plot produced is a heatmap representation of 239 | the contingency table. 240 | 241 | Parameters 242 | ---------- 243 | ls : :class:`~lens.Summary` 244 | Lens `Summary`. 245 | column1 : str 246 | First column. 247 | column2 : str 248 | Second column. 249 | 250 | Returns 251 | ------- 252 | :class:`plotly.Figure` 253 | Plotly figure containing the pairwise density plot. 254 | """ 255 | pair_details = ls.pair_details(column1, column2) 256 | pairdensity = pair_details["pairdensity"] 257 | 258 | x = np.array(pairdensity["x"]) 259 | y = np.array(pairdensity["y"]) 260 | Z = np.array(pairdensity["density"]) 261 | 262 | if ls.summary(column1)["desc"] == "categorical": 263 | idx = np.argsort(x) 264 | x = x[idx] 265 | Z = Z[:, idx] 266 | 267 | if ls.summary(column2)["desc"] == "categorical": 268 | idx = np.argsort(y) 269 | y = y[idx] 270 | Z = Z[idx] 271 | 272 | data = [go.Heatmap(z=Z, x=x, y=y, colorscale=DEFAULT_COLORSCALE)] 273 | layout = go.Layout(title="{} vs {}".format(column1, column2)) 274 | layout["xaxis"] = { 275 | "type": pairdensity["x_scale"], 276 | "autorange": True, 277 | "title": column1, 278 | } 279 | layout["yaxis"] = { 280 | "type": pairdensity["y_scale"], 281 | "autorange": True, 282 | "title": column2, 283 | } 284 | fig = go.Figure(data=data, layout=layout) 285 | fig.data[0]["showscale"] = False 286 | 287 | return fig 288 | 289 | 290 | def plot_correlation(ls, include=None, exclude=None): 291 | """Plot the correlation matrix for numeric columns 292 | 293 | Plot a Spearman rank order correlation coefficient matrix showing the 294 | correlation between columns. The matrix is reordered to group together 295 | columns that have a higher correlation coefficient. The columns to be 296 | plotted in the correlation plot can be selected through either the 297 | ``include`` or ``exclude`` keyword arguments. Only one of them can be 298 | given. 299 | 300 | Parameters 301 | ---------- 302 | ls : :class:`~lens.Summary` 303 | Lens `Summary`. 304 | include : list of str 305 | List of columns to include in the correlation plot. 306 | exclude : list of str 307 | List of columns to exclude from the correlation plot. 308 | 309 | Returns 310 | ------- 311 | :class:`plotly.Figure` 312 | Plotly figure containing the pairwise density plot. 313 | """ 314 | 315 | columns, correlation_matrix = ls.correlation_matrix(include, exclude) 316 | num_cols = len(columns) 317 | 318 | if num_cols > 10: 319 | annotate = False 320 | else: 321 | annotate = True 322 | 323 | hover_text = [] 324 | for i in range(num_cols): 325 | hover_text.append( 326 | [ 327 | "Corr({}, {}) = {:.2g}".format( 328 | columns[i], columns[j], correlation_matrix[i, j] 329 | ) 330 | for j in range(num_cols) 331 | ] 332 | ) 333 | 334 | if annotate: 335 | t = np.reshape( 336 | ["{:.2g}".format(x) for x in correlation_matrix.flatten()], 337 | correlation_matrix.shape, 338 | )[::-1].tolist() 339 | else: 340 | nrows, ncolumns = correlation_matrix.shape 341 | t = [["" for i in range(nrows)] for j in range(ncolumns)] 342 | 343 | fig = pff.create_annotated_heatmap( 344 | z=correlation_matrix.tolist()[::-1], 345 | colorscale="RdBu", 346 | x=columns, 347 | y=columns[::-1], 348 | zmin=-1.0, 349 | zmax=1.0, 350 | annotation_text=t, 351 | text=hover_text[::-1], 352 | hoverinfo="text", 353 | ) 354 | w = len(columns) * 2.5 * 72 355 | while w > 600: 356 | w /= np.sqrt(1.4) 357 | fig.layout["width"] = w 358 | fig.layout["height"] = w 359 | fig.data[0]["showscale"] = True 360 | 361 | return fig 362 | -------------------------------------------------------------------------------- /lens/summarise.py: -------------------------------------------------------------------------------- 1 | """Summarise a Pandas DataFrame""" 2 | 3 | import json 4 | import logging 5 | import os 6 | import time 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import scipy 11 | 12 | from .dask_graph import create_dask_graph 13 | from .tdigest_utils import tdigest_from_centroids 14 | from .utils import hierarchical_ordering 15 | from .version import __version__ 16 | 17 | logger = logging.getLogger(__name__) 18 | logger.addHandler(logging.StreamHandler()) 19 | 20 | 21 | class LensSummaryError(Exception): 22 | pass 23 | 24 | 25 | class EmptyDataFrameError(Exception): 26 | pass 27 | 28 | 29 | def _validate_report(report, schema_version): 30 | """Validates a dict report""" 31 | report_schema_version = report.get("_schema_version") 32 | if ( 33 | report_schema_version is not None 34 | and report_schema_version != schema_version 35 | ): 36 | raise LensSummaryError( 37 | "The version of the report schema `{}` does " 38 | "not match the schema version `{}` supported " 39 | "by this version of lens {}.".format( 40 | report_schema_version, schema_version, __version__ 41 | ) 42 | ) 43 | 44 | columns = report["_columns"] 45 | column_props = report["column_properties"] 46 | num_cols = [col for col in columns if (column_props[col]["numeric"])] 47 | for num_col in num_cols: 48 | if ( 49 | num_col not in report["column_summary"].keys() 50 | or num_col not in report["correlation"]["_columns"] 51 | or num_col not in report["outliers"].keys() 52 | ): 53 | raise LensSummaryError( 54 | "Column `{}` is marked as numeric but " 55 | "the report lacks its numeric summary" 56 | " and correlation".format(num_col) 57 | ) 58 | 59 | cat_cols = [col for col in columns if column_props[col]["is_categorical"]] 60 | for cat_col in cat_cols: 61 | if cat_col not in report["frequencies"].keys(): 62 | raise LensSummaryError( 63 | "Column `{}` is marked as categorical but " 64 | "the report lacks its frequency analysis".format(cat_col) 65 | ) 66 | 67 | 68 | class NumpyEncoder(json.JSONEncoder): 69 | def default(self, obj): 70 | if isinstance(obj, np.integer): 71 | return int(obj) 72 | elif isinstance(obj, np.floating): 73 | return float(obj) 74 | elif isinstance(obj, np.ndarray): 75 | return obj.tolist() 76 | else: 77 | return super(NumpyEncoder, self).default(obj) 78 | 79 | 80 | class Summary(object): 81 | """A summary of a pandas DataFrame. 82 | 83 | Create a summary instance by calling :func:`lens.summarise.summarise` on a 84 | DataFrame. This calculates several quantities of interest to data 85 | scientists. 86 | 87 | The Summary object is designed for programmatic use. For more direct 88 | visual inspection, use the :class:`lens.explorer.Explorer` class 89 | in a Jupyter notebook. 90 | 91 | """ 92 | 93 | schema_version = 1 94 | 95 | def __init__(self, report): 96 | if not isinstance(report, dict): 97 | raise TypeError("report argument must be a dict") 98 | 99 | if "_schema_version" not in report.keys(): 100 | report["_schema_version"] = self.schema_version 101 | 102 | _validate_report(report, schema_version=self.schema_version) 103 | self._report = report 104 | 105 | @staticmethod 106 | def from_json(file): 107 | """Create a Summary from a report saved in JSON format. 108 | 109 | Parameters 110 | ---------- 111 | file : str or buffer 112 | Path to file containing the JSON report or buffer from which the 113 | report can be read. 114 | 115 | Returns 116 | ------- 117 | :class:`~lens.summarise.Summary` 118 | ``Summary`` object containing the summary in the JSON file. 119 | """ 120 | if hasattr(file, "read"): 121 | report = json.load(file) 122 | else: 123 | with open(file, "r") as f: 124 | report = json.load(f) 125 | 126 | return Summary(report) 127 | 128 | def to_json(self, file=None): 129 | """Produce a JSON serialization of the report. 130 | 131 | Parameters 132 | ---------- 133 | file : str or buffer, optional 134 | File name or writeable buffer to save the JSON report. If omitted, 135 | a string containing the report will be returned. 136 | 137 | Returns 138 | ------- 139 | str 140 | JSON serialization of the summary report 141 | """ 142 | if file is None: 143 | return json.dumps( 144 | self._report, separators=(",", ":"), cls=NumpyEncoder 145 | ) 146 | else: 147 | if hasattr(file, "write"): 148 | json.dump( 149 | self._report, file, separators=(",", ":"), cls=NumpyEncoder 150 | ) 151 | else: 152 | with open(file, "w") as f: 153 | json.dump( 154 | self._report, 155 | f, 156 | separators=(",", ":"), 157 | cls=NumpyEncoder, 158 | ) 159 | 160 | @property 161 | def columns(self): 162 | """Get a list of column names of the dataset. 163 | 164 | Returns 165 | ------- 166 | list 167 | Column names 168 | 169 | Examples 170 | -------- 171 | 172 | >>> summary.columns 173 | ['fixed acidity', 174 | 'volatile acidity', 175 | 'citric acid', 176 | 'residual sugar', 177 | 'chlorides', 178 | 'free sulfur dioxide', 179 | 'total sulfur dioxide', 180 | 'density', 181 | 'pH', 182 | 'sulphates', 183 | 'alcohol', 184 | 'quality'] 185 | """ 186 | return self._report["_columns"] 187 | 188 | @property 189 | def rows(self): 190 | """Get the number of rows in the dataset. 191 | 192 | Returns 193 | ------- 194 | int 195 | Number of rows 196 | 197 | Examples 198 | -------- 199 | 200 | >>> summary.rows 201 | 4898 202 | """ 203 | return self._report["row_count"]["total"] 204 | 205 | @property 206 | def rows_unique(self): 207 | """Get the number of unique rows in the dataset. 208 | 209 | Returns 210 | ------- 211 | int 212 | Number of unique rows. 213 | """ 214 | return self._report["row_count"]["unique"] 215 | 216 | def _desc(self, column): 217 | """Return the inferred description of a column. 218 | 219 | Parameters 220 | ---------- 221 | column : str 222 | Column name. 223 | 224 | Returns 225 | ------- 226 | str 227 | Description of the column. 228 | """ 229 | 230 | column_props = self._report["column_properties"][column] 231 | 232 | if column_props["is_categorical"]: 233 | return "categorical" 234 | elif column_props["numeric"]: 235 | return "numeric" 236 | elif column_props["is_ID"]: 237 | return "ID_like" 238 | else: 239 | return None 240 | 241 | def summary(self, column): 242 | """Basic information about the column 243 | 244 | This returns information about the number of nulls and unique 245 | values in ``column`` as well as which type this column is. 246 | This is guaranteed to return a dictionary with the same keys 247 | for every column. 248 | 249 | The dictionary contains the following keys: 250 | 251 | ``desc`` 252 | the type of data: currently ``categorical`` or ``numeric``. 253 | Lens will calculate different quantities for this column 254 | depending on the value of ``desc``. 255 | 256 | ``dtype`` 257 | the type of data in Pandas. 258 | 259 | ``name`` 260 | column name 261 | 262 | ``notnulls`` 263 | number of non-null values in the column 264 | 265 | ``nulls`` 266 | number of null-values in the column 267 | 268 | ``unique`` 269 | number of unique values in the column 270 | 271 | 272 | Examples 273 | -------- 274 | 275 | >>> summary.summary('quality') 276 | {'desc': 'categorical', 277 | 'dtype': 'int64', 278 | 'name': 'quality', 279 | 'notnulls': 4898, 280 | 'nulls': 0, 281 | 'unique': 7} 282 | 283 | >>> summary.summary('chlorides') 284 | {'desc': 'numeric', 285 | 'dtype': 'float64', 286 | 'name': 'chlorides', 287 | 'notnulls': 4898, 288 | 'nulls': 0, 289 | 'unique': 160} 290 | 291 | Parameters 292 | ---------- 293 | column : str 294 | Column name 295 | 296 | Returns 297 | ------- 298 | dict 299 | Dictionary of summary information. 300 | """ 301 | if column not in self._report["_columns"]: 302 | raise LensSummaryError( 303 | "The data summary does not contain" 304 | " information about column `{}`.".format(column) 305 | ) 306 | 307 | column_props = self._report["column_properties"][column] 308 | 309 | summary = {"name": column, "desc": self._desc(column)} 310 | 311 | for key in ["nulls", "notnulls", "unique", "dtype"]: 312 | summary[key] = column_props[key] 313 | 314 | return summary 315 | 316 | def details(self, column): 317 | """Type-specific information for a column 318 | 319 | The `details` method returns additional information on ``column``, 320 | beyond that provided by the ``summary`` method. If ``column`` is 321 | numeric, this returns summary statistics. If it is categorical, 322 | it returns a dictionary of how often each category occurs. 323 | 324 | Examples 325 | -------- 326 | 327 | >>> summary.details('alcohol') 328 | {'desc': 'numeric', 329 | 'iqr': 1.9000000000000004, 330 | 'max': 14.199999999999999, 331 | 'mean': 10.514267047774602, 332 | 'median': 10.4, 333 | 'min': 8.0, 334 | 'name': 'alcohol', 335 | 'std': 1.2306205677573181, 336 | 'sum': 51498.880000000005} 337 | 338 | >>> summary.details('quality') 339 | {'desc': 'categorical', 340 | 'frequencies': 341 | {3: 20, 4: 163, 5: 1457, 6: 2198, 7: 880, 8: 175, 9: 5}, 342 | 'iqr': 1.0, 343 | 'max': 9, 344 | 'mean': 5.8779093507554103, 345 | 'median': 6.0, 346 | 'min': 3, 347 | 'name': 'quality', 348 | 'std': 0.88563857496783116, 349 | 'sum': 28790} 350 | 351 | Parameters 352 | ---------- 353 | column : str 354 | Column name 355 | 356 | Returns 357 | ------- 358 | dict 359 | Dictionary of detailed information. 360 | """ 361 | if column not in self._report["_columns"]: 362 | raise LensSummaryError( 363 | "The data summary does not contain" 364 | " information about column `{}`.".format(column) 365 | ) 366 | 367 | column_props = self._report["column_properties"][column] 368 | 369 | details = {"name": column, "desc": self._desc(column)} 370 | 371 | if column_props["is_categorical"]: 372 | details["frequencies"] = self._report["frequencies"][column] 373 | 374 | if column_props["numeric"]: 375 | column_summ = self._report["column_summary"][column] 376 | for k in ["min", "max", "mean", "median", "std", "sum", "iqr"]: 377 | details[k] = column_summ[k] 378 | return details 379 | 380 | def pair_details(self, first, second): 381 | """Get pairwise information for a column pair. 382 | 383 | The information returned depends on the types of the two columns. 384 | It may contain the following keys. 385 | 386 | correlation 387 | dictionary with the Spearman rank correlation 388 | coefficient and Pearson product-moment correlation coefficient 389 | between the columns. This is returned when both columns are 390 | numeric. 391 | 392 | pairdensity 393 | dictionary with an estimate of the pairwise 394 | density between the columns. The density is either 395 | a 2D KDE estimate if both columns are numerical, or 396 | several 1D KDE estimates if one of the columns is categorical 397 | and the other numerical (grouped by the categorical column) 398 | or a cross-tabuluation. 399 | 400 | Examples 401 | -------- 402 | 403 | >>> summary.pair_details('chlorides', 'quality') 404 | {'correlation': { 405 | 'pearson': -0.20993441094675602, 406 | 'spearman': -0.31448847828244203}, 407 | {'pairdensity': { 408 | 'density': <2d numpy array> 409 | 'x': <1d numpy array of x-values> 410 | 'y': <1d numpy array of y-values> 411 | 'x_scale': 'linear', 412 | 'y_scale': 'cat'} 413 | } 414 | 415 | >>> summary.pair_details('alcohol', 'chlorides') 416 | {'correlation': { 417 | 'pearson': -0.36018871210816106, 418 | 'spearman': -0.5708064071153713}, 419 | {'pairdensity': { 420 | 'density': <2d numpy array> 421 | 'x': <1d numpy array of x-values> 422 | 'y': <1d numpy array of y-values> 423 | 'x_scale': 'linear', 424 | 'y_scale': 'linear'} 425 | } 426 | 427 | Parameters 428 | ---------- 429 | first : str 430 | Name of the first column. 431 | second : str 432 | Name of the second column. 433 | 434 | Returns 435 | ------- 436 | dict 437 | Dictionary of pairwise information. 438 | """ 439 | if first == second: 440 | raise ValueError( 441 | "Can only return the pair details of two different columns: " 442 | "received {} twice.".format(first) 443 | ) 444 | 445 | pair_details = {} 446 | 447 | # Correlation 448 | 449 | corr_report = self._report["correlation"] 450 | try: 451 | idx = [ 452 | corr_report["_columns"].index(col) for col in [first, second] 453 | ] 454 | except ValueError as e: 455 | logger.info( 456 | "No correlation information for column `{}`".format( 457 | e.args[0].split()[0] 458 | ) 459 | ) 460 | else: 461 | correlation = { 462 | k: corr_report[k][idx[0]][idx[1]] 463 | for k in ["spearman", "pearson"] 464 | } 465 | pair_details["correlation"] = correlation 466 | 467 | # Pair density / Crosstab 468 | 469 | pairdensity_report = self._report["pairdensity"] 470 | 471 | # We store pairdensity information for both first/second and 472 | # second/first in a single key in the report, so we check for both 473 | # report[first][second] and report[second][first] to find it and 474 | # transpose if necessary. 475 | try: 476 | pairdensity = pairdensity_report[first][second] 477 | scales = pairdensity["scales"] 478 | density = np.array(pairdensity["density"]) 479 | except KeyError: 480 | try: 481 | pairdensity = pairdensity_report[second][first] 482 | # Invert scale information and transpose matrix 483 | scales = pairdensity["scales"][::-1] 484 | density = np.array(pairdensity["density"]).T 485 | except KeyError: 486 | logger.info( 487 | "No pairdensity information for columns `{}`" 488 | " and `{}`".format(first, second) 489 | ) 490 | pairdensity = None 491 | 492 | if pairdensity is not None: 493 | pairdensity = { 494 | "density": density, 495 | "x": pairdensity["axes"][first], 496 | "y": pairdensity["axes"][second], 497 | "x_scale": scales[0], 498 | "y_scale": scales[1], 499 | } 500 | 501 | pair_details["pairdensity"] = pairdensity 502 | 503 | return pair_details 504 | 505 | def histogram(self, column): 506 | """ 507 | Return the histogram for `column`. 508 | 509 | This function returns a histogram for the column. The number of bins is 510 | estimated through the Freedman-Diaconis rule. 511 | 512 | Parameters 513 | ---------- 514 | 515 | column: str 516 | Name of the column 517 | 518 | Returns 519 | ------- 520 | 521 | counts: array 522 | Counts for each of the bins of the histogram. 523 | bin_edges : array 524 | Edges of the bins in the histogram. Length is ``length(counts)+1``. 525 | """ 526 | self._check_column_name(column) 527 | try: 528 | histogram = self._report["column_summary"][column]["histogram"] 529 | except KeyError: 530 | raise ValueError("{} is not a numeric column".format(column)) 531 | 532 | return [np.array(histogram[key]) for key in ["counts", "bin_edges"]] 533 | 534 | def kde(self, column): 535 | """ 536 | Return a Kernel Density Estimate for `column`. 537 | 538 | This function returns a KDE for the column. It is computed between the 539 | minimum and maximum values of the column and uses Scott's rule to 540 | compute the bandwith. 541 | 542 | Parameters 543 | ---------- 544 | 545 | column: str 546 | Name of the column 547 | 548 | Returns 549 | ------- 550 | 551 | x: array 552 | Values at which the KDE has been evaluated. 553 | y : array 554 | Values of the KDE. 555 | """ 556 | self._check_column_name(column) 557 | try: 558 | kde = self._report["column_summary"][column]["kde"] 559 | except KeyError: 560 | raise ValueError("{} is not a numeric column".format(column)) 561 | 562 | return [np.array(kde[key]) for key in ["x", "y"]] 563 | 564 | def _tdigest_report(self, column): 565 | """ Return the list of tdigest centroids and means from report 566 | """ 567 | self._check_column_name(column) 568 | try: 569 | tdigest_list = self._report["column_summary"][column]["tdigest"] 570 | except KeyError: 571 | raise ValueError("{} is not a numeric column".format(column)) 572 | return tdigest_list 573 | 574 | def tdigest_centroids(self, column): 575 | """Get TDigest centroids and counts for column. 576 | 577 | Parameters 578 | ---------- 579 | column : str 580 | Name of the column. 581 | 582 | Returns 583 | ------- 584 | :class:`numpy.array` 585 | Means of the TDigest centroids. 586 | :class:`numpy.array` 587 | Counts for each of the TDigest centroids. 588 | """ 589 | 590 | tdigest_list = self._tdigest_report(column) 591 | xs, counts = zip(*tdigest_list) 592 | return np.array(xs), np.array(counts) 593 | 594 | def pdf(self, column): 595 | """ Approximate pdf for `column` 596 | 597 | This returns a function representing the pdf of a numeric column. 598 | 599 | Examples 600 | -------- 601 | 602 | >>> pdf = summary.pdf('chlorides') 603 | >>> min_value = summary.details('chlorides')['min'] 604 | >>> max_value = summary.details('chlorides')['max'] 605 | >>> xs = np.linspace(min_value, max_value, 200) 606 | >>> plt.plot(xs, pdf(xs)) 607 | 608 | Parameters 609 | ---------- 610 | 611 | column : str 612 | Name of the column. 613 | 614 | Returns 615 | ------- 616 | pdf: function 617 | Function representing the pdf. 618 | """ 619 | xs, counts = self.tdigest_centroids(column) 620 | return scipy.interpolate.interp1d(xs, counts) 621 | 622 | def tdigest(self, column): 623 | """Return a TDigest object approximating the distribution of a column 624 | 625 | Documentation for the TDigest class can be found at 626 | https://github.com/CamDavidsonPilon/tdigest. 627 | 628 | Parameters 629 | ---------- 630 | column : str 631 | Name of the column. 632 | 633 | Returns 634 | ------- 635 | :class:`tdigest.TDigest` 636 | TDigest instance computed from the values of the column. 637 | """ 638 | return tdigest_from_centroids(self._tdigest_report(column)) 639 | 640 | def cdf(self, column): 641 | """ Approximate cdf for `column` 642 | 643 | This returns a function representing the cdf of a numeric column. 644 | 645 | Examples 646 | -------- 647 | 648 | >>> cdf = summary.cdf('chlorides') 649 | >>> min_value = summary.details('chlorides')['min'] 650 | >>> max_value = summary.details('chlorides')['max'] 651 | >>> xs = np.linspace(min_value, max_value, 200) 652 | >>> plt.plot(xs, cdf(xs)) 653 | 654 | Parameters 655 | ---------- 656 | 657 | column : str 658 | Name of the column. 659 | 660 | Returns 661 | ------- 662 | cdf: function 663 | Function representing the cdf. 664 | """ 665 | tdigest = self.tdigest(column) 666 | return tdigest.cdf 667 | 668 | def correlation_matrix(self, include=None, exclude=None): 669 | """ Correlation matrix for numeric columns 670 | 671 | Parameters 672 | ---------- 673 | 674 | include: list of strings, optional 675 | List of numeric columns to include. Includes all columns 676 | by default. 677 | 678 | exclude: list of strings, optional 679 | List of numeric columns to exclude. Includes all columns 680 | by default. 681 | 682 | Returns 683 | ------- 684 | 685 | columns: list of strings 686 | List of column names 687 | 688 | correlation_matrix: 2D array of floats 689 | The correlation matrix, ordered such that 690 | ``correlation_matrix[i, j]`` is the correlation between 691 | ``columns[i]`` and ``columns[j]`` 692 | 693 | Notes 694 | ----- 695 | 696 | The columns are ordered through hierarchical clustering. Thus, 697 | neighbouring columns in the output will be more correlated. 698 | """ 699 | if include is not None and exclude is not None: 700 | raise ValueError( 701 | "Either 'include' or 'exclude' should be defined, " 702 | "but not both" 703 | ) 704 | 705 | available_columns = self._report["correlation"]["_columns"] 706 | if include is not None: 707 | non_numeric_includes = set(include) - set(available_columns) 708 | if non_numeric_includes: 709 | raise ValueError( 710 | "Only numeric columns can be included in the " 711 | "correlation plot. Columns {} are not " 712 | "numeric".format(non_numeric_includes) 713 | ) 714 | columns = include 715 | elif exclude is not None: 716 | columns = set(available_columns) - set(exclude) 717 | else: 718 | columns = available_columns 719 | columns = list(columns) 720 | 721 | # Filter the correlation matrix to select only the above columns 722 | correlation_report = self._report["correlation"] 723 | idx = [correlation_report["_columns"].index(col) for col in columns] 724 | correlation_matrix = np.array(correlation_report["spearman"])[idx][ 725 | :, idx 726 | ] 727 | 728 | return hierarchical_ordering(columns, correlation_matrix) 729 | 730 | def _check_column_name(self, column): 731 | if column not in self.columns: 732 | raise KeyError(column) 733 | 734 | 735 | def summarise( 736 | df, 737 | scheduler="multiprocessing", 738 | num_workers=None, 739 | size=None, 740 | pairdensities=True, 741 | ): 742 | """Create a Lens Summary for a Pandas DataFrame. 743 | 744 | This creates a :class:`~lens.Summary` instance containing 745 | many quantities of interest to a data scientist. 746 | 747 | Examples 748 | -------- 749 | 750 | Let's explore the wine quality dataset. 751 | 752 | >>> import pandas as pd 753 | >>> import lens 754 | >>> url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv" # noqa 755 | >>> wines_df = pd.read_csv(url, sep=';') 756 | >>> summary = lens.summarise(wines_df) 757 | 758 | Now that we have a :class:`~lens.Summary` instance we can inspect 759 | the shape of the dataset 760 | 761 | >>> summary.columns 762 | ['fixed acidity', 763 | 'volatile acidity', 764 | 'citric acid', 765 | 'residual sugar', 766 | 'chlorides', 767 | 'free sulfur dioxide', 768 | 'total sulfur dioxide', 769 | 'density', 770 | 'pH', 771 | 'sulphates', 772 | 'alcohol', 773 | 'quality'] 774 | >>> summary.rows 775 | 4898 776 | 777 | So far, nothing groundbreaking. Let's look at the ``quality`` column: 778 | 779 | >>> summary.summary('quality') 780 | {'desc': 'categorical', 781 | 'dtype': 'int64', 782 | 'name': 'quality', 783 | 'notnulls': 4898, 784 | 'nulls': 0, 785 | 'unique': 7} 786 | 787 | This tells us that there are seven unique values in the quality columns, 788 | and zero null values. It also tells us that lens will treat this 789 | column as categorical. Let's look at this in more details: 790 | 791 | >>> summary.details('quality') 792 | {'desc': 'categorical', 793 | 'frequencies': {3: 20, 4: 163, 5: 1457, 6: 2198, 7: 880, 8: 175, 9: 5}, 794 | 'iqr': 1.0, 795 | 'max': 9, 796 | 'mean': 5.8779093507554103, 797 | 'median': 6.0, 798 | 'min': 3, 799 | 'name': 'quality', 800 | 'std': 0.88563857496783116, 801 | 'sum': 28790} 802 | 803 | This tells us that the median wine quality is 6 and the standard deviation 804 | is less than one. Let's now get the correlation between the ``quality`` 805 | column and the ``alcohol`` column: 806 | 807 | >>> summary.pair_detail('quality', 'alcohol')['correlation'] 808 | {'pearson': 0.4355747154613688, 'spearman': 0.4403691816246831} 809 | 810 | Thus, the Spearman Rank Correlation coefficient between these two columns 811 | is 0.44. 812 | 813 | Parameters 814 | ---------- 815 | df : pd.DataFrame 816 | DataFrame to be analysed. 817 | scheduler : str, optional 818 | Dask scheduler to use. Must be one of [distributed, multiprocessing, 819 | processes, single-threaded, sync, synchronous, threading, threads]. 820 | num_workers : int or None, optional 821 | Number of workers in the pool. If the environment variable `NUM_CPUS` 822 | is set that number will be used, otherwise it will use as many workers 823 | as CPUs available in the machine. 824 | size : int, optional 825 | DataFrame size on disk, which will be added to the report. 826 | pairdensities : bool, optional 827 | Whether to compute the pairdensity estimation between all pairs of 828 | numerical columns. For most datasets, this is the most expensive 829 | computation. Default is True. 830 | 831 | Returns 832 | ------- 833 | summary : :class:`~lens.Summary` 834 | The computed data summary. 835 | """ 836 | if not isinstance(df, pd.DataFrame): 837 | raise TypeError("Can only summarise a Pandas DataFrame") 838 | 839 | if len(df.columns) == 0: 840 | raise EmptyDataFrameError("The DataFrame has no columns") 841 | 842 | if num_workers is None: 843 | try: 844 | num_workers = int(os.environ["NUM_CPUS"]) 845 | logger.debug( 846 | "Number of workers read from environment: {}".format( 847 | num_workers 848 | ) 849 | ) 850 | except ValueError: 851 | # Set to None if NUM_CPUS cannot be cast to an integer 852 | logger.warning( 853 | "Environment variable NUM_CPUS={} cannot be" 854 | " interpreted as an integer, defaulting to" 855 | " number of cores in system".format(os.environ.get("NUM_CPUS")) 856 | ) 857 | num_workers = None 858 | except KeyError: 859 | # NUM_CPUS not in environment 860 | num_workers = None 861 | 862 | kwargs = {"scheduler": scheduler} 863 | if num_workers is not None: 864 | kwargs["num_workers"] = num_workers 865 | 866 | tstart = time.time() 867 | report = create_dask_graph(df, pairdensities=pairdensities).compute( 868 | **kwargs 869 | ) 870 | report["_run_time"] = time.time() - tstart 871 | 872 | report["_lens_version"] = __version__ 873 | 874 | if size is not None: 875 | report["size"] = size 876 | 877 | return Summary(report) 878 | -------------------------------------------------------------------------------- /lens/tdigest_utils.py: -------------------------------------------------------------------------------- 1 | from tdigest.tdigest import TDigest, Centroid 2 | 3 | 4 | def tdigest_from_centroids(seq): 5 | """Create a TDigest from a list of centroid means and weights tuples 6 | 7 | Parameters 8 | ---------- 9 | 10 | seq : iterable 11 | List of tuples of length 2 that contain the centroid mean and weight 12 | from a TDigest. 13 | """ 14 | 15 | tdigest = TDigest() 16 | 17 | for mean, weight in seq: 18 | tdigest.C.insert(mean, Centroid(mean, weight)) 19 | tdigest.n += weight 20 | 21 | return tdigest 22 | 23 | 24 | def centroids_from_tdigest(tdigest): 25 | """Return centroid means and weights from a TDigest instance""" 26 | 27 | if not isinstance(tdigest, TDigest): 28 | raise ValueError("Argument must be a TDigest instance") 29 | 30 | means = [c.mean for c in tdigest.C.values()] 31 | counts = [c.count for c in tdigest.C.values()] 32 | 33 | return means, counts 34 | -------------------------------------------------------------------------------- /lens/utils.py: -------------------------------------------------------------------------------- 1 | """Plotting utils, mostly adapted from seaborn for use with TDigests.""" 2 | import numpy as np 3 | from scipy import stats 4 | from six import string_types 5 | 6 | import scipy.spatial.distance as distance 7 | import scipy.cluster.hierarchy as hierarchy 8 | 9 | 10 | def _kde_support(data, bw, gridsize, cut, clip): 11 | """Establish support for a kernel density estimate.""" 12 | support_min = max(data.min() - bw * cut, clip[0]) 13 | support_max = min(data.max() + bw * cut, clip[1]) 14 | return np.linspace(support_min, support_max, gridsize) 15 | 16 | 17 | def _scipy_univariate_kde(data, bw, gridsize, cut, clip): 18 | """Compute a univariate kernel density estimate using scipy.""" 19 | kde = stats.gaussian_kde(data, bw_method=bw) 20 | if isinstance(bw, string_types): 21 | bw = "scotts" if bw == "scott" else bw 22 | bw = getattr(kde, "%s_factor" % bw)() * np.std(data) 23 | grid = _kde_support(data, bw, gridsize, cut, clip) 24 | y = kde(grid) 25 | return grid, y 26 | 27 | 28 | def _scipy_bivariate_kde(x, y, bw, gridsize, cut, clip): 29 | """Compute a bivariate kde using scipy.""" 30 | data = np.c_[x, y] 31 | kde = stats.gaussian_kde(data.T) 32 | data_std = data.std(axis=0, ddof=1) 33 | if isinstance(bw, string_types): 34 | bw = "scotts" if bw == "scott" else bw 35 | bw_x = getattr(kde, "%s_factor" % bw)() * data_std[0] 36 | bw_y = getattr(kde, "%s_factor" % bw)() * data_std[1] 37 | elif np.isscalar(bw): 38 | bw_x, bw_y = bw, bw 39 | else: 40 | msg = ( 41 | "Cannot specify a different bandwidth for each dimension " 42 | "with the scipy backend. You should install statsmodels." 43 | ) 44 | raise ValueError(msg) 45 | x_support = _kde_support(data[:, 0], bw_x, gridsize, cut, clip[0]) 46 | y_support = _kde_support(data[:, 1], bw_y, gridsize, cut, clip[1]) 47 | xx, yy = np.meshgrid(x_support, y_support) 48 | z = kde([xx.ravel(), yy.ravel()]).reshape(xx.shape) 49 | return xx, yy, z 50 | 51 | 52 | def axis_ticklabels_overlap(labels): 53 | """Return a boolean for whether the list of ticklabels have overlaps. 54 | 55 | Parameters 56 | ---------- 57 | labels : list of ticklabels 58 | 59 | Returns 60 | ------- 61 | overlap : boolean 62 | True if any of the labels overlap. 63 | """ 64 | if not labels: 65 | return False 66 | try: 67 | bboxes = [l.get_window_extent() for l in labels] 68 | overlaps = [b.count_overlaps(bboxes) for b in bboxes] 69 | return max(overlaps) > 1 70 | except RuntimeError: 71 | # Issue on macosx backend rasies an error in the above code 72 | return False 73 | 74 | 75 | def hierarchical_ordering_indices(columns, correlation_matrix): 76 | """Return array with hierarchical cluster ordering of columns 77 | 78 | Parameters 79 | ---------- 80 | columns: iterable of str 81 | Names of columns. 82 | correlation_matrix: np.ndarray 83 | Matrix of correlation coefficients between columns. 84 | 85 | Returns 86 | ------- 87 | indices: iterable of int 88 | Indices with order of columns 89 | """ 90 | if len(columns) > 2: 91 | pairwise_dists = distance.pdist( 92 | np.where(np.isnan(correlation_matrix), 0, correlation_matrix), 93 | metric="euclidean", 94 | ) 95 | linkage = hierarchy.linkage(pairwise_dists, method="average") 96 | dendogram = hierarchy.dendrogram( 97 | linkage, no_plot=True, color_threshold=-np.inf 98 | ) 99 | idx = dendogram["leaves"] 100 | else: 101 | idx = list(range(len(columns))) 102 | 103 | return idx 104 | 105 | 106 | def hierarchical_ordering(columns, correlation_matrix): 107 | """Reorder matrix by hierarchical clustering of columns 108 | 109 | Parameters 110 | ---------- 111 | columns: iterable of str 112 | Names of columns. 113 | correlation_matrix: np.ndarray 114 | Matrix of correlation coefficients between columns. 115 | 116 | Returns 117 | ------ 118 | columns: iterable of str 119 | Reorderd names of columns. 120 | correlation_matrix: np.ndarray 121 | Reordered matrix of correlation coefficients between columns. 122 | """ 123 | if len(columns) > 2: 124 | idx = hierarchical_ordering_indices(columns, correlation_matrix) 125 | correlation_matrix = correlation_matrix[idx, :][:, idx] 126 | columns = [columns[i] for i in idx] 127 | 128 | return columns, correlation_matrix 129 | -------------------------------------------------------------------------------- /lens/version.py: -------------------------------------------------------------------------------- 1 | """Lens version""" 2 | 3 | __version__ = "0.4.5.dev0" 4 | -------------------------------------------------------------------------------- /lens/widget.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import sys 3 | import logging 4 | from ipywidgets import widgets 5 | from IPython.display import display 6 | from lens.plotting import ( 7 | plot_distribution, 8 | plot_cdf, 9 | plot_pairdensity_mpl, 10 | plot_correlation_mpl, 11 | ) 12 | 13 | logger = logging.getLogger(__name__) 14 | logger.addHandler(logging.StreamHandler()) 15 | 16 | # Check whether we are in a notebook environment 17 | # this is a false positive if we are in the Jupyter console 18 | IN_NOTEBOOK = "ipykernel" in sys.modules 19 | 20 | PADDING = "10px" 21 | PLOT_HEIGHT = 400 22 | PLOT_WIDTH = 600 23 | DPI = 72 24 | 25 | 26 | def update_plot(f, args, plot_area, **kwargs): 27 | """Updates the content of an output widget with rendered function""" 28 | 29 | fig = f(*args) 30 | plot_area.clear_output() 31 | 32 | height = kwargs.get("height", PLOT_HEIGHT) 33 | width = kwargs.get("width", PLOT_WIDTH) 34 | dpi = kwargs.get("dpi", DPI) 35 | 36 | fig.set_size_inches(width / dpi, height / dpi) 37 | 38 | plot_area.layout.height = "{:.0f}px".format(height) 39 | plot_area.layout.width = "{:.0f}px".format(width) 40 | 41 | with plot_area: 42 | display(fig) 43 | 44 | 45 | def create_correlation_plot_widget(ls): 46 | """Return a widget with correlation plot. 47 | 48 | Parameters 49 | ---------- 50 | ls : :class:`~lens.Summary` 51 | Lens `Summary`. 52 | 53 | Returns 54 | ------- 55 | :class:`ipywidgets.Widget` 56 | Jupyter widget to explore correlation matrix plot. 57 | """ 58 | 59 | plot_area = widgets.Output() 60 | 61 | update_plot( 62 | plot_correlation_mpl, 63 | [ls], 64 | plot_area, 65 | height=PLOT_WIDTH, 66 | width=PLOT_WIDTH * 1.3, 67 | ) 68 | 69 | return plot_area 70 | 71 | 72 | def _update_pairdensity_plot(ls, dd1, dd2, plot_area): 73 | if dd1.value != dd2.value: 74 | update_plot( 75 | plot_pairdensity_mpl, 76 | [ls, dd1.value, dd2.value], 77 | plot_area, 78 | height=600, 79 | width=600, 80 | ) 81 | 82 | 83 | def create_pairdensity_plot_widget(ls): 84 | """Create a pairwise density widget. 85 | 86 | Parameters 87 | ---------- 88 | ls : :class:`~lens.Summary` 89 | Lens `Summary`. 90 | 91 | Returns 92 | ------- 93 | :class:`ipywidgets.Widget` 94 | Jupyter widget to explore pairdensity plots. 95 | """ 96 | numeric_columns = ls._report["column_summary"]["_columns"] 97 | dropdown1 = widgets.Dropdown(options=numeric_columns, description="First:") 98 | dropdown2 = widgets.Dropdown( 99 | options=numeric_columns, description="Second:" 100 | ) 101 | if len(numeric_columns) > 1: 102 | dropdown1.value, dropdown2.value = numeric_columns[:2] 103 | 104 | plot_area = widgets.Output() 105 | 106 | for dropdown in [dropdown1, dropdown2]: 107 | dropdown.observe( 108 | lambda x: _update_pairdensity_plot( 109 | ls, dropdown1, dropdown2, plot_area 110 | ), 111 | names="value", 112 | type="change", 113 | ) 114 | 115 | _update_pairdensity_plot(ls, dropdown1, dropdown2, plot_area) 116 | 117 | return widgets.VBox([dropdown1, dropdown2, plot_area], padding=PADDING) 118 | 119 | 120 | def _simple_columnwise_widget(ls, plot_function, columns): 121 | """Basic column-wise plot widget""" 122 | 123 | dropdown = widgets.Dropdown(options=columns, description="Column:") 124 | plot_area = widgets.Output() 125 | update_plot(plot_function, [ls, columns[0]], plot_area, height=PLOT_HEIGHT) 126 | 127 | dropdown.observe( 128 | lambda x: update_plot( 129 | plot_function, [ls, x["new"]], plot_area, height=PLOT_HEIGHT 130 | ), 131 | names="value", 132 | type="change", 133 | ) 134 | 135 | return widgets.VBox([dropdown, plot_area], padding=PADDING) 136 | 137 | 138 | def create_distribution_plot_widget(ls): 139 | """Create a distribution plot widget. 140 | 141 | Parameters 142 | ---------- 143 | ls : :class:`~lens.Summary` 144 | Lens `Summary`. 145 | 146 | Returns 147 | ------- 148 | :class:`ipywidgets.Widget` 149 | Jupyter widget to explore distribution plots. 150 | """ 151 | numeric_columns = ls._report["column_summary"]["_columns"] 152 | return _simple_columnwise_widget(ls, plot_distribution, numeric_columns) 153 | 154 | 155 | def create_cdf_plot_widget(ls): 156 | """Create a CDF plot widget. 157 | 158 | Parameters 159 | ---------- 160 | ls : :class:`~lens.Summary` 161 | Lens `Summary`. 162 | 163 | Returns 164 | ------- 165 | :class:`ipywidgets.Widget` 166 | Jupyter widget to explore CDF plots. 167 | """ 168 | numeric_columns = ls._report["column_summary"]["_columns"] 169 | return _simple_columnwise_widget(ls, plot_cdf, numeric_columns) 170 | 171 | 172 | def interactive_explore(ls): 173 | """Create a widget to visually explore a dataset summary. 174 | 175 | Note that the widget will only work when created within a Jupyter notebook. 176 | 177 | Parameters 178 | ---------- 179 | ls : :class:`~lens.Summary` 180 | Lens `Summary`. 181 | 182 | Returns 183 | ------- 184 | :class:`ipywidgets.Widget` 185 | Jupyter widget with summary plots. 186 | """ 187 | if not IN_NOTEBOOK: 188 | message = ( 189 | "Lens interactive_explore can only be used in a" 190 | " Jupyter notebook" 191 | ) 192 | logger.error(message) 193 | raise ValueError(message) 194 | 195 | tabs = widgets.Tab() 196 | tabs.children = [ 197 | create_distribution_plot_widget(ls), 198 | create_cdf_plot_widget(ls), 199 | create_pairdensity_plot_widget(ls), 200 | create_correlation_plot_widget(ls), 201 | ] 202 | 203 | tabs.set_title(0, "Distribution") 204 | tabs.set_title(1, "CDF") 205 | tabs.set_title(2, "Pairwise density") 206 | tabs.set_title(3, "Correlation matrix") 207 | 208 | return tabs 209 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 79 3 | exclude = ''' 4 | /( 5 | \.git 6 | | \.mypy_cache 7 | | \.tox 8 | | \.venv 9 | | _build 10 | | build 11 | | dist 12 | | \.eggs 13 | )/ 14 | ''' 15 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | conda: 2 | file: docs/environment.yml 3 | python: 4 | version: 3 5 | setup_py_install: true 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Placeholder file so that `pip install -r requirements.txt` uses dependencies 2 | # from setup.py 3 | . 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | 4 | [metadata] 5 | license_file = LICENSE.txt 6 | 7 | [flake8] 8 | ignore = E203,W503 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | from setuptools import setup 5 | 6 | 7 | def source_root_dir(): 8 | """Return the path to the root of the source distribution""" 9 | return os.path.abspath(os.path.dirname(__file__)) 10 | 11 | 12 | def read_version(): 13 | """Read the version from the ``lens.version`` module""" 14 | filename = os.path.join(source_root_dir(), "lens/version.py") 15 | with open(filename) as fin: 16 | namespace = {} 17 | exec(fin.read(), namespace) # pylint: disable=exec-used 18 | return namespace["__version__"] 19 | 20 | 21 | with open("README.rst") as file: 22 | LONG_DESCRIPTION = file.read() 23 | 24 | setup( 25 | name="lens", 26 | version=read_version(), 27 | description="Summarise and explore Pandas DataFrames", 28 | copyright="Copyright 2017-2019, Faculty", 29 | license="Apache 2.0", 30 | url="https://github.com/facultyai/lens", 31 | author="Faculty", 32 | author_email="opensource@faculty.ai", 33 | packages=["lens"], 34 | zip_safe=False, 35 | long_description=LONG_DESCRIPTION, 36 | install_requires=[ 37 | "dask[dataframe,delayed]>=0.18.0", 38 | "ipywidgets>=6.0.0", 39 | "matplotlib", 40 | "numpy>=1.11", 41 | "pandas", 42 | "plotly>=3.0.0", 43 | "scipy", 44 | "tdigest>=0.5.0", 45 | "seaborn", 46 | ], 47 | ) 48 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | import os 3 | import inspect 4 | import random 5 | import string 6 | 7 | from lens.dask_graph import create_dask_graph 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from scipy import stats 12 | import pytest 13 | 14 | np.random.seed(4712) 15 | 16 | dirname = os.path.dirname( 17 | os.path.abspath(inspect.getfile(inspect.currentframe())) 18 | ) 19 | 20 | 21 | @pytest.fixture(scope="module", params=[10, 60, 500, 2000]) 22 | def df(request): 23 | nrows = request.param 24 | n1 = np.random.randn(nrows) * 3 + 20.0 25 | n2 = np.random.randn(nrows) * 5 + 30.0 26 | poisson = np.random.poisson(10, nrows) 27 | items = [ 28 | ("normal", n1 + n2), 29 | ("normal2", n1 - n2), 30 | ("uniform", np.random.random(nrows)), 31 | ("lognormal", stats.lognorm.rvs(5, scale=10, size=nrows)), 32 | ("poisson", poisson), 33 | ("categorical13", gen_poisson_distributed_categorical_data(13, nrows)), 34 | ("categorical5", gen_uniformly_distributed_categorical_data(5, nrows)), 35 | ("categorical2", np.random.randint(0, 2, nrows)), 36 | ("categoricalint", gen_categoricalint_with_no_twos(nrows)), 37 | ("ID", ["ID{}".format(x) for x in range(int(1e3), int(1e3 + nrows))]), 38 | ("datetimes", gen_datetime_strings(nrows)), 39 | ("dates", gen_date_strings(nrows)), 40 | ("times", gen_time_strings(nrows)), 41 | ("nulls", [np.nan] * nrows), 42 | ] 43 | 44 | df = pd.DataFrame.from_dict(dict(items)) 45 | 46 | # sprinkle nrows/50 nulls 47 | ncols = len(df.columns) 48 | ii = np.random.randint(0, ncols, int((nrows * ncols) / 50)) 49 | jj = np.random.randint(0, nrows, int((nrows * ncols) / 50)) 50 | for i, j in zip(ii, jj): 51 | df.loc[j, list(df.columns)[i]] = None 52 | 53 | # No nulls in poissonint to avoid casting as floats 54 | df["poissonint"] = poisson 55 | # Add column that is strictly correlated with a float column 56 | df["normalcorr"] = n1 + n2 57 | # Add a column that has values where normal has nulls 58 | df["antinormal"] = np.where(df.normal.isnull(), n1 + n2, np.nan) 59 | 60 | df.to_csv(dirname + "/test_results/test_data.csv", index=False) 61 | 62 | return df 63 | 64 | 65 | def gen_categoricalint_with_no_twos(nrows): 66 | values = np.random.randint(0, 6, nrows) 67 | values[values == 2] = 5 68 | return values 69 | 70 | 71 | def gen_poisson_distributed_categorical_data(ncategories, size): 72 | categories = [ 73 | str(i) + "".join(random.sample(string.ascii_letters, 4)) 74 | for i in range(ncategories) 75 | ] 76 | random_samples = [ 77 | np.random.poisson(ncategories / 2.0) for i in range(size) 78 | ] 79 | truncated_random_samples = [ 80 | max(min(0, sample), ncategories - 1) for sample in random_samples 81 | ] 82 | sampled_categories = [ 83 | categories[sample] for sample in truncated_random_samples 84 | ] 85 | return sampled_categories 86 | 87 | 88 | def gen_uniformly_distributed_categorical_data(ncategories, size): 89 | categories = [ 90 | str(i) + "".join(random.sample(string.ascii_letters, 4)) 91 | for i in range(ncategories) 92 | ] 93 | random_samples = np.random.randint(0, len(categories), size=size) 94 | truncated_random_samples = [ 95 | max(min(0, sample), ncategories - 1) for sample in random_samples 96 | ] 97 | sampled_categories = [ 98 | categories[sample] for sample in truncated_random_samples 99 | ] 100 | return sampled_categories 101 | 102 | 103 | def gen_date_strings(size): 104 | datetimes = gen_datetimes(size) 105 | date_strings = [datetime.date().isoformat() for datetime in datetimes] 106 | return date_strings 107 | 108 | 109 | def gen_time_strings(size): 110 | datetimes = gen_datetimes(size) 111 | date_strings = [datetime.time().isoformat() for datetime in datetimes] 112 | return date_strings 113 | 114 | 115 | def gen_datetime_strings(size): 116 | datetimes = gen_datetimes(size) 117 | datetime_strings = [datetime.isoformat() for datetime in datetimes] 118 | return datetime_strings 119 | 120 | 121 | def gen_datetimes(size): 122 | timestamps = np.linspace(0, 86400 * 365 * 40, size) 123 | datetimes = [dt.datetime.fromtimestamp(ts) for ts in timestamps] 124 | return datetimes 125 | 126 | 127 | @pytest.fixture(scope="module") 128 | def report(df): 129 | # Get a dict report by not calling summarise 130 | report = create_dask_graph(df).compute(scheduler="multiprocessing") 131 | 132 | return report 133 | -------------------------------------------------------------------------------- /tests/data/test-artworks.csv: -------------------------------------------------------------------------------- 1 | Artist,Nationality,Gender,Date,Classification,Width (cm),Height (cm),Diameter (cm),Depth (cm) 2 | Otto Wagner,(Austrian),(Male),1896,Architecture,168.9,48.6,, 3 | Christian de Portzamparc,(French),(Male),1987,Architecture,29.8451,40.6401,, 4 | Emil Hoppe,(Austrian),(Male),1903,Architecture,31.8,34.3,, 5 | Bernard Tschumi,(),(Male),1980,Architecture,50.8,50.8,, 6 | Emil Hoppe,(Austrian),(Male),1903,Architecture,19.1,38.4,, 7 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 8 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 9 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 10 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 11 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 12 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 13 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 14 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 15 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 16 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 17 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 18 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 19 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 20 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 21 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 22 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 23 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 24 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 25 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 26 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 27 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 28 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 29 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 30 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 31 | Bernard Tschumi,(),(Male),1976-77,Architecture,45.7,35.6,, 32 | Louis I. Kahn,(American),(Male),1968,Architecture,167.6,113.0,, 33 | Bernard Tschumi,(),(Male),1980,Architecture,50.8,50.8,, 34 | Marcel Kammerer,(Austrian),(Male),1900,Architecture,31.4,47.9,, 35 | Bernard Tschumi,(),(Male),1978,Architecture,817.8816,60.9601,, 36 | Otto Schönthal,(Austrian),(Male),1905,Architecture,21.6,30.5,, 37 | Bernard Tschumi,(),(Male),1980,Architecture,50.8,50.8,, 38 | Otto Schönthal,(Austrian),(Male),1906,Architecture,35.8,29.6,, 39 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,, 40 | Bernard Tschumi,(),(Male),1979,Architecture,60.9601,121.9202,, 41 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,, 42 | Bernard Tschumi,(),(Male),1980,Architecture,61.0,121.9,, 43 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,, 44 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,, 45 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,, 46 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,, 47 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,, 48 | Bernard Tschumi,(),(Male),1979,Architecture,61.0,121.9,, 49 | Bernard Tschumi,(),(Male),1980,Architecture,50.8,50.8,, 50 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 51 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.2,, 52 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 53 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 54 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 55 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 56 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 57 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 58 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 59 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 60 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 61 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 62 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 63 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 64 | Bernard Tschumi,(),(Male),1980-81,Architecture,78.7,48.3,, 65 | Hans Poelzig,(German),(Male),1918,Architecture,41.0,37.1,, 66 | Raimund Abraham,(American),(Male),1970,Architecture,135.8903,87.3127,, 67 | "Peter Eisenman, Robert Cole",(American) (),(Male) (Male),1975,Architecture,113.3477,34.9251,, 68 | "Rem Koolhaas, Madelon Vriesendorp",(Dutch) (Dutch),(Male) (Female),1987,Architecture,99.0602,63.5001,, 69 | Roger C. Ferri,(American),(Male),1979,Architecture,110.8,141.6,, 70 | Bernard Tschumi,(),(Male),1984,Architecture,94.7739,94.4564,, 71 | Roger C. Ferri,(American),(Male),1979,Architecture,110.8077,141.6053,, 72 | Bernard Tschumi,(),(Male),1986,Architecture,99.6952,69.2151,, 73 | Roger C. Ferri,(American),(Male),1979,Architecture,113.3477,112.7127,, 74 | Michael Graves,(American),(Male),1978,Architecture,30.2,30.2,, 75 | Michael Graves,(American),(Male),1978,Architecture,60.0,59.7,, 76 | Michael Graves,(American),(Male),1978,Architecture,27.3,27.6,, 77 | Michael Graves,(American),(Male),1978,Architecture,20.3,20.3,, 78 | "Aldo Rossi, Gianni Braghieri, M. Bosshard",(Italian) (Italian) (Italian),(Male) (Male) (Male),1974,Architecture,91.4,72.4,, 79 | Ludwig Mies van der Rohe,(American),(Male),n.d.,Mies van der Rohe Archive,,,, 80 | Erik Gunnar Asplund,(Swedish),(Male),c. 1917,Architecture,34.9,23.2,, 81 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,61.9,31.1,, 82 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,61.6,31.8,, 83 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,34.0,39.4,, 84 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,62.5,57.2,, 85 | Erik Gunnar Asplund,(Swedish),(Male),1917,Architecture,62.2,56.5,, 86 | Erik Gunnar Asplund,(Swedish),(Male),1923,Architecture,79.4,56.8,, 87 | Erik Gunnar Asplund,(Swedish),(Male),1923,Architecture,79.1,57.2,, 88 | Erik Gunnar Asplund,(Swedish),(Male),1923,Architecture,55.2,47.9,, 89 | Erik Gunnar Asplund,(Swedish),(Male),1923,Architecture,55.5,47.9,, 90 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,29.8,33.0,, 91 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,26.7,21.0,, 92 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,93.3,91.4,, 93 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,93.0,90.8,, 94 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,93.7,90.2,, 95 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,90.8,90.8,, 96 | Erik Gunnar Asplund,(Swedish),(Male),1930,Architecture,33.0,52.7,, 97 | Erik Gunnar Asplund,(Swedish),(Male),1936,Architecture,70.5,30.5,, 98 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,51.1,39.4,, 99 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.5,30.5,, 100 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.8,30.8,, 101 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.5,30.8,, 102 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.3,30.8,, 103 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.8,30.8,, 104 | Erik Gunnar Asplund,(Swedish),(Male),1935,Architecture,83.5,30.5,, 105 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,62.9,26.0,, 106 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,46.7,19.7,, 107 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,41.9,41.9,, 108 | Erik Gunnar Asplund,(Swedish),(Male),Unknown,Architecture,101.6,65.4,, 109 | "Erik Gunnar Asplund, Sigurd Lewerentz",(Swedish) (Swedish),(Male) (Male),1937,Architecture,96.2,41.3,, 110 | Erik Gunnar Asplund,(Swedish),(Male),1937,Architecture,83.8,29.5,, 111 | Erik Gunnar Asplund,(Swedish),(Male),1937,Architecture,158.8,49.2,, 112 | "Paul Nelson, Frantz Jourdain, Oscar Nitzchke",(American) (French) (American),(Male) (Male) (Male),1938,Architecture,95.3,37.5,,1.3 113 | "Paul Nelson, Frantz Jourdain, Oscar Nitzchke",(American) (French) (American),(Male) (Male) (Male),1938,Architecture,95.9,37.5,,1.3 114 | "Paul Nelson, Oscar Nitzchke, Frantz Jourdain",(American) (American) (French),(Male) (Male) (Male),1938,Architecture,71.1,71.1,,1.3 115 | "Paul Nelson, Frantz Jourdain, Oscar Nitzchke",(American) (French) (American),(Male) (Male) (Male),1938,Architecture,127.6,71.0,, 116 | Steven Holl,(American),(Male),1977,Architecture,30.2,33.3,, 117 | Steven Holl,(American),(Male),1977,Architecture,55.9,34.3,, 118 | Erich Mendelsohn,(American),(Male),1935,Architecture,29.2,35.6,, 119 | Steven Holl,(American),(Male),1977,Architecture,75.6,55.9,, 120 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,35.9,21.6,, 121 | Paolo Soleri,(American),(Male),1958,Architecture,150.2,55.2,, 122 | Steven Holl,(American),(Male),1980,Architecture,33.0,8.3,, 123 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,35.9,20.3,, 124 | Steven Holl,(American),(Male),1984,Architecture,103.5,52.1,, 125 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,17.8,34.9,, 126 | Steven Holl,(American),(Male),1985,Architecture,107.9502,52.0701,, 127 | Erich Mendelsohn,(American),(Male),1936,Architecture,13.6525,21.59,, 128 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,, 129 | Erich Mendelsohn,(American),(Male),1936,Architecture,35.8776,27.3051,, 130 | Steven Holl,(American),(Male),1986,Architecture,56.8,75.6,, 131 | Erich Mendelsohn,(American),(Male),1936,Architecture,29.2,36.2,, 132 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,, 133 | Erich Mendelsohn,(American),(Male),1936,Architecture,29.2101,36.8301,, 134 | Steven Holl,(American),(Male),1986,Architecture,56.5,76.2,, 135 | Erich Mendelsohn,(American),(Male),1936,Architecture,40.6401,21.9075,, 136 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,, 137 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,28.7338,34.2901,, 138 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,, 139 | Erich Mendelsohn,(American),(Male),1936,Architecture,24.1,35.2,, 140 | Steven Holl,(American),(Male),1986,Architecture,56.5,75.6,, 141 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,36.8,30.2,, 142 | Paul Rudolph,(American),(Male),1989,Architecture,164.5,37.1,, 143 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,37.5,29.8,, 144 | Paul Rudolph,(American),(Male),1949,Architecture,47.9,64.8,, 145 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,29.2,41.9,, 146 | Paul Rudolph,(American),(Male),1958–1964,Architecture,87.0,69.2,, 147 | Erich Mendelsohn,(American),(Male),Unknown,Architecture,28.5,37.5,, 148 | Erich Mendelsohn,(American),(Male),c. 1935,Architecture,25.2,21.0,, 149 | Erich Mendelsohn,(American),(Male),1935,Architecture,35.6,12.7,, 150 | Erich Mendelsohn,(American),(Male),1935,Architecture,38.0,64.0,, 151 | Erich Mendelsohn,(American),(Male),1935,Architecture,60.9601,54.6101,, 152 | Peter Cook,(British),(Male),1979,Architecture,87.0,27.9,, 153 | "Diller + Scofidio, Elizabeth Diller, Ricardo Scofidio",(American) (American) (American),() (Female) (Male),1989,Architecture,92.7,121.0,,3.8 154 | Zaha Hadid,(British),(Female),1991,Architecture,182.8804,129.5403,, 155 | Fumihiko Maki,(Japanese),(Male),1980,Architecture,14.6,20.3,, 156 | "Rem Koolhaas, Zoe Zenghelis, Elia Zenghelis, Madelon Vriesendorp",(Dutch) (British) (British) (Dutch),(Male) (Female) (Female) (Female),1975,Architecture,68.6,113.0,, 157 | Frank Lloyd Wright,(American),(Male),1941,Architecture,51.1176,78.1052,, 158 | Frank Lloyd Wright,(American),(Male),1941,Architecture,78.5,53.0,, 159 | Frank Lloyd Wright,(American),(Male),1941,Architecture,48.8951,43.1,, 160 | Frank Lloyd Wright,(American),(Male),1941,Architecture,78.8,52.7051,, 161 | Mario Bellini,(Italian),(Male),1987,Architecture,77.5,49.5,, 162 | "Venturi and Rauch, Robert Venturi, John Rauch",(American) (American) (American),() (Male) (Male),1965,Architecture,91.4,52.1,, 163 | "Venturi and Rauch, Robert Venturi, John Rauch, Denise Scott Brown",(American) (American) (American) (American),() (Male) (Male) (Female),1978,Architecture,59.6901,30.4801,, 164 | "Venturi, Rauch and Scott Brown, Robert Venturi, John Rauch, Denise Scott Brown",(American) (American) (American) (American),() (Male) (Male) (Female),1981,Architecture,38.4176,30.4801,, 165 | "Venturi, Rauch and Scott Brown, Robert Venturi, John Rauch, Denise Scott Brown",(American) (American) (American) (American),() (Male) (Male) (Female),1981,Architecture,34.2901,30.4801,, 166 | "Venturi, Rauch and Scott Brown, Robert Venturi, John Rauch, Denise Scott Brown",(American) (American) (American) (American),() (Male) (Male) (Female),1983,Architecture,71.1201,30.4801,, 167 | Emilio Ambasz,(Argentine),(Male),1975,Architecture,95.6,95.3,, 168 | Raimund Abraham,(American),(Male),1979,Architecture,83.1852,69.2151,, 169 | Tadao Ando,(Japanese),(Male),1985–1988,Architecture,400.3683,105.0927,, 170 | Tadao Ando,(Japanese),(Male),1985–1988,Architecture,44.5,30.8,, 171 | Tadao Ando,(Japanese),(Male),1985–1988,Architecture,32.3851,30.4801,, 172 | Tadao Ando,(Japanese),(Male),c. 1989-91,Architecture,84.1,29.5,, 173 | Arata Isozaki,(Japanese),(Male),1992,Architecture,102.2,62.2,, 174 | Arata Isozaki,(Japanese),(Male),1992,Architecture,94.9327,54.9276,, 175 | Arata Isozaki,(Japanese),(Male),1992,Architecture,102.2,62.2,, 176 | Arata Isozaki,(Japanese),(Male),1992,Architecture,105.0,65.0,, 177 | Arata Isozaki,(Japanese),(Male),1992,Architecture,105.1,65.1,, 178 | Arata Isozaki,(Japanese),(Male),1992,Architecture,105.1,65.1,, 179 | Arata Isozaki,(Japanese),(Male),1992,Architecture,105.0,65.0,, 180 | "Theo van Doesburg (Christian Emil Marie Küpper), Cornelis van Eesteren",(Dutch) (Dutch),(Male) (Male),1923,Architecture,57.2,57.2,, 181 | Arata Isozaki,(Japanese),(Male),1992,Architecture,84.1,30.5,, 182 | Arata Isozaki,(Japanese),(Male),1992,Architecture,84.5,30.5,, 183 | Arata Isozaki,(Japanese),(Male),1992,Architecture,49.5,30.5,, 184 | Arata Isozaki,(Japanese),(Male),1992,Architecture,49.9,30.5,, 185 | Arata Isozaki,(Japanese),(Male),1992,Architecture,85.1,30.5,, 186 | Bernard Tschumi,(),(Male),1983,Architecture,41.9101,29.5276,, 187 | Frank Lloyd Wright,(American),(Male),1915-17,Architecture,,,, 188 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,, 189 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.9,27.9,, 190 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,28.3,21.6,, 191 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,27.9,21.9,, 192 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,, 193 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,, 194 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.9,28.3,, 195 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,, 196 | Frank Lloyd Wright,(American),(Male),c. 1915-17,Architecture,21.6,27.9,, 197 | "Roberto Burle Marx, Oscar Niemeyer",(Brazilian) (Brazilian),(Male) (Male),1953,Architecture,151.1,100.3,, 198 | "Roberto Burle Marx, Oscar Niemeyer",(Brazilian) (Brazilian),(Male) (Male),1953,Architecture,151.1,99.1,, 199 | Neil M. Denari,(American),(Male),1992,Architecture,84.0,62.2,, 200 | Ludwig Mies van der Rohe,(American),(Male),1910,Architecture,240.0,141.0,, 201 | David Jacob,(American),(Male),1970,Architecture,87.3127,71.1201,,87.3127 202 | -------------------------------------------------------------------------------- /tests/multivariate_kde.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import statsmodels.api as sm 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def _KDEMultivariate( 7 | values, bandwidth=None, grid_points=None, grid_shape=None, logtrans=None 8 | ): 9 | """Multivariate kernel density estimation.""" 10 | 11 | if values.shape[0] < 3: 12 | # Return zeroes if there are too few points to do anything 13 | # useful. 14 | return np.zeros(grid_shape) 15 | 16 | for i, lt in enumerate(logtrans): 17 | if lt: 18 | values[:, i] = np.log10(values[:, i]) 19 | grid_points[i] = np.log10(grid_points[i]) 20 | 21 | kernel = sm.nonparametric.KDEMultivariate( 22 | data=values, var_type="cc", bw=bandwidth 23 | ) 24 | 25 | pdf = np.reshape(kernel.pdf(grid_points), grid_shape) 26 | 27 | return pdf 28 | 29 | 30 | def plot_pd_difference(hist_pd, kde_pd, filename): 31 | max_dev = np.max(np.abs(kde_pd - hist_pd)) 32 | max_dev_s = "Max dev: {:.3g}".format(max_dev) 33 | 34 | norm = np.sum((hist_pd > 1e-3) + (kde_pd > 1e-3)) / 2.0 35 | mean_dev = np.sum(np.abs(kde_pd - hist_pd)) / norm 36 | mean_dev_s = "Mean dev: {:.3g}".format(mean_dev) 37 | 38 | corr = 1 - np.corrcoef(kde_pd, hist_pd)[0][1] 39 | corr_s = "1 - Corr: {:.3g}".format(corr) 40 | 41 | f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4)) 42 | 43 | ax1.pcolormesh(hist_pd, cmap="viridis") 44 | ax1.set_aspect("equal") 45 | ax1.set_title("Smoothed Hist") 46 | 47 | ax2.pcolormesh(kde_pd, cmap="viridis") 48 | ax2.set_aspect("equal") 49 | ax2.set_title("2D KDE") 50 | 51 | diff = hist_pd - kde_pd 52 | 53 | diff_img = ax3.pcolormesh(diff * 100.0, cmap="RdBu", vmin=-10, vmax=10) 54 | ax3.set_aspect("equal") 55 | ax3.set_title("Difference") 56 | 57 | f.tight_layout() 58 | 59 | f.colorbar(diff_img, ax=[ax1, ax2, ax3], label="% difference") 60 | f.text(0.9, 0.5, "\n".join([max_dev_s, mean_dev_s, corr_s]), va="center") 61 | 62 | f.savefig(filename) 63 | plt.close(f) 64 | 65 | 66 | def _normalise_range(X): 67 | return X / np.max(X) 68 | 69 | 70 | def compute_deviation_with_kde(df, pd, filename): 71 | """ Compute mean deviation of smoothed histogram with respect to KDE """ 72 | columns = pd["_columns"] 73 | pd = pd[columns[0]][columns[1]] 74 | bw = pd["bw"] 75 | logtrans = [scale == "log" for scale in pd["scales"]] 76 | x = pd["axes"][columns[0]] 77 | y = pd["axes"][columns[1]] 78 | X, Y = np.meshgrid(x, y) 79 | grid_shape = X.shape 80 | grid_points = np.vstack([X.ravel(), Y.ravel()]) 81 | kde_pd = _KDEMultivariate( 82 | np.array(df.dropna()), 83 | bandwidth=bw, 84 | grid_points=grid_points, 85 | grid_shape=grid_shape, 86 | logtrans=logtrans, 87 | ) 88 | hist_pd = np.array(pd["density"]) 89 | 90 | # hist_pd[50] = hist_pd[50] + np.mean(hist_pd) * 0.1 91 | 92 | kde_pd = _normalise_range(kde_pd) 93 | hist_pd = _normalise_range(hist_pd) 94 | 95 | norm = np.sum((hist_pd > 1e-3) + (kde_pd > 1e-3)) / 2.0 96 | mean_dev = np.sum(np.abs(kde_pd - hist_pd)) / norm 97 | 98 | if mean_dev > 0.01: 99 | plot_pd_difference(hist_pd, kde_pd, filename) 100 | 101 | return mean_dev 102 | -------------------------------------------------------------------------------- /tests/test_explorer.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | import pytest 8 | 9 | import lens 10 | from lens.explorer import Explorer 11 | 12 | dirname = os.path.dirname( 13 | os.path.abspath(inspect.getfile(inspect.currentframe())) 14 | ) 15 | 16 | 17 | @pytest.fixture(scope="module") 18 | def artworks_df(): 19 | df = pd.read_csv(os.path.join(dirname, "data/test-artworks.csv")) 20 | return df 21 | 22 | 23 | @pytest.fixture(scope="module") 24 | def artworks_summary(artworks_df): 25 | summary = lens.summarise(artworks_df) 26 | return summary 27 | 28 | 29 | def test_distribution_plot(artworks_df, artworks_summary): 30 | def mock_render(fig): 31 | # check that this draws a histogram 32 | assert len(fig.axes[0].patches) > 0 33 | 34 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 35 | explorer.distribution_plot("Height (cm)") 36 | 37 | 38 | def test_distribution_plot_bins(artworks_df, artworks_summary): 39 | Nbins = 13 40 | 41 | def mock_render(fig): 42 | # check that this draws a histogram with Nbins bars 43 | assert len(fig.axes[0].patches) == Nbins 44 | 45 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 46 | explorer.distribution_plot("Height (cm)", bins=Nbins) 47 | 48 | 49 | def test_cdf_plot(artworks_df, artworks_summary): 50 | column = "Height (cm)" 51 | plt.cla() 52 | 53 | def mock_render(fig): 54 | ax = fig.axes[0] 55 | assert len(ax.lines) == 1 56 | line = ax.lines[0] 57 | 58 | tdigest = artworks_summary.tdigest(column) 59 | xs = [tdigest.percentile(p) for p in [0, 100]] 60 | 61 | assert line.get_xdata()[0] == xs[0] 62 | assert line.get_xdata()[-1] == xs[-1] 63 | assert line.get_ydata()[0] == 0 64 | assert line.get_ydata()[-1] == 100 65 | 66 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 67 | explorer.cdf_plot("Height (cm)") 68 | 69 | 70 | def test_cdf_plot_log_transformed(artworks_df, artworks_summary): 71 | plt.cla() 72 | 73 | def mock_render(fig): 74 | ax = fig.axes[0] 75 | assert len(ax.lines) == 1 76 | assert ax.get_xaxis().get_scale() == "log" 77 | 78 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 79 | explorer.cdf_plot("Width (cm)") 80 | 81 | 82 | def test_cdf_plot_non_numeric(artworks_summary): 83 | def mock_render(fig): 84 | assert False 85 | 86 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 87 | with pytest.raises(ValueError): 88 | explorer.cdf_plot("Nationality") 89 | 90 | 91 | def test_pairwise_density_plot(artworks_df, artworks_summary): 92 | plt.cla() 93 | 94 | def mock_render(fig): 95 | # currently pairwise_density_plot returns a plotly figure 96 | assert len(fig["data"]) == 1 97 | data = fig["data"][0] 98 | assert data["type"] == "heatmap" 99 | 100 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 101 | explorer.pairwise_density_plot("Width (cm)", "Height (cm)") 102 | 103 | 104 | def test_pairwise_density_plot_one_categorical(artworks_df, artworks_summary): 105 | plt.cla() 106 | 107 | def mock_render(fig): 108 | # currently pairwise_density_plot returns a plotly figure 109 | assert len(fig["data"]) == 1 110 | data = fig["data"][0] 111 | assert data["type"] == "heatmap" 112 | 113 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 114 | explorer.pairwise_density_plot("Nationality", "Height (cm)") 115 | explorer.pairwise_density_plot("Height (cm)", "Nationality") 116 | 117 | 118 | def test_pairwise_density_plot_both_categorical(artworks_df, artworks_summary): 119 | plt.cla() 120 | 121 | def mock_render(fig): 122 | # currently pairwise_density_plot returns a plotly figure 123 | assert len(fig["data"]) == 1 124 | data = fig["data"][0] 125 | assert data["type"] == "heatmap" 126 | 127 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 128 | explorer.pairwise_density_plot("Nationality", "Gender") 129 | 130 | 131 | def test_pairwise_density_plot_not_numeric(artworks_df, artworks_summary): 132 | plt.cla() 133 | 134 | def mock_render(fig): 135 | assert False 136 | 137 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 138 | with pytest.raises(ValueError): 139 | explorer.pairwise_density_plot("Diameter (cm)", "Nationality") 140 | 141 | 142 | def test_correlation_plot(artworks_df, artworks_summary): 143 | plt.cla() 144 | 145 | def mock_render(fig): 146 | assert len(fig["data"]) == 1 147 | data = fig["data"][0] 148 | assert data["type"] == "heatmap" 149 | expected_columns = {"Height (cm)", "Width (cm)", "Depth (cm)"} 150 | assert set(data["y"]) == set(data["x"]) == expected_columns 151 | 152 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 153 | explorer.correlation_plot() 154 | 155 | 156 | def test_correlation_plot_annotations(artworks_df, artworks_summary): 157 | plt.cla() 158 | 159 | def mock_render(fig): 160 | assert len(fig["data"]) == 1 161 | corr = [item for row in fig["data"][0]["z"] for item in row] 162 | labels = [l["text"] for l in fig["layout"]["annotations"]] 163 | for c, l in zip(corr, labels): 164 | assert "{:.2g}".format(c) == l 165 | 166 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 167 | explorer.correlation_plot( 168 | include=["Height (cm)", "Width (cm)", "Depth (cm)"] 169 | ) 170 | explorer.correlation_plot(include=["Height (cm)", "Width (cm)"]) 171 | 172 | 173 | def test_correlation_plot_include(artworks_df, artworks_summary): 174 | plt.cla() 175 | 176 | def mock_render(fig): 177 | assert len(fig["data"]) == 1 178 | data = fig["data"][0] 179 | assert data["type"] == "heatmap" 180 | assert set(data["y"]) == set(data["x"]) == {"Height (cm)"} 181 | 182 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 183 | explorer.correlation_plot(include=["Height (cm)"]) 184 | 185 | 186 | def test_correlation_plot_exclude(artworks_df, artworks_summary): 187 | plt.cla() 188 | 189 | def mock_render(fig): 190 | assert len(fig["data"]) == 1 191 | data = fig["data"][0] 192 | assert data["type"] == "heatmap" 193 | expected_columns = {"Height (cm)", "Depth (cm)"} 194 | assert set(data["y"]) == set(data["x"]) == expected_columns 195 | 196 | explorer = Explorer(artworks_summary, plot_renderer=mock_render) 197 | explorer.correlation_plot(exclude=["Width (cm)"]) 198 | -------------------------------------------------------------------------------- /tests/test_summarise.py: -------------------------------------------------------------------------------- 1 | import json 2 | import inspect 3 | import os 4 | import itertools 5 | import datetime 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | import pytest 11 | 12 | from lens import summarise, metrics, __version__ 13 | from lens.summarise import EmptyDataFrameError, NumpyEncoder 14 | from lens.dask_graph import _join_dask_results 15 | from lens.metrics import CAT_FRAC_THRESHOLD 16 | 17 | from multivariate_kde import compute_deviation_with_kde 18 | 19 | dirname = os.path.dirname( 20 | os.path.abspath(inspect.getfile(inspect.currentframe())) 21 | ) 22 | 23 | test_results_dir = dirname + "/test_results" 24 | 25 | if not os.path.exists(test_results_dir): 26 | os.mkdir(test_results_dir) 27 | 28 | 29 | def test_dask_row_count(df): 30 | rc_report = metrics.row_count(df) 31 | assert rc_report["total"] == len(df) 32 | assert rc_report["unique"] == len(df.drop_duplicates().index) 33 | 34 | # test serialization 35 | json.dumps({"row_count": rc_report}, cls=NumpyEncoder) 36 | 37 | 38 | def test_zero_rows_dataframe(): 39 | columns = sorted(["a", "b", "c", "d"]) 40 | df = pd.DataFrame(columns=columns) 41 | report = summarise(df)._report 42 | assert sorted(report["_columns"]) == columns 43 | for column in columns: 44 | props = report["column_properties"][column] 45 | assert props["nulls"] == 0 46 | assert props["notnulls"] == 0 47 | assert props["unique"] == 0 48 | 49 | 50 | def test_one_row_dataframe(): 51 | items = [ 52 | ("a", [1]), 53 | ("b", [-0.5]), 54 | ("c", ["hello"]), 55 | ("d", [datetime.datetime.now()]), 56 | ] 57 | columns = sorted([item[0] for item in items]) 58 | df = pd.DataFrame.from_dict(dict(items)) 59 | report = summarise(df)._report 60 | assert sorted(report["_columns"]) == columns 61 | column_properties = report["column_properties"] 62 | for column in columns: 63 | props = column_properties[column] 64 | assert props["nulls"] == 0 65 | assert props["notnulls"] == 1 66 | assert props["unique"] == 1 67 | assert column_properties["a"]["dtype"] == "int64" 68 | assert column_properties["b"]["dtype"] == "float64" 69 | assert column_properties["c"]["dtype"] == "object" 70 | assert column_properties["d"]["dtype"] == "datetime64[ns]" 71 | column_summary = report["column_summary"] 72 | assert column_summary["a"]["max"] == 1 73 | assert column_summary["a"]["min"] == 1 74 | assert column_summary["a"]["mean"] == 1.0 75 | assert column_summary["a"]["median"] == 1.0 76 | assert column_summary["a"]["iqr"] == 0.0 77 | 78 | assert column_summary["b"]["max"] == -0.5 79 | assert column_summary["b"]["min"] == -0.5 80 | assert column_summary["b"]["median"] == -0.5 81 | assert column_summary["b"]["mean"] == -0.5 82 | 83 | 84 | @pytest.fixture(scope="module") 85 | def column_properties(df): 86 | cols = df.columns 87 | cps = {col: metrics.column_properties(df[col]) for col in cols} 88 | 89 | return cps 90 | 91 | 92 | @pytest.fixture(scope="module") 93 | def column_summary(df, column_properties): 94 | cols = df.columns 95 | cs = { 96 | col: metrics.column_summary(df[col], column_properties[col]) 97 | for col in cols 98 | } 99 | return cs 100 | 101 | 102 | def test_dask_column_properties(column_properties): 103 | # Only worth checking that we determine categorical columns 104 | # correctly if there are enough rows in the dataframe. 105 | # There are 13 distinct categories. 106 | categorical13_props = column_properties["categorical13"]["categorical13"] 107 | row_threshold = 2 * 13.0 / CAT_FRAC_THRESHOLD 108 | if categorical13_props["notnulls"] > row_threshold: 109 | assert categorical13_props["is_categorical"] 110 | 111 | # test serialization 112 | joined = _join_dask_results(column_properties.values()).compute() 113 | json.dumps({"column_summary": joined}, cls=NumpyEncoder) 114 | 115 | 116 | def test_dask_column_summary(df, column_summary): 117 | for col in df.columns: 118 | series = df[col] 119 | cs_report = column_summary[col] 120 | 121 | if cs_report is None or series.isnull().sum() == len(df.index): 122 | continue 123 | 124 | else: 125 | cs_report = cs_report[col] 126 | 127 | # Test that only lognormal is set to log transform 128 | # Only run this test if the column has enough valid 129 | # values 130 | if len(df.index) >= 50: 131 | if col == "lognormal": 132 | assert cs_report["logtrans"] 133 | else: 134 | assert not cs_report["logtrans"] 135 | 136 | _percs = list(cs_report["percentiles"].keys()) 137 | _percs.sort() 138 | cs_report_perc = [cs_report["percentiles"][p] for p in _percs] 139 | exact_perc = np.nanpercentile(series, _percs) 140 | np.testing.assert_allclose( 141 | cs_report_perc, exact_perc, rtol=1e-3, atol=1e-3 142 | ) 143 | 144 | exact_meanminmax = [ 145 | np.nanmean(series.get_values()), 146 | np.nanmin(series.get_values()), 147 | np.nanmax(series.get_values()), 148 | ] 149 | rep_meanminmax = [cs_report[x] for x in ["mean", "min", "max"]] 150 | np.testing.assert_allclose( 151 | exact_meanminmax, rep_meanminmax, rtol=1e-3, atol=0.01 152 | ) 153 | 154 | # test histogram 155 | histogram = cs_report["histogram"] 156 | 157 | assert np.sum(histogram["counts"]) == series.notnull().sum() 158 | if cs_report["n"] > 1 and not np.all(np.mod(series.dropna(), 1) == 0): 159 | # Bin edges for single-count histograms are not relevant, and 160 | # integer-only histograms not bounded by extremes in distribution 161 | assert np.allclose(histogram["bin_edges"][0], series.min()) 162 | assert np.allclose(histogram["bin_edges"][-1], series.max()) 163 | 164 | if col == "categoricalint": 165 | # Check that bins are set correctly for integers 166 | # we are removing the twos so there should be at least one empty 167 | # bin in the histogram 168 | n_unique = series.dropna().unique().size 169 | assert len(histogram["counts"]) >= n_unique 170 | assert len(histogram["bin_edges"]) == len(histogram["counts"]) + 1 171 | 172 | # Check that the bin that contains 2 is set to 0 173 | idx = np.where(np.array(histogram["bin_edges"]) < 2)[0][-1] 174 | assert histogram["counts"][idx] == 0 175 | 176 | assert np.allclose( 177 | histogram["bin_edges"][0], series.dropna().min() - 0.5 178 | ) 179 | assert np.allclose( 180 | histogram["bin_edges"][-1], series.dropna().max() + 0.5 181 | ) 182 | 183 | # test kde 184 | kde = cs_report["kde"] 185 | 186 | assert np.all(~np.isnan(kde["x"])) 187 | assert np.all(~np.isnan(kde["y"])) 188 | 189 | if "categorical" not in col and np.sum(kde["y"]) > 0: 190 | assert np.allclose(np.trapz(kde["y"], kde["x"]), 1) 191 | 192 | if col == "normal": 193 | mean = cs_report["mean"] 194 | kde_max = kde["x"][np.argmax(kde["y"])] 195 | assert np.allclose(kde_max, mean, atol=5, rtol=0.1) 196 | 197 | # test serialization 198 | joined = _join_dask_results(column_summary.values()).compute() 199 | json.dumps({"column_summary": joined}, cls=NumpyEncoder) 200 | 201 | 202 | def test_dask_outliers(df, column_summary): 203 | reps = [] 204 | for col in df.columns: 205 | reps.append(metrics.outliers(df[col], column_summary[col])) 206 | 207 | # test serialization 208 | joined = _join_dask_results(reps).compute() 209 | json.dumps({"outliers": joined}, cls=NumpyEncoder) 210 | 211 | 212 | @pytest.fixture(scope="module") 213 | def frequencies(df, column_properties): 214 | return { 215 | col: metrics.frequencies(df[col], column_properties[col]) 216 | for col in df.columns 217 | } 218 | 219 | 220 | def test_dask_frequencies(df, frequencies): 221 | for col in frequencies.keys(): 222 | freq_report = frequencies[col] 223 | if freq_report is None: 224 | continue 225 | else: 226 | freq_report = freq_report[col] 227 | 228 | freqs = df[col].value_counts().to_dict() 229 | 230 | for k in freqs.keys(): 231 | assert freqs[k] == freq_report[k] 232 | 233 | # test serialization 234 | joined = _join_dask_results(frequencies.values()).compute() 235 | json.dumps({"freqs": joined}, cls=NumpyEncoder) 236 | 237 | 238 | def test_dask_correlation(df, column_properties): 239 | cp = _join_dask_results(column_properties.values()).compute() 240 | rep = metrics.correlation(df, cp) 241 | cols = rep["_columns"] 242 | sp = np.array(rep["spearman"]) 243 | order = rep["order"] 244 | 245 | assert len(order) == len(cols) 246 | assert sp.shape[0] == len(cols) 247 | assert sp.shape[1] == len(cols) 248 | 249 | # test serialization 250 | json.dumps({"correlation": rep}, cls=NumpyEncoder) 251 | 252 | 253 | def test_dask_pairdensity(df, column_properties, column_summary, frequencies): 254 | pds = [] 255 | for col1, col2 in itertools.combinations(df.columns, 2): 256 | cp = {k: column_properties[k] for k in [col1, col2]} 257 | cs = {k: column_summary[k] for k in [col1, col2]} 258 | fr = {k: frequencies[k] for k in [col1, col2]} 259 | pd = metrics.pairdensity(df[[col1, col2]], cp, cs, fr) 260 | if pd is not None: 261 | if should_pair_density_norm_be_finite(df[[col1, col2]], cp): 262 | if ( 263 | not cp[col1][col1]["is_categorical"] 264 | and not cp[col2][col2]["is_categorical"] 265 | and "poisson" not in col1 266 | and "poisson" not in col2 267 | ): 268 | filename = "{}/{}_{}_{}_pd_diff.png".format( 269 | test_results_dir, len(df.index), col1, col2 270 | ) 271 | mean_dev = compute_deviation_with_kde( 272 | df[[col1, col2]], pd, filename 273 | ) 274 | assert mean_dev < 0.02 275 | assert ( 276 | np.sum(pd[col1][col2]["density"]) > 0 277 | ), "Failed on columns {} - {}".format(col1, col2) 278 | 279 | pds.append(pd) 280 | 281 | joined = _join_dask_results(pds).compute() 282 | 283 | # test serialization 284 | json.dumps({"pairdensity": joined}, cls=NumpyEncoder) 285 | 286 | 287 | def should_pair_density_norm_be_finite(df, column_properties): 288 | col1, col2 = df.columns 289 | valid_rows = df.dropna().index 290 | is_col1_categorical = column_properties[col1][col1]["is_categorical"] 291 | is_col2_categorical = column_properties[col2][col2]["is_categorical"] 292 | if is_col1_categorical and is_col2_categorical: 293 | return len(valid_rows) >= 1 294 | elif is_col1_categorical: 295 | n_distinct = column_properties[col1][col1]["unique"] 296 | return len(valid_rows) >= (n_distinct * 2) 297 | elif is_col2_categorical: 298 | n_distinct = column_properties[col2][col2]["unique"] 299 | return len(valid_rows) >= (n_distinct * 2) 300 | else: 301 | return len(valid_rows) >= 3 302 | 303 | 304 | def serialize_full_report(dreport, fname=None): 305 | # test that it can be serialized as json 306 | try: 307 | if fname is None: 308 | json.dumps(dreport, cls=NumpyEncoder) 309 | else: 310 | with open(fname, "w") as f: 311 | json.dump(dreport, f, indent=2) 312 | except TypeError: 313 | # Nail down which metric is failing 314 | for k in dreport.keys(): 315 | try: 316 | json.dumps({k: dreport[k]}, cls=NumpyEncoder) 317 | except TypeError as e: 318 | raise TypeError( 319 | "Metric {} is not JSON serializable: {}".format(k, e) 320 | ) 321 | 322 | 323 | def test_dask_compute_graph_default(report): 324 | fname = "{}/test_results/report_test_data.json".format(dirname) 325 | 326 | serialize_full_report(report, fname=fname) 327 | 328 | 329 | @pytest.mark.parametrize( 330 | "scheduler,num_workers,pairdensities", 331 | [ 332 | ("sync", None, True), 333 | ("multiprocessing", 2, True), 334 | ("threading", None, True), 335 | ("multiprocessing", 4, False), 336 | ], 337 | ) 338 | def test_dask_compute_graph(df, scheduler, num_workers, pairdensities): 339 | dreport = summarise( 340 | df, 341 | scheduler=scheduler, 342 | num_workers=num_workers, 343 | pairdensities=pairdensities, 344 | )._report 345 | fname = None 346 | if scheduler == "multiprocessing" and num_workers is None: 347 | fname = "{}/test_results/report_test_data_{}.json".format( 348 | dirname, "mp" 349 | ) 350 | assert dreport["_lens_version"] == __version__ 351 | if not pairdensities: 352 | assert dreport["pairdensity"] == {"_columns": [], "_run_time": 0.0} 353 | 354 | serialize_full_report(dreport, fname=fname) 355 | 356 | 357 | def test_empty_df(): 358 | empty_df = pd.DataFrame() 359 | with pytest.raises(EmptyDataFrameError): 360 | summarise(empty_df) 361 | 362 | 363 | @pytest.fixture 364 | def small_df(): 365 | N = 100 366 | df = pd.DataFrame.from_dict( 367 | {"foo": np.random.randn(N), "bar": np.random.randint(10, size=N)} 368 | ) 369 | return df 370 | 371 | 372 | def test_string_num_cpus_env(small_df, monkeypatch): 373 | monkeypatch.setenv("NUM_CPUS", "not-an-int") 374 | ls = summarise(small_df) 375 | assert set(ls._report["_columns"]) == set(small_df.columns) 376 | 377 | 378 | def test_int_num_cpus_env(small_df, monkeypatch): 379 | num_cpus_env = 2 380 | monkeypatch.setenv("NUM_CPUS", str(num_cpus_env)) 381 | ls = summarise(small_df) 382 | assert set(ls._report["_columns"]) == set(small_df.columns) 383 | -------------------------------------------------------------------------------- /tests/test_summary_class.py: -------------------------------------------------------------------------------- 1 | import os 2 | import inspect 3 | 4 | import pytest 5 | import numpy as np 6 | import numpy.testing 7 | import scipy.stats 8 | import pandas as pd 9 | import json 10 | 11 | from lens import Summary, summarise 12 | 13 | dirname = os.path.dirname( 14 | os.path.abspath(inspect.getfile(inspect.currentframe())) 15 | ) 16 | 17 | 18 | @pytest.fixture(scope="function") 19 | def ls(report): 20 | return Summary(report) 21 | 22 | 23 | # VZ: I have not managed to get the below test not to mutate the report 24 | # fixture, so subsequent tests fail if this is run. Disabling for now. 25 | # def test_report_validation(report): 26 | # # load it into the class 27 | # Summary(report) 28 | # r = report.copy() 29 | # 30 | # # Test that it fails on missing data 31 | # for metric in ['frequencies', 'column_summary', 'outliers']: 32 | # r[metric].pop(r[metric]['_columns'][0]) 33 | # with pytest.raises(LensSummaryError): 34 | # Summary(r) 35 | 36 | 37 | def test_columns_method(report, ls): 38 | assert set(ls.columns) == set(report["_columns"]) 39 | 40 | 41 | def test_row_count_method(report, ls): 42 | assert report["row_count"]["total"] == ls.rows 43 | assert report["row_count"]["unique"] == ls.rows_unique 44 | 45 | 46 | def test_summary_method(report, ls): 47 | for col in ls.columns: 48 | summary = ls.summary(col) 49 | assert summary["name"] == col 50 | for k in ["nulls", "notnulls", "unique", "dtype"]: 51 | assert summary[k] == report["column_properties"][col][k] 52 | 53 | assert (summary["desc"] == "categorical") == report[ 54 | "column_properties" 55 | ][col]["is_categorical"] 56 | 57 | assert (summary["desc"] == "numeric") == ( 58 | report["column_properties"][col]["numeric"] 59 | and not report["column_properties"][col]["is_categorical"] 60 | ) 61 | 62 | 63 | def test_numeric_details(report, ls): 64 | num_cols = [ 65 | col 66 | for col in ls.columns 67 | if report["column_properties"][col]["numeric"] 68 | ] 69 | 70 | metrics = ["min", "max", "mean", "median", "std", "sum"] 71 | 72 | for col in num_cols: 73 | details = ls.details(col) 74 | for m in metrics: 75 | if not np.isnan(details[m]): 76 | assert details[m] == report["column_summary"][col][m] 77 | 78 | 79 | def test_categorical_details(report, ls): 80 | cat_cols = [ 81 | col 82 | for col in ls.columns 83 | if report["column_properties"][col]["is_categorical"] 84 | ] 85 | 86 | for col in cat_cols: 87 | details = ls.details(col) 88 | for category in report["frequencies"][col].keys(): 89 | assert ( 90 | details["frequencies"][category] 91 | == report["frequencies"][col][category] 92 | ) 93 | 94 | 95 | def test_histogram(report, ls): 96 | num_cols = [ 97 | col 98 | for col in ls.columns 99 | if report["column_properties"][col]["numeric"] 100 | ] 101 | 102 | for col in num_cols: 103 | histogram = ls.histogram(col) 104 | for key, actual in zip(["counts", "bin_edges"], histogram): 105 | assert np.allclose( 106 | report["column_summary"][col]["histogram"][key], actual 107 | ) 108 | 109 | 110 | def test_kde(report, ls): 111 | num_cols = [ 112 | col 113 | for col in ls.columns 114 | if report["column_properties"][col]["numeric"] 115 | ] 116 | 117 | for col in num_cols: 118 | kde = ls.kde(col) 119 | for key, actual in zip(["x", "y"], kde): 120 | assert np.allclose( 121 | report["column_summary"][col]["kde"][key], actual 122 | ) 123 | 124 | 125 | @pytest.mark.parametrize( 126 | "col1, col2", 127 | [ 128 | ("normal", "poisson"), 129 | ("normal", "lognormal"), 130 | ("normal", "categorical5"), 131 | ("categorical5", "categorical13"), 132 | ], 133 | ) 134 | def test_pair_details_pairdensity(report, ls, col1, col2): 135 | details = ls.pair_details(col1, col2) 136 | 137 | for col, key in zip([col1, col2], ["x", "y"]): 138 | if col in report["column_summary"].keys(): 139 | # Test that logtrans matches scale. 140 | assert report["column_summary"][col]["logtrans"] == ( 141 | details["pairdensity"][key + "_scale"] == "log" 142 | ) 143 | # Test that min/max match range of coordinates. 144 | assert np.allclose( 145 | report["column_summary"][col]["min"], 146 | np.min(details["pairdensity"][key]), 147 | ) 148 | assert ( 149 | np.max(details["pairdensity"][key]) 150 | <= report["column_summary"][col]["max"] 151 | ) 152 | 153 | details_transposed = ls.pair_details(col2, col1) 154 | assert np.allclose( 155 | details["pairdensity"]["density"], 156 | details_transposed["pairdensity"]["density"].T, 157 | ) 158 | 159 | 160 | @pytest.mark.parametrize( 161 | "col1, col2", [("normal", "poisson"), ("normal", "lognormal")] 162 | ) 163 | def test_pair_details_correlation(report, ls, col1, col2): 164 | details = ls.pair_details(col1, col2) 165 | details_transposed = ls.pair_details(col1, col2) 166 | idx = [ 167 | report["correlation"]["_columns"].index(col) for col in [col1, col2] 168 | ] 169 | 170 | for coeff in ["spearman", "pearson"]: 171 | assert np.allclose( 172 | report["correlation"][coeff][idx[0]][idx[1]], 173 | details["correlation"][coeff], 174 | ) 175 | assert np.allclose( 176 | details["correlation"][coeff], 177 | details_transposed["correlation"][coeff], 178 | ) 179 | 180 | 181 | def test_pair_details_empty(ls): 182 | # Test that non-numeric pairs return an empty dict without raising 183 | # exceptions. 184 | details = ls.pair_details("normal", "datetimes") 185 | assert len(details.keys()) == 0 186 | 187 | 188 | def test_pair_details_same_column(ls): 189 | with pytest.raises(ValueError): 190 | ls.pair_details("normal", "normal") 191 | 192 | 193 | @pytest.mark.parametrize( 194 | "col1, col2", [("normal", "lognormal"), ("normal", "normal")] 195 | ) 196 | def test_correlation_matrix(report, ls, col1, col2): 197 | columns, correlation_matrix = ls.correlation_matrix() 198 | index_column1 = columns.index(col1) 199 | index_column2 = columns.index(col2) 200 | correlation_value = ( 201 | 1 202 | if col1 == col2 203 | else (ls.pair_details(col1, col2)["correlation"]["spearman"]) 204 | ) 205 | assert ( 206 | correlation_matrix[index_column1, index_column2] 207 | == correlation_matrix[index_column2, index_column1] 208 | == correlation_value 209 | ) 210 | 211 | 212 | def test_correlation_matrix_one_column(): 213 | column_values = np.random.ranf(size=200) 214 | df = pd.DataFrame.from_dict({"a": column_values}) 215 | summary = summarise(df) 216 | columns, correlation_matrix = summary.correlation_matrix() 217 | assert columns == ["a"] 218 | assert correlation_matrix.shape == (1, 1) 219 | numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0) 220 | 221 | 222 | def test_correlation_matrix_two_columns(): 223 | column1_values = np.random.ranf(size=200) 224 | column2_values = np.random.ranf(size=200) 225 | df = pd.DataFrame.from_dict({"a": column1_values, "b": column2_values}) 226 | summary = summarise(df) 227 | columns, correlation_matrix = summary.correlation_matrix() 228 | assert sorted(columns) == ["a", "b"] 229 | numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0) 230 | numpy.testing.assert_approx_equal(correlation_matrix[1, 1], 1.0) 231 | off_diagonal_term = scipy.stats.spearmanr(df["a"], df["b"]).correlation 232 | numpy.testing.assert_approx_equal( 233 | correlation_matrix[1, 0], off_diagonal_term 234 | ) 235 | numpy.testing.assert_approx_equal( 236 | correlation_matrix[0, 1], off_diagonal_term 237 | ) 238 | 239 | 240 | def test_correlation_matrix_three_columns(): 241 | column_values = [np.random.ranf(size=200) for i in range(3)] 242 | column_headers = ["a", "b", "c"] 243 | df = pd.DataFrame.from_dict(dict(zip(column_headers, column_values))) 244 | summary = summarise(df) 245 | columns, correlation_matrix = summary.correlation_matrix() 246 | assert sorted(columns) == column_headers 247 | 248 | for i, first_column in enumerate(columns): 249 | for j, second_column in enumerate(columns): 250 | expected = scipy.stats.spearmanr( 251 | df[first_column], df[second_column] 252 | ).correlation 253 | actual = correlation_matrix[i, j] 254 | numpy.testing.assert_approx_equal(expected, actual) 255 | 256 | 257 | def test_json_roundtrip(ls): 258 | # Run reference report through JSON roundtrip for comparison 259 | original_report = json.loads(json.dumps(ls._report)) 260 | string_report = json.loads(ls.to_json()) 261 | 262 | filename = "test-report.json" 263 | 264 | # Test filename roundtrip 265 | ls.to_json(filename) 266 | file_report = Summary.from_json(filename)._report 267 | 268 | # Test buffer roundtrip 269 | with open(filename, "w") as f: 270 | ls.to_json(f) 271 | 272 | with open(filename, "r") as f: 273 | buffer_report = Summary.from_json(f)._report 274 | 275 | os.remove(filename) 276 | 277 | for json_report in [string_report, file_report, buffer_report]: 278 | diffs = find_diff(original_report, json_report) 279 | for diff in diffs: 280 | print(diff) 281 | 282 | assert len(diffs) == 0 283 | 284 | 285 | def find_diff(d1, d2, exclude=[], path="", update_path=True): 286 | diffs = [] 287 | for k in d1.keys(): 288 | if k in exclude: 289 | continue 290 | 291 | if k not in d2: 292 | msg = "{} :\n {} as key not in d2".format(path, k) 293 | diffs.append(msg) 294 | else: 295 | new_path = path 296 | if update_path: 297 | if new_path == "": 298 | new_path = k 299 | else: 300 | new_path = new_path + "->" + k 301 | 302 | if isinstance(d1[k], dict): 303 | diffs = diffs + find_diff(d1[k], d2[k], exclude, new_path) 304 | elif isinstance(d1[k], list): 305 | # convert the list to a dict using the index as the key. 306 | diffs = diffs + find_diff( 307 | list_to_dict(d1[k]), 308 | list_to_dict(d2[k]), 309 | exclude, 310 | new_path, 311 | False, 312 | ) 313 | else: 314 | a = d1[k] 315 | b = d2[k] 316 | if not isinstance(a, float) or not ( 317 | np.isnan(a) and np.isnan(b) 318 | ): 319 | if isinstance(a, float): 320 | if not np.allclose(a, b): 321 | msg = "{} :\n - {} : {}\n + {} : {}".format( 322 | path, k, a, k, b 323 | ) 324 | diffs.append(msg) 325 | elif a != b: 326 | msg = "{} :\n - {} : {}\n + {} : {}".format( 327 | path, k, a, k, b 328 | ) 329 | diffs.append(msg) 330 | 331 | return diffs 332 | 333 | 334 | def list_to_dict(list_): 335 | dict_ = {} 336 | for index, item in enumerate(list_): 337 | dict_[index] = item 338 | 339 | return dict_ 340 | 341 | 342 | # Tolerances for N=10k, taken from the TDigest test suite 343 | tdigest_tol = {50: 0.02, 25: 0.015, 10: 0.01, 1: 0.005, 0.1: 0.001} 344 | 345 | for k in list(tdigest_tol.keys()): 346 | tdigest_tol[100 - k] = tdigest_tol[k] 347 | 348 | 349 | @pytest.mark.parametrize("column", ["normal", "lognormal", "poisson"]) 350 | def test_summary_cdf(ls, column): 351 | cdf = ls.cdf(column) 352 | 353 | # Set tolerance based on number of rows 354 | for p in ls._report["column_summary"][column]["percentiles"]: 355 | tol = tdigest_tol[p] * np.sqrt(10000 / ls.rows) 356 | x = ls._report["column_summary"][column]["percentiles"][p] 357 | assert np.allclose(p / 100.0, cdf(x), atol=tol, rtol=1) 358 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py36, py37, flake8, black 3 | toxworkdir = {env:TOX_WORK_DIR:.tox} 4 | 5 | [testenv] 6 | sitepackages = False 7 | setenv = 8 | MPLBACKEND = Agg 9 | deps = 10 | boto3 11 | cloudpickle 12 | flake8 13 | pytest 14 | s3fs 15 | statsmodels 16 | toolz 17 | pytest 18 | commands = py.test {posargs:-v --ignore=it} 19 | 20 | [testenv:flake8] 21 | skip_install = True 22 | deps = 23 | flake8 24 | commands = 25 | flake8 lens tests 26 | 27 | [testenv:black] 28 | skip_install = True 29 | deps = 30 | black==18.9b0 31 | commands = 32 | black {posargs:--check setup.py lens tests} 33 | --------------------------------------------------------------------------------