├── dataprep
    ├── tests
    │   ├── __init__.py
    │   ├── data_connector
    │   │   ├── __init__.py
    │   │   └── test_integration.py
    │   └── eda
    │   │   ├── __init__.py
    │   │   ├── test.py
    │   │   ├── test_plot_missing.py
    │   │   ├── test_plot.py
    │   │   ├── test_report.py
    │   │   └── test_plot_correlation.py
    ├── eda
    │   ├── outlier
    │   │   ├── __init__.py
    │   │   └── computation.py
    │   ├── palette.py
    │   ├── __init__.py
    │   ├── missing
    │   │   └── __init__.py
    │   ├── intermediate.py
    │   ├── correlation
    │   │   ├── __init__.py
    │   │   └── render.py
    │   ├── report.py
    │   ├── utils.py
    │   ├── dtypes.py
    │   └── basic
    │   │   └── __init__.py
    ├── assets
    │   ├── ellipse.jpg
    │   └── english_stopwords.py
    ├── data_connector
    │   ├── __init__.py
    │   ├── schema.py
    │   ├── errors.py
    │   ├── config_manager.py
    │   ├── types.py
    │   ├── implicit_database.py
    │   └── schema.json
    ├── __init__.py
    └── errors.py
├── .coveragerc
├── poetry.toml
├── docs
    ├── source
    │   ├── case_study
    │   │   ├── titanic.ipynb
    │   │   └── house_price.ipynb
    │   ├── _static
    │   │   └── images
    │   │   │   ├── tutorial
    │   │   │       ├── .DS_Store
    │   │   │       ├── URI_.png
    │   │   │       ├── dc_git.png
    │   │   │       ├── App_find.png
    │   │   │       ├── Node_js.png
    │   │   │       ├── dc_query.png
    │   │   │       ├── dc_show.png
    │   │   │       ├── SFU_Spotify.png
    │   │   │       ├── dc_schema.png
    │   │   │       ├── App.js_config.png
    │   │   │       ├── ID_and_secret.png
    │   │   │       ├── Yelp_API_Key.png
    │   │   │       ├── dc_dblp_info.png
    │   │   │       ├── dc_dblp_query.png
    │   │   │       ├── dc_git_clone.png
    │   │   │       ├── dc_yelp_query.png
    │   │   │       ├── Spotify_git_page.png
    │   │   │       ├── Spotify_server.png
    │   │   │       ├── dc_dblp_author.png
    │   │   │       ├── dc_spotify_info.png
    │   │   │       ├── dc_spotify_query.png
    │   │   │       ├── Config_destination.png
    │   │   │       ├── Spotify_dashboard.png
    │   │   │       ├── dc_dblp_pagination.png
    │   │   │       ├── dc_yelp_query_pag.png
    │   │   │       ├── Yelp_authentication.png
    │   │   │       ├── dc_dblp_show_schema.png
    │   │   │       ├── dc_spotify_query_pag.png
    │   │   │       ├── Spotify_authentication.png
    │   │   │       └── dc_spotify_show_schema.png
    │   │   │   ├── data_connector
    │   │   │       ├── info.png
    │   │   │       ├── query.png
    │   │   │       └── show_schema.png
    │   │   │   └── plot_missing
    │   │   │       └── df_x_cat.html
    │   ├── dataprep.rst
    │   ├── dataprep.eda.rst
    │   ├── index.rst
    │   ├── dataprep.data_connector.rst
    │   ├── conf.py
    │   ├── data_connector.rst
    │   ├── eda
    │   │   ├── plot_missing.rst
    │   │   ├── introduction.rst
    │   │   ├── plot_correlation.rst
    │   │   └── plot.rst
    │   ├── DC_DBLP_tut.rst
    │   └── DC_Yelp_tut.rst
    ├── Makefile
    └── make.bat
├── assets
    ├── logo.png
    ├── plot(df).png
    ├── data_connector.png
    ├── plot_missing(df).png
    ├── plot_correlation(df).png
    └── plot_missing(df,x).png
├── codecov.yaml
├── mypy.ini
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── pull_request_template.md
├── pytype.cfg
├── LICENSE
├── .gitignore
├── pyproject.toml
├── Justfile
├── .circleci
    └── config.yml
├── README.md
└── examples
    ├── DataConnector_DBLP.ipynb
    ├── DataConnector_Yelp.ipynb
    └── EDA.ipynb


/dataprep/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataprep/eda/outlier/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source=dataprep


--------------------------------------------------------------------------------
/dataprep/tests/data_connector/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 | 


--------------------------------------------------------------------------------
/dataprep/tests/eda/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | EDA Tests
3 | """
4 | 


--------------------------------------------------------------------------------
/docs/source/case_study/titanic.ipynb:
--------------------------------------------------------------------------------
1 | ../../../examples/titanic.ipynb


--------------------------------------------------------------------------------
/docs/source/case_study/house_price.ipynb:
--------------------------------------------------------------------------------
1 | ../../../examples/house_price.ipynb


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/logo.png


--------------------------------------------------------------------------------
/assets/plot(df).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/plot(df).png


--------------------------------------------------------------------------------
/assets/data_connector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/data_connector.png


--------------------------------------------------------------------------------
/assets/plot_missing(df).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/plot_missing(df).png


--------------------------------------------------------------------------------
/dataprep/assets/ellipse.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/dataprep/assets/ellipse.jpg


--------------------------------------------------------------------------------
/assets/plot_correlation(df).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/plot_correlation(df).png


--------------------------------------------------------------------------------
/assets/plot_missing(df,x).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/plot_missing(df,x).png


--------------------------------------------------------------------------------
/dataprep/data_connector/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | DataConnector
3 | """
4 | from .connector import Connector
5 | 
6 | __all__ = ["Connector"]
7 | 


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/.DS_Store


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/URI_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/URI_.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_git.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_git.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/App_find.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/App_find.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/Node_js.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Node_js.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_query.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_show.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_show.png


--------------------------------------------------------------------------------
/docs/source/_static/images/data_connector/info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/data_connector/info.png


--------------------------------------------------------------------------------
/docs/source/_static/images/data_connector/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/data_connector/query.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/SFU_Spotify.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/SFU_Spotify.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_schema.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/App.js_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/App.js_config.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/ID_and_secret.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/ID_and_secret.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/Yelp_API_Key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Yelp_API_Key.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_dblp_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_info.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_dblp_query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_query.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_git_clone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_git_clone.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_yelp_query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_yelp_query.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/Spotify_git_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Spotify_git_page.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/Spotify_server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Spotify_server.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_dblp_author.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_author.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_spotify_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_spotify_info.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_spotify_query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_spotify_query.png


--------------------------------------------------------------------------------
/docs/source/_static/images/data_connector/show_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/data_connector/show_schema.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/Config_destination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Config_destination.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/Spotify_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Spotify_dashboard.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_dblp_pagination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_pagination.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_yelp_query_pag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_yelp_query_pag.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/Yelp_authentication.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Yelp_authentication.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_dblp_show_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_show_schema.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_spotify_query_pag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_spotify_query_pag.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/Spotify_authentication.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Spotify_authentication.png


--------------------------------------------------------------------------------
/docs/source/_static/images/tutorial/dc_spotify_show_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_spotify_show_schema.png


--------------------------------------------------------------------------------
/docs/source/dataprep.rst:
--------------------------------------------------------------------------------
 1 | dataprep package
 2 | ================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |    dataprep.data_connector
10 |    dataprep.eda


--------------------------------------------------------------------------------
/dataprep/data_connector/schema.py:
--------------------------------------------------------------------------------
1 | """
2 | Module contains the loaded config schema.
3 | """
4 | from json import load as jload
5 | from pathlib import Path
6 | 
7 | with open(f"{Path(__file__).parent}/schema.json", "r") as f:
8 |     CONFIG_SCHEMA = jload(f)
9 | 


--------------------------------------------------------------------------------
/dataprep/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | dataprep
 3 | ========
 4 | 
 5 | Dataprep let you prepare your data using a single library with a few lines of code.
 6 | """
 7 | import logging
 8 | 
 9 | DEFAULT_PARTITIONS = 1
10 | 
11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
12 | 
13 | __version__ = "0.2.8"
14 | 


--------------------------------------------------------------------------------
/dataprep/errors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Library-wise errors
 3 | """
 4 | 
 5 | 
 6 | class DataprepError(Exception):
 7 |     """
 8 |     Base exception, used library-wise
 9 |     """
10 | 
11 | 
12 | class UnreachableError(DataprepError):
13 |     """
14 |     Error indicating some path of the code is unreachable.
15 |     """
16 | 


--------------------------------------------------------------------------------
/dataprep/eda/palette.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file defines palettes used for EDA.
 3 | """
 4 | # pylint: disable=no-name-in-module
 5 | from bokeh.palettes import Category20  # type: ignore
 6 | from holoviews.plotting.util import process_cmap
 7 | 
 8 | PALETTE = Category20[20]
 9 | BIPALETTE = list(reversed(process_cmap("RdBu")))
10 | BRG = ["#1f78b4", "#d62728", "#2ca02c"]
11 | 


--------------------------------------------------------------------------------
/codecov.yaml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |     require_ci_to_pass: yes
 3 |   
 4 |   coverage:
 5 |     precision: 2
 6 |     round: down
 7 |     range: "70...100"
 8 |   
 9 |   parsers:
10 |     gcov:
11 |       branch_detection:
12 |         conditional: yes
13 |         loop: yes
14 |         method: no
15 |         macro: no
16 |   
17 |   comment:
18 |     layout: "reach,diff,flags,tree"
19 |     behavior: default
20 |     require_changes: no


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | [mypy]
 4 | ignore_missing_imports = True
 5 | ignore_errors = False
 6 | warn_unused_configs = True
 7 | disallow_subclassing_any = True
 8 | disallow_any_generics = True
 9 | disallow_untyped_calls = True
10 | disallow_untyped_defs = True
11 | disallow_incomplete_defs = True
12 | check_untyped_defs = True
13 | disallow_untyped_decorators = True
14 | no_implicit_optional = True
15 | warn_redundant_casts = True
16 | warn_unused_ignores = False
17 | warn_return_any = True
18 | 


--------------------------------------------------------------------------------
/dataprep/tests/eda/test.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime as DateTime
 2 | from datetime import timedelta as TimeDelta
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from ...eda.dtypes import is_nominal, is_continuous
 7 | 
 8 | 
 9 | def test_dtypes() -> None:
10 |     df = pd.DataFrame(data=[["a", "c", False]], columns=["S", "C", "B"])
11 |     df["C"] = df["C"].astype("category")
12 | 
13 |     for col in df.columns:
14 |         assert is_nominal(df[col].dtype)
15 | 
16 |     df = pd.DataFrame(
17 |         data=[[complex(3, 1), 1, 1.1, TimeDelta(1), DateTime.now(),]],
18 |         columns=["IM", "I", "F", "TD", "DT"],
19 |     )
20 | 
21 |     for col in df.columns:
22 |         assert is_continuous(df[col].dtype)
23 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: 'triage required, type: enhancement'
 6 | assignees: dovahcrow
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/dataprep/tests/data_connector/test_integration.py:
--------------------------------------------------------------------------------
 1 | from ...data_connector import Connector
 2 | from os import environ
 3 | 
 4 | 
 5 | def test_data_connector() -> None:
 6 |     token = environ["DATAPREP_DATA_CONNECTOR_YELP_TOKEN"]
 7 |     dc = Connector("yelp", _auth={"access_token": token})
 8 |     df = dc.query("businesses", term="ramen", location="vancouver")
 9 | 
10 |     assert len(df) > 0
11 | 
12 |     dc.info()
13 | 
14 |     schema = dc.show_schema("businesses")
15 | 
16 |     assert len(schema) > 0
17 | 
18 |     df = dc.query("businesses", _count=120, term="ramen", location="vancouver")
19 | 
20 |     assert len(df) == 120
21 | 
22 |     df = dc.query("businesses", _count=10000, term="ramen", location="vancouver")
23 | 
24 |     assert len(df) < 1000
25 | 


--------------------------------------------------------------------------------
/docs/source/dataprep.eda.rst:
--------------------------------------------------------------------------------
 1 | dataprep.eda package
 2 | ====================
 3 | 
 4 | .. .. automodule:: dataprep.eda
 5 | ..    :noindex:
 6 | 
 7 | Plot* functions
 8 | ---------------
 9 | .. autofunction:: dataprep.eda.basic.plot
10 | .. autofunction:: dataprep.eda.correlation.plot_correlation
11 | .. autofunction:: dataprep.eda.missing.plot_missing
12 | 
13 | Other functions
14 | ---------------
15 | 
16 | .. autofunction:: dataprep.eda.basic.compute
17 | .. autofunction:: dataprep.eda.basic.render
18 | .. autofunction:: dataprep.eda.correlation.compute.compute_correlation
19 | .. autofunction:: dataprep.eda.correlation.render.render_correlation
20 | .. autofunction:: dataprep.eda.missing.compute.compute_missing
21 | .. autofunction:: dataprep.eda.missing.render.render_missing
22 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/pytype.cfg:
--------------------------------------------------------------------------------
 1 | # NOTE: All relative paths are relative to the location of this file.
 2 | 
 3 | [pytype]
 4 | 
 5 | # Space-separated list of files or directories to exclude.
 6 | exclude =
 7 |     **/*_test.py
 8 |     **/test_*.py
 9 | 
10 | # Space-separated list of files or directories to process.
11 | inputs =
12 |     .
13 | 
14 | # Keep going past errors to analyze as many files as possible.
15 | keep_going = False
16 | 
17 | # All pytype output goes here.
18 | output = .pytype
19 | 
20 | # Paths to source code directories, separated by ':'.
21 | pythonpath =
22 |     .
23 | 
24 | # Python version (major.minor) of the target code.
25 | python_version = 3.7
26 | 
27 | # Comma separated list of error names to ignore.
28 | disable =
29 |     pyi-error
30 | 
31 | # Don't report errors.
32 | report_errors = True
33 | 
34 | # Experimental: solve unknown types to label with structural types.
35 | protocols = False
36 | 
37 | # Experimental: Only load submodules that are explicitly imported.
38 | strict_import = False
39 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. dataprep documentation master file, created by
 2 |    sphinx-quickstart on Wed Nov  6 13:56:43 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to dataprep's documentation!
 7 | ====================================
 8 | 
 9 | EDA
10 | ---
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    eda/introduction
15 |    eda/plot
16 |    eda/plot_correlation
17 |    eda/plot_missing
18 | 
19 | Data Connector
20 | --------------
21 | .. toctree::
22 |    :maxdepth: 2
23 | 
24 |    data_connector
25 |    DC_DBLP_tut
26 |    DC_Yelp_tut
27 |    DC_Spotify_tut
28 | 
29 | Case Study
30 | ----------
31 | .. toctree::
32 |    :maxdepth: 2
33 |    
34 |    case_study/titanic.ipynb
35 |    case_study/house_price.ipynb
36 | 
37 | API Documentation
38 | -----------------
39 | 
40 | .. toctree::
41 |    :maxdepth: 2
42 | 
43 |    dataprep
44 | 
45 | Indices and tables
46 | ==================
47 | 
48 | * :ref:`genindex`
49 | * :ref:`modindex`
50 | * :ref:`search`
51 | 


--------------------------------------------------------------------------------
/dataprep/eda/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | dataprep.eda
 3 | ============
 4 | """
 5 | import tempfile
 6 | 
 7 | from bokeh.io import output_file, output_notebook
 8 | from .basic import compute, plot, render
 9 | from .correlation import compute_correlation, plot_correlation, render_correlation
10 | from .missing import compute_missing, plot_missing, render_missing
11 | from .utils import is_notebook
12 | from .dtypes import (
13 |     DType,
14 |     Categorical,
15 |     Nominal,
16 |     Ordinal,
17 |     Numerical,
18 |     Continuous,
19 |     Discrete,
20 |     DateTime,
21 |     Text,
22 | )
23 | 
24 | __all__ = [
25 |     "plot_correlation",
26 |     "compute_correlation",
27 |     "render_correlation",
28 |     "compute_missing",
29 |     "render_missing",
30 |     "plot_missing",
31 |     "plot",
32 |     "compute",
33 |     "render",
34 |     "DType",
35 |     "Categorical",
36 |     "Nominal",
37 |     "Ordinal",
38 |     "Numerical",
39 |     "Continuous",
40 |     "Discrete",
41 |     "DateTime",
42 |     "Text",
43 | ]
44 | 
45 | 
46 | if is_notebook():
47 |     output_notebook(hide_banner=True)
48 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: 'type: bug, triage required'
 6 | assignees: dovahcrow
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | Or:
21 | 
22 | ```python
23 | paste your code here
24 | ```
25 | 
26 | **Expected behavior**
27 | A clear and concise description of what you expected to happen.
28 | 
29 | **Screenshots**
30 | If applicable, add screenshots to help explain your problem.
31 | 
32 | **Desktop (please complete the following information):**
33 |  - OS: [e.g. Windows]
34 |  - Browser [e.g. chrome, safari]
35 |  - Platform [Jupyter Notebook, Jupyter Lab, Google Colab, VSCode, Python script]
36 |  - Platform Version [e.g. 1.0]
37 |  - Python Version [e.g. 3.7.2]
38 |  - Dataprep Version [e.g. 0.2.2]
39 | 
40 | **Additional context**
41 | Add any other context about the problem here.
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 sfu-db
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/source/dataprep.data_connector.rst:
--------------------------------------------------------------------------------
 1 | dataprep.data\_connector package
 2 | ================================
 3 | 
 4 | .. .. automodule:: dataprep.data_connector
 5 | ..    :members:
 6 | ..    :undoc-members:
 7 | ..    :show-inheritance:
 8 | 
 9 | Connector
10 | ---------
11 | 
12 | .. autoclass:: dataprep.data_connector.Connector
13 |    :members:
14 |    :inherited-members:
15 | 
16 | 
17 |    
18 | .. Submodules
19 | .. ----------
20 | 
21 | .. dataprep.data\_connector.connector module
22 | .. -----------------------------------------
23 | 
24 | .. .. automodule:: dataprep.data_connector.connector
25 | ..    :members:
26 | ..    :undoc-members:
27 | ..    :show-inheritance:
28 | 
29 | .. dataprep.data\_connector.schema module
30 | .. --------------------------------------
31 | 
32 | .. .. automodule:: dataprep.data_connector.schema
33 | ..    :members:
34 | ..    :undoc-members:
35 | ..    :show-inheritance:
36 | 
37 | .. dataprep.data\_connector.types module
38 | .. -------------------------------------
39 | 
40 | .. .. automodule:: dataprep.data_connector.types
41 | ..    :members:
42 | ..    :undoc-members:
43 | ..    :show-inheritance:
44 | 
45 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
 4 | 
 5 | # How Has This Been Tested?
 6 | 
 7 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration
 8 | 
 9 | # Snapshots:
10 | 
11 | Include snapshots for easier review.
12 | 
13 | # Checklist:
14 | 
15 | - [ ] My code follows the style guidelines of this project
16 | - [ ] I have already squashed the commits and make the commit message conform to the project standard.
17 | - [ ] I have already marked the commit with "BREAKING CHANGE" or "Fixes #" if needed.
18 | - [ ] I have performed a self-review of my own code
19 | - [ ] I have commented my code, particularly in hard-to-understand areas
20 | - [ ] I have made corresponding changes to the documentation
21 | - [ ] My changes generate no new warnings
22 | - [ ] I have added tests that prove my fix is effective or that my feature works
23 | - [ ] New and existing unit tests pass locally with my changes
24 | - [ ] Any dependent changes have been merged and published in downstream modules
25 | 


--------------------------------------------------------------------------------
/dataprep/data_connector/errors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module defines errors used in this library.
 3 | """
 4 | from ..errors import DataprepError
 5 | 
 6 | 
 7 | class RequestError(DataprepError):
 8 |     """
 9 |     A error indicating the status code of the API response
10 |     is not 200.
11 |     """
12 | 
13 |     status_code: int
14 |     message: str
15 | 
16 |     def __init__(self, status_code: int, message: str) -> None:
17 |         """
18 |         Constructor
19 | 
20 |         parameters
21 |         ----------
22 |         status_code : int
23 |             The http status code
24 |         messsage : str
25 |             The message from the response
26 |         """
27 | 
28 |         super().__init__()
29 | 
30 |         self.status_code = status_code
31 |         self.message = message
32 | 
33 |     def __str__(self) -> str:
34 |         return f"RequestError: status={self.status_code}, message={self.message}"
35 | 
36 | 
37 | class UniversalParameterOverridden(Exception):
38 |     """
39 |     The parameter is overrided by the universal parameter
40 |     """
41 | 
42 |     param: str
43 |     uparam: str
44 | 
45 |     def __init__(self, param: str, uparam: str) -> None:
46 |         super().__init__()
47 |         self.param = param
48 |         self.uparam = uparam
49 | 
50 |     def __str__(self) -> str:
51 |         return f"the parameter {self.param} is overridden by {self.uparam}"
52 | 


--------------------------------------------------------------------------------
/dataprep/eda/outlier/computation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module containing plot_outlier function.
 3 | """
 4 | 
 5 | 
 6 | import dask.dataframe as dd
 7 | 
 8 | from ..intermediate import Intermediate
 9 | 
10 | DEFAULT_PARTITIONS = 1
11 | 
12 | 
13 | def _calc_num_outlier(df: dd.DataFrame, col_x: str) -> Intermediate:
14 |     """
15 |     calculate outliers based on the MAD method for numerical values.
16 |     :param df: the input dataframe
17 |     :param col_x: the column of df (univariate outlier detection)
18 |     :return: dict(index: value) of outliers
19 |     """
20 |     data_df = dd.from_dask_array(df[col_x].to_dask_array(), columns=["data"])
21 |     median = data_df["data"].quantile(0.5)
22 |     MAD = abs(data_df["data"] - median).quantile(0.5)  # pylint: disable=invalid-name
23 |     data_df["z_score"] = (0.6745 * (data_df["data"] - median)) / MAD
24 |     res_df = data_df[data_df["z_score"] > 3.5].drop("z_score", axis=1)
25 |     result = {"outliers_index": list(res_df["data"].index.compute())}
26 |     raw_data = {"df": df, "col_x": col_x}
27 |     return Intermediate(result, raw_data)
28 | 
29 | 
30 | def _calc_cat_outlier(df: dd.DataFrame, col_x: str, threshold: int = 1) -> Intermediate:
31 |     """
32 |     calculate outliers based on the threshold for categorical values.
33 |     :param df: the input dataframe
34 |     :param col_x: the column of df (univariate outlier detection)
35 |     :return: dict(index: value) of outliers
36 |     """
37 |     groups = df.groupby([col_x]).size()
38 |     result = {"outlier_index": list(groups[groups <= threshold].index.compute())}
39 |     raw_data = {"df": df, "col_x": col_x, "threshold": threshold}
40 |     return Intermediate(result, raw_data)
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # pytype
107 | .pytype/
108 | 
109 | # editors
110 | .vscode
111 | .idea
112 | notebooks/
113 | bfg.jar
114 | profiling
115 | .coverage


--------------------------------------------------------------------------------
/dataprep/tests/eda/test_plot_missing.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     This module for testing plot_missing(df, x, y) function.
 3 | """
 4 | import dask.dataframe as dd
 5 | import numpy as np
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | from ...eda.dtypes import Numerical
10 | from ...eda.missing import compute_missing, render_missing
11 | from ...eda.utils import to_dask
12 | 
13 | 
14 | @pytest.fixture(scope="module")  # type: ignore
15 | def simpledf() -> dd.DataFrame:
16 |     df = pd.DataFrame(np.random.rand(1000, 3), columns=["a", "b", "c"])
17 | 
18 |     df = pd.concat(
19 |         [df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1
20 |     )
21 | 
22 |     df.columns = ["a", "b", "c", "d"]
23 |     idx = np.arange(1000)
24 |     np.random.shuffle(idx)
25 |     df.iloc[idx[:500], 0] = None
26 | 
27 |     ddf = to_dask(df)
28 | 
29 |     return ddf
30 | 
31 | 
32 | def test_sanity_compute_1(simpledf: dd.DataFrame) -> None:
33 |     itmdt = compute_missing(simpledf)
34 |     render_missing(itmdt)
35 | 
36 | 
37 | def test_sanity_compute_2(simpledf: dd.DataFrame) -> None:
38 |     itmdt = compute_missing(simpledf, x="a")
39 |     render_missing(itmdt)
40 | 
41 | 
42 | def test_sanity_compute_3(simpledf: dd.DataFrame) -> None:
43 |     itmdt = compute_missing(simpledf, x="d")
44 |     render_missing(itmdt)
45 | 
46 | 
47 | def test_sanity_compute_4(simpledf: dd.DataFrame) -> None:
48 |     itmdt = compute_missing(simpledf, x="a", y="b")
49 |     render_missing(itmdt)
50 | 
51 | 
52 | def test_sanity_compute_5(simpledf: dd.DataFrame) -> None:
53 |     itmdt = compute_missing(simpledf, x="a", y="d")
54 |     render_missing(itmdt)
55 | 
56 | 
57 | def test_specify_column_type(simpledf: dd.DataFrame) -> None:
58 |     itmdt = compute_missing(simpledf, x="b", dtype={"a": Numerical()})
59 |     render_missing(itmdt)
60 | 
61 | 
62 | @pytest.mark.xfail  # type: ignore
63 | def test_sanity_compute_6(simpledf: dd.DataFrame) -> None:
64 |     compute_missing(simpledf, y="b")
65 | 


--------------------------------------------------------------------------------
/dataprep/tests/eda/test_plot.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     module for testing plot(df, x, y) function.
 3 | """
 4 | import logging
 5 | 
 6 | import dask.dataframe as dd
 7 | import numpy as np
 8 | import pandas as pd
 9 | import pytest
10 | 
11 | from ...eda import plot
12 | from ...eda.dtypes import Nominal
13 | from ...eda.utils import to_dask
14 | 
15 | LOGGER = logging.getLogger(__name__)
16 | 
17 | 
18 | @pytest.fixture(scope="module")  # type: ignore
19 | def simpledf() -> dd.DataFrame:
20 |     df = pd.DataFrame(np.random.rand(1000, 3), columns=["a", "b", "c"])
21 | 
22 |     df = pd.concat(
23 |         [df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1
24 |     )
25 |     df = pd.concat(
26 |         [
27 |             df,
28 |             pd.Series(
29 |                 np.random.choice(
30 |                     ["2020/03/29", "2020/01/10", "2019/11/21"], 1000, replace=True
31 |                 )
32 |             ),
33 |         ],
34 |         axis=1,
35 |     )
36 |     df = pd.concat([df, pd.Series(np.zeros(1000))], axis=1)
37 |     df.columns = ["a", "b", "c", "d", "e", "f"]
38 |     df["e"] = pd.to_datetime(df["e"])
39 | 
40 |     idx = np.arange(1000)
41 |     np.random.shuffle(idx)
42 |     df.iloc[idx[:500], 0] = None
43 | 
44 |     ddf = to_dask(df)
45 | 
46 |     return ddf
47 | 
48 | 
49 | def test_sanity_compute_1(simpledf: dd.DataFrame) -> None:
50 |     plot(simpledf, "a")
51 | 
52 | 
53 | def test_sanity_compute_2(simpledf: dd.DataFrame) -> None:
54 |     plot(simpledf, "e")
55 | 
56 | 
57 | def test_sanity_compute_3(simpledf: dd.DataFrame) -> None:
58 |     plot(simpledf)
59 | 
60 | 
61 | def test_sanity_compute_4(simpledf: dd.DataFrame) -> None:
62 |     plot(simpledf, "d", "e")
63 | 
64 | 
65 | def test_sanity_compute_5(simpledf: dd.DataFrame) -> None:
66 |     plot(simpledf, "a", "e")
67 | 
68 | 
69 | def test_sanity_compute_6(simpledf: dd.DataFrame) -> None:
70 |     plot(simpledf, "f")
71 | 
72 | 
73 | def test_specify_column_type(simpledf: dd.DataFrame) -> None:
74 |     plot(simpledf, dtype={"a": Nominal()})
75 |     plot(simpledf, dtype=Nominal())
76 | 


--------------------------------------------------------------------------------
/dataprep/tests/eda/test_report.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     module for testing plot(df, x, y) function.
 3 | """
 4 | import logging
 5 | from datetime import datetime as DateTime
 6 | from tempfile import TemporaryDirectory
 7 | 
 8 | import dask.dataframe as dd
 9 | import numpy as np
10 | import pandas as pd
11 | import pytest
12 | 
13 | from ...eda import plot, plot_correlation, plot_missing
14 | from ...eda.utils import to_dask
15 | 
16 | LOGGER = logging.getLogger(__name__)
17 | 
18 | 
19 | @pytest.fixture(scope="module")  # type: ignore
20 | def simpledf() -> dd.DataFrame:
21 |     df = pd.DataFrame(np.random.rand(1000, 3), columns=["a", "b", "c"])
22 | 
23 |     df = pd.concat(
24 |         [df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1
25 |     )
26 |     df = pd.concat(
27 |         [df, pd.Series(np.random.choice([list("a"), set("b"),], 1000, replace=True)),],
28 |         axis=1,
29 |     )
30 |     df = pd.concat(
31 |         [
32 |             df,
33 |             pd.Series(
34 |                 np.random.choice(
35 |                     [DateTime(6, 4, 1), pd.to_datetime("today")], 1000, replace=True
36 |                 )
37 |             ),
38 |         ],
39 |         axis=1,
40 |     )
41 | 
42 |     df.columns = ["a", "b", "c", "d", "e", "f"]
43 | 
44 |     idx = np.arange(1000)
45 |     np.random.shuffle(idx)
46 |     df.iloc[idx[:500], 0] = None
47 | 
48 |     ddf = to_dask(df)
49 | 
50 |     return ddf
51 | 
52 | 
53 | def test_plot_report(simpledf: dd.DataFrame) -> None:
54 |     report = plot(simpledf)
55 |     with TemporaryDirectory() as dname:
56 |         report.save(filename=f"{dname}/plot_report.html")
57 |     report._repr_html_()
58 | 
59 | 
60 | def test_plot_correlation_report(simpledf: dd.DataFrame) -> None:
61 |     report = plot_correlation(simpledf)
62 |     with TemporaryDirectory() as dname:
63 |         report.save(filename=f"{dname}/plot_correlation_report.html")
64 |     report._repr_html_()
65 | 
66 | 
67 | def test_plot_missing_report(simpledf: dd.DataFrame) -> None:
68 |     report = plot_missing(simpledf)
69 |     with TemporaryDirectory() as dname:
70 |         report.save(filename=f"{dname}/plot_missing_report.html")
71 |     report._repr_html_()
72 | 


--------------------------------------------------------------------------------
/dataprep/eda/missing/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     This module implements the plot_missing(df) function
 3 | """
 4 | 
 5 | from typing import Optional, Union
 6 | 
 7 | import dask.dataframe as dd
 8 | import pandas as pd
 9 | from bokeh.io import show
10 | 
11 | from .compute import compute_missing
12 | from .render import render_missing
13 | from ..report import Report
14 | from ..dtypes import DTypeDef
15 | 
16 | __all__ = ["render_missing", "compute_missing", "plot_missing"]
17 | 
18 | 
19 | def plot_missing(
20 |     df: Union[pd.DataFrame, dd.DataFrame],
21 |     x: Optional[str] = None,
22 |     y: Optional[str] = None,
23 |     *,
24 |     bins: int = 30,
25 |     ncols: int = 30,
26 |     ndist_sample: int = 100,
27 |     dtype: Optional[DTypeDef] = None,
28 | ) -> Report:
29 |     """
30 |     This function is designed to deal with missing values
31 |     There are three functions: plot_missing(df), plot_missing(df, x)
32 |     plot_missing(df, x, y)
33 | 
34 |     Parameters
35 |     ----------
36 |     df
37 |         the pandas data_frame for which plots are calculated for each column
38 |     x
39 |         a valid column name of the data frame
40 |     y
41 |         a valid column name of the data frame
42 |     ncols
43 |         The number of columns in the figure
44 |     bins
45 |         The number of rows in the figure
46 |     ndist_sample
47 |         The number of sample points
48 |     wdtype: str or DType or dict of str or dict of DType, default None
49 |         Specify Data Types for designated column or all columns.
50 |         E.g.  dtype = {"a": Continuous, "b": "Nominal"} or
51 |         dtype = {"a": Continuous(), "b": "nominal"}
52 |         or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
53 | 
54 |     Examples
55 |     ----------
56 |     >>> from dataprep.eda.missing.computation import plot_missing
57 |     >>> import pandas as pd
58 |     >>> df = pd.read_csv("suicide-rate.csv")
59 |     >>> plot_missing(df, "HDI_for_year")
60 |     >>> plot_missing(df, "HDI_for_year", "population")
61 |     """
62 |     itmdt = compute_missing(
63 |         df, x, y, dtype=dtype, bins=bins, ncols=ncols, ndist_sample=ndist_sample
64 |     )
65 |     fig = render_missing(itmdt)
66 |     return Report(fig)
67 | 


--------------------------------------------------------------------------------
/dataprep/eda/intermediate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Intermediate class
 3 | """
 4 | from typing import Any, Dict, Tuple, Union
 5 | 
 6 | import pandas as pd
 7 | 
 8 | 
 9 | class Intermediate(Dict[str, Any]):
10 |     """
11 |     This class contains intermediate results.
12 |     """
13 | 
14 |     visual_type: str
15 | 
16 |     def __init__(self, *args: Any, **kwargs: Any):
17 |         if (
18 |             len(args) == 1
19 |             and isinstance(args[0], dict)
20 |             and len(kwargs) == 1
21 |             and "visual_type" in kwargs
22 |         ):
23 |             super().__init__(args[0])
24 |             self.visual_type = kwargs["visual_type"]
25 |         elif len(args) == 0:
26 |             visual_type = kwargs.pop("visual_type")
27 |             super().__init__(**kwargs)
28 |             self.visual_type = visual_type
29 |         else:
30 |             assert False, "Unsupported initialization"
31 | 
32 | 
33 | class ColumnsMetadata:
34 |     """
35 |     Container for storing each column's metadata
36 |     """
37 | 
38 |     metadata: pd.DataFrame
39 | 
40 |     def __init__(self) -> None:
41 |         self.metadata = pd.DataFrame()
42 |         self.metadata.index.name = "Column Name"
43 | 
44 |     def __setitem__(self, key: Tuple[str, str], val: Any) -> None:
45 |         col, vtype = key
46 |         if (
47 |             isinstance(val, (tuple, list, dict))
48 |             and vtype
49 |             not in self.metadata.columns  # pylint: disable=unsupported-membership-test
50 |         ):
51 |             self.metadata[vtype] = pd.Series(dtype="object")
52 | 
53 |         self.metadata.loc[col, vtype] = val
54 | 
55 |     def __getitem__(self, key: Union[str, Tuple[str, str]]) -> Any:
56 |         if isinstance(key, tuple):
57 |             col, vtype = key
58 |             return self.metadata.loc[col, vtype]
59 |         else:
60 |             return ColumnMetadata(self.metadata.loc[key])
61 | 
62 | 
63 | class ColumnMetadata:
64 |     """
65 |     Container for storing a single column's metadata.
66 |     This is immutable
67 |     """
68 | 
69 |     metadata: pd.Series
70 | 
71 |     def __init__(self, meta: pd.Series) -> None:
72 |         self.metadata = meta
73 | 
74 |     def __getitem__(self, key: str) -> Any:
75 |         return self.metadata.loc[key]
76 | 


--------------------------------------------------------------------------------
/dataprep/eda/correlation/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     This module implements the plot_correlation(df) function.
 3 | """
 4 | 
 5 | from typing import Any, List, Optional, Tuple, Union
 6 | 
 7 | import dask.dataframe as dd
 8 | import pandas as pd
 9 | from bokeh.io import show
10 | 
11 | from .compute import compute_correlation
12 | from .render import render_correlation
13 | from ..report import Report
14 | 
15 | __all__ = ["render_correlation", "compute_correlation", "plot_correlation"]
16 | 
17 | 
18 | def plot_correlation(
19 |     df: Union[pd.DataFrame, dd.DataFrame],
20 |     x: Optional[str] = None,
21 |     y: Optional[str] = None,
22 |     *,
23 |     value_range: Optional[Tuple[float, float]] = None,
24 |     k: Optional[int] = None,
25 | ) -> Report:
26 |     """
27 |     This function is designed to calculate the correlation between columns
28 |     There are three functions: plot_correlation(df), plot_correlation(df, x)
29 |     plot_correlation(df, x, y)
30 |     There are also some parameters such as k and value_range to satisfy your requirement
31 | 
32 |     Parameters
33 |     ----------
34 |     df
35 |         The pandas data_frame for which plots are calculated for each column
36 |     x
37 |         A valid column name of the data frame
38 |     y
39 |         A valid column name of the data frame
40 |     value_range
41 |         Range of value
42 |     k
43 |         Choose top-k element
44 | 
45 |     Examples
46 |     --------
47 |     >>> from dataprep.eda.correlation.computation import plot_correlation
48 |     >>> import pandas as pd
49 |     >>> df = pd.read_csv("suicide-rate.csv")
50 |     >>> plot_correlation(df)
51 |     >>> plot_correlation(df, k=6)
52 |     >>> plot_correlation(df, "suicides")
53 |     >>> plot_correlation(df, "suicides", k=3)
54 |     >>> plot_correlation(df, "suicides", value_range=[-1, 0.3])
55 |     >>> plot_correlation(df, "suicides", value_range=[-1, 0.3], k=2)
56 |     >>> plot_correlation(df, x_name="population", y_name="suicides_no")
57 |     >>> plot_correlation(df, x_name="population", y_name="suicides", k=5)
58 | 
59 |     Note
60 |     ----
61 |     This function only supports numerical or categorical data,
62 |     and it is better to drop None, Nan and Null value before using it
63 |     """
64 | 
65 |     intermediate = compute_correlation(df, x=x, y=y, value_range=value_range, k=k)
66 |     figure = render_correlation(intermediate)
67 | 
68 |     return Report(figure)
69 | 


--------------------------------------------------------------------------------
/dataprep/eda/report.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     This module implements the Report class.
 3 | """
 4 | 
 5 | from pathlib import Path
 6 | from tempfile import NamedTemporaryFile
 7 | 
 8 | from bokeh.io import save
 9 | from bokeh.models import LayoutDOM
10 | from bokeh.resources import CDN
11 | from IPython.display import HTML, display
12 | from jinja2 import Template
13 | 
14 | INLINE_TEMPLATE = Template(
15 |     """
16 | {% from macros import embed %}
17 | {% block inner_body %}
18 |     {% block contents %}
19 |     {% for doc in docs %}
20 |         {{ embed(doc) if doc.elementid }}
21 |         {% for root in doc.roots %}
22 |         {% block root scoped %}
23 |             {{ embed(root) | indent(10) }}
24 |         {% endblock %}
25 |         {% endfor %}
26 |     {% endfor %}
27 |     {% endblock %}
28 |     {{ plot_script | indent(8) }}
29 | {% endblock %}
30 | """
31 | )
32 | 
33 | 
34 | class Report:
35 |     """
36 |     This class creates a customized Report object for the plot* functions
37 |     """
38 | 
39 |     to_render: LayoutDOM
40 | 
41 |     def __init__(self, to_render: LayoutDOM) -> None:
42 |         self.to_render = to_render
43 | 
44 |     def save(self, filename: str) -> None:
45 |         """
46 |         save function
47 |         """
48 |         save(
49 |             self.to_render,
50 |             filename=filename,
51 |             resources=CDN,
52 |             title="DataPrep.EDA Report",
53 |         )
54 | 
55 |     def _repr_html_(self) -> str:
56 |         # Windows forbids us open the file twice as the result bokeh cannot
57 |         # write to the opened temporary file.
58 |         with NamedTemporaryFile(suffix=".html", delete=False) as tmpf:
59 |             pass
60 | 
61 |         save(
62 |             self.to_render,
63 |             filename=tmpf.name,
64 |             resources=CDN,
65 |             template=INLINE_TEMPLATE,
66 |             title="DataPrep.EDA Report",
67 |         )
68 |         with open(tmpf.name, "r") as f:
69 |             output_html = f.read()
70 | 
71 |         # Delete the temporary file
72 |         Path(tmpf.name).unlink()
73 | 
74 |         # Fix the bokeh: bokeh wrongly call the "waiting for bokeh to load" function
75 |         # inside "Bokeh.safely", which causes Bokeh not found because
76 |         # Bokeh is even not loaded!
77 |         patched_html = output_html.replace(
78 |             "Bokeh.safely",
79 |             "var __dataprep_bokeh_fix = (f) => document.Bokeh === undefined ? setTimeout(f, 1000) : f(); __dataprep_bokeh_fix",  # pylint: disable=line-too-long
80 |         )
81 |         # embed into report template created by us here
82 |         return patched_html
83 | 
84 |     def show(self) -> None:
85 |         """
86 |         Render the report. This is useful when calling plot in a for loop.
87 |         """
88 |         display(HTML(self._repr_html_()))
89 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "dataprep"
  3 | version = "0.2.8"
  4 | description = "Dataprep: Data Preparation in Python"
  5 | authors = ["SFU Database System Lab <dsl.cs.sfu@gmail.com>"]
  6 | maintainers = [
  7 |     "Weiyuan Wu <youngw@sfu.com>", 
  8 |     "Jinglin Peng <jinglin_peng@sfu.ca>",
  9 |     "Pei Wang <peiw@sfu.ca>",
 10 |     "Brandon Lockhart <brandon_lockhart@sfu.ca>",
 11 |     "Song Bian <biansonghz@gmail.com>"
 12 | ]
 13 | 
 14 | license = "MIT"
 15 | 
 16 | readme = "README.md"  # Markdown files are supported
 17 | 
 18 | repository = "https://github.com/sfu-db/dataprep"
 19 | homepage = "https://github.com/sfu-db/dataprep"
 20 | 
 21 | keywords = ["dataprep", "eda", "data connector", "data science", "exploratory data analysis", "data exploration"]
 22 | 
 23 | classifiers = [
 24 |     "Development Status :: 4 - Beta",
 25 |     "Topic :: Software Development :: Build Tools",
 26 |     "Environment :: Console",
 27 |     "Operating System :: OS Independent",
 28 |     "Intended Audience :: Science/Research",
 29 |     "Intended Audience :: Developers",
 30 |     "Intended Audience :: Financial and Insurance Industry",
 31 |     "Intended Audience :: Healthcare Industry",
 32 |     "Topic :: Scientific/Engineering",
 33 |     "Framework :: IPython",
 34 | ]
 35 | 
 36 | [tool.poetry.dependencies]
 37 | python = "^3.6.1"
 38 | 
 39 | # Dependencies for EDA
 40 | dask = { version = "~2.13", extras = [ "complete" ]}
 41 | pandas = "~1.0"
 42 | numpy = "~1.18"
 43 | scipy = "~1.4"
 44 | holoviews = "~1.13"
 45 | bokeh = "~2.1"
 46 | 
 47 | # Dependencies for DataConnector
 48 | jsonschema = "~3.2"
 49 | requests = "~2.23"
 50 | jinja2 = "~2.11"
 51 | jsonpath2 = "~0.4"
 52 | lxml = "~4.5"
 53 | nltk = "^3.5"
 54 | pillow = "^7.1.2"
 55 | wordcloud = "^1.7.0"
 56 | 
 57 | [tool.poetry.dev-dependencies]
 58 | pylint = "~2.4"
 59 | pytest = "~5.4"
 60 | mypy = "~0.770"
 61 | black = "19.10b0"
 62 | nbsphinx = "~0.5"
 63 | sphinx = "^3"
 64 | toml = "^0.10.0"
 65 | rstcheck = "^3.3.1"
 66 | sphinx-autobuild = "^0.7.1"
 67 | pytest-cov = "^2.8.1"
 68 | codecov = "^2.0.22"
 69 | sphinx-autodoc-typehints = "^1.10.3"
 70 | ipython = "^7.13.0"
 71 | rope = "^0.16.0"
 72 | 
 73 | [tool.black]
 74 | line-length = 88
 75 | target-version = ['py36', 'py37']
 76 | exclude = '''
 77 | (
 78 |     /(
 79 |         \.eggs
 80 |     | \.git
 81 |     | \.pytype
 82 |     | \.pytest_cache
 83 |     | build
 84 |     | dist
 85 |     )/
 86 | )
 87 | '''
 88 | 
 89 | [tool.semantic_release]
 90 | version_variable = "dataprep/__init__.py:__version__"
 91 | version_source = "tag"
 92 | commit_subject = "v{version}"
 93 | commit_message = "Bump to v{version}"
 94 | commit_author = "Weiyuan Wu <youngw@sfu.ca>"
 95 | branch = "master"
 96 | commit_version_number = true
 97 | 
 98 | [build-system]
 99 | requires = ["poetry>=1"]
100 | build-backend = "poetry.masonry.api"
101 | 


--------------------------------------------------------------------------------
/dataprep/assets/english_stopwords.py:
--------------------------------------------------------------------------------
  1 | english_stopwords = [
  2 |     "i",
  3 |     "me",
  4 |     "my",
  5 |     "myself",
  6 |     "we",
  7 |     "our",
  8 |     "ours",
  9 |     "ourselves",
 10 |     "you",
 11 |     "you're",
 12 |     "you've",
 13 |     "you'll",
 14 |     "you'd",
 15 |     "your",
 16 |     "yours",
 17 |     "yourself",
 18 |     "yourselves",
 19 |     "he",
 20 |     "him",
 21 |     "his",
 22 |     "himself",
 23 |     "she",
 24 |     "she's",
 25 |     "her",
 26 |     "hers",
 27 |     "herself",
 28 |     "it",
 29 |     "it's",
 30 |     "its",
 31 |     "itself",
 32 |     "they",
 33 |     "them",
 34 |     "their",
 35 |     "theirs",
 36 |     "themselves",
 37 |     "what",
 38 |     "which",
 39 |     "who",
 40 |     "whom",
 41 |     "this",
 42 |     "that",
 43 |     "that'll",
 44 |     "these",
 45 |     "those",
 46 |     "am",
 47 |     "is",
 48 |     "are",
 49 |     "was",
 50 |     "were",
 51 |     "be",
 52 |     "been",
 53 |     "being",
 54 |     "have",
 55 |     "has",
 56 |     "had",
 57 |     "having",
 58 |     "do",
 59 |     "does",
 60 |     "did",
 61 |     "doing",
 62 |     "a",
 63 |     "an",
 64 |     "the",
 65 |     "and",
 66 |     "but",
 67 |     "if",
 68 |     "or",
 69 |     "because",
 70 |     "as",
 71 |     "until",
 72 |     "while",
 73 |     "of",
 74 |     "at",
 75 |     "by",
 76 |     "for",
 77 |     "with",
 78 |     "about",
 79 |     "against",
 80 |     "between",
 81 |     "into",
 82 |     "through",
 83 |     "during",
 84 |     "before",
 85 |     "after",
 86 |     "above",
 87 |     "below",
 88 |     "to",
 89 |     "from",
 90 |     "up",
 91 |     "down",
 92 |     "in",
 93 |     "out",
 94 |     "on",
 95 |     "off",
 96 |     "over",
 97 |     "under",
 98 |     "again",
 99 |     "further",
100 |     "then",
101 |     "once",
102 |     "here",
103 |     "there",
104 |     "when",
105 |     "where",
106 |     "why",
107 |     "how",
108 |     "all",
109 |     "any",
110 |     "both",
111 |     "each",
112 |     "few",
113 |     "more",
114 |     "most",
115 |     "other",
116 |     "some",
117 |     "such",
118 |     "no",
119 |     "nor",
120 |     "not",
121 |     "only",
122 |     "own",
123 |     "same",
124 |     "so",
125 |     "than",
126 |     "too",
127 |     "very",
128 |     "s",
129 |     "t",
130 |     "can",
131 |     "will",
132 |     "just",
133 |     "don",
134 |     "don't",
135 |     "should",
136 |     "should've",
137 |     "now",
138 |     "d",
139 |     "ll",
140 |     "m",
141 |     "o",
142 |     "re",
143 |     "ve",
144 |     "y",
145 |     "ain",
146 |     "aren",
147 |     "aren't",
148 |     "couldn",
149 |     "couldn't",
150 |     "didn",
151 |     "didn't",
152 |     "doesn",
153 |     "doesn't",
154 |     "hadn",
155 |     "hadn't",
156 |     "hasn",
157 |     "hasn't",
158 |     "haven",
159 |     "haven't",
160 |     "isn",
161 |     "isn't",
162 |     "ma",
163 |     "mightn",
164 |     "mightn't",
165 |     "mustn",
166 |     "mustn't",
167 |     "needn",
168 |     "needn't",
169 |     "shan",
170 |     "shan't",
171 |     "shouldn",
172 |     "shouldn't",
173 |     "wasn",
174 |     "wasn't",
175 |     "weren",
176 |     "weren't",
177 |     "won",
178 |     "won't",
179 |     "wouldn",
180 |     "wouldn't",
181 | ]
182 | 


--------------------------------------------------------------------------------
/dataprep/data_connector/config_manager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for config downloading and maintaining
  3 | """
  4 | from json import dump as jdump
  5 | from pathlib import Path
  6 | from shutil import rmtree
  7 | from tempfile import gettempdir
  8 | from typing import cast
  9 | 
 10 | import requests
 11 | 
 12 | META_URL = (
 13 |     "https://raw.githubusercontent.com/sfu-db/DataConnectorConfigs/master/{}/_meta.json"
 14 | )
 15 | TABLE_URL = (
 16 |     "https://raw.githubusercontent.com/sfu-db/DataConnectorConfigs/master/{}/{}.json"
 17 | )
 18 | GIT_REF_URL = "https://api.github.com/repos/sfu-db/DataConnectorConfigs/git/refs/heads"
 19 | 
 20 | 
 21 | def config_directory() -> Path:
 22 |     """
 23 |     Returns the config directory path
 24 |     """
 25 |     tmp = gettempdir()
 26 |     return Path(tmp) / "dataprep" / "data_connector"
 27 | 
 28 | 
 29 | def ensure_config(impdb: str) -> bool:
 30 |     """
 31 |     Ensure the config for `impdb` is downloaded
 32 |     """
 33 |     path = config_directory()
 34 |     obsolete = is_obsolete(impdb)
 35 | 
 36 |     if (path / impdb).exists() and not obsolete:
 37 |         return True
 38 |     else:
 39 |         download_config(impdb)
 40 |         return False
 41 | 
 42 | 
 43 | def is_obsolete(impdb: str) -> bool:
 44 |     """
 45 |     Test if the implicit db config files are obsolete
 46 |     and need to be re-downloaded.
 47 |     """
 48 |     path = config_directory()
 49 |     if not (path / impdb).exists():
 50 |         return True
 51 |     elif not (path / impdb / "_hash").exists():
 52 |         return True
 53 |     else:
 54 |         with open(path / impdb / "_hash", "r") as f:
 55 |             githash = f.read()
 56 | 
 57 |         sha = get_git_master_hash()
 58 | 
 59 |         return githash != sha
 60 | 
 61 | 
 62 | def get_git_master_hash() -> str:
 63 |     """
 64 |     Get current config files repo's hash
 65 |     """
 66 |     refs = requests.get(GIT_REF_URL).json()
 67 |     (sha,) = [ref["object"]["sha"] for ref in refs if ref["ref"] == "refs/heads/master"]
 68 |     return cast(str, sha)
 69 | 
 70 | 
 71 | def download_config(impdb: str) -> None:
 72 |     """
 73 |     Download the config from Github into the temp directory.
 74 |     """
 75 |     url = META_URL.format(impdb)
 76 |     meta = requests.get(url).json()
 77 |     tables = meta["tables"]
 78 | 
 79 |     sha = get_git_master_hash()
 80 |     # In case we push a new config version to github when the user is downloading
 81 |     while True:
 82 |         configs = {"_meta": meta}
 83 |         for table in tables:
 84 |             url = TABLE_URL.format(impdb, table)
 85 |             config = requests.get(url).json()
 86 |             configs[table] = config
 87 |         sha_check = get_git_master_hash()
 88 | 
 89 |         if sha_check == sha:
 90 |             break
 91 | 
 92 |         sha = sha_check
 93 | 
 94 |     path = config_directory()
 95 | 
 96 |     if (path / impdb).exists():
 97 |         rmtree(path / impdb)
 98 | 
 99 |     (path / impdb).mkdir(parents=True)
100 |     for fname, json in configs.items():
101 |         with (path / impdb / f"{fname}.json").open("w") as f:
102 |             jdump(json, f)
103 | 
104 |     with (path / impdb / "_hash").open("w") as f:
105 |         f.write(sha)
106 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | from pathlib import Path
16 | from typing import cast
17 | 
18 | import toml
19 | 
20 | sys.path.insert(0, os.path.abspath("../../"))
21 | 
22 | # -- Project information -----------------------------------------------------
23 | 
24 | project = "dataprep"
25 | copyright = "2020, SFU Database System Lab"
26 | author = "SFU Database System Lab"
27 | 
28 | # The full version, including alpha/beta/rc tags
29 | def get_version() -> str:
30 |     """
31 |     Get the library version from pyproject.toml
32 |     """
33 |     path = Path(__file__).resolve().parents[2] / "pyproject.toml"
34 |     pyproject = toml.loads(open(str(path)).read())
35 |     return cast(str, pyproject["tool"]["poetry"]["version"])
36 | 
37 | 
38 | release = get_version()
39 | 
40 | 
41 | # -- General configuration ---------------------------------------------------
42 | 
43 | # Add any Sphinx extension module names here, as strings. They can be
44 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
45 | # ones.
46 | extensions = [
47 |     "sphinx.ext.todo",
48 |     "sphinx.ext.viewcode",
49 |     "sphinx.ext.autodoc",
50 |     "sphinx.ext.napoleon",
51 |     "nbsphinx",
52 |     "sphinx_autodoc_typehints",
53 | ]
54 | 
55 | autodoc_typehints = "description"
56 | # Napoleon settings
57 | napoleon_google_docstring = False
58 | napoleon_numpy_docstring = True
59 | napoleon_include_init_with_doc = False
60 | napoleon_include_private_with_doc = False
61 | napoleon_include_special_with_doc = False
62 | napoleon_use_admonition_for_examples = False
63 | napoleon_use_admonition_for_notes = False
64 | napoleon_use_admonition_for_references = False
65 | napoleon_use_ivar = False
66 | napoleon_use_param = True
67 | napoleon_use_rtype = True
68 | napoleon_use_keyword = True
69 | napoleon_custom_sections = None
70 | 
71 | # autodoc_default_options = {
72 | #     "members": True,
73 | #     "member-order": "bysource",
74 | #     "special-members": "__init__",
75 | # }
76 | 
77 | # Add any paths that contain templates here, relative to this directory.
78 | templates_path = ["_templates"]
79 | 
80 | # List of patterns, relative to source directory, that match files and
81 | # directories to ignore when looking for source files.
82 | # This pattern also affects html_static_path and html_extra_path.
83 | exclude_patterns = []
84 | 
85 | master_doc = "index"
86 | 
87 | # -- Options for HTML output -------------------------------------------------
88 | 
89 | # The theme to use for HTML and HTML Help pages.  See the documentation for
90 | # a list of builtin themes.
91 | #
92 | html_theme = "nature"
93 | 
94 | # Add any paths that contain custom static files (such as style sheets) here,
95 | # relative to this directory. They are copied after the builtin static files,
96 | # so a file named "default.css" will overwrite the builtin "default.css".
97 | html_static_path = ["_static"]
98 | 


--------------------------------------------------------------------------------
/Justfile:
--------------------------------------------------------------------------------
  1 | build-docs:
  2 |   poetry run sphinx-build -M html docs/source docs/build
  3 | 
  4 | publish-docs: build-docs
  5 |   touch docs/build/html/.nojekyll
  6 |   gh-pages --dotfiles --message "[skip ci] Updates" --dist docs/build/html
  7 |   
  8 | gen-apidocs:
  9 |   poetry run sphinx-apidoc --ext-doctest --ext-autodoc --ext-mathjax -f -o docs/source dataprep
 10 | 
 11 | black:
 12 |   poetry run black dataprep
 13 |   
 14 | ci: format ci-black typeck test lint
 15 | 
 16 | ci-black:
 17 |   poetry run black --check --quiet dataprep
 18 | 
 19 | format:
 20 |   poetry run black dataprep
 21 | 
 22 | typeck: ci-mypy
 23 | 
 24 | test:
 25 |   poetry run pytest dataprep
 26 | 
 27 | testf +ARGS="dataprep":
 28 |   poetry run pytest {{ARGS}}
 29 | 
 30 | lint:
 31 |   poetry run pylint dataprep
 32 | 
 33 | ci-mypy:
 34 |   poetry run mypy dataprep
 35 | 
 36 | build:
 37 |   poetry build
 38 | 
 39 | release version:
 40 |   #! /usr/bin/env bash
 41 | 
 42 |   # Sanity checks
 43 | 
 44 |   arr=(major minor patch)
 45 | 
 46 |   if [[ " ${arr[*]} " != *" {{version}} "* ]]; then
 47 |     echo "version must be one of 'major', 'minor', 'patch', got '{{version}}'";
 48 |     exit 1;
 49 |   fi
 50 | 
 51 |   if [ ! -z "$(git status --porcelain)" ]; then echo "Git tree is not clean, commit first"; exit 1; fi
 52 | 
 53 |   if [ ! -z "$(git rev-parse --verify release)" ]; then echo "delete the existing release branch before new release"; exit 1; fi
 54 | 
 55 |   # Pre bump the version to get the next version number
 56 |   git checkout develop
 57 | 
 58 |   vstring="$(poetry version {{version}})"
 59 |   if [ $? -ne 0 ]; then
 60 |     echo $vstring;
 61 |     exit 1;
 62 |   fi
 63 |   
 64 |   from_version=$(echo "${vstring}" | sed -nr "s/^Bumping version from ([0-9]+\.[0-9]+\.[0-9]+) to ([0-9]+\.[0-9]+\.[0-9]+)$/\1/p")
 65 |   to_version=$(echo "${vstring}" | sed -nr "s/^Bumping version from ([0-9]+\.[0-9]+\.[0-9]+) to ([0-9]+\.[0-9]+\.[0-9]+)$/\2/p")
 66 | 
 67 |   git checkout pyproject.toml # clean up
 68 | 
 69 |   echo "Releasing from ${from_version} to ${to_version}?"
 70 |   select yn in "Yes" "No"; do
 71 |       case $yn in
 72 |           Yes ) break;;
 73 |           No ) git checkout pyproject.toml; git checkout develop; git branch -D release; exit;;
 74 |       esac
 75 |   done
 76 | 
 77 |   # Begin of the real stuff!
 78 | 
 79 |   # Create new release branch
 80 |   git checkout -b "release/v${to_version}" develop
 81 | 
 82 |   poetry version {{version}}
 83 |   
 84 |   echo "Creating release commit"
 85 |   git add pyproject.toml
 86 |   semantic-release version --{{version}}
 87 |   
 88 |   # echo "Merge release/v${to_version} to master & develop"
 89 |   # git checkout master
 90 |   # git merge "release/v${to_version}"
 91 | 
 92 |   # git checkout develop
 93 |   # git merge "release/v${to_version}"
 94 | 
 95 |   echo "Push branch and tag to remote"
 96 |   git push origin "release/v${to_version}":master
 97 |   git push origin "release/v${to_version}":develop
 98 |   git push origin "release/v${to_version}"
 99 |   git push origin "v${to_version}"
100 | 
101 |   echo "Build artifacts"
102 |   poetry build
103 | 
104 |   echo "Creating release draft"
105 |   semantic-release changelog | sed "1iv${to_version}\n" | hub release create -d -a "dist/dataprep-${to_version}-py3-none-any.whl" -a "dist/dataprep-${to_version}.tar.gz" -F - "v${to_version}"
106 | 
107 |   
108 | 
109 | 
110 | @ensure-git-clean:
111 |   if [ ! -z "$(git status --porcelain)" ]; then echo "Git tree is not clean, commit first"; exit 1; fi


--------------------------------------------------------------------------------
/docs/source/data_connector.rst:
--------------------------------------------------------------------------------
 1 | ==================================================================================
 2 | dataprep.data_connector: fetching data from popular websites with a simplified API
 3 | ==================================================================================
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 2
 7 | 
 8 | 
 9 | Overview
10 | ==========
11 | data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. 
12 | The goal is to help the users skip the complex API configuration.
13 | We illustrate how to use data_connector library with Yelp.
14 | 
15 | 
16 | Initializing a connector class for a website
17 | =============================================
18 | The first step is to initialize a Connector class with the configuration file location and access token specified (`How to get access token?
19 | <https://www.yelp.com/developers/documentation/v3/authentication>`_).
20 | Available configuration files can be manually downloaded here: `Configuration Files
21 | <https://github.com/sfu-db/DataConnectorConfigs>`_ or automatically downloaded at usage.
22 | To initialize a data_connector::
23 | 
24 |     from dataprep.data_connector import Connector
25 |     dc = Connector("./DataConnectorConfigs/yelp", auth_params={"access_token":access_token})
26 | 
27 | 
28 | Getting the guidline of the connector with `Connector.info`
29 | =================================================================
30 | | Connector's info method gives information and guideline of using the connector. In the example below, the response shows three things. 
31 | | 	a. There is one table in Yelp, i.e. Yelp.businesses.
32 | | 	b. To query this table, the term and location parameters are required and the longitute and latitude parameters are optional (see Connector.query() section).
33 | | 	c. The examples of calling the methods in the Connector class.
34 | 
35 | ::
36 | 
37 |     dc.info
38 | 
39 | .. image:: _static/images/data_connector/info.png
40 | 	:align: center
41 |    	:width: 496
42 |    	:height: 215
43 | 
44 | 
45 | 
46 | Understand web data with `Connector.show_schema()`
47 | ============================================================
48 | show_schema(table name) returns the schema of the webdata to be returned in a dataframe.
49 | There are two columns in the response.
50 | The first column is the column name and the second is the datatype.
51 | 
52 | ::
53 | 
54 |     dc.show_schema('businesses')
55 | 
56 | 
57 | .. image:: _static/images/data_connector/show_schema.png
58 |    :align: center
59 |    :width: 208
60 |    :height: 458 
61 | 
62 | 
63 | Getting web data with `Connector.query()`
64 | =================================================
65 | the `query()` method downloads the website data.
66 | The parameters should meet the requriement in `Connector.info`
67 | Usually the raw data is returned in JSON or xml format.
68 | data_connector re-format the data in pandas dataframe for the convenience of downstream operations.
69 | 
70 | ::
71 | 
72 |     df = dc.query('businesses', term="korean", location="seattle")
73 |     df
74 | 
75 | .. image:: _static/images/data_connector/query.png
76 |    :align: center
77 |    :width: 870
78 |    :height: 491
79 | 
80 | 
81 | Advanced: writing your own data_connector configuration file
82 | ==============================================================
83 | A configuration file defines the infomation neccessary to fetch data from a website, e.g. the request url; the API authorization type; the parameters needed from the uses(API key, search keyword, etc.); the returned data's schema. 
84 | All the information are reusable.
85 | To write a configuration file for the your own needs or modify an existing one, please refer to `Configuration Files
86 | <https://github.com/sfu-db/DataConnectorConfigs>`_.
87 | 
88 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.0
  2 | jobs:
  3 |   install_dependencies:
  4 |     docker:
  5 |       - image: circleci/python:3.7.2
  6 |     steps:
  7 |       - &step_add_path
  8 |         run:
  9 |           name: Add python user PATH into PATH
 10 |           command: echo "export PATH=$PATH:$HOME/.local/bin" >> $BASH_ENV
 11 |       - &step_install_pipenv
 12 |         run: 
 13 |           name: Install python tools
 14 |           command: pip install --user poetry==1.0.0b9
 15 |       - &step_inproject_venv
 16 |         run:
 17 |           name: Set venv inproject
 18 |           command: poetry config virtualenvs.in-project true
 19 |       - checkout
 20 |       - run:
 21 |           name: Install dependencies
 22 |           command: poetry install
 23 |           no_output_timeout: 1200
 24 |       - run:
 25 |           name: Plot tool versions
 26 |           command: poetry run mypy --version & poetry run pylint --version && poetry run pytest --version && poetry run black --version
 27 |       - persist_to_workspace:
 28 |           root: .
 29 |           paths: .venv
 30 |   check:
 31 |     docker:
 32 |       - image: circleci/python:3.7.2
 33 |     steps:
 34 |       - *step_add_path
 35 |       - *step_install_pipenv
 36 |       - *step_inproject_venv
 37 |       - checkout
 38 |       - attach_workspace:
 39 |           at: .
 40 |       - run:
 41 |           name: Check if the code is formatted
 42 |           command: poetry run black --check --quiet dataprep
 43 |       - run:
 44 |           name: Type check the project
 45 |           command: poetry run mypy dataprep
 46 |       - run:
 47 |           name: Test the project
 48 |           command: poetry run pytest --cov=dataprep
 49 |       - run:
 50 |           name: Style check the project
 51 |           command: poetry run pylint dataprep
 52 |       - run:
 53 |           name: Update coverage data to codecov
 54 |           command: poetry run codecov 
 55 |   docs-build:
 56 |     docker:
 57 |       - image: circleci/python:3.7.2
 58 |     steps:
 59 |       - run:
 60 |           name: Pandoc Installation
 61 |           command: curl -L https://github.com/jgm/pandoc/releases/download/2.9.2.1/pandoc-2.9.2.1-1-amd64.deb -o /tmp/pandoc.deb && sudo dpkg -i /tmp/pandoc.deb
 62 |       - *step_add_path
 63 |       - *step_install_pipenv
 64 |       - *step_inproject_venv
 65 |       - checkout
 66 |       - attach_workspace:
 67 |           at: .
 68 |       - run:
 69 |           name: Build docs
 70 |           command: poetry run sphinx-build -M html docs/source docs/build
 71 |       - persist_to_workspace:
 72 |           root: .
 73 |           paths: docs/build/html
 74 |   docs-deploy:
 75 |     docker:
 76 |       - image: node:8.10.0
 77 |     steps:
 78 |       - add_ssh_keys:
 79 |           fingerprints:
 80 |             - "b7:f1:2a:54:c8:90:80:78:ba:30:d9:9b:b8:7d:03:10"
 81 |       - checkout
 82 |       - attach_workspace:
 83 |           at: .
 84 |       - run:
 85 |           name: Install and configure dependencies
 86 |           command: |
 87 |             npm install -g --silent gh-pages@2.0.1
 88 |             git config user.email "ci@sfu.db"
 89 |             git config user.name "ci"
 90 |       - run:
 91 |           name: Disable jekyll builds
 92 |           command: touch docs/build/html/.nojekyll
 93 |       - run:
 94 |           name: Deploy docs to gh-pages branch
 95 |           command: gh-pages --dotfiles --message "[skip ci] Updates" --dist docs/build/html
 96 | workflows:
 97 |   version: 2
 98 |   build_and_test:
 99 |     jobs:
100 |       - install_dependencies
101 |       - check:
102 |           requires:
103 |             - install_dependencies
104 |       - docs-build:
105 |           requires:
106 |             - install_dependencies
107 |       - docs-deploy:
108 |           requires:
109 |             - check
110 |             - docs-build
111 |           filters:
112 |             branches:
113 |               only: master


--------------------------------------------------------------------------------
/docs/source/eda/plot_missing.rst:
--------------------------------------------------------------------------------
 1 | ======================================================
 2 | `plot_missing`: analyzing the impact of missing values
 3 | ======================================================
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 2
 7 | 
 8 | Overview
 9 | ========
10 | 
11 | The goal of `plot_missing` is to analyze the impact of missing values. The impact means the change of characteristics (e.g., histogram for numerical column or bar chart for categorical column) of the dataset before and after removing the rows with missing values. `plot_missing` mainly provides the following functionalities:
12 | 
13 | 1. `plot_missing(df)`: plot the position of missing values.
14 | 2. `plot_missing(df, x)`: plot the impact on basic characteristics (histogram and bar chart) of missing values in column x to all other columns.
15 | 3. `plot_missing(df, x, y)`: zoom into column y, and plot the impact on more characteristics of missing values in column x to column y.  
16 | 
17 | In the following, we use several examples to demonstrate the functionalities.
18 | 
19 | Loading dataset
20 | ===============
21 | We support two types of dataframe: pandas dataframe and dask dataframe. Here we load the well known `tantic` dataset into a pandas dataframe and use it to demonstrate our functionality::
22 | 
23 |     import pandas as pd
24 |     df = pd.read_csv("https://www.openml.org/data/get_csv/16826755/phpMYEkMl", na_values = ['?'])
25 | 
26 | Plotting the position of missing values via `plot_missing(df)`
27 | ==============================================================
28 | 
29 | Given a dataset, we could plot the position of missing values via plot_missing(df). The dataset is divided into bins, and we use colored bin to represent the number of missing values. The more the missing value is in a bin, the deeper the bin color is. By default, we show 50 columns and each column is divided into 100 bins. We will also show the percentage of missing values for each column in the label. The following is an example::
30 | 
31 |     from dataprep.eda import plot_missing
32 |     plot_missing(df)
33 | 
34 | .. raw:: html
35 | 
36 |    <iframe src="../_static/images/plot_missing/df.html" height="550" width="100%"></iframe>
37 | 
38 | 
39 | The impact on basic characteristics of missing values in column x via `plot_missing(df, x)`
40 | ===========================================================================================
41 | 
42 | After we know the position of the missing value, we could further analyze the impact of missing values. We provide `plot_missing(df, x)` to analyze the impact of missing values in column x. The impact means the characteristics of dataset before and after removing the missing values. Here, we consider two types of characteristics: the histogram for numerical column and the bar chart for categorical column. When calling `plot_missing(df, x)`, user could see the difference of histogram for numerical columns and the bar chart for categorical column, before and after removing the missing values of column x. The following shows an example::
43 |     
44 |     plot_missing(df, "age")
45 | 
46 | .. raw:: html
47 | 
48 |    <iframe src="../_static/images/plot_missing/df_x.html" height="550" width="100%"></iframe>
49 | 
50 | 
51 | The impact on more characteristics of missing values in column x to column y via `plot_missing(df, x, y)`
52 | =========================================================================================================
53 | 
54 | `plot_missing(df, x)` only considers two types of characteristics, i.e., histogram and bar chart, for all columns. If user wants to zoom into a specific column and analyze the impact on more characteristics, she/he could call `plot_missing(df, x, y)`. `plot_missing(df, x, y)` plots the impact of the missing values in x column on y column. The output plot is different when y is numerical column or categorical column. 
55 | 
56 | When y is numerical column, `plot_missing(df, x, y)` shows the impact on histogram, pdf, cdf, and box plot. The following shows an example:: 
57 | 
58 |     plot_missing(df, "age", "fare")
59 | 
60 | .. raw:: html
61 | 
62 |    <iframe src="../_static/images/plot_missing/df_x_num.html" height="550" width="100%"></iframe>
63 | 
64 | When y is categorical column, `plot_missing(df, x, y)` shows the impact on bar chart. The following shows an example:: 
65 | 
66 |     plot_missing(df, "age", "sex")
67 | 
68 | .. raw:: html
69 | 
70 |    <iframe src="../_static/images/plot_missing/df_x_cat.html" height="550" width="100%"></iframe>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center"><img width="100%" src="https://github.com/sfu-db/dataprep/raw/develop/assets/logo.png"/></div>
 2 | 
 3 | -----------------
 4 | 
 5 | [![License]](LICENSE) [![Doc Badge]](https://sfu-db.github.io/dataprep/) [![Version]](https://pypi.org/project/dataprep/) [![Python Version]](https://pypi.org/project/dataprep/)  [![Downloads]](https://pepy.tech/project/dataprep) [![Codecov]](https://codecov.io/gh/sfu-db/dataprep) ![Build Status]  [![Chat]](https://discord.gg/xwbkFNk) 
 6 | 
 7 | Dataprep lets you prepare your data using a single library with a few lines of code.
 8 | 
 9 | Currently, you can use `dataprep` to:
10 | * Collect data from common data sources (through `dataprep.data_connector`)
11 | * Do your exploratory data analysis (through `dataprep.eda`)
12 | * ...more modules are coming
13 | 
14 | 
15 | [Documentation] | [Mail List & Forum] 
16 | 
17 | ## Installation
18 | 
19 | ```bash
20 | pip install dataprep
21 | ```
22 | 
23 | ## Examples & Usages
24 | 
25 | The following examples can give you an impression of what dataprep can do:
26 | 
27 | * [Documentation: Data Connector](https://sfu-db.github.io/dataprep/data_connector.html)
28 | * [Documentation: EDA](https://sfu-db.github.io/dataprep/eda/introduction.html)
29 | * [EDA Case Study: Titanic](https://sfu-db.github.io/dataprep/case_study/titanic.html)
30 | * [EDA Case Study: House Price](https://sfu-db.github.io/dataprep/case_study/house_price.html)
31 | 
32 | ### EDA
33 | 
34 | There are common tasks during the exploratory data analysis stage, 
35 | like a quick look at the columnar distribution, or understanding the correlations
36 | between columns. 
37 | 
38 | The EDA module categorizes these EDA tasks into functions helping you finish EDA
39 | tasks with a single function call.
40 | 
41 | * Want to understand the distributions for each DataFrame column? Use `plot`.
42 | 
43 | <center><a href="https://sfu-db.github.io/dataprep/eda/introduction.html#analyzing-basic-characteristics-via-plot"><img src="https://github.com/sfu-db/dataprep/raw/develop/assets/plot(df).png"/></a></center>
44 | 
45 | * Want to understand the correlation between columns? Use `plot_correlation`.
46 | 
47 | <center><a href="https://sfu-db.github.io/dataprep/eda/introduction.html#analyzing-correlation-via-plot-correlation"><img src="https://github.com/sfu-db/dataprep/raw/develop/assets/plot_correlation(df).png"/></a></center>
48 | 
49 | * Or, if you want to understand the impact of the missing values for each column, use `plot_missing`.
50 | 
51 | <center><a href="https://sfu-db.github.io/dataprep/eda/plot_missing.html#plotting-the-position-of-missing-values-via-plot-missing-df"><img src="https://github.com/sfu-db/dataprep/raw/develop/assets/plot_missing(df).png"/></a></center>
52 | 
53 | * You can drill down to get more information by given `plot`, `plot_correlation` and `plot_missing` a column name. E.g. for `plot_missing`:
54 | 
55 | <center><a href="https://sfu-db.github.io/dataprep/eda/plot_missing.html#the-impact-on-basic-characteristics-of-missing-values-in-column-x-via-plot-missing-df-x"><img src="https://github.com/sfu-db/dataprep/raw/develop/assets/plot_missing(df,x).png"/></a></center>
56 | 
57 | Don't forget to checkout the [examples] folder for detailed demonstration!
58 | 
59 | ### Data Connector
60 | 
61 | You can download Yelp business search result into a pandas DataFrame, 
62 | using two lines of code, without taking deep looking into the Yelp documentation!
63 | Moreover, Data Connector will automatically do the pagination for you so that 
64 | you can specify the desire count of the returned results without even considering the count-per-request restriction from the API.
65 | 
66 | <center><a href="https://sfu-db.github.io/dataprep/data_connector.html#getting-web-data-with-connector-query"><img src="https://github.com/sfu-db/dataprep/raw/develop/assets/data_connector.png"/></a></center>
67 | 
68 | _The code requests 120 records even though Yelp restricts you can only fetch 50 per request._
69 | 
70 | ## Contribute
71 | 
72 | There are many ways to contribute to Dataprep.
73 | 
74 | * Submit bugs and help us verify fixes as they are checked in.
75 | * Review the source code changes.
76 | * Engage with other Dataprep users and developers on StackOverflow.
77 | * Help each other in the [Dataprep Community Discord](https://discord.gg/FXsK2P) and [Mail list & Forum].
78 | * [![Twitter]](https://twitter.com/sfu_db)
79 | * Contribute bug fixes.
80 | * Providing use cases and writing down your user experience.
81 | 
82 | Please take a look at our [wiki] for development documentations!
83 | 
84 | 
85 | [Build Status]: https://img.shields.io/circleci/build/github/sfu-db/dataprep/master?style=flat-square&token=f68e38757f5c98771f46d1c7e700f285a0b9784d
86 | [Documentation]: https://sfu-db.github.io/dataprep/
87 | [Mail list & Forum]: https://groups.google.com/forum/#!forum/dataprep
88 | [wiki]: https://github.com/sfu-db/dataprep/wiki
89 | [examples]: https://github.com/sfu-db/dataprep/tree/master/examples
90 | [Chat]: https://img.shields.io/discord/702765817154109472?style=flat-square
91 | [License]: https://img.shields.io/pypi/l/dataprep?style=flat-square
92 | [Downloads]: https://pepy.tech/badge/dataprep
93 | [Python Version]: https://img.shields.io/pypi/pyversions/dataprep?style=flat-square
94 | [Version]: https://img.shields.io/pypi/v/dataprep?style=flat-square
95 | [Codecov]: https://img.shields.io/codecov/c/github/sfu-db/dataprep?style=flat-square
96 | [Twitter]: https://img.shields.io/twitter/follow/sfu_db?style=social
97 | [Doc Badge]: https://img.shields.io/badge/dynamic/json?color=blue&label=docs&prefix=v&query=%24.info.version&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fdataprep%2Fjson&style=flat-square
98 | 


--------------------------------------------------------------------------------
/docs/source/eda/introduction.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | An introduction to exploratory data analysis with `dataprep.eda`
 3 | ================================================================
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 2
 7 | 
 8 | .. topic:: Section contents
 9 | 
10 |     In this section, we introduce how to do exploratory data analysis throughout `dataprep.eda` and give serval
11 |     simple examples.
12 | 
13 | Exploratory data analysis: functionality description
14 | ----------------------------------------------------
15 | 
16 | `Exploratory data analysis (EDA)  <https://www.wikiwand.com/en/Exploratory_data_analysis>`_ is the procedure of exploring the dataset and summarize its main characteristics. The goal of `dataprep.eda` module is to simplify this procedure and allow user explore important characteristics as many as possible via only a few APIs. Each API allows user analyze dataset from high level to low level from different perspective. Specifically, we provide the following functionalities:
17 | 
18 | * **analyzing basic characteristics via `plot`**: we provide an API `plot` to allow user analyze the basic characteristic of the dataset. It plots the distribution or bar chart for each column to give user a basic sense of the dataset.  If user is interested in one or two specific columns, it provides a more detailed plot for the specific columns by passing column names as the parameter. 
19 | 
20 | * **analyzing correlation between columns via `plot_correlation`**: We provide an API `plot_correlation` to analyze the correlation between columns. It plots the correlation matrix between columns. If user is interested the correlated columns for a specific column, e.g., the most correlated columns to column 'A', the API can provide a more detailed analysis by passing column names as the parameter.
21 | 
22 | * **analyzing the impact of missing values via `plot_missing`**: We provide an API `plot_missing` to analyze the pattern and impact of missing values. At the first glace, it shows the position of missing values, which allows user be aware of data quality for each column or find any underlying pattern of missing values. To understand the impact of missing values from a specific column, user can pass the column name into the parameter. It will compare the distribution of each column with and without missing values from the given column, such that user could understand the impact of the missing values.
23 | 
24 | In the following, we simply introduce `plot`, `plot_correlation` and `plot_missing` and demonstrate their basic functionalities. 
25 | 
26 | .. _demo:
27 | 
28 | Analyzing basic characteristics via `plot` 
29 | ------------------------------------------
30 | 
31 | To analyze the basic characteristics of the dataset, such as the distribution of each column, user could call `eda.plot`. It mainly provides the following functionalities:
32 | 
33 | 1. plot(df): plot basic characteristics (the histogram and the bar chart) for all columns.
34 | 2. plot(df, x): zoom into column x and plot more refined characteristics.
35 | 3. plot(df, x, y): zoom into column x and column y, and plot more refined characteristics to explore their relationship.
36 | 
37 | In the following, we show an example of `plot(df)`, which plots the histogram for each numerical column and bar chart for each categorical column ::
38 | 
39 |   from dataprep.eda import plot
40 |   import pandas as pd
41 |   df = pd.read_csv("https://www.openml.org/data/get_csv/1595261/phpMawTba", na_values = [' ?'])
42 |   plot(df)
43 | 
44 | .. raw:: html
45 | 
46 |    <iframe src="../_static/images/introduction/plot.html" height="550" width="100%" frameborder="0"></iframe>
47 | 
48 | 
49 | Analyzing correlation via `plot_correlation`
50 | --------------------------------------------
51 | To analyze the correlation between columns, we provide `plot_correlation`. Its main functionalities could be summarized as follows:
52 | 
53 | 1. `plot_correlation(df)`: plot the correlation matrix of all columns.
54 | 2. `plot_correlation(df, x)`: plot the most correlated columns to column x.
55 | 3. `plot_correlation(df, x, y)`: plot the scatter plot between column x and column y, as well as the regression line. Besides, the point that has most impact on the correlation value could be identified by passing a parameter.
56 | 4. `plot_correlation(df, x, y, k, value_range)`: filter the result by correlation value or by top-k.
57 | 
58 | In the following, we show an example of `plot_correlation(df)`, which plots the correlation matrix for `Person  <https://www.wikiwand.com/en/Pearson_correlation_coefficient>`_, `Spearman  <https://www.wikiwand.com/en/Spearman%27s_rank_correlation_coefficient>`_ and `KendallTau  <https://www.wikiwand.com/en/Kendall_rank_correlation_coefficient>`_ correlation::
59 | 
60 |   from dataprep.eda import plot_correlation
61 |   import pandas as pd
62 |   df = pd.read_csv("https://www.openml.org/data/get_csv/4965268/wine-quality-red.arff")
63 |   plot_correlation(df)
64 | 
65 | .. raw:: html
66 | 
67 |    <iframe src="../_static/images/introduction/plot_correlation.html" height="550" width="100%"></iframe>
68 | 
69 | 
70 | Analyzing missing values via `plot_missing`
71 | -------------------------------------------
72 | To analyze the pattern and impact of missing values, we provide `plot_missing`. Its main functionalities could be summarized as follows:
73 | 
74 | 1. `plot_missing(df)`: plot the position of missing values.
75 | 2. `plot_missing(df, x)`: plot the impact on basic characteristics (histogram and bar chart) of missing values in column x to all other columns.
76 | 3. `plot_missing(df, x, y)`: zoom into column y, and plot the impact on more characteristics of missing values in column x to column y.  
77 | 
78 | In the following, we show an example of `plot_missing(df)`, which shows the positions of missing values as well as the percentage of missing value in each column::
79 | 
80 |   from dataprep.eda import plot_missing
81 |   import pandas as pd
82 |   df = pd.read_csv("https://www.openml.org/data/get_csv/16826755/phpMYEkMl", na_values = ['?'])
83 |   plot_missing(df)
84 | 
85 | .. raw:: html
86 | 
87 |    <iframe src="../_static/images/introduction/plot_missing.html" height="550" width="100%"></iframe>


--------------------------------------------------------------------------------
/examples/DataConnector_DBLP.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Connector for DBLP \n",
  8 |     "\n",
  9 |     "In this example, we will be going over how to use Data Connector with DBLP."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Preprocessing\n",
 17 |     "\n",
 18 |     "data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. The goal is to help the users skip the complex API configuration. In this tutorial, we demonstrate how to use data_connector library with DBLP.\n",
 19 |     "\n",
 20 |     "If you haven't installed dataprep, run command `pip install dataprep` or execute the following cell."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "># Run me if you'd like to install\n",
 30 |     ">!pip install dataprep"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "# Download and store the configuration files in dataprep. \n",
 38 |     "\n",
 39 |     "The configuration files are used to configure the parameters and initial setup for the API. The available configuration files can be manually downloaded here: [Configuration Files](https://github.com/sfu-db/DataConnectorConfigs) or automatically downloaded at usage. \n",
 40 |     "\n",
 41 |     "Store the configuration file in the dataprep folder. "
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "# Initialize data_connector\n",
 49 |     "\n",
 50 |     "To initialize run the following code. Unlike Yelp and Spotify, tokens and client information are not needed."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "from dataprep.data_connector import Connector\n",
 60 |     "dc = Connector(\"./DataConnectorConfigs/DBLP\")"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "# Functionalities\n",
 68 |     "\n",
 69 |     "Data connector has several functions you can perform to gain insight on the data downloaded from DBLP."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### Connector.info\n",
 77 |     "The info method gives information and guidelines of using the connector. There are 3 sections in the response and they are table, parameters and examples.\n",
 78 |     ">1. Table - The table(s) being accessed.\n",
 79 |     ">2. Parameters - Identifies which parameters can be used to call the method. For DBLP, there is no required **parameter**. \n",
 80 |     ">3. Examples - Shows how you can call the methods in the Connector class."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "dc.info()"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "### Connector.show_schema\n",
 97 |     "The show_schema method returns the schema of the website data to be returned in a Dataframe. There are two columns in the response. The first column is the column name and the second is the datatype.\n",
 98 |     "\n",
 99 |     "As an example, lets see what is in the publication table."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "dc.show_schema(\"publication\")"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "### Connector.query\n",
116 |     "The query method downloads the website data and displays it in a Dataframe. The parameters must meet the requirements as indicated in connector.info for the operation to run.\n",
117 |     "\n",
118 |     "When the data is received from the server, it will either be in a JSON or XML format. The data_connector reformats the data in pandas Dataframe for the convenience of downstream operations.\n",
119 |     "\n",
120 |     "As an example, let's try to get the data from the \"publication\" table, providing the query search for \"lee\"."
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "df = dc.query(\"businesses\", term=\"publication\", location=\"lee\")"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "From query results, you can see how easy it is to download the publication data from DBLP into a pandas Dataframe.\n",
137 |     "\n",
138 |     "Now that you have an understanding of how data connector operates, you can easily accomplish the task with two lines of code.\n",
139 |     "\n",
140 |     "\n",
141 |     ">1. dc = Connector(...)\n",
142 |     ">2. dc.query(...)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "# That's all for now. \n",
150 |     "If you are interested in writing your own configuration file or modify an existing one, refer to the [Configuration Files](https://github.com/sfu-db/DataConnectorConfigs>)."
151 |    ]
152 |   }
153 |  ],
154 |  "metadata": {
155 |   "kernelspec": {
156 |    "display_name": "Python 3",
157 |    "language": "python",
158 |    "name": "python3"
159 |   },
160 |   "language_info": {
161 |    "codemirror_mode": {
162 |     "name": "ipython",
163 |     "version": 3
164 |    },
165 |    "file_extension": ".py",
166 |    "mimetype": "text/x-python",
167 |    "name": "python",
168 |    "nbconvert_exporter": "python",
169 |    "pygments_lexer": "ipython3",
170 |    "version": "3.7.7"
171 |   }
172 |  },
173 |  "nbformat": 4,
174 |  "nbformat_minor": 4
175 | }
176 | 


--------------------------------------------------------------------------------
/dataprep/eda/utils.py:
--------------------------------------------------------------------------------
  1 | """Miscellaneous functions
  2 | """
  3 | import logging
  4 | from math import ceil
  5 | from typing import Any, Union, Optional
  6 | import dask.dataframe as dd
  7 | import numpy as np
  8 | import pandas as pd
  9 | from bokeh.models import Legend
 10 | from bokeh.plotting import Figure
 11 | 
 12 | LOGGER = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def is_notebook() -> Any:
 16 |     """
 17 |     :return: whether it is running in jupyter notebook
 18 |     """
 19 |     try:
 20 |         # pytype: disable=import-error
 21 |         from IPython import get_ipython  # pylint: disable=import-outside-toplevel
 22 | 
 23 |         # pytype: enable=import-error
 24 | 
 25 |         shell = get_ipython().__class__.__name__
 26 |         if shell == "ZMQInteractiveShell":
 27 |             return True
 28 |         return False
 29 |     except (NameError, ImportError):
 30 |         return False
 31 | 
 32 | 
 33 | def to_dask(df: Union[pd.DataFrame, dd.DataFrame]) -> dd.DataFrame:
 34 |     """
 35 |     Convert a dataframe to a dask dataframe.
 36 |     """
 37 |     if isinstance(df, dd.DataFrame):
 38 |         return df
 39 | 
 40 |     df_size = df.memory_usage(deep=True).sum()
 41 |     npartitions = ceil(df_size / 128 / 1024 / 1024)
 42 |     return dd.from_pandas(df, npartitions=npartitions)
 43 | 
 44 | 
 45 | def sample_n(arr: np.ndarray, n: int) -> np.ndarray:  # pylint: disable=C0103
 46 |     """
 47 |     Sample n values uniformly from the range of the `arr`,
 48 |     not from the distribution of `arr`'s elems.
 49 |     """
 50 |     if len(arr) <= n:
 51 |         return arr
 52 | 
 53 |     subsel = np.linspace(0, len(arr) - 1, n)
 54 |     subsel = np.floor(subsel).astype(int)
 55 |     return arr[subsel]
 56 | 
 57 | 
 58 | def relocate_legend(fig: Figure, loc: str) -> Figure:
 59 |     """
 60 |     Relocate legend(s) from center to `loc`
 61 |     """
 62 |     remains = []
 63 |     targets = []
 64 |     for layout in fig.center:
 65 |         if isinstance(layout, Legend):
 66 |             targets.append(layout)
 67 |         else:
 68 |             remains.append(layout)
 69 |     fig.center = remains
 70 |     for layout in targets:
 71 |         fig.add_layout(layout, loc)
 72 | 
 73 |     return fig
 74 | 
 75 | 
 76 | def cut_long_name(name: str, max_len: int = 12) -> str:
 77 |     """
 78 |     If the name is longer than `max_len`,
 79 |     cut it to `max_len` length and append "..."
 80 |     """
 81 |     # Bug 136 Fixed
 82 |     name = str(name)
 83 |     if len(name) <= max_len:
 84 |         return name
 85 |     return f"{name[:max_len]}..."
 86 | 
 87 | 
 88 | def fuse_missing_perc(name: str, perc: float) -> str:
 89 |     """
 90 |     Append (x.y%) to the name if `perc` is not 0
 91 |     """
 92 |     if perc == 0:
 93 |         return name
 94 | 
 95 |     return f"{name} ({perc:.1%})"
 96 | 
 97 | 
 98 | def nullity_filter(
 99 |     df: pd.DataFrame,
100 |     filter_type: Optional[str] = None,
101 |     p_cut_off: int = 0,
102 |     n_cut_off: int = 0,
103 | ) -> pd.DataFrame:
104 |     """
105 |     This function is designed to filters a DataFrame according to its nullity,
106 |     using some combination of 'top' and 'bottom' numerical
107 |     and percentage values.
108 |     Percentages and numerical thresholds can be specified simultaneously
109 | 	Parameters
110 |     ----------
111 |     df
112 | 	 The DataFrame whose columns are being filtered.
113 | 	filter
114 | 	 The orientation of the filter being applied to the DataFrame.
115 |      One of, "top", "bottom", or None (default).
116 |      The filter will simply return the DataFrame if you leave the filter
117 |      argument unspecified or as None.
118 |     p
119 | 	 A completeness ratio cut-off.
120 |      If non-zero the filter will limit the DataFrame to columns with at least p
121 |      completeness.Input should be in the range [0, 1].
122 |     n
123 | 	 A numerical cut-off. If non-zero no more than this number of columns will be returned.
124 |     return
125 | 	 The nullity-filtered `DataFrame`.
126 | 	Examples
127 |     ----------
128 | 	to get a DataFrame with columns of at least 75% completeness but with no more than 5 columns
129 | 	>>> nullity_filter(df, filter='top', p=.75, n=5)
130 |     """
131 | 
132 |     if filter_type == "top":
133 |         if p_cut_off:
134 |             df = df.iloc[
135 |                 :, [c >= p_cut_off for c in df.count(axis="rows").values / len(df)]
136 |             ]
137 |         if n_cut_off:
138 |             df = df.iloc[
139 |                 :, np.sort(np.argsort(df.count(axis="rows").values)[-n_cut_off:])
140 |             ]
141 |     elif filter_type == "bottom":
142 |         if p_cut_off:
143 |             df = df.iloc[
144 |                 :, [c <= p_cut_off for c in df.count(axis="rows").values / len(df)]
145 |             ]
146 |         if n_cut_off:
147 |             df = df.iloc[
148 |                 :, np.sort(np.argsort(df.count(axis="rows").values)[:n_cut_off])
149 |             ]
150 |     return df
151 | 
152 | 
153 | def nullity_sort(
154 |     df: pd.DataFrame, sort: Optional[str] = None, axis: str = "columns"
155 | ) -> pd.DataFrame:
156 |     """
157 |     This function is designed to Sorts a DataFrame according to its nullity,
158 |     in either ascending or descending order.
159 | 	Parameters
160 |     ----------
161 |     df
162 | 	    the pandas data_frame object being sorted.
163 |     sort
164 | 		the sorting method: either "ascending", "descending", or None (default).
165 |     return
166 | 		the nullity-sorted DataFrame.
167 |     """
168 |     if sort is None:
169 |         return df
170 | 
171 |     if axis == "columns":
172 |         if sort == "ascending":
173 |             return df.iloc[np.argsort(df.count(axis="columns").values), :]
174 |         elif sort == "descending":
175 |             return df.iloc[np.flipud(np.argsort(df.count(axis="columns").values)), :]
176 |         else:
177 |             raise ValueError(
178 |                 'The "sort" parameter must be set to "ascending" or "descending".'
179 |             )
180 |     elif axis == "rows":
181 |         if sort == "ascending":
182 |             return df.iloc[:, np.argsort(df.count(axis="rows").values)]
183 |         elif sort == "descending":
184 |             return df.iloc[:, np.flipud(np.argsort(df.count(axis="rows").values))]
185 |         else:
186 |             raise ValueError(
187 |                 'The "sort" parameter must be set to "ascending" or "descending".'
188 |             )
189 |     else:
190 |         raise ValueError('The "axis" parameter must be set to "rows" or "columns".')
191 | 


--------------------------------------------------------------------------------
/dataprep/data_connector/types.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Defines useful types in this library.
  3 | """
  4 | from base64 import b64encode
  5 | from enum import Enum
  6 | from time import time
  7 | from typing import Any, Dict, Optional, cast
  8 | from sys import stderr
  9 | import requests
 10 | from jinja2 import Environment, UndefinedError
 11 | 
 12 | from ..errors import UnreachableError
 13 | 
 14 | 
 15 | class AuthorizationType(Enum):
 16 |     """Enum class defines the supported authorization methods in this library.
 17 | 
 18 |     Note
 19 |     ----
 20 | 
 21 |     * Bearer: requires 'access_token' presented in user params
 22 |     * OAuth2: requires 'client_id' and 'client_secret' in user params for
 23 |       'ClientCredentials' grant type
 24 |     """
 25 | 
 26 |     Bearer = "Bearer"
 27 |     OAuth2 = "OAuth2"
 28 | 
 29 | 
 30 | class Authorization:
 31 |     """Class carries the authorization type and
 32 |     the corresponding parameter.
 33 |     """
 34 | 
 35 |     auth_type: AuthorizationType
 36 |     params: Dict[str, str]
 37 |     storage: Dict[str, Any]
 38 | 
 39 |     def __init__(self, auth_type: AuthorizationType, params: Dict[str, str]) -> None:
 40 |         self.auth_type = auth_type
 41 |         self.params = params
 42 |         self.storage = {}
 43 | 
 44 |     def build(self, req_data: Dict[str, Any], params: Dict[str, Any]) -> None:
 45 |         """Populate some required fields to the request data.
 46 |         Complex logic may also happens in this function (e.g. start a server to do OAuth).
 47 |         """
 48 |         if self.auth_type == AuthorizationType.Bearer:  # pylint: disable=no-member
 49 |             req_data["headers"]["Authorization"] = f"Bearer {params['access_token']}"
 50 |         elif (
 51 |             self.auth_type == AuthorizationType.OAuth2
 52 |             and self.params["grantType"] == "ClientCredentials"
 53 |         ):
 54 |             # TODO: Move OAuth to a separate authenticator
 55 |             if (
 56 |                 "access_token" not in self.storage
 57 |                 or self.storage.get("expires_at", 0) < time()
 58 |             ):
 59 |                 # Not yet authorized
 60 |                 ckey = params["client_id"]
 61 |                 csecret = params["client_secret"]
 62 |                 b64cred = b64encode(f"{ckey}:{csecret}".encode("ascii")).decode()
 63 |                 resp = requests.post(
 64 |                     self.params["tokenServerUrl"],
 65 |                     headers={"Authorization": f"Basic {b64cred}"},
 66 |                     data={"grant_type": "client_credentials"},
 67 |                 ).json()
 68 | 
 69 |                 assert resp["token_type"].lower() == "bearer"
 70 |                 access_token = resp["access_token"]
 71 |                 self.storage["access_token"] = access_token
 72 |                 if "expires_in" in resp:
 73 |                     self.storage["expires_at"] = (
 74 |                         time() + resp["expires_in"] - 60
 75 |                     )  # 60 seconds grace period to avoid clock lag
 76 | 
 77 |             req_data["headers"][
 78 |                 "Authorization"
 79 |             ] = f"Bearer {self.storage['access_token']}"
 80 | 
 81 |             # TODO: handle auto refresh
 82 |         elif (
 83 |             self.auth_type == AuthorizationType.OAuth2
 84 |             and self.params["grantType"] == "AuthorizationCode"
 85 |         ):
 86 |             raise NotImplementedError
 87 | 
 88 | 
 89 | class Fields:
 90 |     """A data structure that stores the fields information (e.g. headers, cookies, ...).
 91 |     This class is useful to populate concrete fields data with required variables provided.
 92 |     """
 93 | 
 94 |     fields: Dict[str, Any]
 95 | 
 96 |     def __init__(self, fields_config: Dict[str, Any]) -> None:
 97 |         self.fields = fields_config
 98 | 
 99 |     def populate(  # pylint: disable=too-many-branches
100 |         self, jenv: Environment, params: Dict[str, Any]
101 |     ) -> Dict[str, str]:
102 |         """Populate a dict based on the fields definition and provided vars.
103 |         """
104 |         ret: Dict[str, str] = {}
105 | 
106 |         for key, def_ in self.fields.items():
107 |             from_key, to_key = key, key
108 | 
109 |             if isinstance(def_, bool):
110 |                 required = def_
111 |                 value = params.get(from_key)
112 |                 if value is None and required:
113 |                     raise KeyError(from_key)
114 |                 remove_if_empty = False
115 |             elif isinstance(def_, str):
116 |                 # is a template
117 |                 template: Optional[str] = def_
118 |                 tmplt = jenv.from_string(cast(str, template))
119 |                 value = tmplt.render(**params)
120 |                 remove_if_empty = False
121 |             elif isinstance(def_, dict):
122 |                 template = def_.get("template")
123 |                 remove_if_empty = def_["removeIfEmpty"]
124 |                 to_key = def_.get("toKey") or to_key
125 |                 from_key = def_.get("fromKey") or from_key
126 | 
127 |                 if template is None:
128 |                     required = def_["required"]
129 |                     value = params.get(from_key)
130 |                     if value is None and required:
131 |                         raise KeyError(from_key)
132 |                 else:
133 |                     tmplt = jenv.from_string(template)
134 |                     try:
135 |                         value = tmplt.render(**params)
136 |                     except UndefinedError:
137 |                         value = ""  # This empty string will be removed if `remove_if_empty` is True
138 |             else:
139 |                 raise UnreachableError()
140 | 
141 |             if value is not None:
142 |                 str_value = str(value)
143 | 
144 |                 if not (remove_if_empty and not str_value):
145 |                     if to_key in ret:
146 |                         print(f"Param {key} conflicting with {to_key}", file=stderr)
147 |                     ret[to_key] = str_value
148 |                     continue
149 |         return ret
150 | 
151 | 
152 | class Orient(Enum):
153 |     """Different types of table orientations
154 |     ref: (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html).
155 |     Currently, DataConnector supports two different types of orientaions:
156 | 
157 |     1. Split, which is column store.
158 |     2. Records, which is row store.
159 | 
160 |     Details can be found in the pandas page.
161 |     """
162 | 
163 |     Split = "split"
164 |     Records = "records"
165 | 


--------------------------------------------------------------------------------
/docs/source/_static/images/plot_missing/df_x_cat.html:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | <!DOCTYPE html>
 6 | <html lang="en">
 7 |   
 8 |   <head>
 9 |     
10 |       <meta charset="utf-8">
11 |       <title>Report</title>
12 |       
13 |       
14 |         
15 |           
16 |         
17 |         
18 |           
19 |         <script type="text/javascript" src="https://cdn.pydata.org/bokeh/release/bokeh-1.4.0.min.js"></script>
20 |         <script type="text/javascript">
21 |             Bokeh.set_log_level("info");
22 |         </script>
23 |         
24 |       
25 |       
26 |     
27 |   </head>
28 |   
29 |   
30 |   <body>
31 |     
32 |       
33 |         
34 |           
35 |           
36 |             
37 |               <div class="bk-root" id="8a543745-52f6-4db3-b5bf-a00f555b05f2" data-root-id="10540"></div>
38 |             
39 |           
40 |         
41 |       
42 |       
43 |         <script type="application/json" id="10659">
44 |           {"804701b4-3e42-4b36-8cf1-2e44b1bc9c03":{"roots":{"references":[{"attributes":{},"id":"10543","type":"CategoricalScale"},{"attributes":{},"id":"10569","type":"BasicTickFormatter"},{"attributes":{"fill_alpha":{"value":0.1},"fill_color":{"value":"#1f77b4"},"line_alpha":{"value":0.1},"line_color":{"value":"#1f77b4"},"top":{"field":"count"},"width":{"value":0.99},"x":{"field":"x"}},"id":"10562","type":"VBar"},{"attributes":{"ticker":{"id":"10548","type":"CategoricalTicker"}},"id":"10550","type":"Grid"},{"attributes":{"items":[{"id":"10572","type":"LegendItem"}]},"id":"10571","type":"Legend"},{"attributes":{"data_source":{"id":"10559","type":"ColumnDataSource"},"glyph":{"id":"10561","type":"VBar"},"hover_glyph":null,"muted_glyph":null,"nonselection_glyph":{"id":"10562","type":"VBar"},"selection_glyph":null,"view":{"id":"10564","type":"CDSView"}},"id":"10563","type":"GlyphRenderer"},{"attributes":{"formatter":{"id":"10569","type":"BasicTickFormatter"},"major_label_standoff":0,"major_label_text_font_size":{"value":"9pt"},"major_tick_line_color":{"value":null},"ticker":{"id":"10552","type":"BasicTicker"}},"id":"10551","type":"LinearAxis"},{"attributes":{"callback":null,"data":{"count":[843,466,658,388],"index":[0,1,2,3],"label":["Origin","Origin","DropMissing","DropMissing"],"x":["male","female","male","female"]},"selected":{"id":"10581","type":"Selection"},"selection_policy":{"id":"10580","type":"UnionRenderers"}},"id":"10559","type":"ColumnDataSource"},{"attributes":{"formatter":{"id":"10567","type":"CategoricalTickFormatter"},"major_label_orientation":1.0471975511965976,"major_label_standoff":0,"major_label_text_font_size":{"value":"9pt"},"major_tick_line_color":{"value":null},"ticker":{"id":"10548","type":"CategoricalTicker"}},"id":"10547","type":"CategoricalAxis"},{"attributes":{},"id":"10548","type":"CategoricalTicker"},{"attributes":{"dimension":1,"ticker":{"id":"10552","type":"BasicTicker"}},"id":"10555","type":"Grid"},{"attributes":{},"id":"10545","type":"LinearScale"},{"attributes":{"text":"Missing impact of age by sex"},"id":"10573","type":"Title"},{"attributes":{},"id":"10580","type":"UnionRenderers"},{"attributes":{"callback":null,"factors":["male","female"]},"id":"10538","type":"FactorRange"},{"attributes":{},"id":"10567","type":"CategoricalTickFormatter"},{"attributes":{},"id":"10552","type":"BasicTicker"},{"attributes":{"below":[{"id":"10547","type":"CategoricalAxis"}],"center":[{"id":"10550","type":"Grid"},{"id":"10555","type":"Grid"}],"left":[{"id":"10551","type":"LinearAxis"}],"plot_height":500,"plot_width":500,"renderers":[{"id":"10563","type":"GlyphRenderer"}],"right":[{"id":"10571","type":"Legend"}],"title":{"id":"10573","type":"Title"},"toolbar":{"id":"10557","type":"Toolbar"},"toolbar_location":null,"x_range":{"id":"10538","type":"FactorRange"},"x_scale":{"id":"10543","type":"CategoricalScale"},"y_range":{"id":"10539","type":"Range1d"},"y_scale":{"id":"10545","type":"LinearScale"}},"id":"10540","subtype":"Figure","type":"Plot"},{"attributes":{"callback":null,"end":885.1500000000001},"id":"10539","type":"Range1d"},{"attributes":{},"id":"10581","type":"Selection"},{"attributes":{"fill_alpha":{"value":0.3},"fill_color":{"field":"label","transform":{"id":"10537","type":"CategoricalColorMapper"}},"line_color":{"field":"label","transform":{"id":"10537","type":"CategoricalColorMapper"}},"top":{"field":"count"},"width":{"value":0.99},"x":{"field":"x"}},"id":"10561","type":"VBar"},{"attributes":{"label":{"field":"label"},"renderers":[{"id":"10563","type":"GlyphRenderer"}]},"id":"10572","type":"LegendItem"},{"attributes":{"factors":["Origin","DropMissing"],"palette":["#1f77b4","#ff7f0e","#2ca02c"]},"id":"10537","type":"CategoricalColorMapper"},{"attributes":{"callback":null,"tooltips":[["sex","@x"],["Count","@count"],["Label","@label"]]},"id":"10556","type":"HoverTool"},{"attributes":{"active_drag":"auto","active_inspect":"auto","active_multi":null,"active_scroll":"auto","active_tap":"auto","tools":[{"id":"10556","type":"HoverTool"}]},"id":"10557","type":"Toolbar"},{"attributes":{"source":{"id":"10559","type":"ColumnDataSource"}},"id":"10564","type":"CDSView"}],"root_ids":["10540"]},"title":"Bokeh Application","version":"1.4.0"}}
45 |         </script>
46 |         <script type="text/javascript">
47 |           (function() {
48 |             var fn = function() {
49 |               Bokeh.safely(function() {
50 |                 (function(root) {
51 |                   function embed_document(root) {
52 |                     
53 |                   var docs_json = document.getElementById('10659').textContent;
54 |                   var render_items = [{"docid":"804701b4-3e42-4b36-8cf1-2e44b1bc9c03","roots":{"10540":"8a543745-52f6-4db3-b5bf-a00f555b05f2"}}];
55 |                   root.Bokeh.embed.embed_items(docs_json, render_items);
56 |                 
57 |                   }
58 |                   if (root.Bokeh !== undefined) {
59 |                     embed_document(root);
60 |                   } else {
61 |                     var attempts = 0;
62 |                     var timer = setInterval(function(root) {
63 |                       if (root.Bokeh !== undefined) {
64 |                         clearInterval(timer);
65 |                         embed_document(root);
66 |                       } else {
67 |                         attempts++;
68 |                         if (attempts > 100) {
69 |                           clearInterval(timer);
70 |                           console.log("Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing");
71 |                         }
72 |                       }
73 |                     }, 10, root)
74 |                   }
75 |                 })(window);
76 |               });
77 |             };
78 |             if (document.readyState != "loading") fn();
79 |             else document.addEventListener("DOMContentLoaded", fn);
80 |           })();
81 |         </script>
82 |     
83 |   </body>
84 |   
85 | </html>


--------------------------------------------------------------------------------
/dataprep/eda/dtypes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | In this module lives the type tree.
  3 | """
  4 | 
  5 | 
  6 | from typing import Any, Dict, Optional, Union, Type
  7 | 
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import dask.dataframe as dd
 12 | 
 13 | from ..errors import UnreachableError
 14 | 
 15 | CATEGORICAL_NUMPY_DTYPES = [np.bool, np.object]
 16 | CATEGORICAL_PANDAS_DTYPES = [pd.CategoricalDtype, pd.PeriodDtype]
 17 | CATEGORICAL_DTYPES = CATEGORICAL_NUMPY_DTYPES + CATEGORICAL_PANDAS_DTYPES
 18 | 
 19 | NUMERICAL_NUMPY_DTYPES = [np.number]
 20 | NUMERICAL_DTYPES = NUMERICAL_NUMPY_DTYPES
 21 | 
 22 | DATETIME_NUMPY_DTYPES = [np.datetime64]
 23 | DATETIME_PANDAS_DTYPES = [pd.DatetimeTZDtype]
 24 | DATETIME_DTYPES = DATETIME_NUMPY_DTYPES + DATETIME_PANDAS_DTYPES
 25 | 
 26 | 
 27 | class DType:
 28 |     """
 29 |     Root of Type Tree
 30 |     """
 31 | 
 32 | 
 33 | ############## Syntactic DTypes ##############
 34 | class Categorical(DType):
 35 |     """
 36 |     Type Categorical
 37 |     """
 38 | 
 39 | 
 40 | class Nominal(Categorical):
 41 |     """
 42 |     Type Nominal, Subtype of Categorical
 43 |     """
 44 | 
 45 | 
 46 | class Ordinal(Categorical):
 47 |     """
 48 |     Type Ordinal, Subtype of Categorical
 49 |     """
 50 | 
 51 | 
 52 | class Numerical(DType):
 53 |     """
 54 |     Type Numerical
 55 |     """
 56 | 
 57 | 
 58 | class Continuous(Numerical):
 59 |     """
 60 |     Type Continuous, Subtype of Numerical
 61 |     """
 62 | 
 63 | 
 64 | class Discrete(Numerical):
 65 |     """
 66 |     Type Discrete, Subtype of Numerical
 67 |     """
 68 | 
 69 | 
 70 | ############## Semantic DTypes ##############
 71 | 
 72 | 
 73 | class DateTime(Numerical):
 74 |     """
 75 |     Type DateTime, Subtype of Numerical
 76 |     """
 77 | 
 78 | 
 79 | class Text(Nominal):
 80 |     """
 81 |     Type Text, Subtype of Nominal
 82 |     """
 83 | 
 84 | 
 85 | ############## End of the Type Tree ##############
 86 | 
 87 | DTypeOrStr = Union[DType, Type[DType], str, None]
 88 | DTypeDict = Union[Dict[str, Union[DType, Type[DType], str]], None]
 89 | DTypeDef = Union[Dict[str, Union[DType, Type[DType], str]], DType, Type[DType], None]
 90 | 
 91 | 
 92 | def detect_dtype(col: dd.Series, known_dtype: Optional[DTypeDef] = None,) -> DType:
 93 |     """
 94 |     Given a column, detect its type or transform its type according to users' specification
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     col: dask.datafram.Series
 99 |         A dataframe column
100 |     known_dtype: Optional[Union[Dict[str, Union[DType, str]], DType]], default None
101 |         A dictionary or single DType given by users to specify the types for designated columns or
102 |         all columns. E.g.  known_dtype = {"a": Continuous, "b": "Nominal"} or
103 |         known_dtype = {"a": Continuous(), "b": "nominal"} or
104 |         known_dtype = Continuous() or known_dtype = "Continuous" or known_dtype = Continuous()
105 |     """
106 |     if not known_dtype:
107 |         return detect_without_known(col)
108 | 
109 |     if isinstance(known_dtype, dict):
110 |         if col.name in known_dtype:
111 |             dtype = normalize_dtype(known_dtype[col.name])
112 |             return map_dtype(dtype)
113 | 
114 |     elif isinstance(normalize_dtype(known_dtype), DType):
115 |         return map_dtype(normalize_dtype(known_dtype))
116 | 
117 |     return detect_without_known(col)
118 | 
119 | 
120 | def map_dtype(dtype: DType) -> DType:
121 |     """
122 |     Currently, we want to keep our Type System flattened.
123 |     We will map Categorical() to Nominal() and Numerical() to Continuous()
124 |     """
125 |     if (
126 |         isinstance(dtype, Categorical) is True
127 |         and isinstance(dtype, Ordinal) is False
128 |         and isinstance(dtype, Nominal) is False
129 |     ):
130 |         return Nominal()
131 |     elif (
132 |         isinstance(dtype, Numerical) is True
133 |         and isinstance(dtype, Continuous) is False
134 |         and isinstance(dtype, Discrete) is False
135 |     ):
136 |         return Continuous()
137 |     else:
138 |         return dtype
139 | 
140 | 
141 | def detect_without_known(col: dd.Series) -> DType:
142 |     """
143 |     This function detects dtypes of column when users didn't specify.
144 |     """
145 |     if is_nominal(col.dtype):
146 |         return Nominal()
147 | 
148 |     elif is_continuous(col.dtype):
149 |         return Continuous()
150 | 
151 |     elif is_datetime(col.dtype):
152 |         return DateTime()
153 |     else:
154 |         raise UnreachableError
155 | 
156 | 
157 | def is_dtype(dtype1: DType, dtype2: DType) -> bool:
158 |     """
159 |     This function detects if dtype2 is dtype1.
160 |     """
161 |     return isinstance(dtype1, dtype2.__class__)
162 | 
163 | 
164 | def normalize_dtype(dtype_repr: Any) -> DType:
165 |     """
166 |     This function normalizes a dtype repr.
167 |     """
168 |     normalized: DType
169 |     str_dic = {
170 |         "Categorical": Categorical,
171 |         "Ordinal": Ordinal,
172 |         "Nominal": Nominal,
173 |         "Numerical": Numerical,
174 |         "Continuous": Continuous,
175 |         "Discrete": Discrete,
176 |         "DateTime": DateTime,
177 |         "Text": Text,
178 |     }
179 |     for str_dtype, dtype in str_dic.items():
180 |         if isinstance(dtype_repr, str):
181 |             if dtype_repr.lower() == str_dtype.lower():
182 |                 normalized = dtype()
183 |                 break
184 | 
185 |         elif isinstance(dtype_repr, dtype):
186 |             normalized = dtype_repr
187 |             break
188 | 
189 |         elif dtype_repr == dtype:
190 |             normalized = dtype()
191 |             break
192 | 
193 |     return normalized
194 | 
195 | 
196 | def is_nominal(dtype: Any) -> bool:
197 |     """
198 |     Given a type, return if that type is a nominal type
199 |     """
200 | 
201 |     if is_continuous(dtype) or is_datetime(dtype):
202 |         return False
203 | 
204 |     if isinstance(dtype, np.dtype):
205 |         dtype = dtype.type
206 | 
207 |         return any(issubclass(dtype, c) for c in CATEGORICAL_NUMPY_DTYPES)
208 |     else:
209 |         return any(isinstance(dtype, c) for c in CATEGORICAL_PANDAS_DTYPES)
210 | 
211 | 
212 | def is_continuous(dtype: Any) -> bool:
213 |     """
214 |     Given a type, return if that type is a continuous type
215 |     """
216 |     dtype = dtype.type
217 |     return any(issubclass(dtype, c) for c in NUMERICAL_NUMPY_DTYPES)
218 | 
219 | 
220 | def is_datetime(dtype: Any) -> bool:
221 |     """
222 |     Given a type, return if that type is a datetime type
223 |     """
224 |     if isinstance(dtype, np.dtype):
225 |         dtype = dtype.type
226 |         return any(issubclass(dtype, c) for c in DATETIME_NUMPY_DTYPES)
227 |     else:
228 |         return any(isinstance(dtype, c) for c in DATETIME_PANDAS_DTYPES)
229 | 
230 | 
231 | def is_pandas_categorical(dtype: Any) -> bool:
232 |     """
233 |     Detect if a dtype is categorical and from pandas.
234 |     """
235 |     return any(isinstance(dtype, c) for c in CATEGORICAL_PANDAS_DTYPES)
236 | 


--------------------------------------------------------------------------------
/examples/DataConnector_Yelp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Connector for Yelp \n",
  8 |     "\n",
  9 |     "In this example, we will be going over how to use Data Connector with Yelp."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Preprocessing\n",
 17 |     "\n",
 18 |     "data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. The goal is to help the users skip the complex API configuration. In this tutorial, we demonstrate how to use data_connector library with Yelp.\n",
 19 |     "\n",
 20 |     "If you haven't installed dataprep, run command `pip install dataprep` or execute the following cell."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "># Run me if you'd like to install\n",
 30 |     ">!pip install dataprep"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "# Obtaining access token from Yelp\n",
 38 |     "\n",
 39 |     "To connect to Yelp, you need to generate a token. This token is a unique identifier of an application requesting access to the Yelp's API. Once an application creates the token, it will act as your credential when making an API request. \n",
 40 |     "\n",
 41 |     "To receive an access token, the user needs to create a server-side application from Yelp. You can get a token by following the [Yelp document](https://www.yelp.com/developers/documentation/v3/authentication).\n",
 42 |     "Simply create an application and generate a key."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "Store the token or API Key in a secure location as it will be used to provide you access to the Yelp's restaurant data."
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "# Download and store the configuration files in dataprep. \n",
 57 |     "\n",
 58 |     "The configuration files are used to configure the parameters and initial setup for the API. The available configuration files can be manually downloaded here: [Configuration Files](https://github.com/sfu-db/DataConnectorConfigs) or automatically downloaded at usage. \n",
 59 |     "\n",
 60 |     "Store the configuration file in the dataprep folder. "
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "# Initialize data_connector\n",
 68 |     "\n",
 69 |     "To initialize run the following code. Copy and paste the Yelp API key into the **access_token** variable and ensure the connector path is correct. Once you have that running you can use the built in functions available in connector."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "from dataprep.data_connector import Connector\n",
 79 |     "access_token = “insert_token_key”\n",
 80 |     "dc = Connector(\"./DataConnectorConfigs/yelp\", auth_params={\"access_token\":access_token})"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "# Functionalities\n",
 88 |     "\n",
 89 |     "Data connector has several functions you can perform to gain insight on the data downloaded from Yelp."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "### Connector.info\n",
 97 |     "The info method gives information and guidelines of using the connector. There are 3 sections in the response and they are table, parameters and examples.\n",
 98 |     ">1. Table - The table(s) being accessed.\n",
 99 |     ">2. Parameters - Identifies which parameters can be used to call the method. For DBLP, there is no required **parameter**. \n",
100 |     ">3. Examples - Shows how you can call the methods in the Connector class."
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "dc.info()"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "### Connector.show_schema\n",
117 |     "The show_schema method returns the schema of the website data to be returned in a Dataframe. There are two columns in the response. The first column is the column name and the second is the datatype.\n",
118 |     "\n",
119 |     "As an example, lets see what is in the business table."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "dc.show_schema(\"business\")"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "### Connector.query\n",
136 |     "The query method downloads the website data and displays it in a Dataframe. The parameters must meet the requirements as indicated in connector.info for the operation to run.\n",
137 |     "\n",
138 |     "When the data is received from the server, it will either be in a JSON or XML format. The data_connector reformats the data in pandas Dataframe for the convenience of downstream operations.\n",
139 |     "\n",
140 |     "As an example, let's try to get the data from the \"business\" table, providing the term \"city\" and location \"seattle\"."
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "df = dc.query(\"businesses\", term=\"city\", location=\"seattle\")"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "From query results, you can see how easy it is to download the restaurant data from Yelp into a pandas Dataframe. \n",
157 |     "\n",
158 |     "Now that you have an understanding of how data connector operates, you can easily accomplish the task with two lines of code.\n",
159 |     "\n",
160 |     ">1. dc = Connector(...)\n",
161 |     ">2. dc.query(...)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "# That's all for now. \n",
169 |     "If you are interested in writing your own configuration file or modify an existing one, refer to the [Configuration Files](https://github.com/sfu-db/DataConnectorConfigs>)."
170 |    ]
171 |   }
172 |  ],
173 |  "metadata": {
174 |   "kernelspec": {
175 |    "display_name": "Python 3",
176 |    "language": "python",
177 |    "name": "python3"
178 |   },
179 |   "language_info": {
180 |    "codemirror_mode": {
181 |     "name": "ipython",
182 |     "version": 3
183 |    },
184 |    "file_extension": ".py",
185 |    "mimetype": "text/x-python",
186 |    "name": "python",
187 |    "nbconvert_exporter": "python",
188 |    "pygments_lexer": "ipython3",
189 |    "version": "3.7.7"
190 |   }
191 |  },
192 |  "nbformat": 4,
193 |  "nbformat_minor": 4
194 | }
195 | 


--------------------------------------------------------------------------------
/dataprep/eda/basic/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module implements the plot(df) function.
  3 | """
  4 | 
  5 | from typing import Optional, Tuple, Union, Dict
  6 | 
  7 | import dask.dataframe as dd
  8 | import pandas as pd
  9 | from bokeh.io import show
 10 | 
 11 | from .compute import compute
 12 | from .render import render
 13 | from ..report import Report
 14 | from ..dtypes import DTypeDef
 15 | 
 16 | __all__ = ["plot", "compute", "render"]
 17 | 
 18 | 
 19 | def plot(
 20 |     df: Union[pd.DataFrame, dd.DataFrame],
 21 |     x: Optional[str] = None,
 22 |     y: Optional[str] = None,
 23 |     z: Optional[str] = None,
 24 |     *,
 25 |     bins: int = 10,
 26 |     ngroups: int = 10,
 27 |     largest: bool = True,
 28 |     nsubgroups: int = 5,
 29 |     timeunit: str = "auto",
 30 |     agg: str = "mean",
 31 |     sample_size: int = 1000,
 32 |     value_range: Optional[Tuple[float, float]] = None,
 33 |     yscale: str = "linear",
 34 |     tile_size: Optional[float] = None,
 35 |     dtype: Optional[DTypeDef] = None,
 36 |     top_words: Optional[int] = 30,
 37 |     stopword: Optional[bool] = True,
 38 |     lemmatize: Optional[bool] = False,
 39 |     stem: Optional[bool] = False,
 40 | ) -> Report:
 41 |     """Generates plots for exploratory data analysis.
 42 | 
 43 |     If no columns are specified, the distribution of
 44 |     each coloumn is plotted. A histogram is plotted if the
 45 |     column contains numerical values, a bar chart is plotted
 46 |     if the column contains categorical values, a line chart is
 47 |     plotted if the column is of type datetime.
 48 | 
 49 |     If one column (x) is specified, the
 50 |     distribution of x is plotted in various ways. If x
 51 |     contains categorical values, a bar chart and pie chart are
 52 |     plotted. If x contains numerical values, a histogram,
 53 |     kernel density estimate plot, box plot, and qq plot are plotted.
 54 |     If x contains datetime values, a line chart is plotted.
 55 | 
 56 |     If two columns (x and y) are specified, plots depicting
 57 |     the relationship between the variables will be displayed. If
 58 |     x and y contain numerical values, a scatter plot, hexbin
 59 |     plot, and binned box plot are plotted. If one of x and y
 60 |     contain categorical values and the other contains numerical values,
 61 |     a box plot and multiline histogram are plotted. If x and y
 62 |     contain categorical vales, a nested bar chart, stacked bar chart, and
 63 |     heat map are plotted. If one of x and y contains datetime values
 64 |     and the other contains numerical values, a line chart and a box plot
 65 |     are shown. If one of x and y contains datetime values and the other
 66 |     contains categorical values, a multiline chart and a stacked box plot
 67 |     are shown.
 68 | 
 69 |     If x, y, and z are specified, they must be one each of type datetime,
 70 |     numerical, and categorical. A multiline chart containing an aggregate
 71 |     on the numerical column grouped by the categorical column over time is
 72 |     plotted.
 73 | 
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     df
 78 |         Dataframe from which plots are to be generated
 79 |     x: Optional[str], default None
 80 |         A valid column name from the dataframe
 81 |     y: Optional[str], default None
 82 |         A valid column name from the dataframe
 83 |     z: Optional[str], default None
 84 |         A valid column name from the dataframe
 85 |     bins: int, default 10
 86 |         For a histogram or box plot with numerical x axis, it defines
 87 |         the number of equal-width bins to use when grouping.
 88 |     ngroups: int, default 10
 89 |         When grouping over a categorical column, it defines the
 90 |         number of groups to show in the plot. Ie, the number of
 91 |         bars to show in a bar chart.
 92 |     largest: bool, default True
 93 |         If true, when grouping over a categorical column, the groups
 94 |         with the largest count will be output. If false, the groups
 95 |         with the smallest count will be output.
 96 |     nsubgroups: int, default 5
 97 |         If x and y are categorical columns, ngroups refers to
 98 |         how many groups to show from column x, and nsubgroups refers to
 99 |         how many subgroups to show from column y in each group in column x.
100 |     timeunit: str, default "auto"
101 |         Defines the time unit to group values over for a datetime column.
102 |         It can be "year", "quarter", "month", "week", "day", "hour",
103 |         "minute", or "second". With default value "auto", it will use the
104 |         time unit such that the resulting number of groups is closest to 15.
105 |     agg: str, default "mean"
106 |         Specify the aggregate to use when aggregating over a numerical
107 |         column
108 |     sample_size: int, default 1000
109 |         Sample size for the scatter plot
110 |     value_range: Optional[Tuple[float, float]], default None
111 |         The lower and upper bounds on the range of a numerical column.
112 |         Applies when column x is specified and column y is unspecified.
113 |     yscale
114 |         The scale to show on the y axis. Can be "linear" or "log".
115 |     tile_size: Optional[float], default None
116 |         Size of the tile for the hexbin plot. Measured from the middle
117 |         of a hexagon to its left or right corner.
118 |     dtype: str or DType or dict of str or dict of DType, default None
119 |         Specify Data Types for designated column or all columns.
120 |         E.g.  dtype = {"a": Continuous, "b": "Nominal"} or
121 |         dtype = {"a": Continuous(), "b": "nominal"}
122 |         or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
123 |     top_words: int, default 30
124 |         Specify the amount of words to show in the wordcloud and
125 |         word frequency bar chart
126 |     stopword: bool, default True
127 |         Eliminate the stopwords in the text data for plotting wordcloud and
128 |         word frequency bar chart
129 |     lemmatize: bool, default False
130 |         Lemmatize the words in the text data for plotting wordcloud and
131 |         word frequency bar chart
132 |     stem: bool, default False
133 |         Apply Potter Stem on the text data for plotting wordcloud and
134 |         word frequency bar chart
135 |     Examples
136 |     --------
137 |     >>> import pandas as pd
138 |     >>> from dataprep.eda import *
139 |     >>> iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
140 |     >>> plot(iris)
141 |     >>> plot(iris, "petal_length", bins=20, value_range=(1,5))
142 |     >>> plot(iris, "petal_width", "species")
143 |     """
144 |     # pylint: disable=too-many-locals,line-too-long
145 | 
146 |     intermediate = compute(
147 |         df,
148 |         x=x,
149 |         y=y,
150 |         z=z,
151 |         bins=bins,
152 |         ngroups=ngroups,
153 |         largest=largest,
154 |         nsubgroups=nsubgroups,
155 |         timeunit=timeunit.lower(),
156 |         agg=agg,
157 |         sample_size=sample_size,
158 |         value_range=value_range,
159 |         dtype=dtype,
160 |         top_words=top_words,
161 |         stopword=stopword,
162 |         lemmatize=lemmatize,
163 |         stem=stem,
164 |     )
165 |     figure = render(intermediate, yscale=yscale, tile_size=tile_size)
166 | 
167 |     return Report(figure)
168 | 


--------------------------------------------------------------------------------
/examples/EDA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "If you haven't installed dataprep, run command `pip install dataprep` or execute the following cell"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# Run me if you'd like to install\n",
 17 |     "!pip install dataprep"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import pandas as pd\n",
 27 |     "from dataprep.eda import plot, plot_correlation, plot_missing"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Load data"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "df = pd.read_csv(\"https://s3-us-west-2.amazonaws.com/dataprep.dsl/datasets/suicide-rate.csv\")"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Plot the distribution of each column in the dataframe. \n",
 51 |     "For numeric column, show the histogram. For categorical column, show bar chart."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "df[\"year\"] = df[\"year\"].astype(\"category\")\n",
 61 |     "plot(df)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "# Show the plots of the given column. If column is numeric, show keneral density plot, box plot and qqnorm plot.\n",
 69 |     "If column is categorical, show bar plot and pie plot."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "plot(df, \"sex\")\n",
 79 |     "plot(df, \"gdp_per_capita\")"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "# Show the plots of the relationship of given two columns. \n",
 87 |     "* For numeric-categorical, show the box plot for each category.\n",
 88 |     "* For numeric-numeric, show the heatmap\n",
 89 |     "* For categorical-categorical, show the bar chart of col_x for each category of col_y"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "plot(df, \"suicides\", \"sex\")\n",
 99 |     "plot(df, \"population\", \"suicides\")\n",
100 |     "plot(df, \"country\", \"generation\")"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## Show correlation matrix plots using each method (pearson, kendall, spearman)\n",
108 |     "If k is specified, in each matrix plot, only show top-k positive cells, set the color of other cells to white. (Do you want to know the top-k negative cells?)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "df_without_missing = df.dropna('columns')\n",
118 |     "plot_correlation(df_without_missing)\n",
119 |     "plot_correlation(df_without_missing, k=1)\n",
120 |     "plot_correlation(df_without_missing, value_range=(0,1))"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "# Show the 3 cols that corresponds to x in the correlation matrix (pearson, kendall, spearman)\n",
128 |     "if k is specified, sort the result based on corr. show the 3 cols that corresponds the top-k correlation value"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "plot_correlation(df_without_missing, \"suicides\")\n",
138 |     "plot_correlation(df_without_missing, \"suicides\", k=2)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "# if value_range is specified, show the correlation value in value_range."
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "plot_correlation(df_without_missing, \"suicides\", value_range=[-1, 0.3])"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "# if no correlation in the range, show blank fig."
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "plot_correlation(df_without_missing, \"suicides\", value_range=[-1, -0.8])"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "plot_correlation(df_without_missing, x=\"population\", y=\"suicides_no\")\n",
180 |     "plot_correlation(df_without_missing, x=\"population\", y=\"suicides\", k=5)\n"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "## show the location/position and percentage of missing data"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "plot_missing(df, num_bins=100)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "## If one want to remove the rows whose x is missing, \n",
204 |     "the impact of the removed rows on other columns. "
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "plot_missing(df, 'HDI_for_year')"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "## If one want to remove the rows whose x is missing, the impact of the removed rows on y columns. "
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "plot_missing(df, 'HDI_for_year', 'population')\n",
230 |     "plot_missing(df, 'HDI_for_year', 'sex')\n",
231 |     "plot_missing(df, 'HDI_for_year', \"country\")"
232 |    ]
233 |   }
234 |  ],
235 |  "metadata": {
236 |   "kernelspec": {
237 |    "display_name": "Python 3",
238 |    "language": "python",
239 |    "name": "python3"
240 |   },
241 |   "language_info": {
242 |    "codemirror_mode": {
243 |     "name": "ipython",
244 |     "version": 3
245 |    },
246 |    "file_extension": ".py",
247 |    "mimetype": "text/x-python",
248 |    "name": "python",
249 |    "nbconvert_exporter": "python",
250 |    "pygments_lexer": "ipython3",
251 |    "version": "3.7.5"
252 |   }
253 |  },
254 |  "nbformat": 4,
255 |  "nbformat_minor": 4
256 | }
257 | 


--------------------------------------------------------------------------------
/docs/source/DC_DBLP_tut.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | ==================================================
  3 | Tutorial - Data Connector for DBLP 
  4 | ==================================================
  5 | 
  6 | .. toctree::
  7 |    :maxdepth: 2
  8 | 
  9 | Overview
 10 | ========
 11 | 
 12 | data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. 
 13 | The goal is to help the users skip the complex API configuration. In this tutorial, we demonstrate how to use data_connector library with DBLP.
 14 | 
 15 | Preprocessing
 16 | ================
 17 | If you haven't installed dataprep, run command pip install dataprep or execute the following cell.
 18 | 
 19 | ::
 20 | 
 21 |     !pip install dataprep
 22 |     
 23 | 
 24 | Download and store the configuration files in dataprep 
 25 | ================================================================
 26 | The configuration files are used to construct the parameters and initial setup for the API. The available configuration files can be manually downloaded here: `Configuration Files
 27 | <https://github.com/sfu-db/DataConnectorConfigs>`_ or automatically downloaded at usage. 
 28 | 
 29 | 
 30 | 
 31 | To automatically download at usage, click on the clipboard button, unsure you are cloning with HTTPS. Go into your terminal, and find an appropriate locate to store the configuration files. 
 32 | When you decided on a location, enter the command ``git clone https://github.com/sfu-db/DataConnectorConfigs.git``. This will clone the git repository to the desired location; as a suggestion store it with the dataprep folder. 
 33 | 
 34 | 
 35 | From here you can proceed with the next steps.
 36 | 
 37 | .. image:: _static/images/tutorial/dc_git.png
 38 | 	:align: center
 39 |    	:width: 1000
 40 |    	:height: 500
 41 | 
 42 | 
 43 | .. image:: _static/images/tutorial/dc_git_clone.png
 44 | 	:align: center
 45 |    	:width: 725
 46 |    	:height: 125
 47 | 
 48 | 
 49 | Below the configuration file are stored with dataprep. 
 50 | 
 51 | .. image:: _static/images/tutorial/Config_destination.png
 52 | 	:align: center
 53 |    	:width: 586
 54 |    	:height: 132
 55 | 
 56 | 
 57 |     
 58 | Initialize data_connector
 59 | =============================
 60 | To initialize, run the following code. 
 61 | 
 62 | ::
 63 | 
 64 |     from dataprep.data_connector import Connector
 65 |     dc = Connector("./DataConnectorConfigs/DBLP")
 66 |     
 67 | Functionalities
 68 | ===================
 69 | Data connector has several functions you can perform to gain insight on the data downloaded from DBLP.
 70 | 
 71 | Connector.info
 72 | ------------------
 73 | | The info method gives information and guidelines on using the connector. There are 3 sections in the response and they are table, parameters and examples.
 74 | |
 75 | | 	a. Table - The table(s) being accessed.
 76 | | 	b. Parameters - Identifies which parameters can be used to call the method. For DBLP, there is no required **parameter**.
 77 | | 	c. Examples - Shows how you can call the methods in the Connector class.
 78 | 
 79 | 
 80 | ::
 81 | 
 82 |     dc.info()
 83 |     
 84 | .. image:: _static/images/tutorial/dc_dblp_info.png
 85 | 	:align: center
 86 |    	:width: 300
 87 |    	:height: 200
 88 | 
 89 | Parameters
 90 | **********************
 91 | | A parameter is a piece of information you supply to a query right as you run it. The parameters for DBLP are **q**, **h**, and **f**, and they are described below.
 92 | |
 93 | | 	a. **q** - Optional - The query string to search for find author profiles, conferences, journals, or individual publications in the database.
 94 | | 	b. **h** - Optional - Maximum number of search results (hits) to return. 
 95 | | 	c. **f** - Optional - The first hit in the numbered sequence of search results (starting with 0) to return. In combination with the h parameter, this parameter can be used for pagination of search results.
 96 | 
 97 | There are additional parameters to query with DBLP. If you are interested in reading up the other available parameters and setting up your own config files, please read this `DBLP link
 98 | <https://dblp.uni-trier.de/faq/13501473>`_ and this `Configuration Files link
 99 | <https://github.com/sfu-db/DataConnectorConfigs>`_.
100 | 
101 | 
102 | Connector.show_schema
103 | --------------------------
104 | The show_schema method returns the website data in a Dataframe format. There are two columns in the response. The first column is the name and the second is the datatype.
105 | As an example, lets see what is in the publication table.
106 | 
107 | ::
108 | 
109 |     dc.show_schema("publication")
110 | 
111 | .. image:: _static/images/tutorial/dc_dblp_show_schema.png
112 | 	:align: center
113 |    	:width: 212
114 |    	:height: 295
115 |     
116 | Connector.query
117 | ------------------
118 | The query method downloads the website data. The parameters must meet the requirements as indicated in connector.info for the operation to run.
119 | 
120 | When the data is received from the server, it will either be in a JSON or XML format. The data_connector reformats the data in pandas Dataframe for the convenience of downstream operations.
121 | 
122 | As an example, let's try to get the data from the "publication" table, providing the query search for "lee".
123 | 
124 | ::
125 | 
126 |     dc.query("publication", q="lee")
127 |     
128 | .. image:: _static/images/tutorial/dc_dblp_query.png
129 | 	:align: center
130 |    	:width: 1000
131 |    	:height: 500
132 |     
133 | From query results, you can see how easy it is to download the publication data from DBLP into a pandas Dataframe.
134 | Now that you have an understanding of how data connector operates, you can easily accomplish the task with two lines of code.
135 | 
136 | ::
137 | 
138 |     dc = Connector(...)
139 |     dc.query(...)
140 | 
141 | Pagination
142 | ===================
143 | | Another feature available in the config files is pagination. Pagination is the process of dividing a document into discrete pages, breaking the content into pages and allow visitors to switch between them. It returns the maximum number of searches to return. 
144 | | 
145 | | To use pagination, you need to include **_count** in your query. The **_count** parameter represents the number of records a user would like to return, which can be larger than the maximum limit of records each return of API itself. Users can still fetch multiple pages of records by using parameters like limit and offset, however this requires users to understand how pagination works different website APIs.
146 | |
147 | 
148 | ::
149 | 
150 |     dc.query("publication", q = "lee", _count = 200)
151 | 
152 | .. image:: _static/images/tutorial/dc_dblp_pagination.png
153 | 	:align: center
154 |    	:width: 1000
155 |    	:height: 500
156 |     
157 | Pagination does not concurrently work with the **h** parameter in a query, you need to select either **h** or **_count**.
158 | 
159 | All publications of one specific author 
160 | =========================================================
161 | | In the query, **q** is a generic search parameter that find author profiles, conferences, journals, or individual publications in the database. As a parameter, **q** is not great when trying to find specific authors and their work. To solve for this issue, you can query the authors first and last name. 
162 | | 
163 | | To fetch all publications of one specific author, you need to include **first_name="______"**, **last_name="______"** in your query.
164 | 
165 | ::
166 | 
167 |     dc.query("publication", first_name = "Jeff", last_name = "Hawkins")
168 | 
169 | .. image:: _static/images/tutorial/dc_dblp_author.png
170 | 	:align: center
171 |    	:width: 1000
172 |    	:height: 500
173 |     
174 | That's all for now.
175 | ===================
176 | Please visit the other tutorials that are available if you are interested in setting up a different data connector. 
177 | If you are interested in writing your own configuration file or modify an existing one, refer to the `Configuration Files
178 | <https://github.com/sfu-db/DataConnectorConfigs>`_.


--------------------------------------------------------------------------------
/docs/source/eda/plot_correlation.rst:
--------------------------------------------------------------------------------
  1 | =============================================================
  2 | `plot_correlation`: analyzing the correlation between columns
  3 | =============================================================
  4 | 
  5 | .. toctree::
  6 |    :maxdepth: 2
  7 | 
  8 |  
  9 | Overview
 10 | ========
 11 | 
 12 | The goal of `plot_correlation` is to analyze the correlation between columns. It provides the following functionalities:
 13 | 
 14 | 1. `plot_correlation(df)`: plot the correlation matrix of all columns.
 15 | 2. `plot_correlation(df, x)`: plot the most correlated columns to column x.
 16 | 3. `plot_correlation(df, x, y)`: plot the scatter plot between column x and column y, as well as the regression line. Besides, the point that has most impact on the correlation value could be identified by passing a parameter.
 17 | 4. `plot_correlation(df, x, y, k, value_range)`: filter the result by correlation value or by top-k.
 18 | 
 19 | ..
 20 |     The following table summarizes the output plots for different setting of x and y.
 21 | 
 22 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 23 |     |             | **plot_correlation(df,x, y)** |                                                                                                        |
 24 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 25 |     | **x**       | **y**                         | **output plots**                                                                                       |
 26 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 27 |     | None        | None                          | n*n correlation matrix for Person, Spearman and KendallTau correlation, where n is max(50, df.columns) |
 28 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 29 |     | Numerical   | None                          | n*1 correlation matrix for Person, Spearman and KendallTau correlation                                 |
 30 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 31 |     | Categorical | None                          | TODO                                                                                                   |
 32 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 33 |     | Numerical   | Numerical                     | `scatter plot  <https://www.wikiwand.com/en/Scatter_plot>`_ with regression line                       |
 34 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 35 |     | Numerical   | Categorical                   | TODO                                                                                                   |
 36 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 37 |     | Categorical | Numerical                     | TODO                                                                                                   |
 38 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 39 |     | Categorical | Categorical                   | TODO                                                                                                   |
 40 |     +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+
 41 | 
 42 | In the following, we use several examples to demonstrate the functionalities.
 43 | 
 44 | 
 45 | Loading dataset
 46 | ===============
 47 | We support two types of dataframe: pandas dataframe and dask dataframe. Here we load the well known `wine quality` dataset into a pandas dataframe and use it to demonstrate our functionality::
 48 | 
 49 |     import pandas as pd
 50 |     df = pd.read_csv("https://www.openml.org/data/get_csv/4965268/wine-quality-red.arff")
 51 | 
 52 | 
 53 | Plotting correlation matrix via `plot_correlation(df)`
 54 | ======================================================
 55 | 
 56 | After getting a dataset, we could plot the correlation matrix of all columns by calling `plot_correlation(df)`. We will compute three types of correlations (`Person  <https://www.wikiwand.com/en/Pearson_correlation_coefficient>`_, `Spearman  <https://www.wikiwand.com/en/Spearman%27s_rank_correlation_coefficient>`_ and `KendallTau  <https://www.wikiwand.com/en/Kendall_rank_correlation_coefficient>`_) and for each of them generating a correlation matrix.  In the matrix, each cell represents the correlation value of two columns. The following shows an example::
 57 | 
 58 |     from dataprep.eda import plot_correlation
 59 |     plot_correlation(df)
 60 | 
 61 | 
 62 | .. raw:: html
 63 | 
 64 |    <iframe src="../_static/images/plot_correlation/df.html" height="550" width="100%"></iframe>
 65 | 
 66 | 
 67 | Finding the most correlated columns via `plot_correlation(df, x)`
 68 | =================================================================
 69 | 
 70 | After getting the correlation matrix, user may zoom into a column and explore how other columns correlated to it. To achieve this goal, we provide `plot_correlation(df, x)`. It computes the correlations (`Person  <https://www.wikiwand.com/en/Pearson_correlation_coefficient>`_, `Spearman  <https://www.wikiwand.com/en/Spearman%27s_rank_correlation_coefficient>`_ and `KendallTau  <https://www.wikiwand.com/en/Kendall_rank_correlation_coefficient>`_) of the interested column x to all other columns and sorting them based on the correlation values. In this case, user could know which column is most correlated or un-correlated to column x. The following shows an example::
 71 |     
 72 |     plot_correlation(df, "alcohol")
 73 | 
 74 | .. raw:: html
 75 | 
 76 |    <iframe src="../_static/images/plot_correlation/df_num.html" height="550" width="100%"></iframe>
 77 | 
 78 | 
 79 | Exploring the correlation between two columns via `plot_correlation(df, x, y)`
 80 | ===============================================================================
 81 | 
 82 | Furthermore, we provide `plot_correlation(df, x, y)` to allow user analyze the correlation between two columns. It plots a scatter plot of column x and y, along with a regression line. The following shows an example::
 83 | 
 84 |     plot_correlation(df, "alcohol", "pH")
 85 | 
 86 | .. raw:: html
 87 | 
 88 |    <iframe src="../_static/images/plot_correlation/df_num_num.html" height="550" width="100%"></iframe>
 89 |   
 90 | 
 91 | Besides, when user passes the parameter k, it could identify the k points that have the largest impact on the correlation value. The impact means that after removing the k points, the correlation value will increase most (positive influence) or decrease most (negative influence). The following shows an example::
 92 | 
 93 |     plot_correlation(df, "alcohol", "pH", k = 2)
 94 | 
 95 | .. raw:: html
 96 | 
 97 |    <iframe src="../_static/images/plot_correlation/df_num_num_k.html" height="550" width="100%"></iframe>
 98 | 
 99 | 
100 | Filtering the result by top-k and value range filter
101 | ====================================================
102 | 
103 | We provide two types of filters to filter the result: top-k and value range. They could be applied to `plot_correlation(df)` and `plot_correlation(df, x)` by passing parameter `k` and `value_range`. After applying top-k filter, only the top-k correlation values will be shown. For value range filter, only the the correlation value in a given range will be shown.
104 | 
105 | The following shows an example of applying top-k filter in `plot_correlation(df)`::
106 | 
107 |     plot_correlation(df, k = 3)
108 | 
109 | .. raw:: html
110 | 
111 |    <iframe src="../_static/images/plot_correlation/df_k.html" height="550" width="100%"></iframe>
112 | 
113 | 
114 | The following shows an example of applying value range filter in `plot_correlation(df, x)`::
115 | 
116 |     plot_correlation(df, "alcohol", value_range=[0.1, 1])
117 | 
118 | .. raw:: html
119 | 
120 |    <iframe src="../_static/images/plot_correlation/df_num_valueRange.html" height="550" width="100%"></iframe>
121 | 


--------------------------------------------------------------------------------
/docs/source/DC_Yelp_tut.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | ==================================================
  3 | Tutorial - Data Connector for Yelp 
  4 | ==================================================
  5 | 
  6 | .. toctree::
  7 |    :maxdepth: 2
  8 | 
  9 | Overview
 10 | ========
 11 | 
 12 | data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. 
 13 | The goal is to help the users skip the complex API configuration. In this tutorial, we demonstrate how to use 
 14 | data_connector library with Yelp.
 15 | 
 16 | 
 17 | Preprocessing
 18 | ================
 19 | If you haven't installed dataprep, run command pip install dataprep or execute the following cell.
 20 | 
 21 | ::
 22 | 
 23 |     !pip install dataprep
 24 |     
 25 | Obtaining access token from Yelp
 26 | =============================================
 27 | To connect to Yelp, you need to generate a token. This token is a unique identifier of an application requesting access to 
 28 | Yelp's API. Once an application creates the token, it will act as your credential when making an API request.
 29 | 
 30 | To receive an access token, the user needs to create a server-side application from Yelp, this can be done by  
 31 | visiting the `Yelp API documentation
 32 | <https://www.yelp.com/developers/documentation/v3/authentication>`_, entering some information about its use and generating a key.
 33 | 
 34 | .. image:: _static/images/tutorial/Yelp_authentication.png
 35 | 	:align: center
 36 |    	:width: 700
 37 |    	:height: 500
 38 | 
 39 | Store the token or API Key in a secure location as it will be used to provide you access to the Yelp's restaurant data.
 40 | 
 41 | .. image:: _static/images/tutorial/Yelp_API_Key.png
 42 | 	:align: center
 43 |    	:width: 700
 44 |    	:height: 400
 45 | 
 46 | Download and store the configuration files in dataprep 
 47 | ================================================================
 48 | The configuration files are used to construct the parameters and initial setup for the API. The available configuration files can be manually downloaded here: `Configuration Files
 49 | <https://github.com/sfu-db/DataConnectorConfigs>`_ or automatically downloaded at usage. 
 50 | 
 51 | 
 52 | 
 53 | To automatically download at usage, click on the clipboard button, unsure you are cloning with HTTPS. Go into your terminal, and find an appropriate locate to store the configuration files. 
 54 | When you decided on a location, enter the command ``git clone https://github.com/sfu-db/DataConnectorConfigs.git``. This will clone the git repository to the desired location; as a suggestion store it with the dataprep folder. 
 55 | 
 56 | 
 57 | From here you can proceed with the next steps.
 58 | 
 59 | .. image:: _static/images/tutorial/dc_git.png
 60 | 	:align: center
 61 |    	:width: 1000
 62 |    	:height: 500
 63 | 
 64 | 
 65 | .. image:: _static/images/tutorial/dc_git_clone.png
 66 | 	:align: center
 67 |    	:width: 725
 68 |    	:height: 125
 69 | 
 70 | 
 71 | Below the configuration file are stored with dataprep. 
 72 | 
 73 | .. image:: _static/images/tutorial/Config_destination.png
 74 | 	:align: center
 75 |    	:width: 586
 76 |    	:height: 132
 77 | 
 78 | 
 79 |     
 80 | Initialize data_connector
 81 | =============================
 82 | To initialize run the following code. Copy and paste the Yelp API key into the **access_token** variable and ensure the connector path is correct. Once you have that running, you can use the built in functions available in connector.
 83 | 
 84 | ::
 85 | 
 86 |     from dataprep.data_connector import Connector
 87 |     access_token = “insert_token_key”
 88 |     dc = Connector("./DataConnectorConfigs/yelp", _auth={"access_token":access_token})
 89 |     
 90 | Functionalities
 91 | ===================
 92 | Data connector has several functions you can perform to gain insight on the data downloaded from Yelp.
 93 | 
 94 | Connector.info
 95 | ------------------
 96 | | The info method gives information and guidelines on using the connector. There are 3 sections in the response and they are table, parameters and examples.
 97 | |
 98 | | 	a. Table - The table(s) being accessed.
 99 | | 	b. Parameters - Identifies which parameters can be used to call the method. 
100 | | 	c. Examples - Shows how you can call the methods in the Connector class.
101 | 
102 | ::
103 | 
104 |     dc.info()
105 |     
106 | .. image:: _static/images/tutorial/dc_show.png
107 | 	:align: center
108 |    	:width: 400
109 |    	:height: 165
110 | 
111 | Parameters
112 | **********************
113 | | A parameter is a piece of information you supply to a query right as you run it. The parameters for Yelp's business query can either be required or optional. The required parameters are **term** and **location** while the optional parameters are **latitude**, **longitude** and **limit**. The parameters are described below.
114 | |
115 | | 	a. **term** - Required - Search term, for example "food" or "restaurants". The term may also be business names, such as "Starbucks". 
116 | | 	b. **location** - Required - Maximum number of search results (hits) to return. 
117 | | 	c. **latitude** - Optional - Latitude of the location you want to search nearby.
118 | | 	d. **longitude** - Optional - Longitude of the location you want to search nearby.
119 | | 	e. **limit** - Number of business results to return. By default, it will return 20. Maximum is 50.
120 | 
121 | There are additional parameters to query with Yelp. If you are interested in reading up the other available parameters and setting up your own config files, please read this `Yelp link
122 | <https://www.yelp.ca/developers/documentation/v3/business_search3>`_ and this `Configuration Files link
123 | <https://github.com/sfu-db/DataConnectorConfigs>`_.
124 | 
125 | Connector.show_schema
126 | -----------------------------
127 | The show_schema method returns the website data in a Dataframe format. There are two columns in the response. The first column is the name and the second is the datatype.
128 | As an example, lets see what is in the publication table.
129 | 
130 | ::
131 | 
132 |     dc.show_schema("business")
133 | 
134 | .. image:: _static/images/tutorial/dc_schema.png
135 | 	:align: center
136 |    	:width: 202
137 |    	:height: 404
138 |     
139 | Connector.query
140 | ------------------
141 | The query method downloads the website data. The parameters must meet the requirements as indicated in connector.info for the operation to run.
142 | 
143 | When the data is received from the server, it will either be in a JSON or XML format. The data_connector reformats the data in pandas Dataframe for the convenience of downstream operations.
144 | 
145 | As an example, let's try to get the data from the "business" table, providing the term "city" and location "seattle".
146 | 
147 | ::
148 | 
149 |     dc.query("businesses", term = "city", location = "seattle", limit = 10)
150 |     
151 | .. image:: _static/images/tutorial/dc_yelp_query.png
152 | 	:align: center
153 |    	:width: 1000
154 |    	:height: 460
155 |     
156 | From query results, you can see how easy it is to download the restaurant data from Yelp into a pandas Dataframe.
157 | Now that you have an understanding of how data connector operates, you can easily accomplish the task with two lines of code.
158 | 
159 | ::
160 | 
161 |     dc = Connector(...)
162 |     dc.query(...)
163 | 
164 | Pagination
165 | ===================
166 | | Another feature available in the config files is pagination. Pagination is the process of dividing a document into discrete pages, breaking the content into pages and allow visitors to switch between them. It returns the maximum number of searches to return. 
167 | | 
168 | | To use pagination, you need to include **_count** in your query. The **_count** parameter represents the number of records a user would like to return, which can be larger than the maximum limit of records each return of API itself. Users can still fetch multiple pages of records by using parameters like limit and offset, however this requires users to understand how pagination works different website APIs.
169 | |
170 | 
171 | ::
172 | 
173 |     dc.query("business", term = "city", location = "seattle",  _count = 200)
174 | 
175 | .. image:: _static/images/tutorial/dc_yelp_query_pag.png
176 | 	:align: center
177 |    	:width: 1000
178 |    	:height: 500
179 |     
180 | Pagination does not concurrently work with the **limit** parameter in a query, you need to select either **limit** or **_count**.  
181 | 
182 | 
183 | That's all for now.
184 | ===================
185 | Please visit the other tutorials that are available if you are interested in setting up a different data connector. 
186 | If you are interested in writing your own configuration file or modify an existing one, refer to the `Configuration Files
187 | <https://github.com/sfu-db/DataConnectorConfigs>`_.
188 | 
189 | 


--------------------------------------------------------------------------------
/dataprep/tests/eda/test_plot_correlation.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     module for testing plot_corr(df, x, y) function.
  3 | """
  4 | import random
  5 | from time import time
  6 | 
  7 | import dask.array as da
  8 | import dask.dataframe as dd
  9 | import numpy as np
 10 | import pandas as pd
 11 | import pytest
 12 | 
 13 | from ...eda.correlation import compute_correlation, plot_correlation
 14 | from ...eda.correlation.compute import (
 15 |     kendall_tau_1xn,
 16 |     kendall_tau_nxn,
 17 |     pearson_1xn,
 18 |     pearson_nxn,
 19 |     spearman_1xn,
 20 |     spearman_nxn,
 21 | )
 22 | from ...eda.utils import to_dask
 23 | 
 24 | 
 25 | @pytest.fixture(scope="module")  # type: ignore
 26 | def simpledf() -> dd.DataFrame:
 27 |     df = pd.DataFrame(np.random.rand(100, 3), columns=["a", "b", "c"])
 28 |     df = pd.concat([df, pd.Series(["a"] * 100)], axis=1)
 29 |     df.columns = ["a", "b", "c", "d"]
 30 |     df = to_dask(df)
 31 | 
 32 |     return df
 33 | 
 34 | 
 35 | def test_sanity_compute_1(simpledf: dd.DataFrame) -> None:
 36 |     compute_correlation(simpledf)
 37 |     plot_correlation(simpledf)
 38 | 
 39 | 
 40 | def test_sanity_compute_2(simpledf: dd.DataFrame) -> None:
 41 |     compute_correlation(simpledf, k=1)
 42 |     plot_correlation(simpledf, k=1)
 43 | 
 44 | 
 45 | def test_sanity_compute_3(simpledf: dd.DataFrame) -> None:
 46 |     compute_correlation(simpledf, x="a")
 47 |     plot_correlation(simpledf, x="a")
 48 | 
 49 | 
 50 | def test_sanity_compute_4(simpledf: dd.DataFrame) -> None:
 51 |     compute_correlation(simpledf, x="a", value_range=(0.5, 0.8))
 52 |     plot_correlation(simpledf, x="a", value_range=(0.5, 0.8))
 53 | 
 54 | 
 55 | def test_sanity_compute_5(simpledf: dd.DataFrame) -> None:
 56 |     compute_correlation(simpledf, x="a", k=1)
 57 |     plot_correlation(simpledf, x="a", k=1)
 58 | 
 59 | 
 60 | def test_sanity_compute_6(simpledf: dd.DataFrame) -> None:
 61 |     compute_correlation(simpledf, x="a", k=0)
 62 |     plot_correlation(simpledf, x="a", k=0)
 63 | 
 64 | 
 65 | def test_sanity_compute_7(simpledf: dd.DataFrame) -> None:
 66 |     compute_correlation(simpledf, x="b", y="a")
 67 |     plot_correlation(simpledf, x="b", y="a")
 68 | 
 69 | 
 70 | def test_sanity_compute_8(simpledf: dd.DataFrame) -> None:
 71 |     compute_correlation(simpledf, x="b", y="a", k=1)
 72 |     plot_correlation(simpledf, x="b", y="a", k=1)
 73 | 
 74 | 
 75 | def test_sanity_compute_9(simpledf: dd.DataFrame) -> None:
 76 |     compute_correlation(simpledf, value_range=(0.3, 0.7))
 77 |     plot_correlation(simpledf, value_range=(0.3, 0.7))
 78 | 
 79 | 
 80 | @pytest.mark.xfail  # type: ignore
 81 | def test_sanity_compute_fail_2(simpledf: dd.DataFrame) -> None:
 82 |     compute_correlation(simpledf, k=3, value_range=(0.3, 0.7))
 83 |     plot_correlation(simpledf, k=3, value_range=(0.3, 0.7))
 84 | 
 85 | 
 86 | @pytest.mark.xfail  # type: ignore
 87 | def test_sanity_compute_fail_3(simpledf: dd.DataFrame) -> None:
 88 |     compute_correlation(simpledf, x="a", value_range=(0.5, 0.8), k=3)
 89 |     plot_correlation(simpledf, x="a", value_range=(0.5, 0.8), k=3)
 90 | 
 91 | 
 92 | @pytest.mark.xfail  # type: ignore
 93 | def test_sanity_compute_fail_4(simpledf: dd.DataFrame) -> None:
 94 |     compute_correlation(simpledf, y="a")
 95 |     plot_correlation(simpledf, y="a")
 96 | 
 97 | 
 98 | @pytest.mark.xfail  # type: ignore
 99 | def test_sanity_compute_fail_5(simpledf: dd.DataFrame) -> None:
100 |     compute_correlation(simpledf, x="d")
101 |     plot_correlation(simpledf, x="d")
102 | 
103 | 
104 | @pytest.mark.xfail  # type: ignore
105 | def test_test_sanity_compute_fail_6(simpledf: dd.DataFrame) -> None:
106 |     compute_correlation(simpledf, x="b", y="a", value_range=(0.5, 0.8))
107 |     plot_correlation(simpledf, x="b", y="a", value_range=(0.5, 0.8))
108 | 
109 | 
110 | @pytest.mark.xfail  # type: ignore
111 | def test_sanity_compute_fail_7(simpledf: dd.DataFrame) -> None:
112 |     compute_correlation(simpledf, x="b", y="a", value_range=(0.5, 0.8), k=3)
113 |     plot_correlation(simpledf, x="b", y="a", value_range=(0.5, 0.8), k=3)
114 | 
115 | 
116 | def test_compute_pearson() -> None:
117 |     array = np.random.rand(100, 10)
118 |     darray = da.from_array(array)
119 |     a = pearson_nxn(darray).compute()
120 |     b = pd.DataFrame(data=array).corr("pearson").values
121 |     assert np.isclose(a, b).all()
122 | 
123 |     for i in range(array.shape[1]):
124 |         _, a = pearson_1xn(darray[:, i], darray)
125 |         assert np.isclose(a, np.sort(b[:, i])).all()
126 | 
127 | 
128 | def test_compute_spearman() -> None:
129 |     array = np.random.rand(100, 10)
130 |     darray = da.from_array(array)
131 |     a = spearman_nxn(darray).compute()
132 |     b = pd.DataFrame(data=array).corr("spearman").values
133 |     assert np.isclose(a, b).all()
134 | 
135 |     for i in range(array.shape[1]):
136 |         _, a = spearman_1xn(darray[:, i], darray)
137 |         assert np.isclose(a, np.sort(b[:, i])).all()
138 | 
139 | 
140 | def test_compute_kendall() -> None:
141 |     array = np.random.rand(100, 10)
142 |     darray = da.from_array(array)
143 |     a = kendall_tau_nxn(darray).compute()
144 |     b = pd.DataFrame(data=array).corr("kendall").values
145 |     assert np.isclose(a, b).all()
146 | 
147 |     for i in range(array.shape[1]):
148 |         _, a = kendall_tau_1xn(darray[:, i], darray)
149 |         assert np.isclose(a, np.sort(b[:, i])).all()
150 | 
151 | 
152 | # def test_plot_corr_df() -> None:  # pylint: disable=too-many-locals
153 | #     """
154 | #     :return:
155 | #     """
156 | #     data = np.random.rand(100, 20)
157 | #     df_data = pd.DataFrame(data)
158 | 
159 | #     start_p_pd = time()
160 | #     res = df_data.corr(method="pearson")
161 | #     end_p_pd = time()
162 | #     print("pd pearson time: ", str(end_p_pd - start_p_pd) + " s")
163 | 
164 | #     start_p = time()
165 | #     _, intermediate = plot_correlation(df=df_data, return_intermediate=True)
166 | #     end_p = time()
167 | #     print("our pearson time: ", str(end_p - start_p) + " s")
168 | #     assert np.isclose(res, intermediate.result["corr_p"]).all()
169 | 
170 | #     start_s_pd = time()
171 | #     res = df_data.corr(method="spearman")
172 | #     end_s_pd = time()
173 | #     print("pd spearman time: ", str(end_s_pd - start_s_pd) + " s")
174 | 
175 | #     start_s = time()
176 | #     _, intermediate = plot_correlation(df=df_data, return_intermediate=True)
177 | #     end_s = time()
178 | #     print("our spearman time: ", str(end_s - start_s) + " s")
179 | #     assert np.isclose(res, intermediate.result["corr_s"]).all()
180 | 
181 | #     start_k_pd = time()
182 | #     res = df_data.corr(method="kendall")
183 | #     end_k_pd = time()
184 | #     print("pd kendall time: ", str(end_k_pd - start_k_pd) + " s")
185 | 
186 | #     start_k = time()
187 | #     _, intermediate = plot_correlation(df=df_data, return_intermediate=True)
188 | #     end_k = time()
189 | #     print("our kendall time: ", str(end_k - start_k) + " s")
190 | #     assert np.isclose(res, intermediate.result["corr_k"]).all()
191 | 
192 | 
193 | # def test_plot_corr_df_k() -> None:
194 | #     """
195 | #     :return:
196 | #     """
197 | #     data = np.random.rand(100, 20)
198 | #     df_data = pd.DataFrame(data)
199 | #     k = 5
200 | #     res = df_data.corr(method="pearson")
201 | #     row, _ = np.shape(res)
202 | #     res_re = np.reshape(np.triu(res, 1), (row * row,))
203 | #     idx = np.argsort(np.absolute(res_re))
204 | #     mask = np.zeros(shape=(row * row,))
205 | #     for i in range(k):
206 | #         mask[idx[-i - 1]] = 1
207 | #     res = np.multiply(res_re, mask)
208 | #     res = np.reshape(res, (row, row))
209 | #     res = res.T
210 | #     _, intermediate = plot_correlation(df=df_data, return_intermediate=True, k=k)
211 | #     assert np.isclose(intermediate.result["corr_p"], res).all()
212 | #     assert np.isclose(intermediate.result["mask_p"], mask).all()
213 | 
214 | 
215 | # def test_plot_corr_df_x_k() -> None:
216 | #     """
217 | #     :return:
218 | #     """
219 | #     df_data = pd.DataFrame({"a": np.random.normal(0, 10, 100)})
220 | #     df_data["b"] = df_data["a"] + np.random.normal(0, 10, 100)
221 | #     df_data["c"] = df_data["a"] + np.random.normal(0, 10, 100)
222 | #     df_data["d"] = df_data["a"] + np.random.normal(0, 10, 100)
223 | #     x_name = "b"
224 | #     k = 3
225 | #     name_list = list(df_data.columns.values)
226 | #     idx_name = name_list.index(x_name)
227 | #     res_p = df_data.corr(method="pearson").values
228 | #     res_p[idx_name][idx_name] = -1
229 | #     res_s = df_data.corr(method="spearman").values
230 | #     res_s[idx_name][idx_name] = -1
231 | #     res_k = df_data.corr(method="kendall").values
232 | #     res_k[idx_name][idx_name] = -1
233 | #     _, _ = plot_correlation(df=df_data, x=x_name, return_intermediate=True, k=k)
234 | 
235 | 
236 | # def test_plot_corr_df_x_y_k() -> None:
237 | #     """
238 | #     :return:
239 | #     """
240 | #     df_data = pd.DataFrame({"a": np.random.normal(0, 10, 100)})
241 | #     df_data["b"] = df_data["a"] + np.random.normal(0, 10, 100)
242 | #     df_data["c"] = df_data["a"] + np.random.normal(0, 10, 100)
243 | #     df_data["d"] = df_data["a"] + np.random.normal(0, 10, 100)
244 | #     x_name = "b"
245 | #     y_name = "c"
246 | #     k = 3
247 | #     _ = plot_correlation(
248 | #         df=df_data, x=x_name, y=y_name, return_intermediate=False, k=k,
249 | #     )
250 | 
251 | #     letters = ["a", "b", "c"]
252 | #     df_data_cat = pd.DataFrame({"a": np.random.normal(0, 10, 100)})
253 | #     df_data_cat["b"] = pd.Categorical([random.choice(letters) for _ in range(100)])
254 | #     df_data_cat["c"] = pd.Categorical([random.choice(letters) for _ in range(100)])
255 | #     _, intermediate = plot_correlation(
256 | #         df=df_data_cat, x="b", y="c", return_intermediate=True
257 | #     )
258 | #     assert np.isclose(
259 | #         pd.crosstab(df_data_cat["b"], df_data_cat["c"]).values,
260 | #         intermediate.result["cross_table"],
261 | #     ).all()
262 | 


--------------------------------------------------------------------------------
/dataprep/eda/correlation/render.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This module implements the visualization for
  3 |     plot_correlation(df) function
  4 | """
  5 | import math
  6 | from typing import List, Optional, Sequence, Tuple
  7 | 
  8 | import numpy as np
  9 | from bokeh.models import (
 10 |     BasicTicker,
 11 |     CategoricalColorMapper,
 12 |     ColorBar,
 13 |     FactorRange,
 14 |     HoverTool,
 15 |     Legend,
 16 |     LegendItem,
 17 |     LinearColorMapper,
 18 |     PrintfTickFormatter,
 19 | )
 20 | from bokeh.models.annotations import Title
 21 | from bokeh.models.widgets import Panel, Tabs
 22 | from bokeh.plotting import Figure, figure
 23 | 
 24 | from ..intermediate import Intermediate
 25 | from ..palette import BIPALETTE, BRG
 26 | 
 27 | __all__ = ["render_correlation"]
 28 | 
 29 | 
 30 | def render_correlation(
 31 |     itmdt: Intermediate,
 32 |     plot_width: int = 500,
 33 |     plot_height: int = 500,
 34 |     palette: Optional[Sequence[str]] = None,
 35 | ) -> Figure:
 36 |     """
 37 |     Render a correlation plot
 38 | 
 39 |     Parameters
 40 |     ----------
 41 |     itmdt
 42 |     plot_width
 43 |         The width of the plot
 44 |     plot_height
 45 |         The height of the plot
 46 |     palette
 47 |         The palette to use. By default (None),
 48 |         the palette will be automatically chosen based on different visualization types.
 49 | 
 50 |     Returns
 51 |     -------
 52 |     Figure
 53 |         The bokeh Figure instance.
 54 |     """
 55 |     if itmdt.visual_type is None:
 56 |         visual_elem = Figure()
 57 |     elif itmdt.visual_type == "correlation_heatmaps":
 58 |         visual_elem = render_correlation_heatmaps(
 59 |             itmdt, plot_width, plot_height, palette or BIPALETTE
 60 |         )
 61 |     elif itmdt.visual_type == "correlation_single_heatmaps":
 62 |         visual_elem = render_correlation_single_heatmaps(
 63 |             itmdt, plot_width, plot_height, palette or BIPALETTE
 64 |         )
 65 |     elif itmdt.visual_type == "correlation_scatter":
 66 |         visual_elem = render_scatter(itmdt, plot_width, plot_height, palette or BRG)
 67 |     else:
 68 |         raise NotImplementedError(f"Unknown visual type {itmdt.visual_type}")
 69 | 
 70 |     return visual_elem
 71 | 
 72 | 
 73 | # def _vis_cross_table(intermediate: Intermediate, params: Dict[str, Any]) -> Figure:
 74 | #     """
 75 | #     :param intermediate: An object to encapsulate the
 76 | #     intermediate results.
 77 | #     :return: A figure object
 78 | #     """
 79 | #     result = intermediate.result
 80 | #     hv.extension("bokeh", logo=False)
 81 | #     cross_matrix = result["cross_table"]
 82 | #     x_cat_list = result["x_cat_list"]
 83 | #     y_cat_list = result["y_cat_list"]
 84 | #     data = []
 85 | #     for i, _ in enumerate(x_cat_list):
 86 | #         for j, _ in enumerate(y_cat_list):
 87 | #             data.append((x_cat_list[i], y_cat_list[j], cross_matrix[i, j]))
 88 | #     tooltips = [("z", "@z")]
 89 | #     hover = HoverTool(tooltips=tooltips)
 90 | #     heatmap = hv.HeatMap(data)
 91 | #     heatmap.opts(
 92 | #         tools=[hover],
 93 | #         colorbar=True,
 94 | #         width=params["width"],
 95 | #         toolbar="above",
 96 | #         title="cross_table",
 97 | #     )
 98 | #     fig = hv.render(heatmap, backend="bokeh")
 99 | #     _discard_unused_visual_elems(fig)
100 | #     return fig
101 | 
102 | ########## HeatMaps ##########
103 | def tweak_figure(fig: Figure) -> None:
104 |     """
105 |     Set some common attributes for a figure
106 |     """
107 |     fig.grid.grid_line_color = None
108 |     fig.axis.axis_line_color = None
109 |     fig.axis.major_tick_line_color = None
110 |     fig.axis.major_label_text_font_size = "9pt"
111 |     fig.axis.major_label_standoff = 0
112 |     fig.xaxis.major_label_orientation = math.pi / 3
113 | 
114 | 
115 | def render_correlation_heatmaps(
116 |     itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str]
117 | ) -> Tabs:
118 |     """
119 |     Render correlation heatmaps in to tabs
120 |     """
121 |     tabs: List[Panel] = []
122 |     tooltips = [("x", "@x"), ("y", "@y"), ("correlation", "@correlation{1.11}")]
123 |     axis_range = itmdt["axis_range"]
124 | 
125 |     for method, df in itmdt["data"].items():
126 |         # in case of numerical column names
127 |         df = df.copy()
128 |         df["x"] = df["x"].apply(str)
129 |         df["y"] = df["y"].apply(str)
130 | 
131 |         mapper, color_bar = create_color_mapper(palette)
132 |         x_range = FactorRange(*axis_range)
133 |         y_range = FactorRange(*reversed(axis_range))
134 |         fig = Figure(
135 |             x_range=x_range,
136 |             y_range=y_range,
137 |             plot_width=plot_width,
138 |             plot_height=plot_height,
139 |             x_axis_location="below",
140 |             tools="hover",
141 |             toolbar_location=None,
142 |             tooltips=tooltips,
143 |             background_fill_color="#fafafa",
144 |         )
145 | 
146 |         tweak_figure(fig)
147 | 
148 |         fig.rect(
149 |             x="x",
150 |             y="y",
151 |             width=1,
152 |             height=1,
153 |             source=df,
154 |             fill_color={"field": "correlation", "transform": mapper},
155 |             line_color=None,
156 |         )
157 | 
158 |         fig.add_layout(color_bar, "right")
159 | 
160 |         tab = Panel(child=fig, title=method)
161 |         tabs.append(tab)
162 | 
163 |     tabs = Tabs(tabs=tabs)
164 |     return tabs
165 | 
166 | 
167 | def render_correlation_single_heatmaps(
168 |     itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str]
169 | ) -> Tabs:
170 |     """
171 |     Render correlation heatmaps, but with single column
172 |     """
173 |     tabs: List[Panel] = []
174 |     tooltips = [("y", "@y"), ("correlation", "@correlation{1.11}")]
175 | 
176 |     for method, df in itmdt["data"].items():
177 |         mapper, color_bar = create_color_mapper(palette)
178 | 
179 |         x_range = FactorRange(*df["x"].unique())
180 |         y_range = FactorRange(*df["y"].unique())
181 |         fig = figure(
182 |             x_range=x_range,
183 |             y_range=y_range,
184 |             plot_width=plot_width,
185 |             plot_height=plot_height,
186 |             x_axis_location="below",
187 |             tools="hover",
188 |             toolbar_location=None,
189 |             tooltips=tooltips,
190 |         )
191 | 
192 |         tweak_figure(fig)
193 | 
194 |         fig.rect(
195 |             x="x",
196 |             y="y",
197 |             width=1,
198 |             height=1,
199 |             source=df,
200 |             fill_color={"field": "correlation", "transform": mapper},
201 |             line_color=None,
202 |         )
203 | 
204 |         fig.add_layout(color_bar, "right")
205 | 
206 |         tab = Panel(child=fig, title=method)
207 |         tabs.append(tab)
208 | 
209 |     tabs = Tabs(tabs=tabs)
210 |     return tabs
211 | 
212 | 
213 | def create_color_mapper(palette: Sequence[str]) -> Tuple[LinearColorMapper, ColorBar]:
214 |     """
215 |     Create a color mapper and a colorbar for heatmap
216 |     """
217 |     mapper = LinearColorMapper(palette=palette, low=-1, high=1)
218 |     colorbar = ColorBar(
219 |         color_mapper=mapper,
220 |         major_label_text_font_size="8pt",
221 |         ticker=BasicTicker(),
222 |         formatter=PrintfTickFormatter(format="%.2f"),
223 |         label_standoff=6,
224 |         border_line_color=None,
225 |         location=(0, 0),
226 |     )
227 |     return mapper, colorbar
228 | 
229 | 
230 | ######### Scatter #########
231 | def render_scatter(
232 |     itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str]
233 | ) -> Figure:
234 |     """
235 |     Render scatter plot with a regression line and possible most influencial points
236 |     """
237 | 
238 |     # pylint: disable=too-many-locals
239 | 
240 |     df = itmdt["data"]
241 |     xcol, ycol, *maybe_label = df.columns
242 | 
243 |     tooltips = [(xcol, f"@{{{xcol}}}"), (ycol, f"@{{{ycol}}}")]
244 | 
245 |     fig = Figure(
246 |         plot_width=plot_width,
247 |         plot_height=plot_height,
248 |         toolbar_location=None,
249 |         title=Title(text="Scatter Plot & Regression", align="center"),
250 |         tools=[],
251 |         x_axis_label=xcol,
252 |         y_axis_label=ycol,
253 |     )
254 | 
255 |     # Scatter
256 |     scatter = fig.scatter(x=df.columns[0], y=df.columns[1], source=df)
257 |     if maybe_label:
258 |         assert len(maybe_label) == 1
259 |         mapper = CategoricalColorMapper(factors=["=", "+", "-"], palette=palette)
260 |         scatter.glyph.fill_color = {"field": maybe_label[0], "transform": mapper}
261 |         scatter.glyph.line_color = {"field": maybe_label[0], "transform": mapper}
262 | 
263 |     # Regression line
264 |     coeff_a, coeff_b = itmdt["coeffs"]
265 |     line_x = np.asarray([df.iloc[:, 0].min(), df.iloc[:, 0].max()])
266 |     line_y = coeff_a * line_x + coeff_b
267 |     fig.line(x=line_x, y=line_y, line_width=3)
268 | 
269 |     # Not adding the tooltips before because we only want to apply tooltip to the scatter
270 |     hover = HoverTool(tooltips=tooltips, renderers=[scatter])
271 |     fig.add_tools(hover)
272 | 
273 |     # Add legends
274 |     if maybe_label:
275 |         nidx = df.index[df[maybe_label[0]] == "-"][0]
276 |         pidx = df.index[df[maybe_label[0]] == "+"][0]
277 | 
278 |         legend = Legend(
279 |             items=[
280 |                 LegendItem(
281 |                     label="Most Influential (-)", renderers=[scatter], index=nidx
282 |                 ),
283 |                 LegendItem(
284 |                     label="Most Influential (+)", renderers=[scatter], index=pidx
285 |                 ),
286 |             ],
287 |             margin=0,
288 |             padding=0,
289 |         )
290 | 
291 |         fig.add_layout(legend, place="right")
292 |     return fig
293 | 


--------------------------------------------------------------------------------
/dataprep/data_connector/implicit_database.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module defines ImplicitDatabase and ImplicitTable,
  3 | where ImplicitDatabase is a conceptual model describes
  4 | a website and ImplicitTable describes an API endpoint.
  5 | """
  6 | from io import StringIO
  7 | from json import load as jload
  8 | from json import loads as jloads
  9 | from pathlib import Path
 10 | from typing import Any, Dict, List, NamedTuple, Optional, Union
 11 | 
 12 | import jsonschema
 13 | import pandas as pd
 14 | from jsonpath2 import Path as JPath
 15 | from lxml import etree  # pytype: disable=import-error
 16 | from requests import Response
 17 | 
 18 | from ..errors import UnreachableError
 19 | from .schema import CONFIG_SCHEMA
 20 | from .types import Authorization, AuthorizationType, Fields, Orient
 21 | 
 22 | _TYPE_MAPPING = {
 23 |     "int": int,
 24 |     "string": str,
 25 |     "float": float,
 26 |     "boolean": bool,
 27 | }
 28 | 
 29 | 
 30 | class SchemaField(NamedTuple):
 31 |     """
 32 |     Schema of one table field
 33 |     """
 34 | 
 35 |     target: str
 36 |     type: str
 37 |     description: Optional[str]
 38 | 
 39 | 
 40 | class Pagination:
 41 |     """
 42 |     Schema of Pagination field
 43 |     """
 44 | 
 45 |     type: str
 46 |     count_key: str
 47 |     max_count: int
 48 |     anchor_key: Optional[str]
 49 |     cursor_id: Optional[str]
 50 |     cursor_key: Optional[str]
 51 | 
 52 |     def __init__(self, pdef: Dict[str, Any]) -> None:
 53 | 
 54 |         self.type = pdef["type"]
 55 |         self.max_count = pdef["max_count"]
 56 |         self.count_key = pdef["count_key"]
 57 |         self.anchor_key = pdef.get("anchor_key")
 58 |         self.cursor_id = pdef.get("cursor_id")
 59 |         self.cursor_key = pdef.get("cursor_key")
 60 | 
 61 | 
 62 | class ImplicitTable:  # pylint: disable=too-many-instance-attributes
 63 |     """
 64 |     ImplicitTable class abstracts the request and the response to a Restful API,
 65 |     so that the remote API can be treated as a database table.
 66 |     """
 67 | 
 68 |     name: str
 69 |     config: Dict[str, Any]
 70 |     # Request related
 71 |     method: str
 72 |     url: str
 73 |     authorization: Optional[Authorization] = None
 74 |     headers: Optional[Fields] = None
 75 |     params: Optional[Fields] = None
 76 |     body_ctype: str
 77 |     body: Optional[Fields] = None
 78 |     cookies: Optional[Fields] = None
 79 |     pag_params: Optional[Pagination] = None
 80 | 
 81 |     # Response related
 82 |     ctype: str
 83 |     table_path: str
 84 |     schema: Dict[str, SchemaField]
 85 |     orient: Orient
 86 | 
 87 |     def __init__(self, name: str, config: Dict[str, Any]) -> None:
 88 |         jsonschema.validate(
 89 |             config, CONFIG_SCHEMA
 90 |         )  # This will throw errors if validate failed
 91 |         self.name = name
 92 |         self.config = config
 93 | 
 94 |         request_def = config["request"]
 95 | 
 96 |         self.method = request_def["method"]
 97 |         self.url = request_def["url"]
 98 | 
 99 |         if "authorization" in request_def:
100 |             auth_def = request_def["authorization"]
101 |             if isinstance(auth_def, str):
102 |                 auth_type = AuthorizationType[auth_def]
103 |                 auth_params: Dict[str, str] = {}
104 |             elif isinstance(auth_def, dict):
105 |                 auth_type = AuthorizationType[auth_def.pop("type")]
106 |                 auth_params = {**auth_def}
107 |             else:
108 |                 raise NotImplementedError
109 |             self.authorization = Authorization(auth_type=auth_type, params=auth_params)
110 | 
111 |         if "pagination" in request_def:
112 |             self.pag_params = Pagination(request_def["pagination"])
113 | 
114 |         for key in ["headers", "params", "cookies"]:
115 |             if key in request_def:
116 |                 setattr(self, key, Fields(request_def[key]))
117 | 
118 |         if "body" in request_def:
119 |             body_def = request_def["body"]
120 |             self.body_ctype = body_def["ctype"]
121 |             self.body = Fields(body_def["content"])
122 | 
123 |         response_def = config["response"]
124 |         self.ctype = response_def["ctype"]
125 |         self.table_path = response_def["tablePath"]
126 |         self.schema = {
127 |             name: SchemaField(def_["target"], def_["type"], def_.get("description"))
128 |             for name, def_ in response_def["schema"].items()
129 |         }
130 |         self.orient = Orient(response_def["orient"])
131 | 
132 |     def from_response(self, resp: Response) -> pd.DataFrame:
133 |         """
134 |         Create a dataframe from a http response.
135 |         """
136 |         if self.ctype == "application/json":
137 |             rows = self.from_json(resp.text)
138 |         elif self.ctype == "application/xml":
139 |             rows = self.from_xml(resp.text)
140 |         else:
141 |             raise UnreachableError
142 | 
143 |         return pd.DataFrame(rows)
144 | 
145 |     def from_json(self, data: str) -> Dict[str, List[Any]]:
146 |         """
147 |         Create rows from json string.
148 |         """
149 |         data = jloads(data)
150 |         table_data = {}
151 |         root = self.table_path
152 | 
153 |         if self.orient == Orient.Records:
154 |             data_rows = [
155 |                 row_node.current_value for row_node in JPath.parse_str(root).match(data)
156 |             ]
157 | 
158 |             for column_name, column_def in self.schema.items():
159 |                 column_target = column_def.target
160 |                 column_type = column_def.type
161 | 
162 |                 target_matcher = JPath.parse_str(column_target)
163 | 
164 |                 col: List[Any] = []
165 |                 for data_row in data_rows:
166 |                     maybe_cell_value = [
167 |                         m.current_value for m in target_matcher.match(data_row)
168 |                     ]
169 | 
170 |                     if not maybe_cell_value:  # If no match
171 |                         col.append(None)
172 |                     elif len(maybe_cell_value) == 1 and column_type != "object":
173 |                         (cell_value,) = maybe_cell_value
174 |                         if cell_value is not None:
175 |                             # Even we have value matched,
176 |                             # the value might be None so we don't do type conversion.
177 |                             cell_value = _TYPE_MAPPING[column_type](cell_value)
178 |                         col.append(cell_value)
179 |                     else:
180 |                         assert (
181 |                             column_type == "object"
182 |                         ), f"{column_name}: {maybe_cell_value} is not {column_type}"
183 |                         col.append(maybe_cell_value)
184 | 
185 |                 table_data[column_name] = col
186 |         else:
187 |             # TODO: split orient
188 |             raise NotImplementedError
189 | 
190 |         return table_data
191 | 
192 |     def from_xml(self, data: str) -> Dict[str, List[Any]]:
193 |         """
194 |         Create rows from xml string.
195 |         """
196 |         table_data = {}
197 | 
198 |         data = data.replace('<?xml version="1.0" encoding="UTF-8"?>', "")
199 | 
200 |         root = etree.parse(StringIO(data))
201 |         data_rows = root.xpath(self.table_path)
202 | 
203 |         if self.orient.value == Orient.Records.value:
204 |             for column_name, column_def in self.schema.items():
205 |                 column_target = column_def.target
206 |                 column_type = column_def.type
207 | 
208 |                 col: List[Any] = []
209 |                 for data_row in data_rows:
210 |                     maybe_cell_value = data_row.xpath(column_target)
211 | 
212 |                     if not maybe_cell_value:
213 |                         col.append(None)
214 |                     elif len(maybe_cell_value) == 1 and column_type != "object":
215 |                         (cell_value,) = maybe_cell_value
216 |                         if cell_value is not None:
217 |                             # Even we have value matched,
218 |                             # the value might be None so we don't do type conversion.
219 |                             cell_value = _TYPE_MAPPING[column_type](cell_value)
220 |                         col.append(cell_value)
221 |                     else:
222 |                         assert (
223 |                             column_type == "object"
224 |                         ), f"{column_name}: {maybe_cell_value} is not {column_type}"
225 |                         col.append(maybe_cell_value)
226 | 
227 |                 table_data[column_name] = col
228 |         else:
229 |             # TODO: split orient
230 |             raise NotImplementedError
231 | 
232 |         return table_data
233 | 
234 | 
235 | class ImplicitDatabase:
236 |     """
237 |     A website that provides data can be treat as a database, represented
238 |     as ImplicitDatabase in DataConnector.
239 |     """
240 | 
241 |     name: str
242 |     tables: Dict[str, ImplicitTable]
243 | 
244 |     def __init__(self, config_path: Union[str, Path]) -> None:
245 |         path = Path(config_path)
246 | 
247 |         self.name = path.name
248 |         self.tables = {}
249 | 
250 |         for table_config_path in path.iterdir():
251 |             if not table_config_path.is_file():
252 |                 # ignore configs that are not file
253 |                 continue
254 |             if table_config_path.name == "_meta.json":
255 |                 # ignore meta file
256 |                 continue
257 |             if table_config_path.suffix != ".json":
258 |                 # ifnote non json file
259 |                 continue
260 | 
261 |             with open(table_config_path) as f:
262 |                 table_config = jload(f)
263 | 
264 |             table = ImplicitTable(table_config_path.stem, table_config)
265 |             if table.name in self.tables:
266 |                 raise RuntimeError(f"Duplicated table name {table.name}")
267 |             self.tables[table.name] = table
268 | 


--------------------------------------------------------------------------------
/docs/source/eda/plot.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | ==================================================
  3 | `plot`: analyzing basic characteristics of dataset 
  4 | ==================================================
  5 | 
  6 | .. toctree::
  7 |    :maxdepth: 2
  8 | 
  9 | Overview
 10 | ========
 11 | 
 12 | The goal of `plot` is to explore basic characteristics of the dataset. It can generate different plots to reveal different characteristics of interested columns. It mainly provides the following functionalities:
 13 | 
 14 | 1. plot(df): plot basic characteristics (the histogram and the bar chart) for all columns.
 15 | 2. plot(df, x): zoom into column x and plot more refined characteristics.
 16 | 3. plot(df, x, y): zoom into column x and column y, and plot more refined characteristics to explore their relationship.
 17 | 
 18 | 
 19 | The generated plots of `plot` function are different for numerical column and categorical column. The following table summarizes the output plots for different setting of x and y.
 20 | 
 21 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 22 | |             | **plot(df,x,y)** |                                                                                                                                                                                                                                                  |
 23 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 24 | | **x**       | **y**            | **output plots**                                                                                                                                                                                                                                 |
 25 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 26 | | None        | None             | `histgram  <https://www.wikiwand.com/en/Histogram>`_ or `bar chart  <https://www.wikiwand.com/en/Bar_chart>`_ for each column                                                                                                                    |
 27 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 28 | | Numerical   | None             | `histgram  <https://www.wikiwand.com/en/Histogram>`_, `kde plot  <https://www.wikiwand.com/en/Kernel_density_estimation>`_, `box plot  <https://www.wikiwand.com/en/Box_plot>`_, `qq-norm plot  <https://www.wikiwand.com/en/Q%E2%80%93Q_plot>`_ |
 29 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 30 | | Categorical | None             | `bar chart  <https://www.wikiwand.com/en/Bar_chart>`_, `pie chart  <https://www.wikiwand.com/en/Pie_chart>`_                                                                                                                                     |
 31 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 32 | | Numerical   | Numerical        | `scatter plot  <https://www.wikiwand.com/en/Scatter_plot>`_, `hexbin plot <https://www.data-to-viz.com/graph/hexbinmap.html>`_, `box plot  <https://www.wikiwand.com/en/Box_plot>`_                                                              |
 33 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 34 | | Numerical   | Categorical      | `box plot  <https://www.wikiwand.com/en/Box_plot>`_, `line plot <https://www.wikiwand.com/en/Line_chart>`_                                                                                                                                       |
 35 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 36 | | Categorical | Numerical        | `box plot  <https://www.wikiwand.com/en/Box_plot>`_, `line plot <https://www.wikiwand.com/en/Line_chart>`_                                                                                                                                       |
 37 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 38 | | Categorical | Categorical      | `nested bar chart <https://www.wikiwand.com/en/Bar_chart#/Grouped_and_stacked>`_, `stacked bar chart <https://www.wikiwand.com/en/Bar_chart#/Grouped_and_stacked>`_, `heat map <https://www.wikiwand.com/en/Heat_map>`_                          |
 39 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 40 | 
 41 | Next, we use several examples to demonstrate the functionalities.
 42 | 
 43 | 
 44 | Loading dataset
 45 | ===============
 46 | We support two types of dataframe: pandas dataframe and dask dataframe. Here we load the well known `adult` dataset into a pandas dataframe and use it to demonstrate our functionality::
 47 | 
 48 |     import pandas as pd
 49 |     df = pd.read_csv("https://www.openml.org/data/get_csv/1595261/phpMawTba", na_values = [' ?'])
 50 | 
 51 | Basic exploration for all columns via `plot(df)`
 52 | ================================================
 53 | 
 54 | After getting a dataset, we could do a rough exploration by calling `plot(df)`. It will plot a histogram for each numeric column and a bar chart for each categorical column. The shown bin number (of histogram) and shown category number (of categorical column) are both customizable. Besides, if a column contains missing values, it ignores them when generating the plot but shows the percentage of missing values in the title. The following shows an example of `plot(df)`::
 55 | 
 56 |     from dataprep.eda import plot
 57 |     plot(df)
 58 | 
 59 | 
 60 | .. raw:: html
 61 | 
 62 |    <iframe src="../_static/images/plot/plot_df.html" height="520" width="100%"></iframe>
 63 | 
 64 | 
 65 | Zooming into a column via `plot(df, x)`
 66 | =======================================
 67 | 
 68 | After we get the basic information of the dataset, we could zoom into an interested column to explore it more by calling `plot(df, x)`, where x is the interested column.  The output is of `plot(df, x)` is different for numerical column and categorical column.
 69 | 
 70 | When x is a numeric column, it plots the histogram, kde plot, box plot and qq-norm plot. The following shows an example::
 71 |     
 72 |     plot(df, "age")
 73 | 
 74 | .. raw:: html
 75 | 
 76 |    <iframe src="../_static/images/plot/plot_df_age.html" height="450" width="100%"></iframe>
 77 | 
 78 | 
 79 | When x is a categorical column, it plots bar chart and pie chart. The following shows an example::
 80 | 
 81 |     plot(df, "education")
 82 | 
 83 | .. raw:: html
 84 | 
 85 |    <iframe src="../_static/images/plot/plot_df_education.html" height="450" width="100%"></iframe>
 86 | 
 87 | 
 88 | Zooming into two columns via `plot(df, x, y)`
 89 | =============================================
 90 | 
 91 | Furthermore, we provide `plot(df, x, y)` to explore the relationship between interested columns x and y. The output is based on the column types of x and y.
 92 | 
 93 | When x and y are both numerical columns, it plots `scatter plot  <https://www.wikiwand.com/en/Scatter_plot>`_, `hexbin plot <https://www.data-to-viz.com/graph/hexbinmap.html>`_ and `box plot  <https://www.wikiwand.com/en/Box_plot>`_. The following shows an example::
 94 | 
 95 |     plot(df, "age", "hours-per-week")
 96 | 
 97 | .. raw:: html
 98 | 
 99 |    <iframe src="../_static/images/plot/plot_df_age_hours.html" height="450" width="100%"></iframe>
100 |   
101 | 
102 | When x and y are both categorical columns, it plots `nested bar chart <https://www.wikiwand.com/en/Bar_chart#/Grouped_and_stacked>`_, `stacked bar chart <https://www.wikiwand.com/en/Bar_chart#/Grouped_and_stacked>`_ and `heat map <https://www.wikiwand.com/en/Heat_map>`_ . The following shows an example::
103 | 
104 |     plot(df, "education", "marital-status")
105 | 
106 | .. raw:: html
107 | 
108 |    <iframe src="../_static/images/plot/plot_df_education_marital.html" height="400" width="100%"></iframe>
109 | 
110 | 
111 | When one of x and y is a numerical column and the other is categorical column, it plots `box plot  <https://www.wikiwand.com/en/Box_plot>`_ and `line plot <https://www.wikiwand.com/en/Line_chart>`_. The following shows an example::
112 | 
113 |     plot(df, "age", "education")
114 |     # or plot(df, "education", "age")
115 | 
116 | .. raw:: html
117 | 
118 |    <iframe src="../_static/images/plot/plot_df_education_age.html" height="450" width="100%"></iframe>


--------------------------------------------------------------------------------
/dataprep/data_connector/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "http://json-schema.org/draft-07/schema#",
  3 |     "$id": "http://example.com/root.json",
  4 |     "type": "object",
  5 |     "title": "The config for a data connector",
  6 |     "required": [
  7 |         "version",
  8 |         "request",
  9 |         "response"
 10 |     ],
 11 |     "additionalProperties": false,
 12 |     "properties": {
 13 |         "version": {
 14 |             "$id": "#/properties/version",
 15 |             "type": "number",
 16 |             "title": "The Version Schema",
 17 |             "description": "The version number of the schema",
 18 |             "default": 1,
 19 |             "minimum": 1
 20 |         },
 21 |         "request": {
 22 |             "$id": "#/properties/request",
 23 |             "type": "object",
 24 |             "title": "The Request Schema",
 25 |             "description": "",
 26 |             "required": [
 27 |                 "url",
 28 |                 "method"
 29 |             ],
 30 |             "properties": {
 31 |                 "url": {
 32 |                     "$id": "#/properties/request/properties/url",
 33 |                     "type": "string",
 34 |                     "title": "The Url Schema",
 35 |                     "description": "The Url of the API endpoint. This can also be a Jinja template",
 36 |                     "default": "",
 37 |                     "examples": [
 38 |                         "http://example.com/api"
 39 |                     ],
 40 |                     "format": "uri"
 41 |                 },
 42 |                 "method": {
 43 |                     "$id": "#/properties/request/properties/method",
 44 |                     "type": "string",
 45 |                     "title": "The Method Schema",
 46 |                     "examples": [
 47 |                         "GET"
 48 |                     ],
 49 |                     "enum": [
 50 |                         "GET",
 51 |                         "POST",
 52 |                         "PUT"
 53 |                     ]
 54 |                 },
 55 |                 "authorization": {
 56 |                     "$ref": "#/definitions/authorization"
 57 |                 },
 58 |                 "headers": {
 59 |                     "$ref": "#/definitions/fields"
 60 |                 },
 61 |                 "params": {
 62 |                     "$ref": "#/definitions/fields"
 63 |                 },
 64 |                 "pagination": {
 65 |                     "$id": "#/properties/request/properties/pagination",
 66 |                     "type": "object",
 67 |                     "properties": {
 68 |                         "type": {
 69 |                             "type": "string"
 70 |                         },
 71 |                         "max_count": {
 72 |                             "type": "integer"
 73 |                         },
 74 |                         "anchor_key": {
 75 |                             "type": "string",
 76 |                             "optional": true
 77 |                         },
 78 |                         "count_key": {
 79 |                             "type": "string"
 80 |                         },
 81 |                         "cursor_id": {
 82 |                             "type": "string",
 83 |                             "optional": true
 84 |                         },
 85 |                         "cursor_key": {
 86 |                             "type": "string",
 87 |                             "optional": true
 88 |                         }
 89 |                     },
 90 |                     "required": [
 91 |                         "count_key",
 92 |                         "type",
 93 |                         "max_count"
 94 |                     ],
 95 |                     "additionalProperties": false
 96 |                 },
 97 |                 "body": {
 98 |                     "$id": "#/properties/request/properties/body",
 99 |                     "type": "object",
100 |                     "title": "The Body Schema",
101 |                     "properties": {
102 |                         "ctype": {
103 |                             "$id": "#/properties/request/properties/body/properties/ctype",
104 |                             "type": "string",
105 |                             "title": "The content type schema",
106 |                             "default": "application/json",
107 |                             "enum": [
108 |                                 "application/x-www-form-urlencoded",
109 |                                 "application/json"
110 |                             ]
111 |                         },
112 |                         "content": {
113 |                             "$ref": "#/definitions/fields"
114 |                         }
115 |                     }
116 |                 },
117 |                 "cookies": {
118 |                     "$ref": "#/definitions/fields"
119 |                 }
120 |             },
121 |             "additionalProperties": false
122 |         },
123 |         "response": {
124 |             "$id": "#/properties/response",
125 |             "type": "object",
126 |             "title": "The Response Schema",
127 |             "required": [
128 |                 "ctype",
129 |                 "tablePath",
130 |                 "schema"
131 |             ],
132 |             "properties": {
133 |                 "ctype": {
134 |                     "$id": "#/properties/response/properties/ctype",
135 |                     "type": "string",
136 |                     "title": "The Response Content Type Schema",
137 |                     "default": "application/json",
138 |                     "enum": [
139 |                         "application/x-www-form-urlencoded",
140 |                         "application/json",
141 |                         "application/xml"
142 |                     ]
143 |                 },
144 |                 "tablePath": {
145 |                     "$id": "#/properties/response/properties/tablePath",
146 |                     "type": "string",
147 |                     "title": "The Path to the Table Object",
148 |                     "default": ""
149 |                 },
150 |                 "schema": {
151 |                     "$ref": "#/definitions/schema"
152 |                 },
153 |                 "orient": {
154 |                     "$id": "#/properties/response/properties/orient",
155 |                     "type": "string",
156 |                     "title": "The Orient for the Table",
157 |                     "default": "records",
158 |                     "enum": [
159 |                         "split",
160 |                         "records"
161 |                     ]
162 |                 }
163 |             },
164 |             "additionalProperties": false
165 |         },
166 |         "additionalProperties": false
167 |     },
168 |     "definitions": {
169 |         "fields": {
170 |             "$id": "#/definitions/fields",
171 |             "type": "object",
172 |             "title": "Spec for Fields Definition",
173 |             "additionalProperties": {
174 |                 "oneOf": [
175 |                     {
176 |                         "type": "string"
177 |                     },
178 |                     {
179 |                         "type": "boolean"
180 |                     },
181 |                     {
182 |                         "type": "object",
183 |                         "required": [
184 |                             "required",
185 |                             "removeIfEmpty"
186 |                         ],
187 |                         "properties": {
188 |                             "required": {
189 |                                 "type": "boolean",
190 |                                 "default": false
191 |                             },
192 |                             "fromKey": {
193 |                                 "type": "string"
194 |                             },
195 |                             "toKey": {
196 |                                 "type": "string"
197 |                             },
198 |                             "template": {
199 |                                 "type": "string"
200 |                             },
201 |                             "removeIfEmpty": {
202 |                                 "type": "boolean",
203 |                                 "default": false
204 |                             },
205 |                             "additionalProperties": false
206 |                         }
207 |                     }
208 |                 ]
209 |             }
210 |         },
211 |         "authorization": {
212 |             "$id": "#/definitions/authorization",
213 |             "oneOf": [
214 |                 {
215 |                     "type": "object",
216 |                     "required": [
217 |                         "type",
218 |                         "grantType",
219 |                         "tokenServerUrl"
220 |                     ],
221 |                     "properties": {
222 |                         "type": {
223 |                             "type": "string",
224 |                             "enum": [
225 |                                 "OAuth2"
226 |                             ]
227 |                         },
228 |                         "grantType": {
229 |                             "type": "string",
230 |                             "enum": [
231 |                                 "ClientCredentials",
232 |                                 "AuthorizationCode"
233 |                             ]
234 |                         },
235 |                         "tokenServerUrl": {
236 |                             "type": "string"
237 |                         }
238 |                     },
239 |                     "additionalProperties": false
240 |                 },
241 |                 {
242 |                     "type": "string",
243 |                     "enum": [
244 |                         "Bearer"
245 |                     ]
246 |                 }
247 |             ]
248 |         },
249 |         "schema": {
250 |             "$id": "#/definitions/schema",
251 |             "type": "object",
252 |             "title": "Spec for table definition",
253 |             "additionalProperties": {
254 |                 "type": "object",
255 |                 "title": "Spec for schema item",
256 |                 "properties": {
257 |                     "target": {
258 |                         "type": "string"
259 |                     },
260 |                     "type": {
261 |                         "type": "string",
262 |                         "enum": [
263 |                             "string",
264 |                             "int",
265 |                             "float",
266 |                             "boolean",
267 |                             "object"
268 |                         ]
269 |                     },
270 |                     "description": {
271 |                         "type": "string"
272 |                     }
273 |                 },
274 |                 "required": [
275 |                     "target",
276 |                     "type"
277 |                 ],
278 |                 "additionalProperties": false
279 |             }
280 |         }
281 |     }
282 | }


--------------------------------------------------------------------------------