├── dataprep ├── tests │ ├── __init__.py │ ├── data_connector │ │ ├── __init__.py │ │ └── test_integration.py │ └── eda │ │ ├── __init__.py │ │ ├── test.py │ │ ├── test_plot_missing.py │ │ ├── test_plot.py │ │ ├── test_report.py │ │ └── test_plot_correlation.py ├── eda │ ├── outlier │ │ ├── __init__.py │ │ └── computation.py │ ├── palette.py │ ├── __init__.py │ ├── missing │ │ └── __init__.py │ ├── intermediate.py │ ├── correlation │ │ ├── __init__.py │ │ └── render.py │ ├── report.py │ ├── utils.py │ ├── dtypes.py │ └── basic │ │ └── __init__.py ├── assets │ ├── ellipse.jpg │ └── english_stopwords.py ├── data_connector │ ├── __init__.py │ ├── schema.py │ ├── errors.py │ ├── config_manager.py │ ├── types.py │ ├── implicit_database.py │ └── schema.json ├── __init__.py └── errors.py ├── .coveragerc ├── poetry.toml ├── docs ├── source │ ├── case_study │ │ ├── titanic.ipynb │ │ └── house_price.ipynb │ ├── _static │ │ └── images │ │ │ ├── tutorial │ │ │ ├── .DS_Store │ │ │ ├── URI_.png │ │ │ ├── dc_git.png │ │ │ ├── App_find.png │ │ │ ├── Node_js.png │ │ │ ├── dc_query.png │ │ │ ├── dc_show.png │ │ │ ├── SFU_Spotify.png │ │ │ ├── dc_schema.png │ │ │ ├── App.js_config.png │ │ │ ├── ID_and_secret.png │ │ │ ├── Yelp_API_Key.png │ │ │ ├── dc_dblp_info.png │ │ │ ├── dc_dblp_query.png │ │ │ ├── dc_git_clone.png │ │ │ ├── dc_yelp_query.png │ │ │ ├── Spotify_git_page.png │ │ │ ├── Spotify_server.png │ │ │ ├── dc_dblp_author.png │ │ │ ├── dc_spotify_info.png │ │ │ ├── dc_spotify_query.png │ │ │ ├── Config_destination.png │ │ │ ├── Spotify_dashboard.png │ │ │ ├── dc_dblp_pagination.png │ │ │ ├── dc_yelp_query_pag.png │ │ │ ├── Yelp_authentication.png │ │ │ ├── dc_dblp_show_schema.png │ │ │ ├── dc_spotify_query_pag.png │ │ │ ├── Spotify_authentication.png │ │ │ └── dc_spotify_show_schema.png │ │ │ ├── data_connector │ │ │ ├── info.png │ │ │ ├── query.png │ │ │ └── show_schema.png │ │ │ └── plot_missing │ │ │ └── df_x_cat.html │ ├── dataprep.rst │ ├── dataprep.eda.rst │ ├── index.rst │ ├── dataprep.data_connector.rst │ ├── conf.py │ ├── data_connector.rst │ ├── eda │ │ ├── plot_missing.rst │ │ ├── introduction.rst │ │ ├── plot_correlation.rst │ │ └── plot.rst │ ├── DC_DBLP_tut.rst │ └── DC_Yelp_tut.rst ├── Makefile └── make.bat ├── assets ├── logo.png ├── plot(df).png ├── data_connector.png ├── plot_missing(df).png ├── plot_correlation(df).png └── plot_missing(df,x).png ├── codecov.yaml ├── mypy.ini ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── pull_request_template.md ├── pytype.cfg ├── LICENSE ├── .gitignore ├── pyproject.toml ├── Justfile ├── .circleci └── config.yml ├── README.md └── examples ├── DataConnector_DBLP.ipynb ├── DataConnector_Yelp.ipynb └── EDA.ipynb /dataprep/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataprep/eda/outlier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source=dataprep -------------------------------------------------------------------------------- /dataprep/tests/data_connector/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | -------------------------------------------------------------------------------- /dataprep/tests/eda/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | EDA Tests 3 | """ 4 | -------------------------------------------------------------------------------- /docs/source/case_study/titanic.ipynb: -------------------------------------------------------------------------------- 1 | ../../../examples/titanic.ipynb -------------------------------------------------------------------------------- /docs/source/case_study/house_price.ipynb: -------------------------------------------------------------------------------- 1 | ../../../examples/house_price.ipynb -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/logo.png -------------------------------------------------------------------------------- /assets/plot(df).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/plot(df).png -------------------------------------------------------------------------------- /assets/data_connector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/data_connector.png -------------------------------------------------------------------------------- /assets/plot_missing(df).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/plot_missing(df).png -------------------------------------------------------------------------------- /dataprep/assets/ellipse.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/dataprep/assets/ellipse.jpg -------------------------------------------------------------------------------- /assets/plot_correlation(df).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/plot_correlation(df).png -------------------------------------------------------------------------------- /assets/plot_missing(df,x).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/assets/plot_missing(df,x).png -------------------------------------------------------------------------------- /dataprep/data_connector/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | DataConnector 3 | """ 4 | from .connector import Connector 5 | 6 | __all__ = ["Connector"] 7 | -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/.DS_Store -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/URI_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/URI_.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_git.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_git.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/App_find.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/App_find.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/Node_js.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Node_js.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_query.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_show.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_show.png -------------------------------------------------------------------------------- /docs/source/_static/images/data_connector/info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/data_connector/info.png -------------------------------------------------------------------------------- /docs/source/_static/images/data_connector/query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/data_connector/query.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/SFU_Spotify.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/SFU_Spotify.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_schema.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/App.js_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/App.js_config.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/ID_and_secret.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/ID_and_secret.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/Yelp_API_Key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Yelp_API_Key.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_dblp_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_info.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_dblp_query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_query.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_git_clone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_git_clone.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_yelp_query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_yelp_query.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/Spotify_git_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Spotify_git_page.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/Spotify_server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Spotify_server.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_dblp_author.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_author.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_spotify_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_spotify_info.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_spotify_query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_spotify_query.png -------------------------------------------------------------------------------- /docs/source/_static/images/data_connector/show_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/data_connector/show_schema.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/Config_destination.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Config_destination.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/Spotify_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Spotify_dashboard.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_dblp_pagination.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_pagination.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_yelp_query_pag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_yelp_query_pag.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/Yelp_authentication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Yelp_authentication.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_dblp_show_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_dblp_show_schema.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_spotify_query_pag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_spotify_query_pag.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/Spotify_authentication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/Spotify_authentication.png -------------------------------------------------------------------------------- /docs/source/_static/images/tutorial/dc_spotify_show_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/dataprep/develop/docs/source/_static/images/tutorial/dc_spotify_show_schema.png -------------------------------------------------------------------------------- /docs/source/dataprep.rst: -------------------------------------------------------------------------------- 1 | dataprep package 2 | ================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | dataprep.data_connector 10 | dataprep.eda -------------------------------------------------------------------------------- /dataprep/data_connector/schema.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module contains the loaded config schema. 3 | """ 4 | from json import load as jload 5 | from pathlib import Path 6 | 7 | with open(f"{Path(__file__).parent}/schema.json", "r") as f: 8 | CONFIG_SCHEMA = jload(f) 9 | -------------------------------------------------------------------------------- /dataprep/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | dataprep 3 | ======== 4 | 5 | Dataprep let you prepare your data using a single library with a few lines of code. 6 | """ 7 | import logging 8 | 9 | DEFAULT_PARTITIONS = 1 10 | 11 | logging.basicConfig(level=logging.INFO, format="%(message)s") 12 | 13 | __version__ = "0.2.8" 14 | -------------------------------------------------------------------------------- /dataprep/errors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Library-wise errors 3 | """ 4 | 5 | 6 | class DataprepError(Exception): 7 | """ 8 | Base exception, used library-wise 9 | """ 10 | 11 | 12 | class UnreachableError(DataprepError): 13 | """ 14 | Error indicating some path of the code is unreachable. 15 | """ 16 | -------------------------------------------------------------------------------- /dataprep/eda/palette.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file defines palettes used for EDA. 3 | """ 4 | # pylint: disable=no-name-in-module 5 | from bokeh.palettes import Category20 # type: ignore 6 | from holoviews.plotting.util import process_cmap 7 | 8 | PALETTE = Category20[20] 9 | BIPALETTE = list(reversed(process_cmap("RdBu"))) 10 | BRG = ["#1f78b4", "#d62728", "#2ca02c"] 11 | -------------------------------------------------------------------------------- /codecov.yaml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: yes 3 | 4 | coverage: 5 | precision: 2 6 | round: down 7 | range: "70...100" 8 | 9 | parsers: 10 | gcov: 11 | branch_detection: 12 | conditional: yes 13 | loop: yes 14 | method: no 15 | macro: no 16 | 17 | comment: 18 | layout: "reach,diff,flags,tree" 19 | behavior: default 20 | require_changes: no -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | 2 | 3 | [mypy] 4 | ignore_missing_imports = True 5 | ignore_errors = False 6 | warn_unused_configs = True 7 | disallow_subclassing_any = True 8 | disallow_any_generics = True 9 | disallow_untyped_calls = True 10 | disallow_untyped_defs = True 11 | disallow_incomplete_defs = True 12 | check_untyped_defs = True 13 | disallow_untyped_decorators = True 14 | no_implicit_optional = True 15 | warn_redundant_casts = True 16 | warn_unused_ignores = False 17 | warn_return_any = True 18 | -------------------------------------------------------------------------------- /dataprep/tests/eda/test.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime as DateTime 2 | from datetime import timedelta as TimeDelta 3 | 4 | import pandas as pd 5 | 6 | from ...eda.dtypes import is_nominal, is_continuous 7 | 8 | 9 | def test_dtypes() -> None: 10 | df = pd.DataFrame(data=[["a", "c", False]], columns=["S", "C", "B"]) 11 | df["C"] = df["C"].astype("category") 12 | 13 | for col in df.columns: 14 | assert is_nominal(df[col].dtype) 15 | 16 | df = pd.DataFrame( 17 | data=[[complex(3, 1), 1, 1.1, TimeDelta(1), DateTime.now(),]], 18 | columns=["IM", "I", "F", "TD", "DT"], 19 | ) 20 | 21 | for col in df.columns: 22 | assert is_continuous(df[col].dtype) 23 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: 'triage required, type: enhancement' 6 | assignees: dovahcrow 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /dataprep/tests/data_connector/test_integration.py: -------------------------------------------------------------------------------- 1 | from ...data_connector import Connector 2 | from os import environ 3 | 4 | 5 | def test_data_connector() -> None: 6 | token = environ["DATAPREP_DATA_CONNECTOR_YELP_TOKEN"] 7 | dc = Connector("yelp", _auth={"access_token": token}) 8 | df = dc.query("businesses", term="ramen", location="vancouver") 9 | 10 | assert len(df) > 0 11 | 12 | dc.info() 13 | 14 | schema = dc.show_schema("businesses") 15 | 16 | assert len(schema) > 0 17 | 18 | df = dc.query("businesses", _count=120, term="ramen", location="vancouver") 19 | 20 | assert len(df) == 120 21 | 22 | df = dc.query("businesses", _count=10000, term="ramen", location="vancouver") 23 | 24 | assert len(df) < 1000 25 | -------------------------------------------------------------------------------- /docs/source/dataprep.eda.rst: -------------------------------------------------------------------------------- 1 | dataprep.eda package 2 | ==================== 3 | 4 | .. .. automodule:: dataprep.eda 5 | .. :noindex: 6 | 7 | Plot* functions 8 | --------------- 9 | .. autofunction:: dataprep.eda.basic.plot 10 | .. autofunction:: dataprep.eda.correlation.plot_correlation 11 | .. autofunction:: dataprep.eda.missing.plot_missing 12 | 13 | Other functions 14 | --------------- 15 | 16 | .. autofunction:: dataprep.eda.basic.compute 17 | .. autofunction:: dataprep.eda.basic.render 18 | .. autofunction:: dataprep.eda.correlation.compute.compute_correlation 19 | .. autofunction:: dataprep.eda.correlation.render.render_correlation 20 | .. autofunction:: dataprep.eda.missing.compute.compute_missing 21 | .. autofunction:: dataprep.eda.missing.render.render_missing 22 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /pytype.cfg: -------------------------------------------------------------------------------- 1 | # NOTE: All relative paths are relative to the location of this file. 2 | 3 | [pytype] 4 | 5 | # Space-separated list of files or directories to exclude. 6 | exclude = 7 | **/*_test.py 8 | **/test_*.py 9 | 10 | # Space-separated list of files or directories to process. 11 | inputs = 12 | . 13 | 14 | # Keep going past errors to analyze as many files as possible. 15 | keep_going = False 16 | 17 | # All pytype output goes here. 18 | output = .pytype 19 | 20 | # Paths to source code directories, separated by ':'. 21 | pythonpath = 22 | . 23 | 24 | # Python version (major.minor) of the target code. 25 | python_version = 3.7 26 | 27 | # Comma separated list of error names to ignore. 28 | disable = 29 | pyi-error 30 | 31 | # Don't report errors. 32 | report_errors = True 33 | 34 | # Experimental: solve unknown types to label with structural types. 35 | protocols = False 36 | 37 | # Experimental: Only load submodules that are explicitly imported. 38 | strict_import = False 39 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. dataprep documentation master file, created by 2 | sphinx-quickstart on Wed Nov 6 13:56:43 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to dataprep's documentation! 7 | ==================================== 8 | 9 | EDA 10 | --- 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | eda/introduction 15 | eda/plot 16 | eda/plot_correlation 17 | eda/plot_missing 18 | 19 | Data Connector 20 | -------------- 21 | .. toctree:: 22 | :maxdepth: 2 23 | 24 | data_connector 25 | DC_DBLP_tut 26 | DC_Yelp_tut 27 | DC_Spotify_tut 28 | 29 | Case Study 30 | ---------- 31 | .. toctree:: 32 | :maxdepth: 2 33 | 34 | case_study/titanic.ipynb 35 | case_study/house_price.ipynb 36 | 37 | API Documentation 38 | ----------------- 39 | 40 | .. toctree:: 41 | :maxdepth: 2 42 | 43 | dataprep 44 | 45 | Indices and tables 46 | ================== 47 | 48 | * :ref:`genindex` 49 | * :ref:`modindex` 50 | * :ref:`search` 51 | -------------------------------------------------------------------------------- /dataprep/eda/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | dataprep.eda 3 | ============ 4 | """ 5 | import tempfile 6 | 7 | from bokeh.io import output_file, output_notebook 8 | from .basic import compute, plot, render 9 | from .correlation import compute_correlation, plot_correlation, render_correlation 10 | from .missing import compute_missing, plot_missing, render_missing 11 | from .utils import is_notebook 12 | from .dtypes import ( 13 | DType, 14 | Categorical, 15 | Nominal, 16 | Ordinal, 17 | Numerical, 18 | Continuous, 19 | Discrete, 20 | DateTime, 21 | Text, 22 | ) 23 | 24 | __all__ = [ 25 | "plot_correlation", 26 | "compute_correlation", 27 | "render_correlation", 28 | "compute_missing", 29 | "render_missing", 30 | "plot_missing", 31 | "plot", 32 | "compute", 33 | "render", 34 | "DType", 35 | "Categorical", 36 | "Nominal", 37 | "Ordinal", 38 | "Numerical", 39 | "Continuous", 40 | "Discrete", 41 | "DateTime", 42 | "Text", 43 | ] 44 | 45 | 46 | if is_notebook(): 47 | output_notebook(hide_banner=True) 48 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'type: bug, triage required' 6 | assignees: dovahcrow 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | Or: 21 | 22 | ```python 23 | paste your code here 24 | ``` 25 | 26 | **Expected behavior** 27 | A clear and concise description of what you expected to happen. 28 | 29 | **Screenshots** 30 | If applicable, add screenshots to help explain your problem. 31 | 32 | **Desktop (please complete the following information):** 33 | - OS: [e.g. Windows] 34 | - Browser [e.g. chrome, safari] 35 | - Platform [Jupyter Notebook, Jupyter Lab, Google Colab, VSCode, Python script] 36 | - Platform Version [e.g. 1.0] 37 | - Python Version [e.g. 3.7.2] 38 | - Dataprep Version [e.g. 0.2.2] 39 | 40 | **Additional context** 41 | Add any other context about the problem here. 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 sfu-db 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/source/dataprep.data_connector.rst: -------------------------------------------------------------------------------- 1 | dataprep.data\_connector package 2 | ================================ 3 | 4 | .. .. automodule:: dataprep.data_connector 5 | .. :members: 6 | .. :undoc-members: 7 | .. :show-inheritance: 8 | 9 | Connector 10 | --------- 11 | 12 | .. autoclass:: dataprep.data_connector.Connector 13 | :members: 14 | :inherited-members: 15 | 16 | 17 | 18 | .. Submodules 19 | .. ---------- 20 | 21 | .. dataprep.data\_connector.connector module 22 | .. ----------------------------------------- 23 | 24 | .. .. automodule:: dataprep.data_connector.connector 25 | .. :members: 26 | .. :undoc-members: 27 | .. :show-inheritance: 28 | 29 | .. dataprep.data\_connector.schema module 30 | .. -------------------------------------- 31 | 32 | .. .. automodule:: dataprep.data_connector.schema 33 | .. :members: 34 | .. :undoc-members: 35 | .. :show-inheritance: 36 | 37 | .. dataprep.data\_connector.types module 38 | .. ------------------------------------- 39 | 40 | .. .. automodule:: dataprep.data_connector.types 41 | .. :members: 42 | .. :undoc-members: 43 | .. :show-inheritance: 44 | 45 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. 4 | 5 | # How Has This Been Tested? 6 | 7 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration 8 | 9 | # Snapshots: 10 | 11 | Include snapshots for easier review. 12 | 13 | # Checklist: 14 | 15 | - [ ] My code follows the style guidelines of this project 16 | - [ ] I have already squashed the commits and make the commit message conform to the project standard. 17 | - [ ] I have already marked the commit with "BREAKING CHANGE" or "Fixes #" if needed. 18 | - [ ] I have performed a self-review of my own code 19 | - [ ] I have commented my code, particularly in hard-to-understand areas 20 | - [ ] I have made corresponding changes to the documentation 21 | - [ ] My changes generate no new warnings 22 | - [ ] I have added tests that prove my fix is effective or that my feature works 23 | - [ ] New and existing unit tests pass locally with my changes 24 | - [ ] Any dependent changes have been merged and published in downstream modules 25 | -------------------------------------------------------------------------------- /dataprep/data_connector/errors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module defines errors used in this library. 3 | """ 4 | from ..errors import DataprepError 5 | 6 | 7 | class RequestError(DataprepError): 8 | """ 9 | A error indicating the status code of the API response 10 | is not 200. 11 | """ 12 | 13 | status_code: int 14 | message: str 15 | 16 | def __init__(self, status_code: int, message: str) -> None: 17 | """ 18 | Constructor 19 | 20 | parameters 21 | ---------- 22 | status_code : int 23 | The http status code 24 | messsage : str 25 | The message from the response 26 | """ 27 | 28 | super().__init__() 29 | 30 | self.status_code = status_code 31 | self.message = message 32 | 33 | def __str__(self) -> str: 34 | return f"RequestError: status={self.status_code}, message={self.message}" 35 | 36 | 37 | class UniversalParameterOverridden(Exception): 38 | """ 39 | The parameter is overrided by the universal parameter 40 | """ 41 | 42 | param: str 43 | uparam: str 44 | 45 | def __init__(self, param: str, uparam: str) -> None: 46 | super().__init__() 47 | self.param = param 48 | self.uparam = uparam 49 | 50 | def __str__(self) -> str: 51 | return f"the parameter {self.param} is overridden by {self.uparam}" 52 | -------------------------------------------------------------------------------- /dataprep/eda/outlier/computation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module containing plot_outlier function. 3 | """ 4 | 5 | 6 | import dask.dataframe as dd 7 | 8 | from ..intermediate import Intermediate 9 | 10 | DEFAULT_PARTITIONS = 1 11 | 12 | 13 | def _calc_num_outlier(df: dd.DataFrame, col_x: str) -> Intermediate: 14 | """ 15 | calculate outliers based on the MAD method for numerical values. 16 | :param df: the input dataframe 17 | :param col_x: the column of df (univariate outlier detection) 18 | :return: dict(index: value) of outliers 19 | """ 20 | data_df = dd.from_dask_array(df[col_x].to_dask_array(), columns=["data"]) 21 | median = data_df["data"].quantile(0.5) 22 | MAD = abs(data_df["data"] - median).quantile(0.5) # pylint: disable=invalid-name 23 | data_df["z_score"] = (0.6745 * (data_df["data"] - median)) / MAD 24 | res_df = data_df[data_df["z_score"] > 3.5].drop("z_score", axis=1) 25 | result = {"outliers_index": list(res_df["data"].index.compute())} 26 | raw_data = {"df": df, "col_x": col_x} 27 | return Intermediate(result, raw_data) 28 | 29 | 30 | def _calc_cat_outlier(df: dd.DataFrame, col_x: str, threshold: int = 1) -> Intermediate: 31 | """ 32 | calculate outliers based on the threshold for categorical values. 33 | :param df: the input dataframe 34 | :param col_x: the column of df (univariate outlier detection) 35 | :return: dict(index: value) of outliers 36 | """ 37 | groups = df.groupby([col_x]).size() 38 | result = {"outlier_index": list(groups[groups <= threshold].index.compute())} 39 | raw_data = {"df": df, "col_x": col_x, "threshold": threshold} 40 | return Intermediate(result, raw_data) 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # pytype 107 | .pytype/ 108 | 109 | # editors 110 | .vscode 111 | .idea 112 | notebooks/ 113 | bfg.jar 114 | profiling 115 | .coverage -------------------------------------------------------------------------------- /dataprep/tests/eda/test_plot_missing.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module for testing plot_missing(df, x, y) function. 3 | """ 4 | import dask.dataframe as dd 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | 9 | from ...eda.dtypes import Numerical 10 | from ...eda.missing import compute_missing, render_missing 11 | from ...eda.utils import to_dask 12 | 13 | 14 | @pytest.fixture(scope="module") # type: ignore 15 | def simpledf() -> dd.DataFrame: 16 | df = pd.DataFrame(np.random.rand(1000, 3), columns=["a", "b", "c"]) 17 | 18 | df = pd.concat( 19 | [df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1 20 | ) 21 | 22 | df.columns = ["a", "b", "c", "d"] 23 | idx = np.arange(1000) 24 | np.random.shuffle(idx) 25 | df.iloc[idx[:500], 0] = None 26 | 27 | ddf = to_dask(df) 28 | 29 | return ddf 30 | 31 | 32 | def test_sanity_compute_1(simpledf: dd.DataFrame) -> None: 33 | itmdt = compute_missing(simpledf) 34 | render_missing(itmdt) 35 | 36 | 37 | def test_sanity_compute_2(simpledf: dd.DataFrame) -> None: 38 | itmdt = compute_missing(simpledf, x="a") 39 | render_missing(itmdt) 40 | 41 | 42 | def test_sanity_compute_3(simpledf: dd.DataFrame) -> None: 43 | itmdt = compute_missing(simpledf, x="d") 44 | render_missing(itmdt) 45 | 46 | 47 | def test_sanity_compute_4(simpledf: dd.DataFrame) -> None: 48 | itmdt = compute_missing(simpledf, x="a", y="b") 49 | render_missing(itmdt) 50 | 51 | 52 | def test_sanity_compute_5(simpledf: dd.DataFrame) -> None: 53 | itmdt = compute_missing(simpledf, x="a", y="d") 54 | render_missing(itmdt) 55 | 56 | 57 | def test_specify_column_type(simpledf: dd.DataFrame) -> None: 58 | itmdt = compute_missing(simpledf, x="b", dtype={"a": Numerical()}) 59 | render_missing(itmdt) 60 | 61 | 62 | @pytest.mark.xfail # type: ignore 63 | def test_sanity_compute_6(simpledf: dd.DataFrame) -> None: 64 | compute_missing(simpledf, y="b") 65 | -------------------------------------------------------------------------------- /dataprep/tests/eda/test_plot.py: -------------------------------------------------------------------------------- 1 | """ 2 | module for testing plot(df, x, y) function. 3 | """ 4 | import logging 5 | 6 | import dask.dataframe as dd 7 | import numpy as np 8 | import pandas as pd 9 | import pytest 10 | 11 | from ...eda import plot 12 | from ...eda.dtypes import Nominal 13 | from ...eda.utils import to_dask 14 | 15 | LOGGER = logging.getLogger(__name__) 16 | 17 | 18 | @pytest.fixture(scope="module") # type: ignore 19 | def simpledf() -> dd.DataFrame: 20 | df = pd.DataFrame(np.random.rand(1000, 3), columns=["a", "b", "c"]) 21 | 22 | df = pd.concat( 23 | [df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1 24 | ) 25 | df = pd.concat( 26 | [ 27 | df, 28 | pd.Series( 29 | np.random.choice( 30 | ["2020/03/29", "2020/01/10", "2019/11/21"], 1000, replace=True 31 | ) 32 | ), 33 | ], 34 | axis=1, 35 | ) 36 | df = pd.concat([df, pd.Series(np.zeros(1000))], axis=1) 37 | df.columns = ["a", "b", "c", "d", "e", "f"] 38 | df["e"] = pd.to_datetime(df["e"]) 39 | 40 | idx = np.arange(1000) 41 | np.random.shuffle(idx) 42 | df.iloc[idx[:500], 0] = None 43 | 44 | ddf = to_dask(df) 45 | 46 | return ddf 47 | 48 | 49 | def test_sanity_compute_1(simpledf: dd.DataFrame) -> None: 50 | plot(simpledf, "a") 51 | 52 | 53 | def test_sanity_compute_2(simpledf: dd.DataFrame) -> None: 54 | plot(simpledf, "e") 55 | 56 | 57 | def test_sanity_compute_3(simpledf: dd.DataFrame) -> None: 58 | plot(simpledf) 59 | 60 | 61 | def test_sanity_compute_4(simpledf: dd.DataFrame) -> None: 62 | plot(simpledf, "d", "e") 63 | 64 | 65 | def test_sanity_compute_5(simpledf: dd.DataFrame) -> None: 66 | plot(simpledf, "a", "e") 67 | 68 | 69 | def test_sanity_compute_6(simpledf: dd.DataFrame) -> None: 70 | plot(simpledf, "f") 71 | 72 | 73 | def test_specify_column_type(simpledf: dd.DataFrame) -> None: 74 | plot(simpledf, dtype={"a": Nominal()}) 75 | plot(simpledf, dtype=Nominal()) 76 | -------------------------------------------------------------------------------- /dataprep/tests/eda/test_report.py: -------------------------------------------------------------------------------- 1 | """ 2 | module for testing plot(df, x, y) function. 3 | """ 4 | import logging 5 | from datetime import datetime as DateTime 6 | from tempfile import TemporaryDirectory 7 | 8 | import dask.dataframe as dd 9 | import numpy as np 10 | import pandas as pd 11 | import pytest 12 | 13 | from ...eda import plot, plot_correlation, plot_missing 14 | from ...eda.utils import to_dask 15 | 16 | LOGGER = logging.getLogger(__name__) 17 | 18 | 19 | @pytest.fixture(scope="module") # type: ignore 20 | def simpledf() -> dd.DataFrame: 21 | df = pd.DataFrame(np.random.rand(1000, 3), columns=["a", "b", "c"]) 22 | 23 | df = pd.concat( 24 | [df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1 25 | ) 26 | df = pd.concat( 27 | [df, pd.Series(np.random.choice([list("a"), set("b"),], 1000, replace=True)),], 28 | axis=1, 29 | ) 30 | df = pd.concat( 31 | [ 32 | df, 33 | pd.Series( 34 | np.random.choice( 35 | [DateTime(6, 4, 1), pd.to_datetime("today")], 1000, replace=True 36 | ) 37 | ), 38 | ], 39 | axis=1, 40 | ) 41 | 42 | df.columns = ["a", "b", "c", "d", "e", "f"] 43 | 44 | idx = np.arange(1000) 45 | np.random.shuffle(idx) 46 | df.iloc[idx[:500], 0] = None 47 | 48 | ddf = to_dask(df) 49 | 50 | return ddf 51 | 52 | 53 | def test_plot_report(simpledf: dd.DataFrame) -> None: 54 | report = plot(simpledf) 55 | with TemporaryDirectory() as dname: 56 | report.save(filename=f"{dname}/plot_report.html") 57 | report._repr_html_() 58 | 59 | 60 | def test_plot_correlation_report(simpledf: dd.DataFrame) -> None: 61 | report = plot_correlation(simpledf) 62 | with TemporaryDirectory() as dname: 63 | report.save(filename=f"{dname}/plot_correlation_report.html") 64 | report._repr_html_() 65 | 66 | 67 | def test_plot_missing_report(simpledf: dd.DataFrame) -> None: 68 | report = plot_missing(simpledf) 69 | with TemporaryDirectory() as dname: 70 | report.save(filename=f"{dname}/plot_missing_report.html") 71 | report._repr_html_() 72 | -------------------------------------------------------------------------------- /dataprep/eda/missing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements the plot_missing(df) function 3 | """ 4 | 5 | from typing import Optional, Union 6 | 7 | import dask.dataframe as dd 8 | import pandas as pd 9 | from bokeh.io import show 10 | 11 | from .compute import compute_missing 12 | from .render import render_missing 13 | from ..report import Report 14 | from ..dtypes import DTypeDef 15 | 16 | __all__ = ["render_missing", "compute_missing", "plot_missing"] 17 | 18 | 19 | def plot_missing( 20 | df: Union[pd.DataFrame, dd.DataFrame], 21 | x: Optional[str] = None, 22 | y: Optional[str] = None, 23 | *, 24 | bins: int = 30, 25 | ncols: int = 30, 26 | ndist_sample: int = 100, 27 | dtype: Optional[DTypeDef] = None, 28 | ) -> Report: 29 | """ 30 | This function is designed to deal with missing values 31 | There are three functions: plot_missing(df), plot_missing(df, x) 32 | plot_missing(df, x, y) 33 | 34 | Parameters 35 | ---------- 36 | df 37 | the pandas data_frame for which plots are calculated for each column 38 | x 39 | a valid column name of the data frame 40 | y 41 | a valid column name of the data frame 42 | ncols 43 | The number of columns in the figure 44 | bins 45 | The number of rows in the figure 46 | ndist_sample 47 | The number of sample points 48 | wdtype: str or DType or dict of str or dict of DType, default None 49 | Specify Data Types for designated column or all columns. 50 | E.g. dtype = {"a": Continuous, "b": "Nominal"} or 51 | dtype = {"a": Continuous(), "b": "nominal"} 52 | or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous() 53 | 54 | Examples 55 | ---------- 56 | >>> from dataprep.eda.missing.computation import plot_missing 57 | >>> import pandas as pd 58 | >>> df = pd.read_csv("suicide-rate.csv") 59 | >>> plot_missing(df, "HDI_for_year") 60 | >>> plot_missing(df, "HDI_for_year", "population") 61 | """ 62 | itmdt = compute_missing( 63 | df, x, y, dtype=dtype, bins=bins, ncols=ncols, ndist_sample=ndist_sample 64 | ) 65 | fig = render_missing(itmdt) 66 | return Report(fig) 67 | -------------------------------------------------------------------------------- /dataprep/eda/intermediate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Intermediate class 3 | """ 4 | from typing import Any, Dict, Tuple, Union 5 | 6 | import pandas as pd 7 | 8 | 9 | class Intermediate(Dict[str, Any]): 10 | """ 11 | This class contains intermediate results. 12 | """ 13 | 14 | visual_type: str 15 | 16 | def __init__(self, *args: Any, **kwargs: Any): 17 | if ( 18 | len(args) == 1 19 | and isinstance(args[0], dict) 20 | and len(kwargs) == 1 21 | and "visual_type" in kwargs 22 | ): 23 | super().__init__(args[0]) 24 | self.visual_type = kwargs["visual_type"] 25 | elif len(args) == 0: 26 | visual_type = kwargs.pop("visual_type") 27 | super().__init__(**kwargs) 28 | self.visual_type = visual_type 29 | else: 30 | assert False, "Unsupported initialization" 31 | 32 | 33 | class ColumnsMetadata: 34 | """ 35 | Container for storing each column's metadata 36 | """ 37 | 38 | metadata: pd.DataFrame 39 | 40 | def __init__(self) -> None: 41 | self.metadata = pd.DataFrame() 42 | self.metadata.index.name = "Column Name" 43 | 44 | def __setitem__(self, key: Tuple[str, str], val: Any) -> None: 45 | col, vtype = key 46 | if ( 47 | isinstance(val, (tuple, list, dict)) 48 | and vtype 49 | not in self.metadata.columns # pylint: disable=unsupported-membership-test 50 | ): 51 | self.metadata[vtype] = pd.Series(dtype="object") 52 | 53 | self.metadata.loc[col, vtype] = val 54 | 55 | def __getitem__(self, key: Union[str, Tuple[str, str]]) -> Any: 56 | if isinstance(key, tuple): 57 | col, vtype = key 58 | return self.metadata.loc[col, vtype] 59 | else: 60 | return ColumnMetadata(self.metadata.loc[key]) 61 | 62 | 63 | class ColumnMetadata: 64 | """ 65 | Container for storing a single column's metadata. 66 | This is immutable 67 | """ 68 | 69 | metadata: pd.Series 70 | 71 | def __init__(self, meta: pd.Series) -> None: 72 | self.metadata = meta 73 | 74 | def __getitem__(self, key: str) -> Any: 75 | return self.metadata.loc[key] 76 | -------------------------------------------------------------------------------- /dataprep/eda/correlation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements the plot_correlation(df) function. 3 | """ 4 | 5 | from typing import Any, List, Optional, Tuple, Union 6 | 7 | import dask.dataframe as dd 8 | import pandas as pd 9 | from bokeh.io import show 10 | 11 | from .compute import compute_correlation 12 | from .render import render_correlation 13 | from ..report import Report 14 | 15 | __all__ = ["render_correlation", "compute_correlation", "plot_correlation"] 16 | 17 | 18 | def plot_correlation( 19 | df: Union[pd.DataFrame, dd.DataFrame], 20 | x: Optional[str] = None, 21 | y: Optional[str] = None, 22 | *, 23 | value_range: Optional[Tuple[float, float]] = None, 24 | k: Optional[int] = None, 25 | ) -> Report: 26 | """ 27 | This function is designed to calculate the correlation between columns 28 | There are three functions: plot_correlation(df), plot_correlation(df, x) 29 | plot_correlation(df, x, y) 30 | There are also some parameters such as k and value_range to satisfy your requirement 31 | 32 | Parameters 33 | ---------- 34 | df 35 | The pandas data_frame for which plots are calculated for each column 36 | x 37 | A valid column name of the data frame 38 | y 39 | A valid column name of the data frame 40 | value_range 41 | Range of value 42 | k 43 | Choose top-k element 44 | 45 | Examples 46 | -------- 47 | >>> from dataprep.eda.correlation.computation import plot_correlation 48 | >>> import pandas as pd 49 | >>> df = pd.read_csv("suicide-rate.csv") 50 | >>> plot_correlation(df) 51 | >>> plot_correlation(df, k=6) 52 | >>> plot_correlation(df, "suicides") 53 | >>> plot_correlation(df, "suicides", k=3) 54 | >>> plot_correlation(df, "suicides", value_range=[-1, 0.3]) 55 | >>> plot_correlation(df, "suicides", value_range=[-1, 0.3], k=2) 56 | >>> plot_correlation(df, x_name="population", y_name="suicides_no") 57 | >>> plot_correlation(df, x_name="population", y_name="suicides", k=5) 58 | 59 | Note 60 | ---- 61 | This function only supports numerical or categorical data, 62 | and it is better to drop None, Nan and Null value before using it 63 | """ 64 | 65 | intermediate = compute_correlation(df, x=x, y=y, value_range=value_range, k=k) 66 | figure = render_correlation(intermediate) 67 | 68 | return Report(figure) 69 | -------------------------------------------------------------------------------- /dataprep/eda/report.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements the Report class. 3 | """ 4 | 5 | from pathlib import Path 6 | from tempfile import NamedTemporaryFile 7 | 8 | from bokeh.io import save 9 | from bokeh.models import LayoutDOM 10 | from bokeh.resources import CDN 11 | from IPython.display import HTML, display 12 | from jinja2 import Template 13 | 14 | INLINE_TEMPLATE = Template( 15 | """ 16 | {% from macros import embed %} 17 | {% block inner_body %} 18 | {% block contents %} 19 | {% for doc in docs %} 20 | {{ embed(doc) if doc.elementid }} 21 | {% for root in doc.roots %} 22 | {% block root scoped %} 23 | {{ embed(root) | indent(10) }} 24 | {% endblock %} 25 | {% endfor %} 26 | {% endfor %} 27 | {% endblock %} 28 | {{ plot_script | indent(8) }} 29 | {% endblock %} 30 | """ 31 | ) 32 | 33 | 34 | class Report: 35 | """ 36 | This class creates a customized Report object for the plot* functions 37 | """ 38 | 39 | to_render: LayoutDOM 40 | 41 | def __init__(self, to_render: LayoutDOM) -> None: 42 | self.to_render = to_render 43 | 44 | def save(self, filename: str) -> None: 45 | """ 46 | save function 47 | """ 48 | save( 49 | self.to_render, 50 | filename=filename, 51 | resources=CDN, 52 | title="DataPrep.EDA Report", 53 | ) 54 | 55 | def _repr_html_(self) -> str: 56 | # Windows forbids us open the file twice as the result bokeh cannot 57 | # write to the opened temporary file. 58 | with NamedTemporaryFile(suffix=".html", delete=False) as tmpf: 59 | pass 60 | 61 | save( 62 | self.to_render, 63 | filename=tmpf.name, 64 | resources=CDN, 65 | template=INLINE_TEMPLATE, 66 | title="DataPrep.EDA Report", 67 | ) 68 | with open(tmpf.name, "r") as f: 69 | output_html = f.read() 70 | 71 | # Delete the temporary file 72 | Path(tmpf.name).unlink() 73 | 74 | # Fix the bokeh: bokeh wrongly call the "waiting for bokeh to load" function 75 | # inside "Bokeh.safely", which causes Bokeh not found because 76 | # Bokeh is even not loaded! 77 | patched_html = output_html.replace( 78 | "Bokeh.safely", 79 | "var __dataprep_bokeh_fix = (f) => document.Bokeh === undefined ? setTimeout(f, 1000) : f(); __dataprep_bokeh_fix", # pylint: disable=line-too-long 80 | ) 81 | # embed into report template created by us here 82 | return patched_html 83 | 84 | def show(self) -> None: 85 | """ 86 | Render the report. This is useful when calling plot in a for loop. 87 | """ 88 | display(HTML(self._repr_html_())) 89 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "dataprep" 3 | version = "0.2.8" 4 | description = "Dataprep: Data Preparation in Python" 5 | authors = ["SFU Database System Lab "] 6 | maintainers = [ 7 | "Weiyuan Wu ", 8 | "Jinglin Peng ", 9 | "Pei Wang ", 10 | "Brandon Lockhart ", 11 | "Song Bian " 12 | ] 13 | 14 | license = "MIT" 15 | 16 | readme = "README.md" # Markdown files are supported 17 | 18 | repository = "https://github.com/sfu-db/dataprep" 19 | homepage = "https://github.com/sfu-db/dataprep" 20 | 21 | keywords = ["dataprep", "eda", "data connector", "data science", "exploratory data analysis", "data exploration"] 22 | 23 | classifiers = [ 24 | "Development Status :: 4 - Beta", 25 | "Topic :: Software Development :: Build Tools", 26 | "Environment :: Console", 27 | "Operating System :: OS Independent", 28 | "Intended Audience :: Science/Research", 29 | "Intended Audience :: Developers", 30 | "Intended Audience :: Financial and Insurance Industry", 31 | "Intended Audience :: Healthcare Industry", 32 | "Topic :: Scientific/Engineering", 33 | "Framework :: IPython", 34 | ] 35 | 36 | [tool.poetry.dependencies] 37 | python = "^3.6.1" 38 | 39 | # Dependencies for EDA 40 | dask = { version = "~2.13", extras = [ "complete" ]} 41 | pandas = "~1.0" 42 | numpy = "~1.18" 43 | scipy = "~1.4" 44 | holoviews = "~1.13" 45 | bokeh = "~2.1" 46 | 47 | # Dependencies for DataConnector 48 | jsonschema = "~3.2" 49 | requests = "~2.23" 50 | jinja2 = "~2.11" 51 | jsonpath2 = "~0.4" 52 | lxml = "~4.5" 53 | nltk = "^3.5" 54 | pillow = "^7.1.2" 55 | wordcloud = "^1.7.0" 56 | 57 | [tool.poetry.dev-dependencies] 58 | pylint = "~2.4" 59 | pytest = "~5.4" 60 | mypy = "~0.770" 61 | black = "19.10b0" 62 | nbsphinx = "~0.5" 63 | sphinx = "^3" 64 | toml = "^0.10.0" 65 | rstcheck = "^3.3.1" 66 | sphinx-autobuild = "^0.7.1" 67 | pytest-cov = "^2.8.1" 68 | codecov = "^2.0.22" 69 | sphinx-autodoc-typehints = "^1.10.3" 70 | ipython = "^7.13.0" 71 | rope = "^0.16.0" 72 | 73 | [tool.black] 74 | line-length = 88 75 | target-version = ['py36', 'py37'] 76 | exclude = ''' 77 | ( 78 | /( 79 | \.eggs 80 | | \.git 81 | | \.pytype 82 | | \.pytest_cache 83 | | build 84 | | dist 85 | )/ 86 | ) 87 | ''' 88 | 89 | [tool.semantic_release] 90 | version_variable = "dataprep/__init__.py:__version__" 91 | version_source = "tag" 92 | commit_subject = "v{version}" 93 | commit_message = "Bump to v{version}" 94 | commit_author = "Weiyuan Wu " 95 | branch = "master" 96 | commit_version_number = true 97 | 98 | [build-system] 99 | requires = ["poetry>=1"] 100 | build-backend = "poetry.masonry.api" 101 | -------------------------------------------------------------------------------- /dataprep/assets/english_stopwords.py: -------------------------------------------------------------------------------- 1 | english_stopwords = [ 2 | "i", 3 | "me", 4 | "my", 5 | "myself", 6 | "we", 7 | "our", 8 | "ours", 9 | "ourselves", 10 | "you", 11 | "you're", 12 | "you've", 13 | "you'll", 14 | "you'd", 15 | "your", 16 | "yours", 17 | "yourself", 18 | "yourselves", 19 | "he", 20 | "him", 21 | "his", 22 | "himself", 23 | "she", 24 | "she's", 25 | "her", 26 | "hers", 27 | "herself", 28 | "it", 29 | "it's", 30 | "its", 31 | "itself", 32 | "they", 33 | "them", 34 | "their", 35 | "theirs", 36 | "themselves", 37 | "what", 38 | "which", 39 | "who", 40 | "whom", 41 | "this", 42 | "that", 43 | "that'll", 44 | "these", 45 | "those", 46 | "am", 47 | "is", 48 | "are", 49 | "was", 50 | "were", 51 | "be", 52 | "been", 53 | "being", 54 | "have", 55 | "has", 56 | "had", 57 | "having", 58 | "do", 59 | "does", 60 | "did", 61 | "doing", 62 | "a", 63 | "an", 64 | "the", 65 | "and", 66 | "but", 67 | "if", 68 | "or", 69 | "because", 70 | "as", 71 | "until", 72 | "while", 73 | "of", 74 | "at", 75 | "by", 76 | "for", 77 | "with", 78 | "about", 79 | "against", 80 | "between", 81 | "into", 82 | "through", 83 | "during", 84 | "before", 85 | "after", 86 | "above", 87 | "below", 88 | "to", 89 | "from", 90 | "up", 91 | "down", 92 | "in", 93 | "out", 94 | "on", 95 | "off", 96 | "over", 97 | "under", 98 | "again", 99 | "further", 100 | "then", 101 | "once", 102 | "here", 103 | "there", 104 | "when", 105 | "where", 106 | "why", 107 | "how", 108 | "all", 109 | "any", 110 | "both", 111 | "each", 112 | "few", 113 | "more", 114 | "most", 115 | "other", 116 | "some", 117 | "such", 118 | "no", 119 | "nor", 120 | "not", 121 | "only", 122 | "own", 123 | "same", 124 | "so", 125 | "than", 126 | "too", 127 | "very", 128 | "s", 129 | "t", 130 | "can", 131 | "will", 132 | "just", 133 | "don", 134 | "don't", 135 | "should", 136 | "should've", 137 | "now", 138 | "d", 139 | "ll", 140 | "m", 141 | "o", 142 | "re", 143 | "ve", 144 | "y", 145 | "ain", 146 | "aren", 147 | "aren't", 148 | "couldn", 149 | "couldn't", 150 | "didn", 151 | "didn't", 152 | "doesn", 153 | "doesn't", 154 | "hadn", 155 | "hadn't", 156 | "hasn", 157 | "hasn't", 158 | "haven", 159 | "haven't", 160 | "isn", 161 | "isn't", 162 | "ma", 163 | "mightn", 164 | "mightn't", 165 | "mustn", 166 | "mustn't", 167 | "needn", 168 | "needn't", 169 | "shan", 170 | "shan't", 171 | "shouldn", 172 | "shouldn't", 173 | "wasn", 174 | "wasn't", 175 | "weren", 176 | "weren't", 177 | "won", 178 | "won't", 179 | "wouldn", 180 | "wouldn't", 181 | ] 182 | -------------------------------------------------------------------------------- /dataprep/data_connector/config_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for config downloading and maintaining 3 | """ 4 | from json import dump as jdump 5 | from pathlib import Path 6 | from shutil import rmtree 7 | from tempfile import gettempdir 8 | from typing import cast 9 | 10 | import requests 11 | 12 | META_URL = ( 13 | "https://raw.githubusercontent.com/sfu-db/DataConnectorConfigs/master/{}/_meta.json" 14 | ) 15 | TABLE_URL = ( 16 | "https://raw.githubusercontent.com/sfu-db/DataConnectorConfigs/master/{}/{}.json" 17 | ) 18 | GIT_REF_URL = "https://api.github.com/repos/sfu-db/DataConnectorConfigs/git/refs/heads" 19 | 20 | 21 | def config_directory() -> Path: 22 | """ 23 | Returns the config directory path 24 | """ 25 | tmp = gettempdir() 26 | return Path(tmp) / "dataprep" / "data_connector" 27 | 28 | 29 | def ensure_config(impdb: str) -> bool: 30 | """ 31 | Ensure the config for `impdb` is downloaded 32 | """ 33 | path = config_directory() 34 | obsolete = is_obsolete(impdb) 35 | 36 | if (path / impdb).exists() and not obsolete: 37 | return True 38 | else: 39 | download_config(impdb) 40 | return False 41 | 42 | 43 | def is_obsolete(impdb: str) -> bool: 44 | """ 45 | Test if the implicit db config files are obsolete 46 | and need to be re-downloaded. 47 | """ 48 | path = config_directory() 49 | if not (path / impdb).exists(): 50 | return True 51 | elif not (path / impdb / "_hash").exists(): 52 | return True 53 | else: 54 | with open(path / impdb / "_hash", "r") as f: 55 | githash = f.read() 56 | 57 | sha = get_git_master_hash() 58 | 59 | return githash != sha 60 | 61 | 62 | def get_git_master_hash() -> str: 63 | """ 64 | Get current config files repo's hash 65 | """ 66 | refs = requests.get(GIT_REF_URL).json() 67 | (sha,) = [ref["object"]["sha"] for ref in refs if ref["ref"] == "refs/heads/master"] 68 | return cast(str, sha) 69 | 70 | 71 | def download_config(impdb: str) -> None: 72 | """ 73 | Download the config from Github into the temp directory. 74 | """ 75 | url = META_URL.format(impdb) 76 | meta = requests.get(url).json() 77 | tables = meta["tables"] 78 | 79 | sha = get_git_master_hash() 80 | # In case we push a new config version to github when the user is downloading 81 | while True: 82 | configs = {"_meta": meta} 83 | for table in tables: 84 | url = TABLE_URL.format(impdb, table) 85 | config = requests.get(url).json() 86 | configs[table] = config 87 | sha_check = get_git_master_hash() 88 | 89 | if sha_check == sha: 90 | break 91 | 92 | sha = sha_check 93 | 94 | path = config_directory() 95 | 96 | if (path / impdb).exists(): 97 | rmtree(path / impdb) 98 | 99 | (path / impdb).mkdir(parents=True) 100 | for fname, json in configs.items(): 101 | with (path / impdb / f"{fname}.json").open("w") as f: 102 | jdump(json, f) 103 | 104 | with (path / impdb / "_hash").open("w") as f: 105 | f.write(sha) 106 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | from pathlib import Path 16 | from typing import cast 17 | 18 | import toml 19 | 20 | sys.path.insert(0, os.path.abspath("../../")) 21 | 22 | # -- Project information ----------------------------------------------------- 23 | 24 | project = "dataprep" 25 | copyright = "2020, SFU Database System Lab" 26 | author = "SFU Database System Lab" 27 | 28 | # The full version, including alpha/beta/rc tags 29 | def get_version() -> str: 30 | """ 31 | Get the library version from pyproject.toml 32 | """ 33 | path = Path(__file__).resolve().parents[2] / "pyproject.toml" 34 | pyproject = toml.loads(open(str(path)).read()) 35 | return cast(str, pyproject["tool"]["poetry"]["version"]) 36 | 37 | 38 | release = get_version() 39 | 40 | 41 | # -- General configuration --------------------------------------------------- 42 | 43 | # Add any Sphinx extension module names here, as strings. They can be 44 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 45 | # ones. 46 | extensions = [ 47 | "sphinx.ext.todo", 48 | "sphinx.ext.viewcode", 49 | "sphinx.ext.autodoc", 50 | "sphinx.ext.napoleon", 51 | "nbsphinx", 52 | "sphinx_autodoc_typehints", 53 | ] 54 | 55 | autodoc_typehints = "description" 56 | # Napoleon settings 57 | napoleon_google_docstring = False 58 | napoleon_numpy_docstring = True 59 | napoleon_include_init_with_doc = False 60 | napoleon_include_private_with_doc = False 61 | napoleon_include_special_with_doc = False 62 | napoleon_use_admonition_for_examples = False 63 | napoleon_use_admonition_for_notes = False 64 | napoleon_use_admonition_for_references = False 65 | napoleon_use_ivar = False 66 | napoleon_use_param = True 67 | napoleon_use_rtype = True 68 | napoleon_use_keyword = True 69 | napoleon_custom_sections = None 70 | 71 | # autodoc_default_options = { 72 | # "members": True, 73 | # "member-order": "bysource", 74 | # "special-members": "__init__", 75 | # } 76 | 77 | # Add any paths that contain templates here, relative to this directory. 78 | templates_path = ["_templates"] 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | # This pattern also affects html_static_path and html_extra_path. 83 | exclude_patterns = [] 84 | 85 | master_doc = "index" 86 | 87 | # -- Options for HTML output ------------------------------------------------- 88 | 89 | # The theme to use for HTML and HTML Help pages. See the documentation for 90 | # a list of builtin themes. 91 | # 92 | html_theme = "nature" 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ["_static"] 98 | -------------------------------------------------------------------------------- /Justfile: -------------------------------------------------------------------------------- 1 | build-docs: 2 | poetry run sphinx-build -M html docs/source docs/build 3 | 4 | publish-docs: build-docs 5 | touch docs/build/html/.nojekyll 6 | gh-pages --dotfiles --message "[skip ci] Updates" --dist docs/build/html 7 | 8 | gen-apidocs: 9 | poetry run sphinx-apidoc --ext-doctest --ext-autodoc --ext-mathjax -f -o docs/source dataprep 10 | 11 | black: 12 | poetry run black dataprep 13 | 14 | ci: format ci-black typeck test lint 15 | 16 | ci-black: 17 | poetry run black --check --quiet dataprep 18 | 19 | format: 20 | poetry run black dataprep 21 | 22 | typeck: ci-mypy 23 | 24 | test: 25 | poetry run pytest dataprep 26 | 27 | testf +ARGS="dataprep": 28 | poetry run pytest {{ARGS}} 29 | 30 | lint: 31 | poetry run pylint dataprep 32 | 33 | ci-mypy: 34 | poetry run mypy dataprep 35 | 36 | build: 37 | poetry build 38 | 39 | release version: 40 | #! /usr/bin/env bash 41 | 42 | # Sanity checks 43 | 44 | arr=(major minor patch) 45 | 46 | if [[ " ${arr[*]} " != *" {{version}} "* ]]; then 47 | echo "version must be one of 'major', 'minor', 'patch', got '{{version}}'"; 48 | exit 1; 49 | fi 50 | 51 | if [ ! -z "$(git status --porcelain)" ]; then echo "Git tree is not clean, commit first"; exit 1; fi 52 | 53 | if [ ! -z "$(git rev-parse --verify release)" ]; then echo "delete the existing release branch before new release"; exit 1; fi 54 | 55 | # Pre bump the version to get the next version number 56 | git checkout develop 57 | 58 | vstring="$(poetry version {{version}})" 59 | if [ $? -ne 0 ]; then 60 | echo $vstring; 61 | exit 1; 62 | fi 63 | 64 | from_version=$(echo "${vstring}" | sed -nr "s/^Bumping version from ([0-9]+\.[0-9]+\.[0-9]+) to ([0-9]+\.[0-9]+\.[0-9]+)$/\1/p") 65 | to_version=$(echo "${vstring}" | sed -nr "s/^Bumping version from ([0-9]+\.[0-9]+\.[0-9]+) to ([0-9]+\.[0-9]+\.[0-9]+)$/\2/p") 66 | 67 | git checkout pyproject.toml # clean up 68 | 69 | echo "Releasing from ${from_version} to ${to_version}?" 70 | select yn in "Yes" "No"; do 71 | case $yn in 72 | Yes ) break;; 73 | No ) git checkout pyproject.toml; git checkout develop; git branch -D release; exit;; 74 | esac 75 | done 76 | 77 | # Begin of the real stuff! 78 | 79 | # Create new release branch 80 | git checkout -b "release/v${to_version}" develop 81 | 82 | poetry version {{version}} 83 | 84 | echo "Creating release commit" 85 | git add pyproject.toml 86 | semantic-release version --{{version}} 87 | 88 | # echo "Merge release/v${to_version} to master & develop" 89 | # git checkout master 90 | # git merge "release/v${to_version}" 91 | 92 | # git checkout develop 93 | # git merge "release/v${to_version}" 94 | 95 | echo "Push branch and tag to remote" 96 | git push origin "release/v${to_version}":master 97 | git push origin "release/v${to_version}":develop 98 | git push origin "release/v${to_version}" 99 | git push origin "v${to_version}" 100 | 101 | echo "Build artifacts" 102 | poetry build 103 | 104 | echo "Creating release draft" 105 | semantic-release changelog | sed "1iv${to_version}\n" | hub release create -d -a "dist/dataprep-${to_version}-py3-none-any.whl" -a "dist/dataprep-${to_version}.tar.gz" -F - "v${to_version}" 106 | 107 | 108 | 109 | 110 | @ensure-git-clean: 111 | if [ ! -z "$(git status --porcelain)" ]; then echo "Git tree is not clean, commit first"; exit 1; fi -------------------------------------------------------------------------------- /docs/source/data_connector.rst: -------------------------------------------------------------------------------- 1 | ================================================================================== 2 | dataprep.data_connector: fetching data from popular websites with a simplified API 3 | ================================================================================== 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | 9 | Overview 10 | ========== 11 | data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. 12 | The goal is to help the users skip the complex API configuration. 13 | We illustrate how to use data_connector library with Yelp. 14 | 15 | 16 | Initializing a connector class for a website 17 | ============================================= 18 | The first step is to initialize a Connector class with the configuration file location and access token specified (`How to get access token? 19 | `_). 20 | Available configuration files can be manually downloaded here: `Configuration Files 21 | `_ or automatically downloaded at usage. 22 | To initialize a data_connector:: 23 | 24 | from dataprep.data_connector import Connector 25 | dc = Connector("./DataConnectorConfigs/yelp", auth_params={"access_token":access_token}) 26 | 27 | 28 | Getting the guidline of the connector with `Connector.info` 29 | ================================================================= 30 | | Connector's info method gives information and guideline of using the connector. In the example below, the response shows three things. 31 | | a. There is one table in Yelp, i.e. Yelp.businesses. 32 | | b. To query this table, the term and location parameters are required and the longitute and latitude parameters are optional (see Connector.query() section). 33 | | c. The examples of calling the methods in the Connector class. 34 | 35 | :: 36 | 37 | dc.info 38 | 39 | .. image:: _static/images/data_connector/info.png 40 | :align: center 41 | :width: 496 42 | :height: 215 43 | 44 | 45 | 46 | Understand web data with `Connector.show_schema()` 47 | ============================================================ 48 | show_schema(table name) returns the schema of the webdata to be returned in a dataframe. 49 | There are two columns in the response. 50 | The first column is the column name and the second is the datatype. 51 | 52 | :: 53 | 54 | dc.show_schema('businesses') 55 | 56 | 57 | .. image:: _static/images/data_connector/show_schema.png 58 | :align: center 59 | :width: 208 60 | :height: 458 61 | 62 | 63 | Getting web data with `Connector.query()` 64 | ================================================= 65 | the `query()` method downloads the website data. 66 | The parameters should meet the requriement in `Connector.info` 67 | Usually the raw data is returned in JSON or xml format. 68 | data_connector re-format the data in pandas dataframe for the convenience of downstream operations. 69 | 70 | :: 71 | 72 | df = dc.query('businesses', term="korean", location="seattle") 73 | df 74 | 75 | .. image:: _static/images/data_connector/query.png 76 | :align: center 77 | :width: 870 78 | :height: 491 79 | 80 | 81 | Advanced: writing your own data_connector configuration file 82 | ============================================================== 83 | A configuration file defines the infomation neccessary to fetch data from a website, e.g. the request url; the API authorization type; the parameters needed from the uses(API key, search keyword, etc.); the returned data's schema. 84 | All the information are reusable. 85 | To write a configuration file for the your own needs or modify an existing one, please refer to `Configuration Files 86 | `_. 87 | 88 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.0 2 | jobs: 3 | install_dependencies: 4 | docker: 5 | - image: circleci/python:3.7.2 6 | steps: 7 | - &step_add_path 8 | run: 9 | name: Add python user PATH into PATH 10 | command: echo "export PATH=$PATH:$HOME/.local/bin" >> $BASH_ENV 11 | - &step_install_pipenv 12 | run: 13 | name: Install python tools 14 | command: pip install --user poetry==1.0.0b9 15 | - &step_inproject_venv 16 | run: 17 | name: Set venv inproject 18 | command: poetry config virtualenvs.in-project true 19 | - checkout 20 | - run: 21 | name: Install dependencies 22 | command: poetry install 23 | no_output_timeout: 1200 24 | - run: 25 | name: Plot tool versions 26 | command: poetry run mypy --version & poetry run pylint --version && poetry run pytest --version && poetry run black --version 27 | - persist_to_workspace: 28 | root: . 29 | paths: .venv 30 | check: 31 | docker: 32 | - image: circleci/python:3.7.2 33 | steps: 34 | - *step_add_path 35 | - *step_install_pipenv 36 | - *step_inproject_venv 37 | - checkout 38 | - attach_workspace: 39 | at: . 40 | - run: 41 | name: Check if the code is formatted 42 | command: poetry run black --check --quiet dataprep 43 | - run: 44 | name: Type check the project 45 | command: poetry run mypy dataprep 46 | - run: 47 | name: Test the project 48 | command: poetry run pytest --cov=dataprep 49 | - run: 50 | name: Style check the project 51 | command: poetry run pylint dataprep 52 | - run: 53 | name: Update coverage data to codecov 54 | command: poetry run codecov 55 | docs-build: 56 | docker: 57 | - image: circleci/python:3.7.2 58 | steps: 59 | - run: 60 | name: Pandoc Installation 61 | command: curl -L https://github.com/jgm/pandoc/releases/download/2.9.2.1/pandoc-2.9.2.1-1-amd64.deb -o /tmp/pandoc.deb && sudo dpkg -i /tmp/pandoc.deb 62 | - *step_add_path 63 | - *step_install_pipenv 64 | - *step_inproject_venv 65 | - checkout 66 | - attach_workspace: 67 | at: . 68 | - run: 69 | name: Build docs 70 | command: poetry run sphinx-build -M html docs/source docs/build 71 | - persist_to_workspace: 72 | root: . 73 | paths: docs/build/html 74 | docs-deploy: 75 | docker: 76 | - image: node:8.10.0 77 | steps: 78 | - add_ssh_keys: 79 | fingerprints: 80 | - "b7:f1:2a:54:c8:90:80:78:ba:30:d9:9b:b8:7d:03:10" 81 | - checkout 82 | - attach_workspace: 83 | at: . 84 | - run: 85 | name: Install and configure dependencies 86 | command: | 87 | npm install -g --silent gh-pages@2.0.1 88 | git config user.email "ci@sfu.db" 89 | git config user.name "ci" 90 | - run: 91 | name: Disable jekyll builds 92 | command: touch docs/build/html/.nojekyll 93 | - run: 94 | name: Deploy docs to gh-pages branch 95 | command: gh-pages --dotfiles --message "[skip ci] Updates" --dist docs/build/html 96 | workflows: 97 | version: 2 98 | build_and_test: 99 | jobs: 100 | - install_dependencies 101 | - check: 102 | requires: 103 | - install_dependencies 104 | - docs-build: 105 | requires: 106 | - install_dependencies 107 | - docs-deploy: 108 | requires: 109 | - check 110 | - docs-build 111 | filters: 112 | branches: 113 | only: master -------------------------------------------------------------------------------- /docs/source/eda/plot_missing.rst: -------------------------------------------------------------------------------- 1 | ====================================================== 2 | `plot_missing`: analyzing the impact of missing values 3 | ====================================================== 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | Overview 9 | ======== 10 | 11 | The goal of `plot_missing` is to analyze the impact of missing values. The impact means the change of characteristics (e.g., histogram for numerical column or bar chart for categorical column) of the dataset before and after removing the rows with missing values. `plot_missing` mainly provides the following functionalities: 12 | 13 | 1. `plot_missing(df)`: plot the position of missing values. 14 | 2. `plot_missing(df, x)`: plot the impact on basic characteristics (histogram and bar chart) of missing values in column x to all other columns. 15 | 3. `plot_missing(df, x, y)`: zoom into column y, and plot the impact on more characteristics of missing values in column x to column y. 16 | 17 | In the following, we use several examples to demonstrate the functionalities. 18 | 19 | Loading dataset 20 | =============== 21 | We support two types of dataframe: pandas dataframe and dask dataframe. Here we load the well known `tantic` dataset into a pandas dataframe and use it to demonstrate our functionality:: 22 | 23 | import pandas as pd 24 | df = pd.read_csv("https://www.openml.org/data/get_csv/16826755/phpMYEkMl", na_values = ['?']) 25 | 26 | Plotting the position of missing values via `plot_missing(df)` 27 | ============================================================== 28 | 29 | Given a dataset, we could plot the position of missing values via plot_missing(df). The dataset is divided into bins, and we use colored bin to represent the number of missing values. The more the missing value is in a bin, the deeper the bin color is. By default, we show 50 columns and each column is divided into 100 bins. We will also show the percentage of missing values for each column in the label. The following is an example:: 30 | 31 | from dataprep.eda import plot_missing 32 | plot_missing(df) 33 | 34 | .. raw:: html 35 | 36 | 37 | 38 | 39 | The impact on basic characteristics of missing values in column x via `plot_missing(df, x)` 40 | =========================================================================================== 41 | 42 | After we know the position of the missing value, we could further analyze the impact of missing values. We provide `plot_missing(df, x)` to analyze the impact of missing values in column x. The impact means the characteristics of dataset before and after removing the missing values. Here, we consider two types of characteristics: the histogram for numerical column and the bar chart for categorical column. When calling `plot_missing(df, x)`, user could see the difference of histogram for numerical columns and the bar chart for categorical column, before and after removing the missing values of column x. The following shows an example:: 43 | 44 | plot_missing(df, "age") 45 | 46 | .. raw:: html 47 | 48 | 49 | 50 | 51 | The impact on more characteristics of missing values in column x to column y via `plot_missing(df, x, y)` 52 | ========================================================================================================= 53 | 54 | `plot_missing(df, x)` only considers two types of characteristics, i.e., histogram and bar chart, for all columns. If user wants to zoom into a specific column and analyze the impact on more characteristics, she/he could call `plot_missing(df, x, y)`. `plot_missing(df, x, y)` plots the impact of the missing values in x column on y column. The output plot is different when y is numerical column or categorical column. 55 | 56 | When y is numerical column, `plot_missing(df, x, y)` shows the impact on histogram, pdf, cdf, and box plot. The following shows an example:: 57 | 58 | plot_missing(df, "age", "fare") 59 | 60 | .. raw:: html 61 | 62 | 63 | 64 | When y is categorical column, `plot_missing(df, x, y)` shows the impact on bar chart. The following shows an example:: 65 | 66 | plot_missing(df, "age", "sex") 67 | 68 | .. raw:: html 69 | 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ----------------- 4 | 5 | [![License]](LICENSE) [![Doc Badge]](https://sfu-db.github.io/dataprep/) [![Version]](https://pypi.org/project/dataprep/) [![Python Version]](https://pypi.org/project/dataprep/) [![Downloads]](https://pepy.tech/project/dataprep) [![Codecov]](https://codecov.io/gh/sfu-db/dataprep) ![Build Status] [![Chat]](https://discord.gg/xwbkFNk) 6 | 7 | Dataprep lets you prepare your data using a single library with a few lines of code. 8 | 9 | Currently, you can use `dataprep` to: 10 | * Collect data from common data sources (through `dataprep.data_connector`) 11 | * Do your exploratory data analysis (through `dataprep.eda`) 12 | * ...more modules are coming 13 | 14 | 15 | [Documentation] | [Mail List & Forum] 16 | 17 | ## Installation 18 | 19 | ```bash 20 | pip install dataprep 21 | ``` 22 | 23 | ## Examples & Usages 24 | 25 | The following examples can give you an impression of what dataprep can do: 26 | 27 | * [Documentation: Data Connector](https://sfu-db.github.io/dataprep/data_connector.html) 28 | * [Documentation: EDA](https://sfu-db.github.io/dataprep/eda/introduction.html) 29 | * [EDA Case Study: Titanic](https://sfu-db.github.io/dataprep/case_study/titanic.html) 30 | * [EDA Case Study: House Price](https://sfu-db.github.io/dataprep/case_study/house_price.html) 31 | 32 | ### EDA 33 | 34 | There are common tasks during the exploratory data analysis stage, 35 | like a quick look at the columnar distribution, or understanding the correlations 36 | between columns. 37 | 38 | The EDA module categorizes these EDA tasks into functions helping you finish EDA 39 | tasks with a single function call. 40 | 41 | * Want to understand the distributions for each DataFrame column? Use `plot`. 42 | 43 |
44 | 45 | * Want to understand the correlation between columns? Use `plot_correlation`. 46 | 47 |
48 | 49 | * Or, if you want to understand the impact of the missing values for each column, use `plot_missing`. 50 | 51 |
52 | 53 | * You can drill down to get more information by given `plot`, `plot_correlation` and `plot_missing` a column name. E.g. for `plot_missing`: 54 | 55 |
56 | 57 | Don't forget to checkout the [examples] folder for detailed demonstration! 58 | 59 | ### Data Connector 60 | 61 | You can download Yelp business search result into a pandas DataFrame, 62 | using two lines of code, without taking deep looking into the Yelp documentation! 63 | Moreover, Data Connector will automatically do the pagination for you so that 64 | you can specify the desire count of the returned results without even considering the count-per-request restriction from the API. 65 | 66 |
67 | 68 | _The code requests 120 records even though Yelp restricts you can only fetch 50 per request._ 69 | 70 | ## Contribute 71 | 72 | There are many ways to contribute to Dataprep. 73 | 74 | * Submit bugs and help us verify fixes as they are checked in. 75 | * Review the source code changes. 76 | * Engage with other Dataprep users and developers on StackOverflow. 77 | * Help each other in the [Dataprep Community Discord](https://discord.gg/FXsK2P) and [Mail list & Forum]. 78 | * [![Twitter]](https://twitter.com/sfu_db) 79 | * Contribute bug fixes. 80 | * Providing use cases and writing down your user experience. 81 | 82 | Please take a look at our [wiki] for development documentations! 83 | 84 | 85 | [Build Status]: https://img.shields.io/circleci/build/github/sfu-db/dataprep/master?style=flat-square&token=f68e38757f5c98771f46d1c7e700f285a0b9784d 86 | [Documentation]: https://sfu-db.github.io/dataprep/ 87 | [Mail list & Forum]: https://groups.google.com/forum/#!forum/dataprep 88 | [wiki]: https://github.com/sfu-db/dataprep/wiki 89 | [examples]: https://github.com/sfu-db/dataprep/tree/master/examples 90 | [Chat]: https://img.shields.io/discord/702765817154109472?style=flat-square 91 | [License]: https://img.shields.io/pypi/l/dataprep?style=flat-square 92 | [Downloads]: https://pepy.tech/badge/dataprep 93 | [Python Version]: https://img.shields.io/pypi/pyversions/dataprep?style=flat-square 94 | [Version]: https://img.shields.io/pypi/v/dataprep?style=flat-square 95 | [Codecov]: https://img.shields.io/codecov/c/github/sfu-db/dataprep?style=flat-square 96 | [Twitter]: https://img.shields.io/twitter/follow/sfu_db?style=social 97 | [Doc Badge]: https://img.shields.io/badge/dynamic/json?color=blue&label=docs&prefix=v&query=%24.info.version&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fdataprep%2Fjson&style=flat-square 98 | -------------------------------------------------------------------------------- /docs/source/eda/introduction.rst: -------------------------------------------------------------------------------- 1 | 2 | An introduction to exploratory data analysis with `dataprep.eda` 3 | ================================================================ 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | .. topic:: Section contents 9 | 10 | In this section, we introduce how to do exploratory data analysis throughout `dataprep.eda` and give serval 11 | simple examples. 12 | 13 | Exploratory data analysis: functionality description 14 | ---------------------------------------------------- 15 | 16 | `Exploratory data analysis (EDA) `_ is the procedure of exploring the dataset and summarize its main characteristics. The goal of `dataprep.eda` module is to simplify this procedure and allow user explore important characteristics as many as possible via only a few APIs. Each API allows user analyze dataset from high level to low level from different perspective. Specifically, we provide the following functionalities: 17 | 18 | * **analyzing basic characteristics via `plot`**: we provide an API `plot` to allow user analyze the basic characteristic of the dataset. It plots the distribution or bar chart for each column to give user a basic sense of the dataset. If user is interested in one or two specific columns, it provides a more detailed plot for the specific columns by passing column names as the parameter. 19 | 20 | * **analyzing correlation between columns via `plot_correlation`**: We provide an API `plot_correlation` to analyze the correlation between columns. It plots the correlation matrix between columns. If user is interested the correlated columns for a specific column, e.g., the most correlated columns to column 'A', the API can provide a more detailed analysis by passing column names as the parameter. 21 | 22 | * **analyzing the impact of missing values via `plot_missing`**: We provide an API `plot_missing` to analyze the pattern and impact of missing values. At the first glace, it shows the position of missing values, which allows user be aware of data quality for each column or find any underlying pattern of missing values. To understand the impact of missing values from a specific column, user can pass the column name into the parameter. It will compare the distribution of each column with and without missing values from the given column, such that user could understand the impact of the missing values. 23 | 24 | In the following, we simply introduce `plot`, `plot_correlation` and `plot_missing` and demonstrate their basic functionalities. 25 | 26 | .. _demo: 27 | 28 | Analyzing basic characteristics via `plot` 29 | ------------------------------------------ 30 | 31 | To analyze the basic characteristics of the dataset, such as the distribution of each column, user could call `eda.plot`. It mainly provides the following functionalities: 32 | 33 | 1. plot(df): plot basic characteristics (the histogram and the bar chart) for all columns. 34 | 2. plot(df, x): zoom into column x and plot more refined characteristics. 35 | 3. plot(df, x, y): zoom into column x and column y, and plot more refined characteristics to explore their relationship. 36 | 37 | In the following, we show an example of `plot(df)`, which plots the histogram for each numerical column and bar chart for each categorical column :: 38 | 39 | from dataprep.eda import plot 40 | import pandas as pd 41 | df = pd.read_csv("https://www.openml.org/data/get_csv/1595261/phpMawTba", na_values = [' ?']) 42 | plot(df) 43 | 44 | .. raw:: html 45 | 46 | 47 | 48 | 49 | Analyzing correlation via `plot_correlation` 50 | -------------------------------------------- 51 | To analyze the correlation between columns, we provide `plot_correlation`. Its main functionalities could be summarized as follows: 52 | 53 | 1. `plot_correlation(df)`: plot the correlation matrix of all columns. 54 | 2. `plot_correlation(df, x)`: plot the most correlated columns to column x. 55 | 3. `plot_correlation(df, x, y)`: plot the scatter plot between column x and column y, as well as the regression line. Besides, the point that has most impact on the correlation value could be identified by passing a parameter. 56 | 4. `plot_correlation(df, x, y, k, value_range)`: filter the result by correlation value or by top-k. 57 | 58 | In the following, we show an example of `plot_correlation(df)`, which plots the correlation matrix for `Person `_, `Spearman `_ and `KendallTau `_ correlation:: 59 | 60 | from dataprep.eda import plot_correlation 61 | import pandas as pd 62 | df = pd.read_csv("https://www.openml.org/data/get_csv/4965268/wine-quality-red.arff") 63 | plot_correlation(df) 64 | 65 | .. raw:: html 66 | 67 | 68 | 69 | 70 | Analyzing missing values via `plot_missing` 71 | ------------------------------------------- 72 | To analyze the pattern and impact of missing values, we provide `plot_missing`. Its main functionalities could be summarized as follows: 73 | 74 | 1. `plot_missing(df)`: plot the position of missing values. 75 | 2. `plot_missing(df, x)`: plot the impact on basic characteristics (histogram and bar chart) of missing values in column x to all other columns. 76 | 3. `plot_missing(df, x, y)`: zoom into column y, and plot the impact on more characteristics of missing values in column x to column y. 77 | 78 | In the following, we show an example of `plot_missing(df)`, which shows the positions of missing values as well as the percentage of missing value in each column:: 79 | 80 | from dataprep.eda import plot_missing 81 | import pandas as pd 82 | df = pd.read_csv("https://www.openml.org/data/get_csv/16826755/phpMYEkMl", na_values = ['?']) 83 | plot_missing(df) 84 | 85 | .. raw:: html 86 | 87 | -------------------------------------------------------------------------------- /examples/DataConnector_DBLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Connector for DBLP \n", 8 | "\n", 9 | "In this example, we will be going over how to use Data Connector with DBLP." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Preprocessing\n", 17 | "\n", 18 | "data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. The goal is to help the users skip the complex API configuration. In this tutorial, we demonstrate how to use data_connector library with DBLP.\n", 19 | "\n", 20 | "If you haven't installed dataprep, run command `pip install dataprep` or execute the following cell." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "># Run me if you'd like to install\n", 30 | ">!pip install dataprep" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "# Download and store the configuration files in dataprep. \n", 38 | "\n", 39 | "The configuration files are used to configure the parameters and initial setup for the API. The available configuration files can be manually downloaded here: [Configuration Files](https://github.com/sfu-db/DataConnectorConfigs) or automatically downloaded at usage. \n", 40 | "\n", 41 | "Store the configuration file in the dataprep folder. " 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "# Initialize data_connector\n", 49 | "\n", 50 | "To initialize run the following code. Unlike Yelp and Spotify, tokens and client information are not needed." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "from dataprep.data_connector import Connector\n", 60 | "dc = Connector(\"./DataConnectorConfigs/DBLP\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "# Functionalities\n", 68 | "\n", 69 | "Data connector has several functions you can perform to gain insight on the data downloaded from DBLP." 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Connector.info\n", 77 | "The info method gives information and guidelines of using the connector. There are 3 sections in the response and they are table, parameters and examples.\n", 78 | ">1. Table - The table(s) being accessed.\n", 79 | ">2. Parameters - Identifies which parameters can be used to call the method. For DBLP, there is no required **parameter**. \n", 80 | ">3. Examples - Shows how you can call the methods in the Connector class." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "dc.info()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### Connector.show_schema\n", 97 | "The show_schema method returns the schema of the website data to be returned in a Dataframe. There are two columns in the response. The first column is the column name and the second is the datatype.\n", 98 | "\n", 99 | "As an example, lets see what is in the publication table." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "dc.show_schema(\"publication\")" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Connector.query\n", 116 | "The query method downloads the website data and displays it in a Dataframe. The parameters must meet the requirements as indicated in connector.info for the operation to run.\n", 117 | "\n", 118 | "When the data is received from the server, it will either be in a JSON or XML format. The data_connector reformats the data in pandas Dataframe for the convenience of downstream operations.\n", 119 | "\n", 120 | "As an example, let's try to get the data from the \"publication\" table, providing the query search for \"lee\"." 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "df = dc.query(\"businesses\", term=\"publication\", location=\"lee\")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "From query results, you can see how easy it is to download the publication data from DBLP into a pandas Dataframe.\n", 137 | "\n", 138 | "Now that you have an understanding of how data connector operates, you can easily accomplish the task with two lines of code.\n", 139 | "\n", 140 | "\n", 141 | ">1. dc = Connector(...)\n", 142 | ">2. dc.query(...)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "# That's all for now. \n", 150 | "If you are interested in writing your own configuration file or modify an existing one, refer to the [Configuration Files](https://github.com/sfu-db/DataConnectorConfigs>)." 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.7.7" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 4 175 | } 176 | -------------------------------------------------------------------------------- /dataprep/eda/utils.py: -------------------------------------------------------------------------------- 1 | """Miscellaneous functions 2 | """ 3 | import logging 4 | from math import ceil 5 | from typing import Any, Union, Optional 6 | import dask.dataframe as dd 7 | import numpy as np 8 | import pandas as pd 9 | from bokeh.models import Legend 10 | from bokeh.plotting import Figure 11 | 12 | LOGGER = logging.getLogger(__name__) 13 | 14 | 15 | def is_notebook() -> Any: 16 | """ 17 | :return: whether it is running in jupyter notebook 18 | """ 19 | try: 20 | # pytype: disable=import-error 21 | from IPython import get_ipython # pylint: disable=import-outside-toplevel 22 | 23 | # pytype: enable=import-error 24 | 25 | shell = get_ipython().__class__.__name__ 26 | if shell == "ZMQInteractiveShell": 27 | return True 28 | return False 29 | except (NameError, ImportError): 30 | return False 31 | 32 | 33 | def to_dask(df: Union[pd.DataFrame, dd.DataFrame]) -> dd.DataFrame: 34 | """ 35 | Convert a dataframe to a dask dataframe. 36 | """ 37 | if isinstance(df, dd.DataFrame): 38 | return df 39 | 40 | df_size = df.memory_usage(deep=True).sum() 41 | npartitions = ceil(df_size / 128 / 1024 / 1024) 42 | return dd.from_pandas(df, npartitions=npartitions) 43 | 44 | 45 | def sample_n(arr: np.ndarray, n: int) -> np.ndarray: # pylint: disable=C0103 46 | """ 47 | Sample n values uniformly from the range of the `arr`, 48 | not from the distribution of `arr`'s elems. 49 | """ 50 | if len(arr) <= n: 51 | return arr 52 | 53 | subsel = np.linspace(0, len(arr) - 1, n) 54 | subsel = np.floor(subsel).astype(int) 55 | return arr[subsel] 56 | 57 | 58 | def relocate_legend(fig: Figure, loc: str) -> Figure: 59 | """ 60 | Relocate legend(s) from center to `loc` 61 | """ 62 | remains = [] 63 | targets = [] 64 | for layout in fig.center: 65 | if isinstance(layout, Legend): 66 | targets.append(layout) 67 | else: 68 | remains.append(layout) 69 | fig.center = remains 70 | for layout in targets: 71 | fig.add_layout(layout, loc) 72 | 73 | return fig 74 | 75 | 76 | def cut_long_name(name: str, max_len: int = 12) -> str: 77 | """ 78 | If the name is longer than `max_len`, 79 | cut it to `max_len` length and append "..." 80 | """ 81 | # Bug 136 Fixed 82 | name = str(name) 83 | if len(name) <= max_len: 84 | return name 85 | return f"{name[:max_len]}..." 86 | 87 | 88 | def fuse_missing_perc(name: str, perc: float) -> str: 89 | """ 90 | Append (x.y%) to the name if `perc` is not 0 91 | """ 92 | if perc == 0: 93 | return name 94 | 95 | return f"{name} ({perc:.1%})" 96 | 97 | 98 | def nullity_filter( 99 | df: pd.DataFrame, 100 | filter_type: Optional[str] = None, 101 | p_cut_off: int = 0, 102 | n_cut_off: int = 0, 103 | ) -> pd.DataFrame: 104 | """ 105 | This function is designed to filters a DataFrame according to its nullity, 106 | using some combination of 'top' and 'bottom' numerical 107 | and percentage values. 108 | Percentages and numerical thresholds can be specified simultaneously 109 | Parameters 110 | ---------- 111 | df 112 | The DataFrame whose columns are being filtered. 113 | filter 114 | The orientation of the filter being applied to the DataFrame. 115 | One of, "top", "bottom", or None (default). 116 | The filter will simply return the DataFrame if you leave the filter 117 | argument unspecified or as None. 118 | p 119 | A completeness ratio cut-off. 120 | If non-zero the filter will limit the DataFrame to columns with at least p 121 | completeness.Input should be in the range [0, 1]. 122 | n 123 | A numerical cut-off. If non-zero no more than this number of columns will be returned. 124 | return 125 | The nullity-filtered `DataFrame`. 126 | Examples 127 | ---------- 128 | to get a DataFrame with columns of at least 75% completeness but with no more than 5 columns 129 | >>> nullity_filter(df, filter='top', p=.75, n=5) 130 | """ 131 | 132 | if filter_type == "top": 133 | if p_cut_off: 134 | df = df.iloc[ 135 | :, [c >= p_cut_off for c in df.count(axis="rows").values / len(df)] 136 | ] 137 | if n_cut_off: 138 | df = df.iloc[ 139 | :, np.sort(np.argsort(df.count(axis="rows").values)[-n_cut_off:]) 140 | ] 141 | elif filter_type == "bottom": 142 | if p_cut_off: 143 | df = df.iloc[ 144 | :, [c <= p_cut_off for c in df.count(axis="rows").values / len(df)] 145 | ] 146 | if n_cut_off: 147 | df = df.iloc[ 148 | :, np.sort(np.argsort(df.count(axis="rows").values)[:n_cut_off]) 149 | ] 150 | return df 151 | 152 | 153 | def nullity_sort( 154 | df: pd.DataFrame, sort: Optional[str] = None, axis: str = "columns" 155 | ) -> pd.DataFrame: 156 | """ 157 | This function is designed to Sorts a DataFrame according to its nullity, 158 | in either ascending or descending order. 159 | Parameters 160 | ---------- 161 | df 162 | the pandas data_frame object being sorted. 163 | sort 164 | the sorting method: either "ascending", "descending", or None (default). 165 | return 166 | the nullity-sorted DataFrame. 167 | """ 168 | if sort is None: 169 | return df 170 | 171 | if axis == "columns": 172 | if sort == "ascending": 173 | return df.iloc[np.argsort(df.count(axis="columns").values), :] 174 | elif sort == "descending": 175 | return df.iloc[np.flipud(np.argsort(df.count(axis="columns").values)), :] 176 | else: 177 | raise ValueError( 178 | 'The "sort" parameter must be set to "ascending" or "descending".' 179 | ) 180 | elif axis == "rows": 181 | if sort == "ascending": 182 | return df.iloc[:, np.argsort(df.count(axis="rows").values)] 183 | elif sort == "descending": 184 | return df.iloc[:, np.flipud(np.argsort(df.count(axis="rows").values))] 185 | else: 186 | raise ValueError( 187 | 'The "sort" parameter must be set to "ascending" or "descending".' 188 | ) 189 | else: 190 | raise ValueError('The "axis" parameter must be set to "rows" or "columns".') 191 | -------------------------------------------------------------------------------- /dataprep/data_connector/types.py: -------------------------------------------------------------------------------- 1 | """ 2 | Defines useful types in this library. 3 | """ 4 | from base64 import b64encode 5 | from enum import Enum 6 | from time import time 7 | from typing import Any, Dict, Optional, cast 8 | from sys import stderr 9 | import requests 10 | from jinja2 import Environment, UndefinedError 11 | 12 | from ..errors import UnreachableError 13 | 14 | 15 | class AuthorizationType(Enum): 16 | """Enum class defines the supported authorization methods in this library. 17 | 18 | Note 19 | ---- 20 | 21 | * Bearer: requires 'access_token' presented in user params 22 | * OAuth2: requires 'client_id' and 'client_secret' in user params for 23 | 'ClientCredentials' grant type 24 | """ 25 | 26 | Bearer = "Bearer" 27 | OAuth2 = "OAuth2" 28 | 29 | 30 | class Authorization: 31 | """Class carries the authorization type and 32 | the corresponding parameter. 33 | """ 34 | 35 | auth_type: AuthorizationType 36 | params: Dict[str, str] 37 | storage: Dict[str, Any] 38 | 39 | def __init__(self, auth_type: AuthorizationType, params: Dict[str, str]) -> None: 40 | self.auth_type = auth_type 41 | self.params = params 42 | self.storage = {} 43 | 44 | def build(self, req_data: Dict[str, Any], params: Dict[str, Any]) -> None: 45 | """Populate some required fields to the request data. 46 | Complex logic may also happens in this function (e.g. start a server to do OAuth). 47 | """ 48 | if self.auth_type == AuthorizationType.Bearer: # pylint: disable=no-member 49 | req_data["headers"]["Authorization"] = f"Bearer {params['access_token']}" 50 | elif ( 51 | self.auth_type == AuthorizationType.OAuth2 52 | and self.params["grantType"] == "ClientCredentials" 53 | ): 54 | # TODO: Move OAuth to a separate authenticator 55 | if ( 56 | "access_token" not in self.storage 57 | or self.storage.get("expires_at", 0) < time() 58 | ): 59 | # Not yet authorized 60 | ckey = params["client_id"] 61 | csecret = params["client_secret"] 62 | b64cred = b64encode(f"{ckey}:{csecret}".encode("ascii")).decode() 63 | resp = requests.post( 64 | self.params["tokenServerUrl"], 65 | headers={"Authorization": f"Basic {b64cred}"}, 66 | data={"grant_type": "client_credentials"}, 67 | ).json() 68 | 69 | assert resp["token_type"].lower() == "bearer" 70 | access_token = resp["access_token"] 71 | self.storage["access_token"] = access_token 72 | if "expires_in" in resp: 73 | self.storage["expires_at"] = ( 74 | time() + resp["expires_in"] - 60 75 | ) # 60 seconds grace period to avoid clock lag 76 | 77 | req_data["headers"][ 78 | "Authorization" 79 | ] = f"Bearer {self.storage['access_token']}" 80 | 81 | # TODO: handle auto refresh 82 | elif ( 83 | self.auth_type == AuthorizationType.OAuth2 84 | and self.params["grantType"] == "AuthorizationCode" 85 | ): 86 | raise NotImplementedError 87 | 88 | 89 | class Fields: 90 | """A data structure that stores the fields information (e.g. headers, cookies, ...). 91 | This class is useful to populate concrete fields data with required variables provided. 92 | """ 93 | 94 | fields: Dict[str, Any] 95 | 96 | def __init__(self, fields_config: Dict[str, Any]) -> None: 97 | self.fields = fields_config 98 | 99 | def populate( # pylint: disable=too-many-branches 100 | self, jenv: Environment, params: Dict[str, Any] 101 | ) -> Dict[str, str]: 102 | """Populate a dict based on the fields definition and provided vars. 103 | """ 104 | ret: Dict[str, str] = {} 105 | 106 | for key, def_ in self.fields.items(): 107 | from_key, to_key = key, key 108 | 109 | if isinstance(def_, bool): 110 | required = def_ 111 | value = params.get(from_key) 112 | if value is None and required: 113 | raise KeyError(from_key) 114 | remove_if_empty = False 115 | elif isinstance(def_, str): 116 | # is a template 117 | template: Optional[str] = def_ 118 | tmplt = jenv.from_string(cast(str, template)) 119 | value = tmplt.render(**params) 120 | remove_if_empty = False 121 | elif isinstance(def_, dict): 122 | template = def_.get("template") 123 | remove_if_empty = def_["removeIfEmpty"] 124 | to_key = def_.get("toKey") or to_key 125 | from_key = def_.get("fromKey") or from_key 126 | 127 | if template is None: 128 | required = def_["required"] 129 | value = params.get(from_key) 130 | if value is None and required: 131 | raise KeyError(from_key) 132 | else: 133 | tmplt = jenv.from_string(template) 134 | try: 135 | value = tmplt.render(**params) 136 | except UndefinedError: 137 | value = "" # This empty string will be removed if `remove_if_empty` is True 138 | else: 139 | raise UnreachableError() 140 | 141 | if value is not None: 142 | str_value = str(value) 143 | 144 | if not (remove_if_empty and not str_value): 145 | if to_key in ret: 146 | print(f"Param {key} conflicting with {to_key}", file=stderr) 147 | ret[to_key] = str_value 148 | continue 149 | return ret 150 | 151 | 152 | class Orient(Enum): 153 | """Different types of table orientations 154 | ref: (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html). 155 | Currently, DataConnector supports two different types of orientaions: 156 | 157 | 1. Split, which is column store. 158 | 2. Records, which is row store. 159 | 160 | Details can be found in the pandas page. 161 | """ 162 | 163 | Split = "split" 164 | Records = "records" 165 | -------------------------------------------------------------------------------- /docs/source/_static/images/plot_missing/df_x_cat.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Report 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 |
38 | 39 | 40 | 41 | 42 | 43 | 46 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /dataprep/eda/dtypes.py: -------------------------------------------------------------------------------- 1 | """ 2 | In this module lives the type tree. 3 | """ 4 | 5 | 6 | from typing import Any, Dict, Optional, Union, Type 7 | 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import dask.dataframe as dd 12 | 13 | from ..errors import UnreachableError 14 | 15 | CATEGORICAL_NUMPY_DTYPES = [np.bool, np.object] 16 | CATEGORICAL_PANDAS_DTYPES = [pd.CategoricalDtype, pd.PeriodDtype] 17 | CATEGORICAL_DTYPES = CATEGORICAL_NUMPY_DTYPES + CATEGORICAL_PANDAS_DTYPES 18 | 19 | NUMERICAL_NUMPY_DTYPES = [np.number] 20 | NUMERICAL_DTYPES = NUMERICAL_NUMPY_DTYPES 21 | 22 | DATETIME_NUMPY_DTYPES = [np.datetime64] 23 | DATETIME_PANDAS_DTYPES = [pd.DatetimeTZDtype] 24 | DATETIME_DTYPES = DATETIME_NUMPY_DTYPES + DATETIME_PANDAS_DTYPES 25 | 26 | 27 | class DType: 28 | """ 29 | Root of Type Tree 30 | """ 31 | 32 | 33 | ############## Syntactic DTypes ############## 34 | class Categorical(DType): 35 | """ 36 | Type Categorical 37 | """ 38 | 39 | 40 | class Nominal(Categorical): 41 | """ 42 | Type Nominal, Subtype of Categorical 43 | """ 44 | 45 | 46 | class Ordinal(Categorical): 47 | """ 48 | Type Ordinal, Subtype of Categorical 49 | """ 50 | 51 | 52 | class Numerical(DType): 53 | """ 54 | Type Numerical 55 | """ 56 | 57 | 58 | class Continuous(Numerical): 59 | """ 60 | Type Continuous, Subtype of Numerical 61 | """ 62 | 63 | 64 | class Discrete(Numerical): 65 | """ 66 | Type Discrete, Subtype of Numerical 67 | """ 68 | 69 | 70 | ############## Semantic DTypes ############## 71 | 72 | 73 | class DateTime(Numerical): 74 | """ 75 | Type DateTime, Subtype of Numerical 76 | """ 77 | 78 | 79 | class Text(Nominal): 80 | """ 81 | Type Text, Subtype of Nominal 82 | """ 83 | 84 | 85 | ############## End of the Type Tree ############## 86 | 87 | DTypeOrStr = Union[DType, Type[DType], str, None] 88 | DTypeDict = Union[Dict[str, Union[DType, Type[DType], str]], None] 89 | DTypeDef = Union[Dict[str, Union[DType, Type[DType], str]], DType, Type[DType], None] 90 | 91 | 92 | def detect_dtype(col: dd.Series, known_dtype: Optional[DTypeDef] = None,) -> DType: 93 | """ 94 | Given a column, detect its type or transform its type according to users' specification 95 | 96 | Parameters 97 | ---------- 98 | col: dask.datafram.Series 99 | A dataframe column 100 | known_dtype: Optional[Union[Dict[str, Union[DType, str]], DType]], default None 101 | A dictionary or single DType given by users to specify the types for designated columns or 102 | all columns. E.g. known_dtype = {"a": Continuous, "b": "Nominal"} or 103 | known_dtype = {"a": Continuous(), "b": "nominal"} or 104 | known_dtype = Continuous() or known_dtype = "Continuous" or known_dtype = Continuous() 105 | """ 106 | if not known_dtype: 107 | return detect_without_known(col) 108 | 109 | if isinstance(known_dtype, dict): 110 | if col.name in known_dtype: 111 | dtype = normalize_dtype(known_dtype[col.name]) 112 | return map_dtype(dtype) 113 | 114 | elif isinstance(normalize_dtype(known_dtype), DType): 115 | return map_dtype(normalize_dtype(known_dtype)) 116 | 117 | return detect_without_known(col) 118 | 119 | 120 | def map_dtype(dtype: DType) -> DType: 121 | """ 122 | Currently, we want to keep our Type System flattened. 123 | We will map Categorical() to Nominal() and Numerical() to Continuous() 124 | """ 125 | if ( 126 | isinstance(dtype, Categorical) is True 127 | and isinstance(dtype, Ordinal) is False 128 | and isinstance(dtype, Nominal) is False 129 | ): 130 | return Nominal() 131 | elif ( 132 | isinstance(dtype, Numerical) is True 133 | and isinstance(dtype, Continuous) is False 134 | and isinstance(dtype, Discrete) is False 135 | ): 136 | return Continuous() 137 | else: 138 | return dtype 139 | 140 | 141 | def detect_without_known(col: dd.Series) -> DType: 142 | """ 143 | This function detects dtypes of column when users didn't specify. 144 | """ 145 | if is_nominal(col.dtype): 146 | return Nominal() 147 | 148 | elif is_continuous(col.dtype): 149 | return Continuous() 150 | 151 | elif is_datetime(col.dtype): 152 | return DateTime() 153 | else: 154 | raise UnreachableError 155 | 156 | 157 | def is_dtype(dtype1: DType, dtype2: DType) -> bool: 158 | """ 159 | This function detects if dtype2 is dtype1. 160 | """ 161 | return isinstance(dtype1, dtype2.__class__) 162 | 163 | 164 | def normalize_dtype(dtype_repr: Any) -> DType: 165 | """ 166 | This function normalizes a dtype repr. 167 | """ 168 | normalized: DType 169 | str_dic = { 170 | "Categorical": Categorical, 171 | "Ordinal": Ordinal, 172 | "Nominal": Nominal, 173 | "Numerical": Numerical, 174 | "Continuous": Continuous, 175 | "Discrete": Discrete, 176 | "DateTime": DateTime, 177 | "Text": Text, 178 | } 179 | for str_dtype, dtype in str_dic.items(): 180 | if isinstance(dtype_repr, str): 181 | if dtype_repr.lower() == str_dtype.lower(): 182 | normalized = dtype() 183 | break 184 | 185 | elif isinstance(dtype_repr, dtype): 186 | normalized = dtype_repr 187 | break 188 | 189 | elif dtype_repr == dtype: 190 | normalized = dtype() 191 | break 192 | 193 | return normalized 194 | 195 | 196 | def is_nominal(dtype: Any) -> bool: 197 | """ 198 | Given a type, return if that type is a nominal type 199 | """ 200 | 201 | if is_continuous(dtype) or is_datetime(dtype): 202 | return False 203 | 204 | if isinstance(dtype, np.dtype): 205 | dtype = dtype.type 206 | 207 | return any(issubclass(dtype, c) for c in CATEGORICAL_NUMPY_DTYPES) 208 | else: 209 | return any(isinstance(dtype, c) for c in CATEGORICAL_PANDAS_DTYPES) 210 | 211 | 212 | def is_continuous(dtype: Any) -> bool: 213 | """ 214 | Given a type, return if that type is a continuous type 215 | """ 216 | dtype = dtype.type 217 | return any(issubclass(dtype, c) for c in NUMERICAL_NUMPY_DTYPES) 218 | 219 | 220 | def is_datetime(dtype: Any) -> bool: 221 | """ 222 | Given a type, return if that type is a datetime type 223 | """ 224 | if isinstance(dtype, np.dtype): 225 | dtype = dtype.type 226 | return any(issubclass(dtype, c) for c in DATETIME_NUMPY_DTYPES) 227 | else: 228 | return any(isinstance(dtype, c) for c in DATETIME_PANDAS_DTYPES) 229 | 230 | 231 | def is_pandas_categorical(dtype: Any) -> bool: 232 | """ 233 | Detect if a dtype is categorical and from pandas. 234 | """ 235 | return any(isinstance(dtype, c) for c in CATEGORICAL_PANDAS_DTYPES) 236 | -------------------------------------------------------------------------------- /examples/DataConnector_Yelp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Connector for Yelp \n", 8 | "\n", 9 | "In this example, we will be going over how to use Data Connector with Yelp." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Preprocessing\n", 17 | "\n", 18 | "data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. The goal is to help the users skip the complex API configuration. In this tutorial, we demonstrate how to use data_connector library with Yelp.\n", 19 | "\n", 20 | "If you haven't installed dataprep, run command `pip install dataprep` or execute the following cell." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "># Run me if you'd like to install\n", 30 | ">!pip install dataprep" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "# Obtaining access token from Yelp\n", 38 | "\n", 39 | "To connect to Yelp, you need to generate a token. This token is a unique identifier of an application requesting access to the Yelp's API. Once an application creates the token, it will act as your credential when making an API request. \n", 40 | "\n", 41 | "To receive an access token, the user needs to create a server-side application from Yelp. You can get a token by following the [Yelp document](https://www.yelp.com/developers/documentation/v3/authentication).\n", 42 | "Simply create an application and generate a key." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Store the token or API Key in a secure location as it will be used to provide you access to the Yelp's restaurant data." 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# Download and store the configuration files in dataprep. \n", 57 | "\n", 58 | "The configuration files are used to configure the parameters and initial setup for the API. The available configuration files can be manually downloaded here: [Configuration Files](https://github.com/sfu-db/DataConnectorConfigs) or automatically downloaded at usage. \n", 59 | "\n", 60 | "Store the configuration file in the dataprep folder. " 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "# Initialize data_connector\n", 68 | "\n", 69 | "To initialize run the following code. Copy and paste the Yelp API key into the **access_token** variable and ensure the connector path is correct. Once you have that running you can use the built in functions available in connector." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "from dataprep.data_connector import Connector\n", 79 | "access_token = “insert_token_key”\n", 80 | "dc = Connector(\"./DataConnectorConfigs/yelp\", auth_params={\"access_token\":access_token})" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "# Functionalities\n", 88 | "\n", 89 | "Data connector has several functions you can perform to gain insight on the data downloaded from Yelp." 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### Connector.info\n", 97 | "The info method gives information and guidelines of using the connector. There are 3 sections in the response and they are table, parameters and examples.\n", 98 | ">1. Table - The table(s) being accessed.\n", 99 | ">2. Parameters - Identifies which parameters can be used to call the method. For DBLP, there is no required **parameter**. \n", 100 | ">3. Examples - Shows how you can call the methods in the Connector class." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "dc.info()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "### Connector.show_schema\n", 117 | "The show_schema method returns the schema of the website data to be returned in a Dataframe. There are two columns in the response. The first column is the column name and the second is the datatype.\n", 118 | "\n", 119 | "As an example, lets see what is in the business table." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "dc.show_schema(\"business\")" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Connector.query\n", 136 | "The query method downloads the website data and displays it in a Dataframe. The parameters must meet the requirements as indicated in connector.info for the operation to run.\n", 137 | "\n", 138 | "When the data is received from the server, it will either be in a JSON or XML format. The data_connector reformats the data in pandas Dataframe for the convenience of downstream operations.\n", 139 | "\n", 140 | "As an example, let's try to get the data from the \"business\" table, providing the term \"city\" and location \"seattle\"." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "df = dc.query(\"businesses\", term=\"city\", location=\"seattle\")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "From query results, you can see how easy it is to download the restaurant data from Yelp into a pandas Dataframe. \n", 157 | "\n", 158 | "Now that you have an understanding of how data connector operates, you can easily accomplish the task with two lines of code.\n", 159 | "\n", 160 | ">1. dc = Connector(...)\n", 161 | ">2. dc.query(...)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "# That's all for now. \n", 169 | "If you are interested in writing your own configuration file or modify an existing one, refer to the [Configuration Files](https://github.com/sfu-db/DataConnectorConfigs>)." 170 | ] 171 | } 172 | ], 173 | "metadata": { 174 | "kernelspec": { 175 | "display_name": "Python 3", 176 | "language": "python", 177 | "name": "python3" 178 | }, 179 | "language_info": { 180 | "codemirror_mode": { 181 | "name": "ipython", 182 | "version": 3 183 | }, 184 | "file_extension": ".py", 185 | "mimetype": "text/x-python", 186 | "name": "python", 187 | "nbconvert_exporter": "python", 188 | "pygments_lexer": "ipython3", 189 | "version": "3.7.7" 190 | } 191 | }, 192 | "nbformat": 4, 193 | "nbformat_minor": 4 194 | } 195 | -------------------------------------------------------------------------------- /dataprep/eda/basic/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements the plot(df) function. 3 | """ 4 | 5 | from typing import Optional, Tuple, Union, Dict 6 | 7 | import dask.dataframe as dd 8 | import pandas as pd 9 | from bokeh.io import show 10 | 11 | from .compute import compute 12 | from .render import render 13 | from ..report import Report 14 | from ..dtypes import DTypeDef 15 | 16 | __all__ = ["plot", "compute", "render"] 17 | 18 | 19 | def plot( 20 | df: Union[pd.DataFrame, dd.DataFrame], 21 | x: Optional[str] = None, 22 | y: Optional[str] = None, 23 | z: Optional[str] = None, 24 | *, 25 | bins: int = 10, 26 | ngroups: int = 10, 27 | largest: bool = True, 28 | nsubgroups: int = 5, 29 | timeunit: str = "auto", 30 | agg: str = "mean", 31 | sample_size: int = 1000, 32 | value_range: Optional[Tuple[float, float]] = None, 33 | yscale: str = "linear", 34 | tile_size: Optional[float] = None, 35 | dtype: Optional[DTypeDef] = None, 36 | top_words: Optional[int] = 30, 37 | stopword: Optional[bool] = True, 38 | lemmatize: Optional[bool] = False, 39 | stem: Optional[bool] = False, 40 | ) -> Report: 41 | """Generates plots for exploratory data analysis. 42 | 43 | If no columns are specified, the distribution of 44 | each coloumn is plotted. A histogram is plotted if the 45 | column contains numerical values, a bar chart is plotted 46 | if the column contains categorical values, a line chart is 47 | plotted if the column is of type datetime. 48 | 49 | If one column (x) is specified, the 50 | distribution of x is plotted in various ways. If x 51 | contains categorical values, a bar chart and pie chart are 52 | plotted. If x contains numerical values, a histogram, 53 | kernel density estimate plot, box plot, and qq plot are plotted. 54 | If x contains datetime values, a line chart is plotted. 55 | 56 | If two columns (x and y) are specified, plots depicting 57 | the relationship between the variables will be displayed. If 58 | x and y contain numerical values, a scatter plot, hexbin 59 | plot, and binned box plot are plotted. If one of x and y 60 | contain categorical values and the other contains numerical values, 61 | a box plot and multiline histogram are plotted. If x and y 62 | contain categorical vales, a nested bar chart, stacked bar chart, and 63 | heat map are plotted. If one of x and y contains datetime values 64 | and the other contains numerical values, a line chart and a box plot 65 | are shown. If one of x and y contains datetime values and the other 66 | contains categorical values, a multiline chart and a stacked box plot 67 | are shown. 68 | 69 | If x, y, and z are specified, they must be one each of type datetime, 70 | numerical, and categorical. A multiline chart containing an aggregate 71 | on the numerical column grouped by the categorical column over time is 72 | plotted. 73 | 74 | 75 | Parameters 76 | ---------- 77 | df 78 | Dataframe from which plots are to be generated 79 | x: Optional[str], default None 80 | A valid column name from the dataframe 81 | y: Optional[str], default None 82 | A valid column name from the dataframe 83 | z: Optional[str], default None 84 | A valid column name from the dataframe 85 | bins: int, default 10 86 | For a histogram or box plot with numerical x axis, it defines 87 | the number of equal-width bins to use when grouping. 88 | ngroups: int, default 10 89 | When grouping over a categorical column, it defines the 90 | number of groups to show in the plot. Ie, the number of 91 | bars to show in a bar chart. 92 | largest: bool, default True 93 | If true, when grouping over a categorical column, the groups 94 | with the largest count will be output. If false, the groups 95 | with the smallest count will be output. 96 | nsubgroups: int, default 5 97 | If x and y are categorical columns, ngroups refers to 98 | how many groups to show from column x, and nsubgroups refers to 99 | how many subgroups to show from column y in each group in column x. 100 | timeunit: str, default "auto" 101 | Defines the time unit to group values over for a datetime column. 102 | It can be "year", "quarter", "month", "week", "day", "hour", 103 | "minute", or "second". With default value "auto", it will use the 104 | time unit such that the resulting number of groups is closest to 15. 105 | agg: str, default "mean" 106 | Specify the aggregate to use when aggregating over a numerical 107 | column 108 | sample_size: int, default 1000 109 | Sample size for the scatter plot 110 | value_range: Optional[Tuple[float, float]], default None 111 | The lower and upper bounds on the range of a numerical column. 112 | Applies when column x is specified and column y is unspecified. 113 | yscale 114 | The scale to show on the y axis. Can be "linear" or "log". 115 | tile_size: Optional[float], default None 116 | Size of the tile for the hexbin plot. Measured from the middle 117 | of a hexagon to its left or right corner. 118 | dtype: str or DType or dict of str or dict of DType, default None 119 | Specify Data Types for designated column or all columns. 120 | E.g. dtype = {"a": Continuous, "b": "Nominal"} or 121 | dtype = {"a": Continuous(), "b": "nominal"} 122 | or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous() 123 | top_words: int, default 30 124 | Specify the amount of words to show in the wordcloud and 125 | word frequency bar chart 126 | stopword: bool, default True 127 | Eliminate the stopwords in the text data for plotting wordcloud and 128 | word frequency bar chart 129 | lemmatize: bool, default False 130 | Lemmatize the words in the text data for plotting wordcloud and 131 | word frequency bar chart 132 | stem: bool, default False 133 | Apply Potter Stem on the text data for plotting wordcloud and 134 | word frequency bar chart 135 | Examples 136 | -------- 137 | >>> import pandas as pd 138 | >>> from dataprep.eda import * 139 | >>> iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv') 140 | >>> plot(iris) 141 | >>> plot(iris, "petal_length", bins=20, value_range=(1,5)) 142 | >>> plot(iris, "petal_width", "species") 143 | """ 144 | # pylint: disable=too-many-locals,line-too-long 145 | 146 | intermediate = compute( 147 | df, 148 | x=x, 149 | y=y, 150 | z=z, 151 | bins=bins, 152 | ngroups=ngroups, 153 | largest=largest, 154 | nsubgroups=nsubgroups, 155 | timeunit=timeunit.lower(), 156 | agg=agg, 157 | sample_size=sample_size, 158 | value_range=value_range, 159 | dtype=dtype, 160 | top_words=top_words, 161 | stopword=stopword, 162 | lemmatize=lemmatize, 163 | stem=stem, 164 | ) 165 | figure = render(intermediate, yscale=yscale, tile_size=tile_size) 166 | 167 | return Report(figure) 168 | -------------------------------------------------------------------------------- /examples/EDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "If you haven't installed dataprep, run command `pip install dataprep` or execute the following cell" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Run me if you'd like to install\n", 17 | "!pip install dataprep" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "from dataprep.eda import plot, plot_correlation, plot_missing" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Load data" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "df = pd.read_csv(\"https://s3-us-west-2.amazonaws.com/dataprep.dsl/datasets/suicide-rate.csv\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Plot the distribution of each column in the dataframe. \n", 51 | "For numeric column, show the histogram. For categorical column, show bar chart." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df[\"year\"] = df[\"year\"].astype(\"category\")\n", 61 | "plot(df)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "# Show the plots of the given column. If column is numeric, show keneral density plot, box plot and qqnorm plot.\n", 69 | "If column is categorical, show bar plot and pie plot." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "plot(df, \"sex\")\n", 79 | "plot(df, \"gdp_per_capita\")" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "# Show the plots of the relationship of given two columns. \n", 87 | "* For numeric-categorical, show the box plot for each category.\n", 88 | "* For numeric-numeric, show the heatmap\n", 89 | "* For categorical-categorical, show the bar chart of col_x for each category of col_y" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "plot(df, \"suicides\", \"sex\")\n", 99 | "plot(df, \"population\", \"suicides\")\n", 100 | "plot(df, \"country\", \"generation\")" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## Show correlation matrix plots using each method (pearson, kendall, spearman)\n", 108 | "If k is specified, in each matrix plot, only show top-k positive cells, set the color of other cells to white. (Do you want to know the top-k negative cells?)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "df_without_missing = df.dropna('columns')\n", 118 | "plot_correlation(df_without_missing)\n", 119 | "plot_correlation(df_without_missing, k=1)\n", 120 | "plot_correlation(df_without_missing, value_range=(0,1))" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "# Show the 3 cols that corresponds to x in the correlation matrix (pearson, kendall, spearman)\n", 128 | "if k is specified, sort the result based on corr. show the 3 cols that corresponds the top-k correlation value" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "plot_correlation(df_without_missing, \"suicides\")\n", 138 | "plot_correlation(df_without_missing, \"suicides\", k=2)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# if value_range is specified, show the correlation value in value_range." 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "plot_correlation(df_without_missing, \"suicides\", value_range=[-1, 0.3])" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "# if no correlation in the range, show blank fig." 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "plot_correlation(df_without_missing, \"suicides\", value_range=[-1, -0.8])" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "plot_correlation(df_without_missing, x=\"population\", y=\"suicides_no\")\n", 180 | "plot_correlation(df_without_missing, x=\"population\", y=\"suicides\", k=5)\n" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## show the location/position and percentage of missing data" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "plot_missing(df, num_bins=100)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "## If one want to remove the rows whose x is missing, \n", 204 | "the impact of the removed rows on other columns. " 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "plot_missing(df, 'HDI_for_year')" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "## If one want to remove the rows whose x is missing, the impact of the removed rows on y columns. " 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "plot_missing(df, 'HDI_for_year', 'population')\n", 230 | "plot_missing(df, 'HDI_for_year', 'sex')\n", 231 | "plot_missing(df, 'HDI_for_year', \"country\")" 232 | ] 233 | } 234 | ], 235 | "metadata": { 236 | "kernelspec": { 237 | "display_name": "Python 3", 238 | "language": "python", 239 | "name": "python3" 240 | }, 241 | "language_info": { 242 | "codemirror_mode": { 243 | "name": "ipython", 244 | "version": 3 245 | }, 246 | "file_extension": ".py", 247 | "mimetype": "text/x-python", 248 | "name": "python", 249 | "nbconvert_exporter": "python", 250 | "pygments_lexer": "ipython3", 251 | "version": "3.7.5" 252 | } 253 | }, 254 | "nbformat": 4, 255 | "nbformat_minor": 4 256 | } 257 | -------------------------------------------------------------------------------- /docs/source/DC_DBLP_tut.rst: -------------------------------------------------------------------------------- 1 | 2 | ================================================== 3 | Tutorial - Data Connector for DBLP 4 | ================================================== 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | Overview 10 | ======== 11 | 12 | data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. 13 | The goal is to help the users skip the complex API configuration. In this tutorial, we demonstrate how to use data_connector library with DBLP. 14 | 15 | Preprocessing 16 | ================ 17 | If you haven't installed dataprep, run command pip install dataprep or execute the following cell. 18 | 19 | :: 20 | 21 | !pip install dataprep 22 | 23 | 24 | Download and store the configuration files in dataprep 25 | ================================================================ 26 | The configuration files are used to construct the parameters and initial setup for the API. The available configuration files can be manually downloaded here: `Configuration Files 27 | `_ or automatically downloaded at usage. 28 | 29 | 30 | 31 | To automatically download at usage, click on the clipboard button, unsure you are cloning with HTTPS. Go into your terminal, and find an appropriate locate to store the configuration files. 32 | When you decided on a location, enter the command ``git clone https://github.com/sfu-db/DataConnectorConfigs.git``. This will clone the git repository to the desired location; as a suggestion store it with the dataprep folder. 33 | 34 | 35 | From here you can proceed with the next steps. 36 | 37 | .. image:: _static/images/tutorial/dc_git.png 38 | :align: center 39 | :width: 1000 40 | :height: 500 41 | 42 | 43 | .. image:: _static/images/tutorial/dc_git_clone.png 44 | :align: center 45 | :width: 725 46 | :height: 125 47 | 48 | 49 | Below the configuration file are stored with dataprep. 50 | 51 | .. image:: _static/images/tutorial/Config_destination.png 52 | :align: center 53 | :width: 586 54 | :height: 132 55 | 56 | 57 | 58 | Initialize data_connector 59 | ============================= 60 | To initialize, run the following code. 61 | 62 | :: 63 | 64 | from dataprep.data_connector import Connector 65 | dc = Connector("./DataConnectorConfigs/DBLP") 66 | 67 | Functionalities 68 | =================== 69 | Data connector has several functions you can perform to gain insight on the data downloaded from DBLP. 70 | 71 | Connector.info 72 | ------------------ 73 | | The info method gives information and guidelines on using the connector. There are 3 sections in the response and they are table, parameters and examples. 74 | | 75 | | a. Table - The table(s) being accessed. 76 | | b. Parameters - Identifies which parameters can be used to call the method. For DBLP, there is no required **parameter**. 77 | | c. Examples - Shows how you can call the methods in the Connector class. 78 | 79 | 80 | :: 81 | 82 | dc.info() 83 | 84 | .. image:: _static/images/tutorial/dc_dblp_info.png 85 | :align: center 86 | :width: 300 87 | :height: 200 88 | 89 | Parameters 90 | ********************** 91 | | A parameter is a piece of information you supply to a query right as you run it. The parameters for DBLP are **q**, **h**, and **f**, and they are described below. 92 | | 93 | | a. **q** - Optional - The query string to search for find author profiles, conferences, journals, or individual publications in the database. 94 | | b. **h** - Optional - Maximum number of search results (hits) to return. 95 | | c. **f** - Optional - The first hit in the numbered sequence of search results (starting with 0) to return. In combination with the h parameter, this parameter can be used for pagination of search results. 96 | 97 | There are additional parameters to query with DBLP. If you are interested in reading up the other available parameters and setting up your own config files, please read this `DBLP link 98 | `_ and this `Configuration Files link 99 | `_. 100 | 101 | 102 | Connector.show_schema 103 | -------------------------- 104 | The show_schema method returns the website data in a Dataframe format. There are two columns in the response. The first column is the name and the second is the datatype. 105 | As an example, lets see what is in the publication table. 106 | 107 | :: 108 | 109 | dc.show_schema("publication") 110 | 111 | .. image:: _static/images/tutorial/dc_dblp_show_schema.png 112 | :align: center 113 | :width: 212 114 | :height: 295 115 | 116 | Connector.query 117 | ------------------ 118 | The query method downloads the website data. The parameters must meet the requirements as indicated in connector.info for the operation to run. 119 | 120 | When the data is received from the server, it will either be in a JSON or XML format. The data_connector reformats the data in pandas Dataframe for the convenience of downstream operations. 121 | 122 | As an example, let's try to get the data from the "publication" table, providing the query search for "lee". 123 | 124 | :: 125 | 126 | dc.query("publication", q="lee") 127 | 128 | .. image:: _static/images/tutorial/dc_dblp_query.png 129 | :align: center 130 | :width: 1000 131 | :height: 500 132 | 133 | From query results, you can see how easy it is to download the publication data from DBLP into a pandas Dataframe. 134 | Now that you have an understanding of how data connector operates, you can easily accomplish the task with two lines of code. 135 | 136 | :: 137 | 138 | dc = Connector(...) 139 | dc.query(...) 140 | 141 | Pagination 142 | =================== 143 | | Another feature available in the config files is pagination. Pagination is the process of dividing a document into discrete pages, breaking the content into pages and allow visitors to switch between them. It returns the maximum number of searches to return. 144 | | 145 | | To use pagination, you need to include **_count** in your query. The **_count** parameter represents the number of records a user would like to return, which can be larger than the maximum limit of records each return of API itself. Users can still fetch multiple pages of records by using parameters like limit and offset, however this requires users to understand how pagination works different website APIs. 146 | | 147 | 148 | :: 149 | 150 | dc.query("publication", q = "lee", _count = 200) 151 | 152 | .. image:: _static/images/tutorial/dc_dblp_pagination.png 153 | :align: center 154 | :width: 1000 155 | :height: 500 156 | 157 | Pagination does not concurrently work with the **h** parameter in a query, you need to select either **h** or **_count**. 158 | 159 | All publications of one specific author 160 | ========================================================= 161 | | In the query, **q** is a generic search parameter that find author profiles, conferences, journals, or individual publications in the database. As a parameter, **q** is not great when trying to find specific authors and their work. To solve for this issue, you can query the authors first and last name. 162 | | 163 | | To fetch all publications of one specific author, you need to include **first_name="______"**, **last_name="______"** in your query. 164 | 165 | :: 166 | 167 | dc.query("publication", first_name = "Jeff", last_name = "Hawkins") 168 | 169 | .. image:: _static/images/tutorial/dc_dblp_author.png 170 | :align: center 171 | :width: 1000 172 | :height: 500 173 | 174 | That's all for now. 175 | =================== 176 | Please visit the other tutorials that are available if you are interested in setting up a different data connector. 177 | If you are interested in writing your own configuration file or modify an existing one, refer to the `Configuration Files 178 | `_. -------------------------------------------------------------------------------- /docs/source/eda/plot_correlation.rst: -------------------------------------------------------------------------------- 1 | ============================================================= 2 | `plot_correlation`: analyzing the correlation between columns 3 | ============================================================= 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | 9 | Overview 10 | ======== 11 | 12 | The goal of `plot_correlation` is to analyze the correlation between columns. It provides the following functionalities: 13 | 14 | 1. `plot_correlation(df)`: plot the correlation matrix of all columns. 15 | 2. `plot_correlation(df, x)`: plot the most correlated columns to column x. 16 | 3. `plot_correlation(df, x, y)`: plot the scatter plot between column x and column y, as well as the regression line. Besides, the point that has most impact on the correlation value could be identified by passing a parameter. 17 | 4. `plot_correlation(df, x, y, k, value_range)`: filter the result by correlation value or by top-k. 18 | 19 | .. 20 | The following table summarizes the output plots for different setting of x and y. 21 | 22 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 23 | | | **plot_correlation(df,x, y)** | | 24 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 25 | | **x** | **y** | **output plots** | 26 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 27 | | None | None | n*n correlation matrix for Person, Spearman and KendallTau correlation, where n is max(50, df.columns) | 28 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 29 | | Numerical | None | n*1 correlation matrix for Person, Spearman and KendallTau correlation | 30 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 31 | | Categorical | None | TODO | 32 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 33 | | Numerical | Numerical | `scatter plot `_ with regression line | 34 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 35 | | Numerical | Categorical | TODO | 36 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 37 | | Categorical | Numerical | TODO | 38 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 39 | | Categorical | Categorical | TODO | 40 | +-------------+-------------------------------+--------------------------------------------------------------------------------------------------------+ 41 | 42 | In the following, we use several examples to demonstrate the functionalities. 43 | 44 | 45 | Loading dataset 46 | =============== 47 | We support two types of dataframe: pandas dataframe and dask dataframe. Here we load the well known `wine quality` dataset into a pandas dataframe and use it to demonstrate our functionality:: 48 | 49 | import pandas as pd 50 | df = pd.read_csv("https://www.openml.org/data/get_csv/4965268/wine-quality-red.arff") 51 | 52 | 53 | Plotting correlation matrix via `plot_correlation(df)` 54 | ====================================================== 55 | 56 | After getting a dataset, we could plot the correlation matrix of all columns by calling `plot_correlation(df)`. We will compute three types of correlations (`Person `_, `Spearman `_ and `KendallTau `_) and for each of them generating a correlation matrix. In the matrix, each cell represents the correlation value of two columns. The following shows an example:: 57 | 58 | from dataprep.eda import plot_correlation 59 | plot_correlation(df) 60 | 61 | 62 | .. raw:: html 63 | 64 | 65 | 66 | 67 | Finding the most correlated columns via `plot_correlation(df, x)` 68 | ================================================================= 69 | 70 | After getting the correlation matrix, user may zoom into a column and explore how other columns correlated to it. To achieve this goal, we provide `plot_correlation(df, x)`. It computes the correlations (`Person `_, `Spearman `_ and `KendallTau `_) of the interested column x to all other columns and sorting them based on the correlation values. In this case, user could know which column is most correlated or un-correlated to column x. The following shows an example:: 71 | 72 | plot_correlation(df, "alcohol") 73 | 74 | .. raw:: html 75 | 76 | 77 | 78 | 79 | Exploring the correlation between two columns via `plot_correlation(df, x, y)` 80 | =============================================================================== 81 | 82 | Furthermore, we provide `plot_correlation(df, x, y)` to allow user analyze the correlation between two columns. It plots a scatter plot of column x and y, along with a regression line. The following shows an example:: 83 | 84 | plot_correlation(df, "alcohol", "pH") 85 | 86 | .. raw:: html 87 | 88 | 89 | 90 | 91 | Besides, when user passes the parameter k, it could identify the k points that have the largest impact on the correlation value. The impact means that after removing the k points, the correlation value will increase most (positive influence) or decrease most (negative influence). The following shows an example:: 92 | 93 | plot_correlation(df, "alcohol", "pH", k = 2) 94 | 95 | .. raw:: html 96 | 97 | 98 | 99 | 100 | Filtering the result by top-k and value range filter 101 | ==================================================== 102 | 103 | We provide two types of filters to filter the result: top-k and value range. They could be applied to `plot_correlation(df)` and `plot_correlation(df, x)` by passing parameter `k` and `value_range`. After applying top-k filter, only the top-k correlation values will be shown. For value range filter, only the the correlation value in a given range will be shown. 104 | 105 | The following shows an example of applying top-k filter in `plot_correlation(df)`:: 106 | 107 | plot_correlation(df, k = 3) 108 | 109 | .. raw:: html 110 | 111 | 112 | 113 | 114 | The following shows an example of applying value range filter in `plot_correlation(df, x)`:: 115 | 116 | plot_correlation(df, "alcohol", value_range=[0.1, 1]) 117 | 118 | .. raw:: html 119 | 120 | 121 | -------------------------------------------------------------------------------- /docs/source/DC_Yelp_tut.rst: -------------------------------------------------------------------------------- 1 | 2 | ================================================== 3 | Tutorial - Data Connector for Yelp 4 | ================================================== 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | Overview 10 | ======== 11 | 12 | data_connector is a component in the dataprep library that aims to simplify the data access by providing a standard API set. 13 | The goal is to help the users skip the complex API configuration. In this tutorial, we demonstrate how to use 14 | data_connector library with Yelp. 15 | 16 | 17 | Preprocessing 18 | ================ 19 | If you haven't installed dataprep, run command pip install dataprep or execute the following cell. 20 | 21 | :: 22 | 23 | !pip install dataprep 24 | 25 | Obtaining access token from Yelp 26 | ============================================= 27 | To connect to Yelp, you need to generate a token. This token is a unique identifier of an application requesting access to 28 | Yelp's API. Once an application creates the token, it will act as your credential when making an API request. 29 | 30 | To receive an access token, the user needs to create a server-side application from Yelp, this can be done by 31 | visiting the `Yelp API documentation 32 | `_, entering some information about its use and generating a key. 33 | 34 | .. image:: _static/images/tutorial/Yelp_authentication.png 35 | :align: center 36 | :width: 700 37 | :height: 500 38 | 39 | Store the token or API Key in a secure location as it will be used to provide you access to the Yelp's restaurant data. 40 | 41 | .. image:: _static/images/tutorial/Yelp_API_Key.png 42 | :align: center 43 | :width: 700 44 | :height: 400 45 | 46 | Download and store the configuration files in dataprep 47 | ================================================================ 48 | The configuration files are used to construct the parameters and initial setup for the API. The available configuration files can be manually downloaded here: `Configuration Files 49 | `_ or automatically downloaded at usage. 50 | 51 | 52 | 53 | To automatically download at usage, click on the clipboard button, unsure you are cloning with HTTPS. Go into your terminal, and find an appropriate locate to store the configuration files. 54 | When you decided on a location, enter the command ``git clone https://github.com/sfu-db/DataConnectorConfigs.git``. This will clone the git repository to the desired location; as a suggestion store it with the dataprep folder. 55 | 56 | 57 | From here you can proceed with the next steps. 58 | 59 | .. image:: _static/images/tutorial/dc_git.png 60 | :align: center 61 | :width: 1000 62 | :height: 500 63 | 64 | 65 | .. image:: _static/images/tutorial/dc_git_clone.png 66 | :align: center 67 | :width: 725 68 | :height: 125 69 | 70 | 71 | Below the configuration file are stored with dataprep. 72 | 73 | .. image:: _static/images/tutorial/Config_destination.png 74 | :align: center 75 | :width: 586 76 | :height: 132 77 | 78 | 79 | 80 | Initialize data_connector 81 | ============================= 82 | To initialize run the following code. Copy and paste the Yelp API key into the **access_token** variable and ensure the connector path is correct. Once you have that running, you can use the built in functions available in connector. 83 | 84 | :: 85 | 86 | from dataprep.data_connector import Connector 87 | access_token = “insert_token_key” 88 | dc = Connector("./DataConnectorConfigs/yelp", _auth={"access_token":access_token}) 89 | 90 | Functionalities 91 | =================== 92 | Data connector has several functions you can perform to gain insight on the data downloaded from Yelp. 93 | 94 | Connector.info 95 | ------------------ 96 | | The info method gives information and guidelines on using the connector. There are 3 sections in the response and they are table, parameters and examples. 97 | | 98 | | a. Table - The table(s) being accessed. 99 | | b. Parameters - Identifies which parameters can be used to call the method. 100 | | c. Examples - Shows how you can call the methods in the Connector class. 101 | 102 | :: 103 | 104 | dc.info() 105 | 106 | .. image:: _static/images/tutorial/dc_show.png 107 | :align: center 108 | :width: 400 109 | :height: 165 110 | 111 | Parameters 112 | ********************** 113 | | A parameter is a piece of information you supply to a query right as you run it. The parameters for Yelp's business query can either be required or optional. The required parameters are **term** and **location** while the optional parameters are **latitude**, **longitude** and **limit**. The parameters are described below. 114 | | 115 | | a. **term** - Required - Search term, for example "food" or "restaurants". The term may also be business names, such as "Starbucks". 116 | | b. **location** - Required - Maximum number of search results (hits) to return. 117 | | c. **latitude** - Optional - Latitude of the location you want to search nearby. 118 | | d. **longitude** - Optional - Longitude of the location you want to search nearby. 119 | | e. **limit** - Number of business results to return. By default, it will return 20. Maximum is 50. 120 | 121 | There are additional parameters to query with Yelp. If you are interested in reading up the other available parameters and setting up your own config files, please read this `Yelp link 122 | `_ and this `Configuration Files link 123 | `_. 124 | 125 | Connector.show_schema 126 | ----------------------------- 127 | The show_schema method returns the website data in a Dataframe format. There are two columns in the response. The first column is the name and the second is the datatype. 128 | As an example, lets see what is in the publication table. 129 | 130 | :: 131 | 132 | dc.show_schema("business") 133 | 134 | .. image:: _static/images/tutorial/dc_schema.png 135 | :align: center 136 | :width: 202 137 | :height: 404 138 | 139 | Connector.query 140 | ------------------ 141 | The query method downloads the website data. The parameters must meet the requirements as indicated in connector.info for the operation to run. 142 | 143 | When the data is received from the server, it will either be in a JSON or XML format. The data_connector reformats the data in pandas Dataframe for the convenience of downstream operations. 144 | 145 | As an example, let's try to get the data from the "business" table, providing the term "city" and location "seattle". 146 | 147 | :: 148 | 149 | dc.query("businesses", term = "city", location = "seattle", limit = 10) 150 | 151 | .. image:: _static/images/tutorial/dc_yelp_query.png 152 | :align: center 153 | :width: 1000 154 | :height: 460 155 | 156 | From query results, you can see how easy it is to download the restaurant data from Yelp into a pandas Dataframe. 157 | Now that you have an understanding of how data connector operates, you can easily accomplish the task with two lines of code. 158 | 159 | :: 160 | 161 | dc = Connector(...) 162 | dc.query(...) 163 | 164 | Pagination 165 | =================== 166 | | Another feature available in the config files is pagination. Pagination is the process of dividing a document into discrete pages, breaking the content into pages and allow visitors to switch between them. It returns the maximum number of searches to return. 167 | | 168 | | To use pagination, you need to include **_count** in your query. The **_count** parameter represents the number of records a user would like to return, which can be larger than the maximum limit of records each return of API itself. Users can still fetch multiple pages of records by using parameters like limit and offset, however this requires users to understand how pagination works different website APIs. 169 | | 170 | 171 | :: 172 | 173 | dc.query("business", term = "city", location = "seattle", _count = 200) 174 | 175 | .. image:: _static/images/tutorial/dc_yelp_query_pag.png 176 | :align: center 177 | :width: 1000 178 | :height: 500 179 | 180 | Pagination does not concurrently work with the **limit** parameter in a query, you need to select either **limit** or **_count**. 181 | 182 | 183 | That's all for now. 184 | =================== 185 | Please visit the other tutorials that are available if you are interested in setting up a different data connector. 186 | If you are interested in writing your own configuration file or modify an existing one, refer to the `Configuration Files 187 | `_. 188 | 189 | -------------------------------------------------------------------------------- /dataprep/tests/eda/test_plot_correlation.py: -------------------------------------------------------------------------------- 1 | """ 2 | module for testing plot_corr(df, x, y) function. 3 | """ 4 | import random 5 | from time import time 6 | 7 | import dask.array as da 8 | import dask.dataframe as dd 9 | import numpy as np 10 | import pandas as pd 11 | import pytest 12 | 13 | from ...eda.correlation import compute_correlation, plot_correlation 14 | from ...eda.correlation.compute import ( 15 | kendall_tau_1xn, 16 | kendall_tau_nxn, 17 | pearson_1xn, 18 | pearson_nxn, 19 | spearman_1xn, 20 | spearman_nxn, 21 | ) 22 | from ...eda.utils import to_dask 23 | 24 | 25 | @pytest.fixture(scope="module") # type: ignore 26 | def simpledf() -> dd.DataFrame: 27 | df = pd.DataFrame(np.random.rand(100, 3), columns=["a", "b", "c"]) 28 | df = pd.concat([df, pd.Series(["a"] * 100)], axis=1) 29 | df.columns = ["a", "b", "c", "d"] 30 | df = to_dask(df) 31 | 32 | return df 33 | 34 | 35 | def test_sanity_compute_1(simpledf: dd.DataFrame) -> None: 36 | compute_correlation(simpledf) 37 | plot_correlation(simpledf) 38 | 39 | 40 | def test_sanity_compute_2(simpledf: dd.DataFrame) -> None: 41 | compute_correlation(simpledf, k=1) 42 | plot_correlation(simpledf, k=1) 43 | 44 | 45 | def test_sanity_compute_3(simpledf: dd.DataFrame) -> None: 46 | compute_correlation(simpledf, x="a") 47 | plot_correlation(simpledf, x="a") 48 | 49 | 50 | def test_sanity_compute_4(simpledf: dd.DataFrame) -> None: 51 | compute_correlation(simpledf, x="a", value_range=(0.5, 0.8)) 52 | plot_correlation(simpledf, x="a", value_range=(0.5, 0.8)) 53 | 54 | 55 | def test_sanity_compute_5(simpledf: dd.DataFrame) -> None: 56 | compute_correlation(simpledf, x="a", k=1) 57 | plot_correlation(simpledf, x="a", k=1) 58 | 59 | 60 | def test_sanity_compute_6(simpledf: dd.DataFrame) -> None: 61 | compute_correlation(simpledf, x="a", k=0) 62 | plot_correlation(simpledf, x="a", k=0) 63 | 64 | 65 | def test_sanity_compute_7(simpledf: dd.DataFrame) -> None: 66 | compute_correlation(simpledf, x="b", y="a") 67 | plot_correlation(simpledf, x="b", y="a") 68 | 69 | 70 | def test_sanity_compute_8(simpledf: dd.DataFrame) -> None: 71 | compute_correlation(simpledf, x="b", y="a", k=1) 72 | plot_correlation(simpledf, x="b", y="a", k=1) 73 | 74 | 75 | def test_sanity_compute_9(simpledf: dd.DataFrame) -> None: 76 | compute_correlation(simpledf, value_range=(0.3, 0.7)) 77 | plot_correlation(simpledf, value_range=(0.3, 0.7)) 78 | 79 | 80 | @pytest.mark.xfail # type: ignore 81 | def test_sanity_compute_fail_2(simpledf: dd.DataFrame) -> None: 82 | compute_correlation(simpledf, k=3, value_range=(0.3, 0.7)) 83 | plot_correlation(simpledf, k=3, value_range=(0.3, 0.7)) 84 | 85 | 86 | @pytest.mark.xfail # type: ignore 87 | def test_sanity_compute_fail_3(simpledf: dd.DataFrame) -> None: 88 | compute_correlation(simpledf, x="a", value_range=(0.5, 0.8), k=3) 89 | plot_correlation(simpledf, x="a", value_range=(0.5, 0.8), k=3) 90 | 91 | 92 | @pytest.mark.xfail # type: ignore 93 | def test_sanity_compute_fail_4(simpledf: dd.DataFrame) -> None: 94 | compute_correlation(simpledf, y="a") 95 | plot_correlation(simpledf, y="a") 96 | 97 | 98 | @pytest.mark.xfail # type: ignore 99 | def test_sanity_compute_fail_5(simpledf: dd.DataFrame) -> None: 100 | compute_correlation(simpledf, x="d") 101 | plot_correlation(simpledf, x="d") 102 | 103 | 104 | @pytest.mark.xfail # type: ignore 105 | def test_test_sanity_compute_fail_6(simpledf: dd.DataFrame) -> None: 106 | compute_correlation(simpledf, x="b", y="a", value_range=(0.5, 0.8)) 107 | plot_correlation(simpledf, x="b", y="a", value_range=(0.5, 0.8)) 108 | 109 | 110 | @pytest.mark.xfail # type: ignore 111 | def test_sanity_compute_fail_7(simpledf: dd.DataFrame) -> None: 112 | compute_correlation(simpledf, x="b", y="a", value_range=(0.5, 0.8), k=3) 113 | plot_correlation(simpledf, x="b", y="a", value_range=(0.5, 0.8), k=3) 114 | 115 | 116 | def test_compute_pearson() -> None: 117 | array = np.random.rand(100, 10) 118 | darray = da.from_array(array) 119 | a = pearson_nxn(darray).compute() 120 | b = pd.DataFrame(data=array).corr("pearson").values 121 | assert np.isclose(a, b).all() 122 | 123 | for i in range(array.shape[1]): 124 | _, a = pearson_1xn(darray[:, i], darray) 125 | assert np.isclose(a, np.sort(b[:, i])).all() 126 | 127 | 128 | def test_compute_spearman() -> None: 129 | array = np.random.rand(100, 10) 130 | darray = da.from_array(array) 131 | a = spearman_nxn(darray).compute() 132 | b = pd.DataFrame(data=array).corr("spearman").values 133 | assert np.isclose(a, b).all() 134 | 135 | for i in range(array.shape[1]): 136 | _, a = spearman_1xn(darray[:, i], darray) 137 | assert np.isclose(a, np.sort(b[:, i])).all() 138 | 139 | 140 | def test_compute_kendall() -> None: 141 | array = np.random.rand(100, 10) 142 | darray = da.from_array(array) 143 | a = kendall_tau_nxn(darray).compute() 144 | b = pd.DataFrame(data=array).corr("kendall").values 145 | assert np.isclose(a, b).all() 146 | 147 | for i in range(array.shape[1]): 148 | _, a = kendall_tau_1xn(darray[:, i], darray) 149 | assert np.isclose(a, np.sort(b[:, i])).all() 150 | 151 | 152 | # def test_plot_corr_df() -> None: # pylint: disable=too-many-locals 153 | # """ 154 | # :return: 155 | # """ 156 | # data = np.random.rand(100, 20) 157 | # df_data = pd.DataFrame(data) 158 | 159 | # start_p_pd = time() 160 | # res = df_data.corr(method="pearson") 161 | # end_p_pd = time() 162 | # print("pd pearson time: ", str(end_p_pd - start_p_pd) + " s") 163 | 164 | # start_p = time() 165 | # _, intermediate = plot_correlation(df=df_data, return_intermediate=True) 166 | # end_p = time() 167 | # print("our pearson time: ", str(end_p - start_p) + " s") 168 | # assert np.isclose(res, intermediate.result["corr_p"]).all() 169 | 170 | # start_s_pd = time() 171 | # res = df_data.corr(method="spearman") 172 | # end_s_pd = time() 173 | # print("pd spearman time: ", str(end_s_pd - start_s_pd) + " s") 174 | 175 | # start_s = time() 176 | # _, intermediate = plot_correlation(df=df_data, return_intermediate=True) 177 | # end_s = time() 178 | # print("our spearman time: ", str(end_s - start_s) + " s") 179 | # assert np.isclose(res, intermediate.result["corr_s"]).all() 180 | 181 | # start_k_pd = time() 182 | # res = df_data.corr(method="kendall") 183 | # end_k_pd = time() 184 | # print("pd kendall time: ", str(end_k_pd - start_k_pd) + " s") 185 | 186 | # start_k = time() 187 | # _, intermediate = plot_correlation(df=df_data, return_intermediate=True) 188 | # end_k = time() 189 | # print("our kendall time: ", str(end_k - start_k) + " s") 190 | # assert np.isclose(res, intermediate.result["corr_k"]).all() 191 | 192 | 193 | # def test_plot_corr_df_k() -> None: 194 | # """ 195 | # :return: 196 | # """ 197 | # data = np.random.rand(100, 20) 198 | # df_data = pd.DataFrame(data) 199 | # k = 5 200 | # res = df_data.corr(method="pearson") 201 | # row, _ = np.shape(res) 202 | # res_re = np.reshape(np.triu(res, 1), (row * row,)) 203 | # idx = np.argsort(np.absolute(res_re)) 204 | # mask = np.zeros(shape=(row * row,)) 205 | # for i in range(k): 206 | # mask[idx[-i - 1]] = 1 207 | # res = np.multiply(res_re, mask) 208 | # res = np.reshape(res, (row, row)) 209 | # res = res.T 210 | # _, intermediate = plot_correlation(df=df_data, return_intermediate=True, k=k) 211 | # assert np.isclose(intermediate.result["corr_p"], res).all() 212 | # assert np.isclose(intermediate.result["mask_p"], mask).all() 213 | 214 | 215 | # def test_plot_corr_df_x_k() -> None: 216 | # """ 217 | # :return: 218 | # """ 219 | # df_data = pd.DataFrame({"a": np.random.normal(0, 10, 100)}) 220 | # df_data["b"] = df_data["a"] + np.random.normal(0, 10, 100) 221 | # df_data["c"] = df_data["a"] + np.random.normal(0, 10, 100) 222 | # df_data["d"] = df_data["a"] + np.random.normal(0, 10, 100) 223 | # x_name = "b" 224 | # k = 3 225 | # name_list = list(df_data.columns.values) 226 | # idx_name = name_list.index(x_name) 227 | # res_p = df_data.corr(method="pearson").values 228 | # res_p[idx_name][idx_name] = -1 229 | # res_s = df_data.corr(method="spearman").values 230 | # res_s[idx_name][idx_name] = -1 231 | # res_k = df_data.corr(method="kendall").values 232 | # res_k[idx_name][idx_name] = -1 233 | # _, _ = plot_correlation(df=df_data, x=x_name, return_intermediate=True, k=k) 234 | 235 | 236 | # def test_plot_corr_df_x_y_k() -> None: 237 | # """ 238 | # :return: 239 | # """ 240 | # df_data = pd.DataFrame({"a": np.random.normal(0, 10, 100)}) 241 | # df_data["b"] = df_data["a"] + np.random.normal(0, 10, 100) 242 | # df_data["c"] = df_data["a"] + np.random.normal(0, 10, 100) 243 | # df_data["d"] = df_data["a"] + np.random.normal(0, 10, 100) 244 | # x_name = "b" 245 | # y_name = "c" 246 | # k = 3 247 | # _ = plot_correlation( 248 | # df=df_data, x=x_name, y=y_name, return_intermediate=False, k=k, 249 | # ) 250 | 251 | # letters = ["a", "b", "c"] 252 | # df_data_cat = pd.DataFrame({"a": np.random.normal(0, 10, 100)}) 253 | # df_data_cat["b"] = pd.Categorical([random.choice(letters) for _ in range(100)]) 254 | # df_data_cat["c"] = pd.Categorical([random.choice(letters) for _ in range(100)]) 255 | # _, intermediate = plot_correlation( 256 | # df=df_data_cat, x="b", y="c", return_intermediate=True 257 | # ) 258 | # assert np.isclose( 259 | # pd.crosstab(df_data_cat["b"], df_data_cat["c"]).values, 260 | # intermediate.result["cross_table"], 261 | # ).all() 262 | -------------------------------------------------------------------------------- /dataprep/eda/correlation/render.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements the visualization for 3 | plot_correlation(df) function 4 | """ 5 | import math 6 | from typing import List, Optional, Sequence, Tuple 7 | 8 | import numpy as np 9 | from bokeh.models import ( 10 | BasicTicker, 11 | CategoricalColorMapper, 12 | ColorBar, 13 | FactorRange, 14 | HoverTool, 15 | Legend, 16 | LegendItem, 17 | LinearColorMapper, 18 | PrintfTickFormatter, 19 | ) 20 | from bokeh.models.annotations import Title 21 | from bokeh.models.widgets import Panel, Tabs 22 | from bokeh.plotting import Figure, figure 23 | 24 | from ..intermediate import Intermediate 25 | from ..palette import BIPALETTE, BRG 26 | 27 | __all__ = ["render_correlation"] 28 | 29 | 30 | def render_correlation( 31 | itmdt: Intermediate, 32 | plot_width: int = 500, 33 | plot_height: int = 500, 34 | palette: Optional[Sequence[str]] = None, 35 | ) -> Figure: 36 | """ 37 | Render a correlation plot 38 | 39 | Parameters 40 | ---------- 41 | itmdt 42 | plot_width 43 | The width of the plot 44 | plot_height 45 | The height of the plot 46 | palette 47 | The palette to use. By default (None), 48 | the palette will be automatically chosen based on different visualization types. 49 | 50 | Returns 51 | ------- 52 | Figure 53 | The bokeh Figure instance. 54 | """ 55 | if itmdt.visual_type is None: 56 | visual_elem = Figure() 57 | elif itmdt.visual_type == "correlation_heatmaps": 58 | visual_elem = render_correlation_heatmaps( 59 | itmdt, plot_width, plot_height, palette or BIPALETTE 60 | ) 61 | elif itmdt.visual_type == "correlation_single_heatmaps": 62 | visual_elem = render_correlation_single_heatmaps( 63 | itmdt, plot_width, plot_height, palette or BIPALETTE 64 | ) 65 | elif itmdt.visual_type == "correlation_scatter": 66 | visual_elem = render_scatter(itmdt, plot_width, plot_height, palette or BRG) 67 | else: 68 | raise NotImplementedError(f"Unknown visual type {itmdt.visual_type}") 69 | 70 | return visual_elem 71 | 72 | 73 | # def _vis_cross_table(intermediate: Intermediate, params: Dict[str, Any]) -> Figure: 74 | # """ 75 | # :param intermediate: An object to encapsulate the 76 | # intermediate results. 77 | # :return: A figure object 78 | # """ 79 | # result = intermediate.result 80 | # hv.extension("bokeh", logo=False) 81 | # cross_matrix = result["cross_table"] 82 | # x_cat_list = result["x_cat_list"] 83 | # y_cat_list = result["y_cat_list"] 84 | # data = [] 85 | # for i, _ in enumerate(x_cat_list): 86 | # for j, _ in enumerate(y_cat_list): 87 | # data.append((x_cat_list[i], y_cat_list[j], cross_matrix[i, j])) 88 | # tooltips = [("z", "@z")] 89 | # hover = HoverTool(tooltips=tooltips) 90 | # heatmap = hv.HeatMap(data) 91 | # heatmap.opts( 92 | # tools=[hover], 93 | # colorbar=True, 94 | # width=params["width"], 95 | # toolbar="above", 96 | # title="cross_table", 97 | # ) 98 | # fig = hv.render(heatmap, backend="bokeh") 99 | # _discard_unused_visual_elems(fig) 100 | # return fig 101 | 102 | ########## HeatMaps ########## 103 | def tweak_figure(fig: Figure) -> None: 104 | """ 105 | Set some common attributes for a figure 106 | """ 107 | fig.grid.grid_line_color = None 108 | fig.axis.axis_line_color = None 109 | fig.axis.major_tick_line_color = None 110 | fig.axis.major_label_text_font_size = "9pt" 111 | fig.axis.major_label_standoff = 0 112 | fig.xaxis.major_label_orientation = math.pi / 3 113 | 114 | 115 | def render_correlation_heatmaps( 116 | itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str] 117 | ) -> Tabs: 118 | """ 119 | Render correlation heatmaps in to tabs 120 | """ 121 | tabs: List[Panel] = [] 122 | tooltips = [("x", "@x"), ("y", "@y"), ("correlation", "@correlation{1.11}")] 123 | axis_range = itmdt["axis_range"] 124 | 125 | for method, df in itmdt["data"].items(): 126 | # in case of numerical column names 127 | df = df.copy() 128 | df["x"] = df["x"].apply(str) 129 | df["y"] = df["y"].apply(str) 130 | 131 | mapper, color_bar = create_color_mapper(palette) 132 | x_range = FactorRange(*axis_range) 133 | y_range = FactorRange(*reversed(axis_range)) 134 | fig = Figure( 135 | x_range=x_range, 136 | y_range=y_range, 137 | plot_width=plot_width, 138 | plot_height=plot_height, 139 | x_axis_location="below", 140 | tools="hover", 141 | toolbar_location=None, 142 | tooltips=tooltips, 143 | background_fill_color="#fafafa", 144 | ) 145 | 146 | tweak_figure(fig) 147 | 148 | fig.rect( 149 | x="x", 150 | y="y", 151 | width=1, 152 | height=1, 153 | source=df, 154 | fill_color={"field": "correlation", "transform": mapper}, 155 | line_color=None, 156 | ) 157 | 158 | fig.add_layout(color_bar, "right") 159 | 160 | tab = Panel(child=fig, title=method) 161 | tabs.append(tab) 162 | 163 | tabs = Tabs(tabs=tabs) 164 | return tabs 165 | 166 | 167 | def render_correlation_single_heatmaps( 168 | itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str] 169 | ) -> Tabs: 170 | """ 171 | Render correlation heatmaps, but with single column 172 | """ 173 | tabs: List[Panel] = [] 174 | tooltips = [("y", "@y"), ("correlation", "@correlation{1.11}")] 175 | 176 | for method, df in itmdt["data"].items(): 177 | mapper, color_bar = create_color_mapper(palette) 178 | 179 | x_range = FactorRange(*df["x"].unique()) 180 | y_range = FactorRange(*df["y"].unique()) 181 | fig = figure( 182 | x_range=x_range, 183 | y_range=y_range, 184 | plot_width=plot_width, 185 | plot_height=plot_height, 186 | x_axis_location="below", 187 | tools="hover", 188 | toolbar_location=None, 189 | tooltips=tooltips, 190 | ) 191 | 192 | tweak_figure(fig) 193 | 194 | fig.rect( 195 | x="x", 196 | y="y", 197 | width=1, 198 | height=1, 199 | source=df, 200 | fill_color={"field": "correlation", "transform": mapper}, 201 | line_color=None, 202 | ) 203 | 204 | fig.add_layout(color_bar, "right") 205 | 206 | tab = Panel(child=fig, title=method) 207 | tabs.append(tab) 208 | 209 | tabs = Tabs(tabs=tabs) 210 | return tabs 211 | 212 | 213 | def create_color_mapper(palette: Sequence[str]) -> Tuple[LinearColorMapper, ColorBar]: 214 | """ 215 | Create a color mapper and a colorbar for heatmap 216 | """ 217 | mapper = LinearColorMapper(palette=palette, low=-1, high=1) 218 | colorbar = ColorBar( 219 | color_mapper=mapper, 220 | major_label_text_font_size="8pt", 221 | ticker=BasicTicker(), 222 | formatter=PrintfTickFormatter(format="%.2f"), 223 | label_standoff=6, 224 | border_line_color=None, 225 | location=(0, 0), 226 | ) 227 | return mapper, colorbar 228 | 229 | 230 | ######### Scatter ######### 231 | def render_scatter( 232 | itmdt: Intermediate, plot_width: int, plot_height: int, palette: Sequence[str] 233 | ) -> Figure: 234 | """ 235 | Render scatter plot with a regression line and possible most influencial points 236 | """ 237 | 238 | # pylint: disable=too-many-locals 239 | 240 | df = itmdt["data"] 241 | xcol, ycol, *maybe_label = df.columns 242 | 243 | tooltips = [(xcol, f"@{{{xcol}}}"), (ycol, f"@{{{ycol}}}")] 244 | 245 | fig = Figure( 246 | plot_width=plot_width, 247 | plot_height=plot_height, 248 | toolbar_location=None, 249 | title=Title(text="Scatter Plot & Regression", align="center"), 250 | tools=[], 251 | x_axis_label=xcol, 252 | y_axis_label=ycol, 253 | ) 254 | 255 | # Scatter 256 | scatter = fig.scatter(x=df.columns[0], y=df.columns[1], source=df) 257 | if maybe_label: 258 | assert len(maybe_label) == 1 259 | mapper = CategoricalColorMapper(factors=["=", "+", "-"], palette=palette) 260 | scatter.glyph.fill_color = {"field": maybe_label[0], "transform": mapper} 261 | scatter.glyph.line_color = {"field": maybe_label[0], "transform": mapper} 262 | 263 | # Regression line 264 | coeff_a, coeff_b = itmdt["coeffs"] 265 | line_x = np.asarray([df.iloc[:, 0].min(), df.iloc[:, 0].max()]) 266 | line_y = coeff_a * line_x + coeff_b 267 | fig.line(x=line_x, y=line_y, line_width=3) 268 | 269 | # Not adding the tooltips before because we only want to apply tooltip to the scatter 270 | hover = HoverTool(tooltips=tooltips, renderers=[scatter]) 271 | fig.add_tools(hover) 272 | 273 | # Add legends 274 | if maybe_label: 275 | nidx = df.index[df[maybe_label[0]] == "-"][0] 276 | pidx = df.index[df[maybe_label[0]] == "+"][0] 277 | 278 | legend = Legend( 279 | items=[ 280 | LegendItem( 281 | label="Most Influential (-)", renderers=[scatter], index=nidx 282 | ), 283 | LegendItem( 284 | label="Most Influential (+)", renderers=[scatter], index=pidx 285 | ), 286 | ], 287 | margin=0, 288 | padding=0, 289 | ) 290 | 291 | fig.add_layout(legend, place="right") 292 | return fig 293 | -------------------------------------------------------------------------------- /dataprep/data_connector/implicit_database.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module defines ImplicitDatabase and ImplicitTable, 3 | where ImplicitDatabase is a conceptual model describes 4 | a website and ImplicitTable describes an API endpoint. 5 | """ 6 | from io import StringIO 7 | from json import load as jload 8 | from json import loads as jloads 9 | from pathlib import Path 10 | from typing import Any, Dict, List, NamedTuple, Optional, Union 11 | 12 | import jsonschema 13 | import pandas as pd 14 | from jsonpath2 import Path as JPath 15 | from lxml import etree # pytype: disable=import-error 16 | from requests import Response 17 | 18 | from ..errors import UnreachableError 19 | from .schema import CONFIG_SCHEMA 20 | from .types import Authorization, AuthorizationType, Fields, Orient 21 | 22 | _TYPE_MAPPING = { 23 | "int": int, 24 | "string": str, 25 | "float": float, 26 | "boolean": bool, 27 | } 28 | 29 | 30 | class SchemaField(NamedTuple): 31 | """ 32 | Schema of one table field 33 | """ 34 | 35 | target: str 36 | type: str 37 | description: Optional[str] 38 | 39 | 40 | class Pagination: 41 | """ 42 | Schema of Pagination field 43 | """ 44 | 45 | type: str 46 | count_key: str 47 | max_count: int 48 | anchor_key: Optional[str] 49 | cursor_id: Optional[str] 50 | cursor_key: Optional[str] 51 | 52 | def __init__(self, pdef: Dict[str, Any]) -> None: 53 | 54 | self.type = pdef["type"] 55 | self.max_count = pdef["max_count"] 56 | self.count_key = pdef["count_key"] 57 | self.anchor_key = pdef.get("anchor_key") 58 | self.cursor_id = pdef.get("cursor_id") 59 | self.cursor_key = pdef.get("cursor_key") 60 | 61 | 62 | class ImplicitTable: # pylint: disable=too-many-instance-attributes 63 | """ 64 | ImplicitTable class abstracts the request and the response to a Restful API, 65 | so that the remote API can be treated as a database table. 66 | """ 67 | 68 | name: str 69 | config: Dict[str, Any] 70 | # Request related 71 | method: str 72 | url: str 73 | authorization: Optional[Authorization] = None 74 | headers: Optional[Fields] = None 75 | params: Optional[Fields] = None 76 | body_ctype: str 77 | body: Optional[Fields] = None 78 | cookies: Optional[Fields] = None 79 | pag_params: Optional[Pagination] = None 80 | 81 | # Response related 82 | ctype: str 83 | table_path: str 84 | schema: Dict[str, SchemaField] 85 | orient: Orient 86 | 87 | def __init__(self, name: str, config: Dict[str, Any]) -> None: 88 | jsonschema.validate( 89 | config, CONFIG_SCHEMA 90 | ) # This will throw errors if validate failed 91 | self.name = name 92 | self.config = config 93 | 94 | request_def = config["request"] 95 | 96 | self.method = request_def["method"] 97 | self.url = request_def["url"] 98 | 99 | if "authorization" in request_def: 100 | auth_def = request_def["authorization"] 101 | if isinstance(auth_def, str): 102 | auth_type = AuthorizationType[auth_def] 103 | auth_params: Dict[str, str] = {} 104 | elif isinstance(auth_def, dict): 105 | auth_type = AuthorizationType[auth_def.pop("type")] 106 | auth_params = {**auth_def} 107 | else: 108 | raise NotImplementedError 109 | self.authorization = Authorization(auth_type=auth_type, params=auth_params) 110 | 111 | if "pagination" in request_def: 112 | self.pag_params = Pagination(request_def["pagination"]) 113 | 114 | for key in ["headers", "params", "cookies"]: 115 | if key in request_def: 116 | setattr(self, key, Fields(request_def[key])) 117 | 118 | if "body" in request_def: 119 | body_def = request_def["body"] 120 | self.body_ctype = body_def["ctype"] 121 | self.body = Fields(body_def["content"]) 122 | 123 | response_def = config["response"] 124 | self.ctype = response_def["ctype"] 125 | self.table_path = response_def["tablePath"] 126 | self.schema = { 127 | name: SchemaField(def_["target"], def_["type"], def_.get("description")) 128 | for name, def_ in response_def["schema"].items() 129 | } 130 | self.orient = Orient(response_def["orient"]) 131 | 132 | def from_response(self, resp: Response) -> pd.DataFrame: 133 | """ 134 | Create a dataframe from a http response. 135 | """ 136 | if self.ctype == "application/json": 137 | rows = self.from_json(resp.text) 138 | elif self.ctype == "application/xml": 139 | rows = self.from_xml(resp.text) 140 | else: 141 | raise UnreachableError 142 | 143 | return pd.DataFrame(rows) 144 | 145 | def from_json(self, data: str) -> Dict[str, List[Any]]: 146 | """ 147 | Create rows from json string. 148 | """ 149 | data = jloads(data) 150 | table_data = {} 151 | root = self.table_path 152 | 153 | if self.orient == Orient.Records: 154 | data_rows = [ 155 | row_node.current_value for row_node in JPath.parse_str(root).match(data) 156 | ] 157 | 158 | for column_name, column_def in self.schema.items(): 159 | column_target = column_def.target 160 | column_type = column_def.type 161 | 162 | target_matcher = JPath.parse_str(column_target) 163 | 164 | col: List[Any] = [] 165 | for data_row in data_rows: 166 | maybe_cell_value = [ 167 | m.current_value for m in target_matcher.match(data_row) 168 | ] 169 | 170 | if not maybe_cell_value: # If no match 171 | col.append(None) 172 | elif len(maybe_cell_value) == 1 and column_type != "object": 173 | (cell_value,) = maybe_cell_value 174 | if cell_value is not None: 175 | # Even we have value matched, 176 | # the value might be None so we don't do type conversion. 177 | cell_value = _TYPE_MAPPING[column_type](cell_value) 178 | col.append(cell_value) 179 | else: 180 | assert ( 181 | column_type == "object" 182 | ), f"{column_name}: {maybe_cell_value} is not {column_type}" 183 | col.append(maybe_cell_value) 184 | 185 | table_data[column_name] = col 186 | else: 187 | # TODO: split orient 188 | raise NotImplementedError 189 | 190 | return table_data 191 | 192 | def from_xml(self, data: str) -> Dict[str, List[Any]]: 193 | """ 194 | Create rows from xml string. 195 | """ 196 | table_data = {} 197 | 198 | data = data.replace('', "") 199 | 200 | root = etree.parse(StringIO(data)) 201 | data_rows = root.xpath(self.table_path) 202 | 203 | if self.orient.value == Orient.Records.value: 204 | for column_name, column_def in self.schema.items(): 205 | column_target = column_def.target 206 | column_type = column_def.type 207 | 208 | col: List[Any] = [] 209 | for data_row in data_rows: 210 | maybe_cell_value = data_row.xpath(column_target) 211 | 212 | if not maybe_cell_value: 213 | col.append(None) 214 | elif len(maybe_cell_value) == 1 and column_type != "object": 215 | (cell_value,) = maybe_cell_value 216 | if cell_value is not None: 217 | # Even we have value matched, 218 | # the value might be None so we don't do type conversion. 219 | cell_value = _TYPE_MAPPING[column_type](cell_value) 220 | col.append(cell_value) 221 | else: 222 | assert ( 223 | column_type == "object" 224 | ), f"{column_name}: {maybe_cell_value} is not {column_type}" 225 | col.append(maybe_cell_value) 226 | 227 | table_data[column_name] = col 228 | else: 229 | # TODO: split orient 230 | raise NotImplementedError 231 | 232 | return table_data 233 | 234 | 235 | class ImplicitDatabase: 236 | """ 237 | A website that provides data can be treat as a database, represented 238 | as ImplicitDatabase in DataConnector. 239 | """ 240 | 241 | name: str 242 | tables: Dict[str, ImplicitTable] 243 | 244 | def __init__(self, config_path: Union[str, Path]) -> None: 245 | path = Path(config_path) 246 | 247 | self.name = path.name 248 | self.tables = {} 249 | 250 | for table_config_path in path.iterdir(): 251 | if not table_config_path.is_file(): 252 | # ignore configs that are not file 253 | continue 254 | if table_config_path.name == "_meta.json": 255 | # ignore meta file 256 | continue 257 | if table_config_path.suffix != ".json": 258 | # ifnote non json file 259 | continue 260 | 261 | with open(table_config_path) as f: 262 | table_config = jload(f) 263 | 264 | table = ImplicitTable(table_config_path.stem, table_config) 265 | if table.name in self.tables: 266 | raise RuntimeError(f"Duplicated table name {table.name}") 267 | self.tables[table.name] = table 268 | -------------------------------------------------------------------------------- /docs/source/eda/plot.rst: -------------------------------------------------------------------------------- 1 | 2 | ================================================== 3 | `plot`: analyzing basic characteristics of dataset 4 | ================================================== 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | Overview 10 | ======== 11 | 12 | The goal of `plot` is to explore basic characteristics of the dataset. It can generate different plots to reveal different characteristics of interested columns. It mainly provides the following functionalities: 13 | 14 | 1. plot(df): plot basic characteristics (the histogram and the bar chart) for all columns. 15 | 2. plot(df, x): zoom into column x and plot more refined characteristics. 16 | 3. plot(df, x, y): zoom into column x and column y, and plot more refined characteristics to explore their relationship. 17 | 18 | 19 | The generated plots of `plot` function are different for numerical column and categorical column. The following table summarizes the output plots for different setting of x and y. 20 | 21 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 22 | | | **plot(df,x,y)** | | 23 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 24 | | **x** | **y** | **output plots** | 25 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 26 | | None | None | `histgram `_ or `bar chart `_ for each column | 27 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 28 | | Numerical | None | `histgram `_, `kde plot `_, `box plot `_, `qq-norm plot `_ | 29 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 30 | | Categorical | None | `bar chart `_, `pie chart `_ | 31 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 32 | | Numerical | Numerical | `scatter plot `_, `hexbin plot `_, `box plot `_ | 33 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 34 | | Numerical | Categorical | `box plot `_, `line plot `_ | 35 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 36 | | Categorical | Numerical | `box plot `_, `line plot `_ | 37 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 38 | | Categorical | Categorical | `nested bar chart `_, `stacked bar chart `_, `heat map `_ | 39 | +-------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 40 | 41 | Next, we use several examples to demonstrate the functionalities. 42 | 43 | 44 | Loading dataset 45 | =============== 46 | We support two types of dataframe: pandas dataframe and dask dataframe. Here we load the well known `adult` dataset into a pandas dataframe and use it to demonstrate our functionality:: 47 | 48 | import pandas as pd 49 | df = pd.read_csv("https://www.openml.org/data/get_csv/1595261/phpMawTba", na_values = [' ?']) 50 | 51 | Basic exploration for all columns via `plot(df)` 52 | ================================================ 53 | 54 | After getting a dataset, we could do a rough exploration by calling `plot(df)`. It will plot a histogram for each numeric column and a bar chart for each categorical column. The shown bin number (of histogram) and shown category number (of categorical column) are both customizable. Besides, if a column contains missing values, it ignores them when generating the plot but shows the percentage of missing values in the title. The following shows an example of `plot(df)`:: 55 | 56 | from dataprep.eda import plot 57 | plot(df) 58 | 59 | 60 | .. raw:: html 61 | 62 | 63 | 64 | 65 | Zooming into a column via `plot(df, x)` 66 | ======================================= 67 | 68 | After we get the basic information of the dataset, we could zoom into an interested column to explore it more by calling `plot(df, x)`, where x is the interested column. The output is of `plot(df, x)` is different for numerical column and categorical column. 69 | 70 | When x is a numeric column, it plots the histogram, kde plot, box plot and qq-norm plot. The following shows an example:: 71 | 72 | plot(df, "age") 73 | 74 | .. raw:: html 75 | 76 | 77 | 78 | 79 | When x is a categorical column, it plots bar chart and pie chart. The following shows an example:: 80 | 81 | plot(df, "education") 82 | 83 | .. raw:: html 84 | 85 | 86 | 87 | 88 | Zooming into two columns via `plot(df, x, y)` 89 | ============================================= 90 | 91 | Furthermore, we provide `plot(df, x, y)` to explore the relationship between interested columns x and y. The output is based on the column types of x and y. 92 | 93 | When x and y are both numerical columns, it plots `scatter plot `_, `hexbin plot `_ and `box plot `_. The following shows an example:: 94 | 95 | plot(df, "age", "hours-per-week") 96 | 97 | .. raw:: html 98 | 99 | 100 | 101 | 102 | When x and y are both categorical columns, it plots `nested bar chart `_, `stacked bar chart `_ and `heat map `_ . The following shows an example:: 103 | 104 | plot(df, "education", "marital-status") 105 | 106 | .. raw:: html 107 | 108 | 109 | 110 | 111 | When one of x and y is a numerical column and the other is categorical column, it plots `box plot `_ and `line plot `_. The following shows an example:: 112 | 113 | plot(df, "age", "education") 114 | # or plot(df, "education", "age") 115 | 116 | .. raw:: html 117 | 118 | -------------------------------------------------------------------------------- /dataprep/data_connector/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "$id": "http://example.com/root.json", 4 | "type": "object", 5 | "title": "The config for a data connector", 6 | "required": [ 7 | "version", 8 | "request", 9 | "response" 10 | ], 11 | "additionalProperties": false, 12 | "properties": { 13 | "version": { 14 | "$id": "#/properties/version", 15 | "type": "number", 16 | "title": "The Version Schema", 17 | "description": "The version number of the schema", 18 | "default": 1, 19 | "minimum": 1 20 | }, 21 | "request": { 22 | "$id": "#/properties/request", 23 | "type": "object", 24 | "title": "The Request Schema", 25 | "description": "", 26 | "required": [ 27 | "url", 28 | "method" 29 | ], 30 | "properties": { 31 | "url": { 32 | "$id": "#/properties/request/properties/url", 33 | "type": "string", 34 | "title": "The Url Schema", 35 | "description": "The Url of the API endpoint. This can also be a Jinja template", 36 | "default": "", 37 | "examples": [ 38 | "http://example.com/api" 39 | ], 40 | "format": "uri" 41 | }, 42 | "method": { 43 | "$id": "#/properties/request/properties/method", 44 | "type": "string", 45 | "title": "The Method Schema", 46 | "examples": [ 47 | "GET" 48 | ], 49 | "enum": [ 50 | "GET", 51 | "POST", 52 | "PUT" 53 | ] 54 | }, 55 | "authorization": { 56 | "$ref": "#/definitions/authorization" 57 | }, 58 | "headers": { 59 | "$ref": "#/definitions/fields" 60 | }, 61 | "params": { 62 | "$ref": "#/definitions/fields" 63 | }, 64 | "pagination": { 65 | "$id": "#/properties/request/properties/pagination", 66 | "type": "object", 67 | "properties": { 68 | "type": { 69 | "type": "string" 70 | }, 71 | "max_count": { 72 | "type": "integer" 73 | }, 74 | "anchor_key": { 75 | "type": "string", 76 | "optional": true 77 | }, 78 | "count_key": { 79 | "type": "string" 80 | }, 81 | "cursor_id": { 82 | "type": "string", 83 | "optional": true 84 | }, 85 | "cursor_key": { 86 | "type": "string", 87 | "optional": true 88 | } 89 | }, 90 | "required": [ 91 | "count_key", 92 | "type", 93 | "max_count" 94 | ], 95 | "additionalProperties": false 96 | }, 97 | "body": { 98 | "$id": "#/properties/request/properties/body", 99 | "type": "object", 100 | "title": "The Body Schema", 101 | "properties": { 102 | "ctype": { 103 | "$id": "#/properties/request/properties/body/properties/ctype", 104 | "type": "string", 105 | "title": "The content type schema", 106 | "default": "application/json", 107 | "enum": [ 108 | "application/x-www-form-urlencoded", 109 | "application/json" 110 | ] 111 | }, 112 | "content": { 113 | "$ref": "#/definitions/fields" 114 | } 115 | } 116 | }, 117 | "cookies": { 118 | "$ref": "#/definitions/fields" 119 | } 120 | }, 121 | "additionalProperties": false 122 | }, 123 | "response": { 124 | "$id": "#/properties/response", 125 | "type": "object", 126 | "title": "The Response Schema", 127 | "required": [ 128 | "ctype", 129 | "tablePath", 130 | "schema" 131 | ], 132 | "properties": { 133 | "ctype": { 134 | "$id": "#/properties/response/properties/ctype", 135 | "type": "string", 136 | "title": "The Response Content Type Schema", 137 | "default": "application/json", 138 | "enum": [ 139 | "application/x-www-form-urlencoded", 140 | "application/json", 141 | "application/xml" 142 | ] 143 | }, 144 | "tablePath": { 145 | "$id": "#/properties/response/properties/tablePath", 146 | "type": "string", 147 | "title": "The Path to the Table Object", 148 | "default": "" 149 | }, 150 | "schema": { 151 | "$ref": "#/definitions/schema" 152 | }, 153 | "orient": { 154 | "$id": "#/properties/response/properties/orient", 155 | "type": "string", 156 | "title": "The Orient for the Table", 157 | "default": "records", 158 | "enum": [ 159 | "split", 160 | "records" 161 | ] 162 | } 163 | }, 164 | "additionalProperties": false 165 | }, 166 | "additionalProperties": false 167 | }, 168 | "definitions": { 169 | "fields": { 170 | "$id": "#/definitions/fields", 171 | "type": "object", 172 | "title": "Spec for Fields Definition", 173 | "additionalProperties": { 174 | "oneOf": [ 175 | { 176 | "type": "string" 177 | }, 178 | { 179 | "type": "boolean" 180 | }, 181 | { 182 | "type": "object", 183 | "required": [ 184 | "required", 185 | "removeIfEmpty" 186 | ], 187 | "properties": { 188 | "required": { 189 | "type": "boolean", 190 | "default": false 191 | }, 192 | "fromKey": { 193 | "type": "string" 194 | }, 195 | "toKey": { 196 | "type": "string" 197 | }, 198 | "template": { 199 | "type": "string" 200 | }, 201 | "removeIfEmpty": { 202 | "type": "boolean", 203 | "default": false 204 | }, 205 | "additionalProperties": false 206 | } 207 | } 208 | ] 209 | } 210 | }, 211 | "authorization": { 212 | "$id": "#/definitions/authorization", 213 | "oneOf": [ 214 | { 215 | "type": "object", 216 | "required": [ 217 | "type", 218 | "grantType", 219 | "tokenServerUrl" 220 | ], 221 | "properties": { 222 | "type": { 223 | "type": "string", 224 | "enum": [ 225 | "OAuth2" 226 | ] 227 | }, 228 | "grantType": { 229 | "type": "string", 230 | "enum": [ 231 | "ClientCredentials", 232 | "AuthorizationCode" 233 | ] 234 | }, 235 | "tokenServerUrl": { 236 | "type": "string" 237 | } 238 | }, 239 | "additionalProperties": false 240 | }, 241 | { 242 | "type": "string", 243 | "enum": [ 244 | "Bearer" 245 | ] 246 | } 247 | ] 248 | }, 249 | "schema": { 250 | "$id": "#/definitions/schema", 251 | "type": "object", 252 | "title": "Spec for table definition", 253 | "additionalProperties": { 254 | "type": "object", 255 | "title": "Spec for schema item", 256 | "properties": { 257 | "target": { 258 | "type": "string" 259 | }, 260 | "type": { 261 | "type": "string", 262 | "enum": [ 263 | "string", 264 | "int", 265 | "float", 266 | "boolean", 267 | "object" 268 | ] 269 | }, 270 | "description": { 271 | "type": "string" 272 | } 273 | }, 274 | "required": [ 275 | "target", 276 | "type" 277 | ], 278 | "additionalProperties": false 279 | } 280 | } 281 | } 282 | } --------------------------------------------------------------------------------