├── .gitignore
├── MANIFEST.in
├── dataclean
├── __init__.py
├── static
│ ├── main.css
│ ├── iosbadge.js
│ ├── main.js
│ └── jquery.tablesorter.min.js
├── codegen.py
├── pipeline.py
├── manager.py
├── cleaning.py
└── widget.py
├── setup.py
├── README.rst
└── LICENSE.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | __pycache__/
3 | dist/
4 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include dataclean/static *.js *.css
2 |
--------------------------------------------------------------------------------
/dataclean/__init__.py:
--------------------------------------------------------------------------------
1 | def _jupyter_nbextension_paths():
2 | return [
3 | {
4 | "section": "notebook",
5 | "src": "static",
6 | "dest": "sherlockml-dataclean",
7 | "require": "sherlockml-dataclean/main",
8 | }
9 | ]
10 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | from setuptools import setup
3 |
4 | STATIC_JS_FILES = [
5 | "dataclean/static/main.js",
6 | "dataclean/static/jquery.tablesorter.min.js",
7 | "dataclean/static/iosbadge.js",
8 | "dataclean/static/main.css",
9 | ]
10 |
11 |
12 | def read_long_description():
13 | with open(os.path.join(os.path.dirname(__file__), "README.rst")) as fp:
14 | return fp.read()
15 |
16 |
17 | setup(
18 | name="ipydataclean",
19 | version="0.2.2",
20 | url="https://github.com/facultyai/ipydataclean",
21 | author="Faculty",
22 | author_email="opensource@faculty.ai",
23 | description="Interactive cleaning for pandas DataFrames",
24 | license="Apache 2.0",
25 | long_description=read_long_description(),
26 | data_files=[("share/jupyter/nbextensions/ipydataclean", STATIC_JS_FILES)],
27 | packages=["dataclean"],
28 | install_requires=[
29 | "future",
30 | "ipython",
31 | "ipywidgets>=7.0.0",
32 | "matplotlib",
33 | "numpy",
34 | "pandas",
35 | "scikit-learn",
36 | "scipy",
37 | "boltzmannclean",
38 | 'funcsigs;python_version<"3.0"',
39 | ],
40 | )
41 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | ipydataclean
2 | ============
3 |
4 | Jupyter notebook extension and python library for interactive cleaning of
5 | pandas DataFrames with a selection of techniques, from simple replacements of
6 | missing values to imputation with a Restricted Boltzmann Machine.
7 |
8 | Installation
9 | ------------
10 |
11 | .. code-block:: bash
12 |
13 | pip install ipydataclean
14 | jupyter nbextension enable dataclean --py --sys-prefix
15 |
16 | Usage
17 | -----
18 |
19 | Use your Jupyter notebook as normal. When a pandas DataFrame is present in your
20 | python kernel you should see a new notification on the Data Cleaner icon in
21 | your toolbar. DataFrames with names beginning with an underscore will be
22 | ignored.
23 |
24 | .. figure:: https://user-images.githubusercontent.com/29061040/37827637-30cf156a-2e90-11e8-9b84-81a41cf94898.png
25 | :width: 25 %
26 | :alt: Data Cleaner toolbar icon.
27 |
28 | Data Cleaner toolbar icon.
29 |
30 | Clicking on the icon will open a floating window containing a summary of the
31 | DataFrames in your kernel. Clicking on the name of one of these DataFrames will
32 | show some of the Data Cleaner controls and some summary statistics on the
33 | DataFrame columns.
34 |
35 | .. figure:: https://user-images.githubusercontent.com/29061040/37827939-520b095e-2e91-11e8-8a85-a4d8cb0dfed1.png
36 | :width: 25 %
37 | :alt: Data Cleaner window.
38 |
39 | Data Cleaner window.
40 |
41 | Clicking on the name of one of these columns will show data cleaning tools
42 | specific to that column, with a histogram or bar chart showing the distribution
43 | of these values. As you create a step the effect that this will have on the
44 | data distribution is shown as a preview.
45 |
46 | .. figure:: https://user-images.githubusercontent.com/29061040/37828167-169edb9c-2e92-11e8-88cd-f918d2c498df.png
47 | :width: 50 %
48 | :alt: Creating a data cleaning step on a column.
49 |
50 | Creating a data cleaning step on a column.
51 |
52 | You can also choose to fill in missing and mistyped values in your DataFrame
53 | with a Restricted Boltzmann Machine. This uses the boltzmannclean package.
54 |
55 | .. figure:: https://user-images.githubusercontent.com/29061040/37828870-d096628e-2e94-11e8-9291-511fab3bdf7a.png
56 | :width: 40 %
57 | :alt: Creating a Restricted Boltzmann Machine cleaning step.
58 |
59 | Creating a Restricted Boltzmann Machine cleaning step.
60 |
61 | Once you create your steps they are added to a processing pipeline which can be
62 | viewed in the "Pipeline" widget.
63 |
64 | .. figure:: https://user-images.githubusercontent.com/29061040/37829003-4488afda-2e95-11e8-9995-9ebc1348d2bf.png
65 | :width: 40 %
66 | :alt: A data cleaning pipeline.
67 |
68 | A data cleaning pipeline.
69 |
70 | These steps can be modified or deleted using these controls, and when ready the
71 | pipeline can be executed on the dataframe or output to code. Executing your
72 | pipeline will create a new DataFrame with the suffix "_cleaned" in your kernel,
73 | while exporting will create a new code cell in your notebook defining a python
74 | function which will carry out the pipeline cleaning steps.
75 |
76 | .. figure:: https://user-images.githubusercontent.com/29061040/37829131-bf920dd4-2e95-11e8-9e77-aaa3533c2095.png
77 | :width: 40 %
78 | :alt: An exported pipeline.
79 |
80 | An exported pipeline.
81 |
82 | Caveats
83 | -------
84 |
85 | Duplicated or non string column names are not supported.
86 |
87 | For DataFrames over 1000 rows, a sample of 1000 rows will be used for
88 | previewing and creating your processing pipeline, with the whole DataFrame only
89 | operated on when the pipeline is executed.
90 |
--------------------------------------------------------------------------------
/dataclean/static/main.css:
--------------------------------------------------------------------------------
1 |
2 | .datacleaner {
3 | max-height: 500px;
4 | min-height: 100px;
5 | display:inline-block;
6 | font-size: 80%;
7 | padding: 0px;
8 | overflow-y: auto;
9 | font-weight: normal;
10 | color: #333333;
11 | white-space: nowrap;
12 | overflow-x: auto;
13 | }
14 |
15 | .datacleaner-float-wrapper {
16 | position: fixed !important;
17 | top: 120px;
18 | width:700px;
19 | right: 20px;
20 | border: thin solid rgba(0, 0, 0, 0.38);
21 | border-radius: 5px;
22 | padding:10px;
23 | background-color: #fff;
24 | opacity: .95;
25 | z-index: 100;
26 | overflow: auto;
27 | }
28 |
29 | .hide-btn{
30 | float: right;
31 | }
32 |
33 | .reload-btn{
34 | float: right;
35 | }
36 |
37 | .kill-btn{
38 | float: right;
39 | }
40 |
41 | .col-md-9 {
42 | overflow:hidden;
43 | margin-left: 14%;
44 | width: 80%}
45 |
46 | #datacleaner-wrapper.closed {
47 | min-width: 250px;
48 | width: auto;
49 | transition: width;
50 | }
51 | #datacleaner-wrapper:hover{
52 | opacity: 1;
53 | }
54 | #datacleaner-wrapper .header {
55 | font-size: 16px;
56 | font-weight: bold;
57 | }
58 | #datacleaner-wrapper .hide-btn {
59 | font-size: 14px;
60 | font-family: monospace;
61 | }
62 |
63 | #datacleaner-wrapper .reload-btn {
64 | font-size: 14px;
65 | font-family: monospace;
66 | }
67 |
68 | #datacleaner-wrapper .kill-btn {
69 | font-size: 14px;
70 | font-family: monospace;
71 | }
72 |
73 |
74 |
75 | /* don't waste so much screen space... */
76 | #datacleaner-wrapper .toc-item{
77 | padding-left: 20px;
78 | }
79 |
80 | #datacleaner-wrapper .toc-item .toc-item{
81 | padding-left: 10px;
82 | }
83 |
84 |
85 | table.table, table.table tr, table.table td, table.table th {
86 | border: 0;
87 | }
88 | table.table-nonfluid {
89 | width: auto !important;
90 | }
91 | table.table {
92 | margin-left: 0;
93 | margin-right: 0;
94 | }
95 | /* tablesorter */
96 | .tablesorter-default .header,
97 | .tablesorter-default .tablesorter-header {
98 | background-image: url(data:image/gif;base64,R0lGODlhFQAJAIAAACMtMP///yH5BAEAAAEALAAAAAAVAAkAAAIXjI+AywnaYnhUMoqt3gZXPmVg94yJVQAAOw==);
99 | background-position: right center;
100 | background-repeat: no-repeat;
101 | cursor: pointer;
102 | padding-right: 20px;
103 | }
104 | .tablesorter-default thead .headerSortUp,
105 | .tablesorter-default thead .tablesorter-headerSortUp,
106 | .tablesorter-default thead .tablesorter-headerAsc {
107 | background-image: url(data:image/gif;base64,R0lGODlhFQAEAIAAACMtMP///yH5BAEAAAEALAAAAAAVAAQAAAINjI8Bya2wnINUMopZAQA7);
108 | }
109 | .tablesorter-default thead .headerSortDown,
110 | .tablesorter-default thead .tablesorter-headerSortDown,
111 | .tablesorter-default thead .tablesorter-headerDesc {
112 | background-image: url(data:image/gif;base64,R0lGODlhFQAEAIAAACMtMP///yH5BAEAAAEALAAAAAAVAAQAAAINjB+gC+jP2ptn0WskLQA7);
113 | }
114 | .tablesorter-default thead .sorter-false {
115 | background-image: none;
116 | cursor: default;
117 | padding-right: 5px;
118 | }
119 |
120 | .arrow-down:before {
121 | content: " ";
122 | display: inline-block;
123 | vertical-align: middle;
124 | width: 0;
125 | height: 0;
126 | border-left: 4px solid transparent;
127 | border-right: 4px solid transparent;
128 | border-top: 4px solid #888;
129 | margin-right: 4px;
130 | }
131 |
132 | .arrow-right:before {
133 | content: " ";
134 | display: inline-block;
135 | vertical-align: middle;
136 | width: 0;
137 | height: 0;
138 | border-top: 4px solid transparent;
139 | border-left: 4px solid #888;
140 | border-bottom: 4px solid transparent;
141 | margin-right: 4px;
142 | }
143 | /* notification badge */
144 | .iosb {
145 | position: absolute;
146 | z-index: 20;
147 | background: #fff; }
148 |
149 | .iosb-content {
150 | text-align: center;
151 | font-weight: 700;
152 | font-family: monospace, sans-serif; }
153 |
154 | .iosb-grey {
155 | background-color: #3a3a3a;
156 | background-image: -webkit-gradient(linear, left top, left bottom, from(#868686), to(#3a3a3a));
157 | background-image: -webkit-linear-gradient(#868686, #3a3a3a);
158 | background-image: -moz-linear-gradient(#868686, #3a3a3a);
159 | background-image: -o-linear-gradient(#868686, #3a3a3a);
160 | background-image: linear-gradient(#868686, #3a3a3a); }
161 | .iosb-grey .iosb-content {
162 | color: #fff;
163 | text-shadow: 1px -1px 1px #474747; }
164 |
165 | .iosb-ios {
166 | background-color: #4a6c9b;
167 | background-image: -webkit-gradient(linear, left top, left bottom, from(#849cbb), to(#4a6c9b));
168 | background-image: -webkit-linear-gradient(#849cbb, #4a6c9b);
169 | background-image: -moz-linear-gradient(#849cbb, #4a6c9b);
170 | background-image: -o-linear-gradient(#849cbb, #4a6c9b);
171 | background-image: linear-gradient(#849cbb, #4a6c9b); }
172 | .iosb-ios .iosb-content {
173 | color: #fff;
174 | text-shadow: 1px -1px 1px #626a76; }
175 |
176 | /* END themes */
177 | /* BEGIN sizes */
178 | .iosb-20 {
179 | -webkit-box-shadow: 0 1px 2px rgba(68, 68, 68, 0.8), 0 1px rgba(255, 255, 255, 0.3) inset;
180 | -moz-box-shadow: 0 1px 2px rgba(68, 68, 68, 0.8), 0 1px rgba(255, 255, 255, 0.3) inset;
181 | box-shadow: 0 1px 2px rgba(68, 68, 68, 0.8), 0 1px rgba(255, 255, 255, 0.3) inset;
182 | min-width: 15px;
183 | height: 15px; }
184 | .iosb-20, .iosb-20 .iosb-inner {
185 | -moz-border-radius: 7px;
186 | border-radius: 7px; }
187 | .iosb-20 .iosb-inner {
188 | margin: 1px;
189 | min-width: 13px;
190 | height: 13px; }
191 | .iosb-20 .iosb-content {
192 | padding: 0 5px;
193 | line-height: 13px;
194 | height: 13px; }
195 | .iosb-20.iosb-top-left {
196 | top: -5px;
197 | left: -5px; }
198 | .iosb-20.iosb-top-right {
199 | top: -5px;
200 | right: -5px; }
201 | .iosb-20.iosb-bottom-left {
202 | bottom: -5px;
203 | left: -5px; }
204 | .iosb-20.iosb-bottom-right {
205 | bottom: -5px;
206 | right: -5px; }
207 | .iosb-20 .iosb-string {
208 | font-size: 8px; }
209 | .iosb-20 .iosb-number {
210 | font-size: 9px; }
211 |
212 | /* END sizes */
--------------------------------------------------------------------------------
/dataclean/codegen.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 |
3 | import builtins
4 | import re
5 | from collections import namedtuple
6 |
7 | try:
8 | from inspect import signature
9 | except ImportError: # Python 2
10 | from funcsigs import signature
11 |
12 | from inspect import getsourcelines, ismethod, isclass, isfunction, ismodule
13 | from textwrap import dedent
14 |
15 |
16 | def indent(text, prefix):
17 | """Adds 'prefix' to the beginning of lines in 'text'."""
18 |
19 | def prefixed_lines():
20 | for line in text.splitlines(True):
21 | yield (prefix + line if line.strip() else line)
22 |
23 | return "".join(prefixed_lines())
24 |
25 |
26 | ClosureVars = namedtuple("ClosureVars", "nonlocals globals builtins unbound")
27 |
28 |
29 | def getclosurevars(func):
30 | """
31 | Get the mapping of free variables to their current values.
32 |
33 | Returns a named tuple of dicts mapping the current nonlocal, global
34 | and builtin references as seen by the body of the function. A final
35 | set of unbound names that could not be resolved is also provided.
36 | """
37 | # From the Python3 inspect module, vendored here for Python2 compatibility
38 |
39 | if ismethod(func):
40 | func = func.__func__
41 |
42 | if not isfunction(func):
43 | raise TypeError("'{!r}' is not a Python function".format(func))
44 |
45 | code = func.__code__
46 | # Nonlocal references are named in co_freevars and resolved
47 | # by looking them up in __closure__ by positional index
48 | if func.__closure__ is None:
49 | nonlocal_vars = {}
50 | else:
51 | nonlocal_vars = {
52 | var: cell.cell_contents
53 | for var, cell in zip(code.co_freevars, func.__closure__)
54 | }
55 |
56 | # Global and builtin references are named in co_names and resolved
57 | # by looking them up in __globals__ or __builtins__
58 | global_ns = func.__globals__
59 | builtin_ns = global_ns.get("__builtins__", builtins.__dict__)
60 | if ismodule(builtin_ns):
61 | builtin_ns = builtin_ns.__dict__
62 | global_vars = {}
63 | builtin_vars = {}
64 | unbound_names = set()
65 | for name in code.co_names:
66 | if name in ("None", "True", "False"):
67 | # Because these used to be builtins instead of keywords, they
68 | # may still show up as name references. We ignore them.
69 | continue
70 | try:
71 | global_vars[name] = global_ns[name]
72 | except KeyError:
73 | try:
74 | builtin_vars[name] = builtin_ns[name]
75 | except KeyError:
76 | unbound_names.add(name)
77 |
78 | return ClosureVars(nonlocal_vars, global_vars, builtin_vars, unbound_names)
79 |
80 |
81 | CODE_INDENT = " "
82 |
83 | EXPORT_FUNCTION_SIGNATURE = "def exported_pipeline(df):\n"
84 |
85 | STEP_CODE_PREFIX = indent("\ndataframe = df.copy()\n\n", CODE_INDENT)
86 |
87 | STEP_CODE_SUFFIX = indent("return dataframe", CODE_INDENT)
88 |
89 |
90 | def replace(string, substitutions):
91 | """Replaces all substitutions in one pass to avoid conflicts"""
92 |
93 | substrings = sorted(substitutions, key=len, reverse=True)
94 | regex = re.compile("|".join(map(re.escape, substrings)))
95 | return regex.sub(lambda match: substitutions[match.group(0)], string)
96 |
97 |
98 | def render_code(function, **params):
99 | """
100 | Generate the code of a function with text replacement of arguments.
101 |
102 | Renders the code of a python function applying textual substitutions of
103 | input arguments with their repr/value.
104 |
105 | Parameters
106 | ----------
107 | function : function
108 | Python function to render.
109 |
110 | This function should have any code to be output within lines [2:-1] of
111 | the code as written. For intended usage this means that the signature,
112 | the one line docstring, and the return statement are ommitted when
113 | rendering. One should also take care the function does not use text
114 | which may clash with substitutions made when calling this function.
115 |
116 | Returns
117 | -------
118 | str
119 | The text of the input function with arguments replaced, indented once.
120 | """
121 |
122 | substitutions = {}
123 | comment = ""
124 |
125 | if "code_comment" in params:
126 | for line in params["code_comment"].split("\n"):
127 | comment += "# " + line + "\n"
128 |
129 | code = getsourcelines(function)
130 |
131 | # [2:-1] slice removes signature, docstring and return statement
132 | code = dedent("".join(code[0][2:-1]))
133 |
134 | for arg_name in signature(function).parameters.keys():
135 | if arg_name in params:
136 | # repr of a type, e.g. repr(int) doesn't produce valid python
137 | if isinstance(params[arg_name], type):
138 | substitutions[arg_name] = params[arg_name].__name__
139 | else:
140 | substitutions[arg_name] = repr(params[arg_name])
141 |
142 | if substitutions:
143 | code = replace(code, substitutions)
144 |
145 | return indent(comment + code, CODE_INDENT)
146 |
147 |
148 | def get_module_dependencies(function):
149 | """
150 | Generate the import statements required for a function
151 |
152 | Parameters
153 | ----------
154 | function : function
155 | Python function for which to generate import statements.
156 |
157 | Returns
158 | -------
159 | import list: list of str
160 | The import statements required for a function, indented once.
161 |
162 | For any closure variables not themselves a module or imported from one,
163 | the generated statement will attempt to bind the repr() of the variable
164 | to the variable name.
165 | """
166 |
167 | import_list = []
168 | import_statement = None
169 |
170 | for name, imported in getclosurevars(function).globals.items():
171 |
172 | if hasattr(imported, "__module__"):
173 | import_statement = "from {0} import {1}".format(
174 | imported.__module__, imported.__name__
175 | )
176 |
177 | if imported.__name__ != name:
178 | import_statement += " as {0}".format(name)
179 |
180 | import_statement += "\n"
181 |
182 | elif ismodule(imported):
183 | import_statement = "import {0}".format(imported.__name__)
184 |
185 | if imported.__name__ != name:
186 | import_statement += " as {0}".format(name)
187 |
188 | import_statement += "\n"
189 |
190 | else:
191 | import_statement = "{0} = {1}\n".format(name, repr(imported))
192 |
193 | if import_statement:
194 | import_list.append(indent(import_statement, CODE_INDENT))
195 |
196 | return import_list
197 |
--------------------------------------------------------------------------------
/dataclean/pipeline.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta, abstractproperty
2 |
3 | import boltzmannclean
4 |
5 | import dataclean.codegen as codegen
6 | from dataclean.cleaning import (
7 | OUTLIER_REMOVAL_METHODS,
8 | NULL_REMOVAL_METHODS,
9 | TYPE_CONVERT_METHODS,
10 | )
11 |
12 |
13 | class DataCleanStepBase(object):
14 | """Base class for a cleaning step to be applied to a dataframe"""
15 |
16 | __metaclass__ = ABCMeta
17 |
18 | def __init__(self, **params):
19 | self.params = params
20 |
21 | @abstractproperty
22 | def cleaning_function(self):
23 | pass
24 |
25 | def execute(self, dataframe, preview=True):
26 | return self.cleaning_function(
27 | dataframe.copy() if preview else dataframe, **self.params
28 | )
29 |
30 | @abstractproperty
31 | def description(self):
32 | """Return a human readable brief description of the step"""
33 | pass
34 |
35 | def render_code(self):
36 | return codegen.render_code(
37 | function=self.cleaning_function,
38 | code_comment=self.description,
39 | **self.params
40 | )
41 |
42 | def required_import_statements(self):
43 | return codegen.get_module_dependencies(self.cleaning_function)
44 |
45 |
46 | class OutlierRemovalStep(DataCleanStepBase):
47 | """A step to handle outliers in a numerical dataframe column"""
48 |
49 | def __init__(self, **params):
50 | super(OutlierRemovalStep, self).__init__(**params)
51 | self.colname = self.params["colname"]
52 | self.low_cut = self.params["low_cut"]
53 | self.high_cut = self.params["high_cut"]
54 | self.replacement_method = self.params.pop("replacement_method")
55 |
56 | @property
57 | def cleaning_function(self):
58 | return OUTLIER_REMOVAL_METHODS[self.replacement_method]
59 |
60 | @property
61 | def description(self):
62 | description = (
63 | "On {colname}, "
64 | "for values outside {low_cut} to {high_cut}, {replacement_method}"
65 | ).format(
66 | colname=self.colname,
67 | low_cut=self.low_cut,
68 | high_cut=self.high_cut,
69 | replacement_method=self.replacement_method.value,
70 | )
71 |
72 | return description
73 |
74 |
75 | class NullRemovalStep(DataCleanStepBase):
76 | """A step to handle null values in a dataframe column"""
77 |
78 | def __init__(self, **params):
79 | super(NullRemovalStep, self).__init__(**params)
80 | self.colname = self.params["colname"]
81 | self.replacement_method = self.params.pop("replacement_method")
82 |
83 | @property
84 | def cleaning_function(self):
85 | return NULL_REMOVAL_METHODS[self.replacement_method]
86 |
87 | @property
88 | def description(self):
89 | description = (
90 | "On {colname}, " + "for missing values, {replacement_method}"
91 | ).format(
92 | colname=self.colname,
93 | replacement_method=self.replacement_method.value,
94 | )
95 |
96 | return description
97 |
98 |
99 | class TypeConversionStep(DataCleanStepBase):
100 | """A step to handle mistyped values in a dataframe column"""
101 |
102 | def __init__(self, **params):
103 | super(TypeConversionStep, self).__init__(**params)
104 | self.colname = self.params["colname"]
105 | self.data_type = self.params["data_type"]
106 | self.replacement_method = self.params.pop("replacement_method")
107 |
108 | @property
109 | def cleaning_function(self):
110 | return TYPE_CONVERT_METHODS[self.replacement_method]
111 |
112 | @property
113 | def description(self):
114 | description = (
115 | "On {colname}, "
116 | + "for non {data_type} types, {replacement_method}"
117 | ).format(
118 | colname=self.colname,
119 | replacement_method=self.replacement_method.value,
120 | data_type=self.data_type.__name__,
121 | )
122 |
123 | return description
124 |
125 |
126 | class RbmStep(DataCleanStepBase):
127 | """A step to fill missing values with a Restricted Boltzmann Machine"""
128 |
129 | def __init__(self, **params):
130 | super(RbmStep, self).__init__(**params)
131 | self.numerical_columns = self.params["numerical_columns"]
132 | self.categorical_columns = self.params["categorical_columns"]
133 |
134 | @property
135 | def cleaning_function(self):
136 | return boltzmannclean.clean
137 |
138 | def execute(self, dataframe, preview=True):
139 | return self.cleaning_function(
140 | dataframe.copy() if preview else dataframe,
141 | tune_rbm=not preview,
142 | **self.params
143 | )
144 |
145 | @property
146 | def description(self):
147 | description = (
148 | "On {num_cols} columns, " + "impute values, with an RBM"
149 | ).format(
150 | num_cols=len(self.numerical_columns + self.categorical_columns)
151 | )
152 |
153 | return description
154 |
155 | def render_code(self):
156 | return codegen.render_code(
157 | function=self.cleaning_function,
158 | tune_rbm=True,
159 | code_comment=self.description,
160 | **self.params
161 | )
162 |
163 |
164 | class Pipeline(object):
165 | """Keeps track of which cleaning step the user wishes to apply."""
166 |
167 | def __init__(self):
168 | self.steps = []
169 |
170 | def append(self, step):
171 | self.steps.append(step)
172 |
173 | def remove(self, step):
174 | self.steps.remove(step)
175 |
176 | def replace(self, old_step, new_step):
177 | if old_step in self.steps:
178 | index = self.steps.index(old_step)
179 | self.steps.remove(old_step)
180 | self.steps.insert(index, new_step)
181 |
182 | def execute(self, dataframe, up_to_step=None, preview=True):
183 | """Executes the current pipeline up to up_to_step on dataframe"""
184 |
185 | new_dataframe = dataframe
186 |
187 | for step in self.steps:
188 | if step is up_to_step:
189 | break
190 | new_dataframe = step.execute(new_dataframe, preview)
191 | # avoids the unnecessary pandas SettingWithCopy warning
192 | new_dataframe.is_copy = False
193 |
194 | return new_dataframe
195 |
196 | def export(self):
197 | """Returns the python code making up the pipeline"""
198 |
199 | code = ""
200 | imports = []
201 |
202 | for step in self.steps:
203 | code += step.render_code()
204 | imports += step.required_import_statements()
205 |
206 | export_code = codegen.EXPORT_FUNCTION_SIGNATURE
207 |
208 | for import_statement in sorted(set(imports)):
209 | export_code += import_statement
210 |
211 | export_code += (
212 | codegen.STEP_CODE_PREFIX + code + codegen.STEP_CODE_SUFFIX
213 | )
214 |
215 | return export_code
216 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2018 ASI Data Science. All rights reserved.
2 |
3 | Apache License
4 | Version 2.0, January 2004
5 | http://www.apache.org/licenses/
6 |
7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8 |
9 | 1. Definitions.
10 |
11 | "License" shall mean the terms and conditions for use, reproduction,
12 | and distribution as defined by Sections 1 through 9 of this document.
13 |
14 | "Licensor" shall mean the copyright owner or entity authorized by
15 | the copyright owner that is granting the License.
16 |
17 | "Legal Entity" shall mean the union of the acting entity and all
18 | other entities that control, are controlled by, or are under common
19 | control with that entity. For the purposes of this definition,
20 | "control" means (i) the power, direct or indirect, to cause the
21 | direction or management of such entity, whether by contract or
22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
23 | outstanding shares, or (iii) beneficial ownership of such entity.
24 |
25 | "You" (or "Your") shall mean an individual or Legal Entity
26 | exercising permissions granted by this License.
27 |
28 | "Source" form shall mean the preferred form for making modifications,
29 | including but not limited to software source code, documentation
30 | source, and configuration files.
31 |
32 | "Object" form shall mean any form resulting from mechanical
33 | transformation or translation of a Source form, including but
34 | not limited to compiled object code, generated documentation,
35 | and conversions to other media types.
36 |
37 | "Work" shall mean the work of authorship, whether in Source or
38 | Object form, made available under the License, as indicated by a
39 | copyright notice that is included in or attached to the work
40 | (an example is provided in the Appendix below).
41 |
42 | "Derivative Works" shall mean any work, whether in Source or Object
43 | form, that is based on (or derived from) the Work and for which the
44 | editorial revisions, annotations, elaborations, or other modifications
45 | represent, as a whole, an original work of authorship. For the purposes
46 | of this License, Derivative Works shall not include works that remain
47 | separable from, or merely link (or bind by name) to the interfaces of,
48 | the Work and Derivative Works thereof.
49 |
50 | "Contribution" shall mean any work of authorship, including
51 | the original version of the Work and any modifications or additions
52 | to that Work or Derivative Works thereof, that is intentionally
53 | submitted to Licensor for inclusion in the Work by the copyright owner
54 | or by an individual or Legal Entity authorized to submit on behalf of
55 | the copyright owner. For the purposes of this definition, "submitted"
56 | means any form of electronic, verbal, or written communication sent
57 | to the Licensor or its representatives, including but not limited to
58 | communication on electronic mailing lists, source code control systems,
59 | and issue tracking systems that are managed by, or on behalf of, the
60 | Licensor for the purpose of discussing and improving the Work, but
61 | excluding communication that is conspicuously marked or otherwise
62 | designated in writing by the copyright owner as "Not a Contribution."
63 |
64 | "Contributor" shall mean Licensor and any individual or Legal Entity
65 | on behalf of whom a Contribution has been received by Licensor and
66 | subsequently incorporated within the Work.
67 |
68 | 2. Grant of Copyright License. Subject to the terms and conditions of
69 | this License, each Contributor hereby grants to You a perpetual,
70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71 | copyright license to reproduce, prepare Derivative Works of,
72 | publicly display, publicly perform, sublicense, and distribute the
73 | Work and such Derivative Works in Source or Object form.
74 |
75 | 3. Grant of Patent License. Subject to the terms and conditions of
76 | this License, each Contributor hereby grants to You a perpetual,
77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78 | (except as stated in this section) patent license to make, have made,
79 | use, offer to sell, sell, import, and otherwise transfer the Work,
80 | where such license applies only to those patent claims licensable
81 | by such Contributor that are necessarily infringed by their
82 | Contribution(s) alone or by combination of their Contribution(s)
83 | with the Work to which such Contribution(s) was submitted. If You
84 | institute patent litigation against any entity (including a
85 | cross-claim or counterclaim in a lawsuit) alleging that the Work
86 | or a Contribution incorporated within the Work constitutes direct
87 | or contributory patent infringement, then any patent licenses
88 | granted to You under this License for that Work shall terminate
89 | as of the date such litigation is filed.
90 |
91 | 4. Redistribution. You may reproduce and distribute copies of the
92 | Work or Derivative Works thereof in any medium, with or without
93 | modifications, and in Source or Object form, provided that You
94 | meet the following conditions:
95 |
96 | (a) You must give any other recipients of the Work or
97 | Derivative Works a copy of this License; and
98 |
99 | (b) You must cause any modified files to carry prominent notices
100 | stating that You changed the files; and
101 |
102 | (c) You must retain, in the Source form of any Derivative Works
103 | that You distribute, all copyright, patent, trademark, and
104 | attribution notices from the Source form of the Work,
105 | excluding those notices that do not pertain to any part of
106 | the Derivative Works; and
107 |
108 | (d) If the Work includes a "NOTICE" text file as part of its
109 | distribution, then any Derivative Works that You distribute must
110 | include a readable copy of the attribution notices contained
111 | within such NOTICE file, excluding those notices that do not
112 | pertain to any part of the Derivative Works, in at least one
113 | of the following places: within a NOTICE text file distributed
114 | as part of the Derivative Works; within the Source form or
115 | documentation, if provided along with the Derivative Works; or,
116 | within a display generated by the Derivative Works, if and
117 | wherever such third-party notices normally appear. The contents
118 | of the NOTICE file are for informational purposes only and
119 | do not modify the License. You may add Your own attribution
120 | notices within Derivative Works that You distribute, alongside
121 | or as an addendum to the NOTICE text from the Work, provided
122 | that such additional attribution notices cannot be construed
123 | as modifying the License.
124 |
125 | You may add Your own copyright statement to Your modifications and
126 | may provide additional or different license terms and conditions
127 | for use, reproduction, or distribution of Your modifications, or
128 | for any such Derivative Works as a whole, provided Your use,
129 | reproduction, and distribution of the Work otherwise complies with
130 | the conditions stated in this License.
131 |
132 | 5. Submission of Contributions. Unless You explicitly state otherwise,
133 | any Contribution intentionally submitted for inclusion in the Work
134 | by You to the Licensor shall be under the terms and conditions of
135 | this License, without any additional terms or conditions.
136 | Notwithstanding the above, nothing herein shall supersede or modify
137 | the terms of any separate license agreement you may have executed
138 | with Licensor regarding such Contributions.
139 |
140 | 6. Trademarks. This License does not grant permission to use the trade
141 | names, trademarks, service marks, or product names of the Licensor,
142 | except as required for reasonable and customary use in describing the
143 | origin of the Work and reproducing the content of the NOTICE file.
144 |
145 | 7. Disclaimer of Warranty. Unless required by applicable law or
146 | agreed to in writing, Licensor provides the Work (and each
147 | Contributor provides its Contributions) on an "AS IS" BASIS,
148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 | implied, including, without limitation, any warranties or conditions
150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 | PARTICULAR PURPOSE. You are solely responsible for determining the
152 | appropriateness of using or redistributing the Work and assume any
153 | risks associated with Your exercise of permissions under this License.
154 |
155 | 8. Limitation of Liability. In no event and under no legal theory,
156 | whether in tort (including negligence), contract, or otherwise,
157 | unless required by applicable law (such as deliberate and grossly
158 | negligent acts) or agreed to in writing, shall any Contributor be
159 | liable to You for damages, including any direct, indirect, special,
160 | incidental, or consequential damages of any character arising as a
161 | result of this License or out of the use or inability to use the
162 | Work (including but not limited to damages for loss of goodwill,
163 | work stoppage, computer failure or malfunction, or any and all
164 | other commercial damages or losses), even if such Contributor
165 | has been advised of the possibility of such damages.
166 |
167 | 9. Accepting Warranty or Additional Liability. While redistributing
168 | the Work or Derivative Works thereof, You may choose to offer,
169 | and charge a fee for, acceptance of support, warranty, indemnity,
170 | or other liability obligations and/or rights consistent with this
171 | License. However, in accepting such obligations, You may act only
172 | on Your own behalf and on Your sole responsibility, not on behalf
173 | of any other Contributor, and only if You agree to indemnify,
174 | defend, and hold each Contributor harmless for any liability
175 | incurred by, or claims asserted against, such Contributor by reason
176 | of your accepting any such warranty or additional liability.
177 |
178 | END OF TERMS AND CONDITIONS
179 |
--------------------------------------------------------------------------------
/dataclean/manager.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 | from base64 import b64encode
4 |
5 | import ipywidgets
6 | from IPython.display import Javascript, display
7 | from IPython.utils.py3compat import str_to_bytes, bytes_to_str
8 | from pandas import DataFrame
9 |
10 | from dataclean.pipeline import Pipeline
11 | from dataclean.widget import (
12 | CallbackManager,
13 | ColumnWidgetController,
14 | DataFrameWidgetController,
15 | PipelineWidgetController,
16 | )
17 |
18 |
19 | def create_new_code_cell(code):
20 | """Javascript to create and populate a new code cell in the notebook"""
21 | encoded_code = bytes_to_str(b64encode(str_to_bytes(code)))
22 | display(
23 | Javascript(
24 | """
25 | var code_cell = IPython.notebook.insert_cell_below('code');
26 | code_cell.set_text(atob("{0}"));
27 | """.format(
28 | encoded_code
29 | )
30 | )
31 | )
32 |
33 |
34 | def display_colwidget(col_id):
35 | """Javascript to display a collapsed column widget"""
36 | display(
37 | Javascript(
38 | """
39 | if ($('#{0}_row').hasClass('hidden')){{$('#{0}').click()}}
40 | """.format(
41 | str(col_id)
42 | )
43 | )
44 | )
45 |
46 |
47 | class DataCleaner(object):
48 | """Keeps track of DataFrames in the user's kernel"""
49 |
50 | def __init__(self):
51 | self.dataframe_managers = {}
52 | self._main = sys.modules["__main__"]
53 | self.refresh()
54 |
55 | def refresh(self):
56 | dataframe_managers_new = {}
57 | for var_name, var in vars(self._main).items():
58 | if isinstance(var, DataFrame) and not var_name.startswith("_"):
59 | manager = self._manager_for_dataframe(var, var_name)
60 | dataframe_managers_new[id(var)] = manager
61 |
62 | self.dataframe_managers = dataframe_managers_new
63 |
64 | def dataframe_metadata(self):
65 | self.refresh()
66 | metadata = []
67 | for manager in self.dataframe_managers.values():
68 | metadata.append(manager.metadata())
69 | return json.dumps(metadata)
70 |
71 | def manager_for_id(self, dataframe_id):
72 | return self.dataframe_managers[dataframe_id]
73 |
74 | def _manager_for_dataframe(self, dataframe, name):
75 | for manager in self.dataframe_managers.values():
76 | if manager.full_dataframe is dataframe:
77 | manager.name = name
78 | break
79 | else:
80 | manager = DataframeManager(dataframe, name)
81 |
82 | def export_cleaned_dataframe(new_dataframe, dataframe_name):
83 | new_df_name = dataframe_name + "_cleaned"
84 | suffix = 0
85 |
86 | # ensures we have a unique name
87 | while getattr(self._main, new_df_name, None) is not None:
88 | suffix += 1
89 | new_df_name = dataframe_name + "_cleaned_" + str(suffix)
90 |
91 | setattr(self._main, new_df_name, new_dataframe)
92 |
93 | def export_to_code(code):
94 | create_new_code_cell(code)
95 |
96 | manager.execute_callback.register_callback(
97 | export_cleaned_dataframe
98 | )
99 | manager.export_callback.register_callback(export_to_code)
100 |
101 | return manager
102 |
103 |
104 | class DataframeManager(object):
105 | """Manages the widget controller classes for a single DataFrame"""
106 |
107 | MAX_ROWS = 1000
108 |
109 | def __init__(self, dataframe, name):
110 | self.name = name
111 | self.column_widget_controller_by_id = {}
112 | self._pipeline_widget_controller = None
113 | self._dataframe_widget_controller = None
114 |
115 | self.execute_callback = CallbackManager()
116 | self.export_callback = CallbackManager()
117 |
118 | self.full_dataframe = dataframe
119 |
120 | if dataframe.shape[0] > self.MAX_ROWS:
121 | self.dataframe = dataframe.sample(n=self.MAX_ROWS)
122 | self.is_sample = True
123 | else:
124 | self.dataframe = dataframe
125 | self.is_sample = False
126 |
127 | if not (dataframe.columns.is_unique and dataframe.index.is_unique):
128 | self.dataframe = DataFrame({"_": []})
129 |
130 | self.pipeline = Pipeline()
131 | self.active_step = None
132 |
133 | self.column_by_id = {}
134 | for colname, column in self.dataframe.items():
135 | self.column_by_id[id(column)] = self.dataframe[colname]
136 |
137 | def metadata(self):
138 | metadata = {
139 | "dfName": self.name,
140 | "dfId": id(self.full_dataframe),
141 | "dfShape": self.full_dataframe.shape,
142 | "dfColnames": sorted(
143 | self.full_dataframe.columns.to_series().apply(str)
144 | ),
145 | "dfCols": [
146 | {
147 | "colname": colname,
148 | "colId": id(self.full_dataframe[colname]),
149 | "description": {
150 | "dtype": str(column.dtype),
151 | "null_percentage": "{0:.0f}%".format(
152 | 100 * column.isnull().sum() / float(len(column))
153 | if len(column) > 0
154 | else 0
155 | ),
156 | "distinct": len(column.value_counts()),
157 | },
158 | }
159 | for colname, column in self.full_dataframe.items()
160 | ],
161 | }
162 | return metadata
163 |
164 | @property
165 | def dataframe_widget(self):
166 | if self._dataframe_widget_controller is None:
167 | self._dataframe_widget_controller = DataFrameWidgetController(
168 | self.pipeline_widget, self.MAX_ROWS if self.is_sample else 0
169 | )
170 |
171 | def resample():
172 | self.dataframe = self.full_dataframe.sample(n=self.MAX_ROWS)
173 | self._refresh_colwidgets()
174 |
175 | self._dataframe_widget_controller.resample_callback.register_callback(
176 | resample
177 | )
178 | self._dataframe_widget_controller.new_step_callback.register_callback(
179 | self._new_step
180 | )
181 | self._dataframe_widget_controller.modify_step_callback.register_callback(
182 | self._replace_active_step
183 | )
184 |
185 | if self.dataframe.equals(DataFrame({"_": []})):
186 | widget = ipywidgets.Label(
187 | value=(
188 | "DataFrames with non-unique column names or index are "
189 | "unsupported."
190 | ),
191 | layout=ipywidgets.Layout(width="600px"),
192 | )
193 | elif self.dataframe.empty:
194 | widget = ipywidgets.Label(value=("DataFrame is empty."))
195 | else:
196 | widget = self._dataframe_widget_controller.render_widget(
197 | self.dataframe, self.active_step
198 | )
199 |
200 | return widget
201 |
202 | @property
203 | def pipeline_widget(self):
204 | if self._pipeline_widget_controller is None:
205 | self._pipeline_widget_controller = PipelineWidgetController(
206 | self.pipeline, self.name
207 | )
208 |
209 | def enter_edit_mode(active_step):
210 | self._refresh_colwidgets(step=active_step)
211 | self.active_step = active_step
212 | if hasattr(active_step, "colname"):
213 | display_colwidget(
214 | id(self.full_dataframe[active_step.colname])
215 | )
216 |
217 | def enter_add_mode():
218 | self._refresh_colwidgets()
219 | self.active_step = None
220 |
221 | def execute_pipeline():
222 | new_dataframe = self.pipeline.execute(
223 | self.full_dataframe.copy(), preview=False
224 | )
225 | self.execute_callback.send_callbacks(new_dataframe, self.name)
226 |
227 | def export_pipeline():
228 | code = self.pipeline.export()
229 | self.export_callback.send_callbacks(code)
230 |
231 | self._pipeline_widget_controller.add_mode_callback.register_callback(
232 | enter_add_mode
233 | )
234 | self._pipeline_widget_controller.edit_mode_callback.register_callback(
235 | enter_edit_mode
236 | )
237 |
238 | self._pipeline_widget_controller.execute_callback.register_callback(
239 | execute_pipeline
240 | )
241 | self._pipeline_widget_controller.export_callback.register_callback(
242 | export_pipeline
243 | )
244 |
245 | self._pipeline_widget_controller.delete_step_callback.register_callback(
246 | self._delete_step
247 | )
248 |
249 | return self._pipeline_widget_controller.render_widget(self.active_step)
250 |
251 | def column_widget(self, col_id):
252 | if self.dataframe.empty:
253 | widget = ipywidgets.Label(value="")
254 | else:
255 | if col_id in self.column_widget_controller_by_id:
256 | col_widget_controller = self.column_widget_controller_by_id[
257 | col_id
258 | ]
259 | else:
260 | column = self.column_by_id[col_id]
261 |
262 | col_widget_controller = ColumnWidgetController()
263 | col_widget_controller.load_data(
264 | column, self.dataframe, self.active_step
265 | )
266 |
267 | self.column_widget_controller_by_id[
268 | col_id
269 | ] = col_widget_controller
270 |
271 | col_widget_controller.new_step_callback.register_callback(
272 | self._new_step
273 | )
274 | col_widget_controller.modify_step_callback.register_callback(
275 | self._replace_active_step
276 | )
277 |
278 | widget = col_widget_controller.render_widget()
279 |
280 | return widget
281 |
282 | def _refresh_colwidgets(self, step=None):
283 | new_dataframe = self.pipeline.execute(self.dataframe, up_to_step=step)
284 | for (
285 | col_id,
286 | col_widget_controller,
287 | ) in self.column_widget_controller_by_id.items():
288 | col_widget_controller.load_data(
289 | new_dataframe[self.column_by_id[col_id].name],
290 | new_dataframe,
291 | step,
292 | )
293 | col_widget_controller.render_widget()
294 | self._dataframe_widget_controller.render_widget(new_dataframe, step)
295 |
296 | def _new_step(self, new_step):
297 | self.pipeline.append(new_step)
298 | if self._pipeline_widget_controller:
299 | self._pipeline_widget_controller.render_widget()
300 | self._dataframe_widget_controller.display_pipeline()
301 | self._refresh_colwidgets()
302 |
303 | def _replace_active_step(self, modified_step):
304 | self.pipeline.replace(self.active_step, modified_step)
305 | self.active_step = None
306 | if self._pipeline_widget_controller:
307 | self._pipeline_widget_controller.render_widget()
308 | self._dataframe_widget_controller.display_pipeline()
309 | self._refresh_colwidgets()
310 |
311 | def _delete_step(self, step):
312 | self.pipeline.remove(step)
313 | self._pipeline_widget_controller.render_widget()
314 | self.active_step = None
315 | self._refresh_colwidgets()
316 |
--------------------------------------------------------------------------------
/dataclean/cleaning.py:
--------------------------------------------------------------------------------
1 | from builtins import int
2 | from enum import Enum
3 |
4 | from sklearn.neighbors import KernelDensity
5 |
6 |
7 | def outlier_removal_mean(dataframe, colname, low_cut, high_cut):
8 | """Replace outliers with the mean on dataframe[colname]"""
9 |
10 | col = dataframe[colname]
11 |
12 | col_numerics = col.loc[
13 | col.apply(
14 | lambda x: isinstance(x, (int, float))
15 | and (x >= low_cut and x <= high_cut)
16 | )
17 | ]
18 |
19 | dataframe.loc[
20 | col.apply(
21 | lambda x: isinstance(x, (int, float))
22 | and (x < low_cut or x > high_cut)
23 | ),
24 | colname,
25 | ] = col_numerics.mean()
26 |
27 | return dataframe
28 |
29 |
30 | def outlier_removal_null(dataframe, colname, low_cut, high_cut):
31 | """Replace outliers with empty values on dataframe[colname]"""
32 |
33 | col = dataframe[colname]
34 |
35 | dataframe.loc[
36 | col.apply(
37 | lambda x: isinstance(x, (int, float))
38 | and (x < low_cut or x > high_cut)
39 | ),
40 | colname,
41 | ] = None
42 |
43 | return dataframe
44 |
45 |
46 | def outlier_removal_median(dataframe, colname, low_cut, high_cut):
47 | """Replace outliers with the median on dataframe[colname]"""
48 |
49 | col = dataframe[colname]
50 |
51 | col_numerics = col.loc[
52 | col.apply(
53 | lambda x: isinstance(x, (int, float))
54 | and (x >= low_cut and x <= high_cut)
55 | )
56 | ]
57 |
58 | dataframe.loc[
59 | col.apply(
60 | lambda x: isinstance(x, (int, float))
61 | and (x < low_cut or x > high_cut)
62 | ),
63 | colname,
64 | ] = col_numerics.median()
65 |
66 | return dataframe
67 |
68 |
69 | def outlier_removal_mode_numeric(dataframe, colname, low_cut, high_cut):
70 | """Replace outliers with the modal numeric value on dataframe[colname]"""
71 |
72 | col = dataframe[colname]
73 |
74 | col_numerics = col.loc[
75 | col.apply(
76 | lambda x: isinstance(x, (int, float))
77 | and (x >= low_cut and x <= high_cut)
78 | )
79 | ]
80 |
81 | dataframe.loc[
82 | col.apply(
83 | lambda x: isinstance(x, (int, float))
84 | and (x < low_cut or x > high_cut)
85 | ),
86 | colname,
87 | ] = col_numerics.mode().get(0, None)
88 |
89 | return dataframe
90 |
91 |
92 | def outlier_removal_nearest_cut(dataframe, colname, low_cut, high_cut):
93 | """Clip outliers on dataframe[colname]"""
94 |
95 | col = dataframe[colname]
96 |
97 | dataframe.loc[
98 | col.apply(lambda x: isinstance(x, (int, float)) and x < low_cut),
99 | colname,
100 | ] = low_cut
101 |
102 | dataframe.loc[
103 | col.apply(lambda x: isinstance(x, (int, float)) and x > high_cut),
104 | colname,
105 | ] = high_cut
106 |
107 | return dataframe
108 |
109 |
110 | def outlier_removal_drop(dataframe, colname, low_cut, high_cut):
111 | """Drop rows with outliers on dataframe[colname]"""
112 |
113 | col = dataframe[colname]
114 |
115 | dataframe = dataframe.loc[
116 | col.isnull()
117 | | col.apply(
118 | lambda x: not isinstance(x, (int, float))
119 | or (x >= low_cut and x <= high_cut)
120 | ),
121 | :,
122 | ]
123 |
124 | return dataframe
125 |
126 |
127 | def outlier_removal_sample(dataframe, colname, low_cut, high_cut):
128 | """Replace outliers with samples from a KDE on dataframe[colname]"""
129 |
130 | col = dataframe[colname]
131 |
132 | col_numerics = col.loc[
133 | col.apply(
134 | lambda x: isinstance(x, (int, float))
135 | and (x >= low_cut and x <= high_cut)
136 | )
137 | ]
138 | if col_numerics.empty:
139 | col_numerics[0] = low_cut
140 | col_numerics[1] = high_cut
141 |
142 | kde = KernelDensity()
143 | kde.fit(col_numerics.values.reshape(-1, 1))
144 |
145 | is_outlier = col.apply(
146 | lambda x: isinstance(x, (int, float)) and (x < low_cut or x > high_cut)
147 | )
148 |
149 | samples = kde.sample(n_samples=is_outlier.sum())
150 |
151 | dataframe.loc[is_outlier, colname] = samples.flatten()
152 |
153 | return dataframe
154 |
155 |
156 | def null_removal_mean(dataframe, colname):
157 | """Replace nulls with the mean on dataframe[colname]"""
158 |
159 | col = dataframe[colname]
160 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))]
161 |
162 | dataframe[colname] = col.fillna(col_numerics.mean())
163 |
164 | return dataframe
165 |
166 |
167 | def null_removal_sample(dataframe, colname):
168 | """Replace nulls with samples from a KDE on dataframe[colname]"""
169 |
170 | col = dataframe[colname]
171 |
172 | col_numerics = col.loc[
173 | col.notnull() & col.apply(lambda x: isinstance(x, (int, float)))
174 | ]
175 | if col_numerics.empty:
176 | col_numerics[0] = 0
177 |
178 | kde = KernelDensity()
179 | kde.fit(col_numerics.values.reshape(-1, 1))
180 |
181 | samples = kde.sample(n_samples=col.isnull().sum())
182 |
183 | dataframe.loc[col.isnull(), colname] = samples.flatten()
184 |
185 | return dataframe
186 |
187 |
188 | def null_removal_median(dataframe, colname):
189 | """Replace nulls with the median on dataframe[colname]"""
190 |
191 | col = dataframe[colname]
192 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))]
193 |
194 | dataframe[colname] = col.fillna(col_numerics.median())
195 |
196 | return dataframe
197 |
198 |
199 | def null_removal_mode(dataframe, colname):
200 | """Replace nulls with the mode on dataframe[colname]"""
201 |
202 | col = dataframe[colname]
203 |
204 | dataframe[colname] = col.fillna(col.mode().get(0, None))
205 |
206 | return dataframe
207 |
208 |
209 | def null_removal_mode_numeric(dataframe, colname):
210 | """Replace nulls with the modal numeric value on dataframe[colname]"""
211 |
212 | col = dataframe[colname]
213 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))]
214 |
215 | dataframe[colname] = col.fillna(col_numerics.mode().get(0, None))
216 |
217 | return dataframe
218 |
219 |
220 | def null_removal_drop(dataframe, colname):
221 | """Drops rows with nulls on dataframe[colname]"""
222 |
223 | dataframe = dataframe.dropna(subset=[colname])
224 |
225 | return dataframe
226 |
227 |
228 | def type_convert_mean(dataframe, colname, data_type):
229 | """Replace mistyped values with the mean on dataframe[colname]"""
230 |
231 | col = dataframe[colname]
232 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))]
233 |
234 | dataframe.loc[
235 | col.notnull() & col.apply(lambda x: not isinstance(x, data_type)),
236 | colname,
237 | ] = col_numerics.mean()
238 |
239 | return dataframe
240 |
241 |
242 | def type_convert_median(dataframe, colname, data_type):
243 | """Replace mistyped values with the median on dataframe[colname]"""
244 |
245 | col = dataframe[colname]
246 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))]
247 |
248 | dataframe.loc[
249 | col.notnull() & col.apply(lambda x: not isinstance(x, data_type)),
250 | colname,
251 | ] = col_numerics.median()
252 |
253 | return dataframe
254 |
255 |
256 | def type_convert_mode(dataframe, colname, data_type):
257 | """Replace mistyped values with the modal value on dataframe[colname]"""
258 |
259 | col = dataframe[colname]
260 | col_this_type = col.loc[col.apply(lambda x: isinstance(x, data_type))]
261 |
262 | dataframe.loc[
263 | col.notnull() & col.apply(lambda x: not isinstance(x, data_type)),
264 | colname,
265 | ] = col_this_type.mode().get(0, None)
266 |
267 | return dataframe
268 |
269 |
270 | def type_convert_cast(dataframe, colname, data_type):
271 | """Tries to cast mistyped values on dataframe[colname]"""
272 |
273 | def try_cast(x):
274 | try:
275 | return data_type(x)
276 | except ValueError:
277 | return x
278 |
279 | dataframe[colname] = dataframe[colname].apply(try_cast)
280 |
281 | return dataframe
282 |
283 |
284 | def type_convert_drop(dataframe, colname, data_type):
285 | """Drops rows with mistyped values with the mean on dataframe[colname]"""
286 |
287 | col = dataframe[colname]
288 |
289 | dataframe = dataframe.loc[
290 | col.isnull() | col.apply(lambda x: isinstance(x, data_type)), :
291 | ]
292 |
293 | return dataframe
294 |
295 |
296 | def type_convert_sample(dataframe, colname, data_type):
297 | """Replace mistyped values with samples from a KDE on dataframe[colname]"""
298 |
299 | col = dataframe[colname]
300 |
301 | col_numerics = col.loc[
302 | col.notnull() & col.apply(lambda x: isinstance(x, (int, float)))
303 | ]
304 | if col_numerics.empty:
305 | col_numerics[0] = 0
306 |
307 | kde = KernelDensity()
308 | kde.fit(col_numerics.values.reshape(-1, 1))
309 |
310 | is_wrong_type = col.apply(lambda x: not isinstance(x, data_type))
311 |
312 | samples = kde.sample(n_samples=is_wrong_type.sum())
313 |
314 | dataframe.loc[is_wrong_type, colname] = samples.flatten()
315 |
316 | return dataframe
317 |
318 |
319 | class OutlierRemovalMethod(Enum):
320 | NONE = "Do Nothing"
321 | MEAN = "Replace with Mean (excluding outliers)"
322 | MEDIAN = "Replace with Median (excluding outliers)"
323 | NEAREST_CUT = "Replace with Nearest Cut (Clip)"
324 | MODE_NUMERIC = "Replace with Mode"
325 | SAMPLE = "Sample from Column Distribution"
326 | NULL = "Replace with Null"
327 | DROP = "Drop Rows"
328 |
329 |
330 | class NullRemovalMethod(Enum):
331 | NONE = "Do Nothing"
332 | MEAN = "Replace with Mean"
333 | MEDIAN = "Replace with Median"
334 | MODE = "Replace with Most Common Value"
335 | MODE_NUMERIC = "Replace with Mode"
336 | SAMPLE = "Sample from Column Distribution"
337 | DROP = "Drop Rows"
338 |
339 |
340 | class TypeConvertMethod(Enum):
341 | NONE = "Do Nothing"
342 | CAST = "Try to Cast"
343 | MEAN = "Replace with Mean"
344 | MEDIAN = "Replace with Median"
345 | MODE = "Replace with Most Common Value"
346 | SAMPLE = "Sample from Column Distribution"
347 | DROP = "Drop Rows"
348 |
349 |
350 | # Numeric values are always treated as continuous
351 | class CategoricalTypes(Enum):
352 | CONTINUOUS = "Numeric"
353 | CATEGORICAL = "Categorical"
354 |
355 |
356 | OUTLIER_REMOVAL_METHODS = {
357 | OutlierRemovalMethod.MEAN: outlier_removal_mean,
358 | OutlierRemovalMethod.MEDIAN: outlier_removal_median,
359 | OutlierRemovalMethod.NEAREST_CUT: outlier_removal_nearest_cut,
360 | OutlierRemovalMethod.DROP: outlier_removal_drop,
361 | OutlierRemovalMethod.MODE_NUMERIC: outlier_removal_mode_numeric,
362 | OutlierRemovalMethod.SAMPLE: outlier_removal_sample,
363 | OutlierRemovalMethod.NULL: outlier_removal_null,
364 | OutlierRemovalMethod.NONE: lambda df, *_, **__: df,
365 | }
366 |
367 | NULL_REMOVAL_METHODS = {
368 | NullRemovalMethod.MEAN: null_removal_mean,
369 | NullRemovalMethod.MEDIAN: null_removal_median,
370 | NullRemovalMethod.MODE: null_removal_mode,
371 | NullRemovalMethod.MODE_NUMERIC: null_removal_mode_numeric,
372 | NullRemovalMethod.DROP: null_removal_drop,
373 | NullRemovalMethod.SAMPLE: null_removal_sample,
374 | NullRemovalMethod.NONE: lambda df, *_, **__: df,
375 | }
376 |
377 | TYPE_CONVERT_METHODS = {
378 | TypeConvertMethod.MEAN: type_convert_mean,
379 | TypeConvertMethod.MEDIAN: type_convert_median,
380 | TypeConvertMethod.MODE: type_convert_mode,
381 | TypeConvertMethod.DROP: type_convert_drop,
382 | TypeConvertMethod.CAST: type_convert_cast,
383 | TypeConvertMethod.SAMPLE: type_convert_sample,
384 | TypeConvertMethod.NONE: lambda df, *_, **__: df,
385 | }
386 |
387 |
388 | # Encodes which transformations are allowed for which data types
389 | ALLOWED_TRANSFORMATIONS = {
390 | CategoricalTypes.CONTINUOUS: [
391 | OutlierRemovalMethod.MEAN,
392 | OutlierRemovalMethod.MEDIAN,
393 | OutlierRemovalMethod.NEAREST_CUT,
394 | OutlierRemovalMethod.DROP,
395 | OutlierRemovalMethod.MODE_NUMERIC,
396 | OutlierRemovalMethod.SAMPLE,
397 | OutlierRemovalMethod.NULL,
398 | OutlierRemovalMethod.NONE,
399 | NullRemovalMethod.MEAN,
400 | NullRemovalMethod.MEDIAN,
401 | NullRemovalMethod.MODE_NUMERIC,
402 | NullRemovalMethod.DROP,
403 | NullRemovalMethod.SAMPLE,
404 | NullRemovalMethod.NONE,
405 | TypeConvertMethod.MEAN,
406 | TypeConvertMethod.MEDIAN,
407 | TypeConvertMethod.MODE,
408 | TypeConvertMethod.DROP,
409 | TypeConvertMethod.CAST,
410 | TypeConvertMethod.SAMPLE,
411 | TypeConvertMethod.NONE,
412 | ],
413 | CategoricalTypes.CATEGORICAL: [
414 | NullRemovalMethod.MODE,
415 | NullRemovalMethod.DROP,
416 | NullRemovalMethod.NONE,
417 | TypeConvertMethod.DROP,
418 | TypeConvertMethod.CAST,
419 | TypeConvertMethod.MODE,
420 | TypeConvertMethod.NONE,
421 | ],
422 | }
423 |
--------------------------------------------------------------------------------
/dataclean/static/iosbadge.js:
--------------------------------------------------------------------------------
1 | /*! iOSBadge - v0.2.0
2 | * http://kristerkari.github.com/iOSBadge/
3 | * Copyright (c) 2016 Krister Kari; Licensed MIT
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 |
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 |
8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.*/
9 |
10 | (function($, window, document) {
11 | 'use strict';
12 |
13 | /**
14 | Check if the content is a number
15 | @param content {String|Number} The content
16 | @return {Boolean} true or false depending on if the content is a number
17 | @private
18 | */
19 | var isNumber;
20 | isNumber = function(content) {
21 | if (typeof content === 'string' || typeof content === 'number') {
22 | if (!isNaN(parseInt(content, 10))) {
23 | return true;
24 | } else {
25 | return false;
26 | }
27 | } else {
28 | return false;
29 | }
30 | };
31 |
32 | /**
33 | Constructor and plugin settings
34 |
35 | Make sure that the plugin works even without the `new` keyword.
36 |
37 | Check for any user defined settings and initialize the plugin.
38 | @class IOSBadge
39 | @constructor
40 | @example
41 | var badge = new IOSBadge();
42 | */
43 | window.IOSBadge = (function() {
44 | function IOSBadge(element, settings) {
45 | if (!(this instanceof IOSBadge)) {
46 | return new IOSBadge(element, settings);
47 | } else if (!element || !(element.nodeType && element.nodeType === 1)) {
48 | throw new Error('You need to pass an element as the first argument to iOSBadge');
49 | }
50 | this.element = element;
51 | this.settings = settings;
52 | if (element.length && element.length > 1) {
53 | this.element = element[0];
54 | }
55 | if (settings && typeof settings === 'object') {
56 | this.content = settings.content || 1;
57 | this.size = settings.size || 20;
58 | this.position = settings.position || 'top-right';
59 | this.namespace = settings.namespace || 'iosb';
60 | this.theme = settings.theme || 'red';
61 | } else {
62 | this.content = 1;
63 | this.size = 20;
64 | this.position = 'top-right';
65 | this.namespace = 'iosb';
66 | this.theme = 'red';
67 | }
68 | this._generate();
69 | }
70 |
71 |
72 | /**
73 | Generate elements used by the plugin.
74 | @method _generate
75 | @private
76 | */
77 |
78 | IOSBadge.prototype._generate = function() {
79 | this.type = isNumber(this.content) ? 'number' : 'string';
80 | if (/^(-|\+)\d+/.test(this.content)) {
81 | this.content = this.content.slice(1);
82 | }
83 | this.badgeElem = document.createElement('div');
84 | this.badgeInner = document.createElement('div');
85 | this.badgeContent = document.createElement('div');
86 | this._setContent(this.content);
87 | this._setClasses(this.position, this.size, this.type, this.theme);
88 | this.badgeInner.appendChild(this.badgeContent);
89 | this.badgeElem.appendChild(this.badgeInner);
90 | this.element.appendChild(this.badgeElem);
91 | };
92 |
93 |
94 | /**
95 | Set jQuery/Zepto options from the user.
96 | @method _setOptions
97 | @param options {Object|String} Plugin options given with jQuery or Zepto.
98 | @private
99 | */
100 |
101 | IOSBadge.prototype._setOptions = function(options) {
102 | if (options && typeof options === 'object') {
103 | if (options.content != null) {
104 | this.setContent(options.content);
105 | }
106 | if (options.position != null) {
107 | this.setPosition(options.position);
108 | }
109 | if (options.theme != null) {
110 | this.setTheme(options.theme);
111 | }
112 | if (options.size != null) {
113 | this.setSize(options.size);
114 | }
115 | if (options.hide && options.hide === true) {
116 | this.hide();
117 | } else if (options.show && options.show === true) {
118 | this.show();
119 | }
120 | } else if (typeof options === 'string') {
121 | if (options.toLowerCase() === 'showbadge') {
122 | this.show();
123 | } else if (options.toLowerCase() === 'hidebadge') {
124 | this.hide();
125 | } else {
126 | this.setContent(options);
127 | }
128 | }
129 | };
130 |
131 |
132 | /**
133 | Set the content of badge element.
134 | @method _setContent
135 | @param content {Number|String} content for the badge element.
136 | @private
137 | */
138 |
139 | IOSBadge.prototype._setContent = function(content) {
140 | this.content = content;
141 | this.badgeContent.innerHTML = content;
142 | };
143 |
144 |
145 | /**
146 | Set the classnames used by the plugin.
147 | @method _setClasses
148 | @param position {String} Badge position.
149 | @param size {String} Badge size.
150 | @param type {String} Badge type (number or string).
151 | @param theme {String} Badge theme.
152 | @private
153 | */
154 |
155 | IOSBadge.prototype._setClasses = function(position, size, type, theme) {
156 | var namespace;
157 | namespace = this.namespace;
158 | this.badgeElem.className = namespace + " " + namespace + "-" + size + " " + namespace + "-" + position;
159 | this.badgeInner.className = namespace + "-inner " + namespace + "-" + theme;
160 | this.badgeContent.className = namespace + "-content " + namespace + "-" + type;
161 | };
162 |
163 |
164 | /**
165 | Returns the current content set for badge. Not chainable.
166 | @method getContent
167 | @return {Number|String} Badge content.
168 | @example
169 | badge.getContent();
170 | */
171 |
172 | IOSBadge.prototype.getContent = function() {
173 | var badgeContent, badgeContentInt;
174 | badgeContent = this.badgeContent.innerHTML;
175 | badgeContentInt = parseInt(badgeContent, 10);
176 | if (!isNaN(badgeContentInt)) {
177 | return badgeContentInt;
178 | } else {
179 | return badgeContent;
180 | }
181 | };
182 |
183 |
184 | /**
185 | Set the content of your badge. Content can be a number or a string.
186 | Increase or decrease your current badge number by passing a `'+'` or `'-'` prefixed
187 | number in a string e.g. `.setContent('+7')`
188 | @method setContent
189 | @param content {Number|String} Badge content.
190 | @chainable
191 | @example
192 | badge.setContent(6);
193 | */
194 |
195 | IOSBadge.prototype.setContent = function(content) {
196 | var amount, firstChar, type;
197 | if (content == null) {
198 | content = 1;
199 | }
200 | type = typeof content;
201 | if (type === 'object' || type === 'function') {
202 | return this;
203 | } else if (this.badgeElem.style.display === 'none') {
204 | this.show();
205 | }
206 | if (type === 'string') {
207 | firstChar = content.charAt(0);
208 | amount = +content.substring(1) || 0;
209 | if (content === '') {
210 | this._setContent('');
211 | this.hide();
212 | return this;
213 | } else if (firstChar === '+') {
214 | this.increaseBy(amount);
215 | return this;
216 | } else if (firstChar === '-') {
217 | this.decreaseBy(amount);
218 | return this;
219 | } else if (isNumber(content)) {
220 | type = 'number';
221 | } else {
222 | type = 'string';
223 | }
224 | } else {
225 | type = 'number';
226 | }
227 | this.type = type;
228 | this._setClasses(this.position, this.size, type, this.theme);
229 | this._setContent(content);
230 | return this;
231 | };
232 |
233 |
234 | /**
235 | Set the position of your badge.
236 | Positions are: `'top-left'`, `'top-right'`, `'bottom-left'` or `'bottom-right'`.
237 | @method setPosition
238 | @param position {String} Badge position.
239 | @chainable
240 | @example
241 | badge.setPosition('bottom-left');
242 | */
243 |
244 | IOSBadge.prototype.setPosition = function(position) {
245 | if (typeof position === 'string') {
246 | this.position = position;
247 | this._setClasses(position, this.size, this.type, this.theme);
248 | }
249 | return this;
250 | };
251 |
252 |
253 | /**
254 | Set the theme of your badge.
255 | Available default themes are: `'red'`, `'blue'`, `'green'`, `'grey'` and `'ios'`.
256 | Themes can be configured in the `iosbadge.scss` file.
257 | @method setTheme
258 | @param theme {String} Badge theme.
259 | @chainable
260 | @example
261 | badge.setTheme('ios');
262 | */
263 |
264 | IOSBadge.prototype.setTheme = function(theme) {
265 | if (typeof theme === 'string') {
266 | this.theme = theme;
267 | this._setClasses(this.position, this.size, this.type, theme);
268 | }
269 | return this;
270 | };
271 |
272 |
273 | /**
274 | Set the size of your badge.
275 | Available default sizes are: `20`, `22`, `24`, `26`, `28`, `30`, `32`, `34` and `36`.
276 | Sizes can be configured in the `iosbadge.scss` file.
277 | @method setSize
278 | @param size {Number|String} Badge size.
279 | @chainable
280 | @example
281 | badge.setSize(30);
282 | */
283 |
284 | IOSBadge.prototype.setSize = function(size) {
285 | if (isNumber(size)) {
286 | this.size = parseInt(size, 10);
287 | this._setClasses(this.position, this.size, this.type, this.theme);
288 | }
289 | return this;
290 | };
291 |
292 |
293 | /**
294 | Decrease the current number in your badge.
295 | @method decreaseBy
296 | @param amount {Number} The amount to decrease by.
297 | @chainable
298 | @example
299 | badge.decreaseBy(2);
300 | */
301 |
302 | IOSBadge.prototype.decreaseBy = function(amount) {
303 | if (isNumber(amount)) {
304 | this.type = 'number';
305 | this._setClasses(this.position, this.size, this.type, this.theme);
306 | this._setContent((parseInt(this.content, 10) || 0) - parseInt(amount, 10));
307 | }
308 | return this;
309 | };
310 |
311 |
312 | /**
313 | Increase the current number in your badge.
314 | @method increaseBy
315 | @param amount {Number} The amount to increase by.
316 | @chainable
317 | @example
318 | badge.increaseBy(2);
319 | */
320 |
321 | IOSBadge.prototype.increaseBy = function(amount) {
322 | if (isNumber(amount)) {
323 | this.type = 'number';
324 | this._setClasses(this.position, this.size, this.type, this.theme);
325 | this._setContent((parseInt(this.content, 10) || 0) + parseInt(amount, 10));
326 | }
327 | return this;
328 | };
329 |
330 |
331 | /**
332 | Hide your badge element.
333 | @method hide
334 | @chainable
335 | @example
336 | badge.hide();
337 | */
338 |
339 | IOSBadge.prototype.hide = function() {
340 | this.badgeElem.style.display = 'none';
341 | return this;
342 | };
343 |
344 |
345 | /**
346 | Show your badge element.
347 | @method hide
348 | @chainable
349 | @example
350 | badge.show();
351 | */
352 |
353 | IOSBadge.prototype.show = function() {
354 | this.badgeElem.style.display = 'block';
355 | return this;
356 | };
357 |
358 | return IOSBadge;
359 |
360 | })();
361 | if (typeof $ === 'function') {
362 | $.fn.iosbadge = function(options) {
363 | var iOSBadge;
364 | iOSBadge = 'iosbadge';
365 | if (typeof options === 'string' && options.toLowerCase() === 'getcontent' && this.data(iOSBadge)) {
366 | return this.data(iOSBadge).getContent();
367 | } else {
368 | return this.each(function() {
369 | var $self, pluginData, self;
370 | self = this;
371 | $self = $(self);
372 | pluginData = $self.data(iOSBadge);
373 | if (!pluginData) {
374 | $self.data(iOSBadge, new window.IOSBadge(self, options));
375 | } else {
376 | pluginData._setOptions(options);
377 | }
378 | });
379 | }
380 | };
381 | }
382 | })(window.jQuery || window.Zepto, window, window.document);
383 |
--------------------------------------------------------------------------------
/dataclean/static/main.js:
--------------------------------------------------------------------------------
1 | define(["require", "jquery", "base/js/namespace", 'services/config',
2 | 'base/js/events', 'base/js/utils', 'notebook/js/codecell', 'notebook/js/outputarea'
3 | ], function(require, $, Jupyter, configmod, events, utils, codecell, outputarea) {
4 |
5 | var Notebook = require('notebook/js/notebook').Notebook;
6 | "use strict";
7 | var mod_name = "dataclean";
8 | var log_prefix = '[' + mod_name + '] ';
9 |
10 | var n_dataframes = 0
11 |
12 | // ...........Parameters configuration......................
13 | // define default values for config parameters if they were not present in general settings (notebook.json)
14 | var cfg = {
15 | 'position' : {
16 | top: '50px'
17 | },
18 | 'window_display': false,
19 | 'python': {
20 | varRefreshCmd: (`try:
21 | print(_datacleaner.dataframe_metadata())
22 | except:
23 | print([])`)
24 | .replace(/^ /gm, '')
25 | },
26 | };
27 |
28 | //.....................global variables....
29 |
30 |
31 | var st = {};
32 | st.config_loaded = false;
33 | st.extension_initialized = false;
34 |
35 | function read_config(cfg, callback) { // read after nb is loaded
36 | // create config object to load parameters
37 | var config = Jupyter.notebook.config;
38 | config.loaded.then(function() {
39 |
40 | cfg = $.extend(true, cfg, config.data.datacleaner);
41 | // then update cfg with some vars found in current notebook metadata
42 | // and save in nb metadata (then can be modified per document)
43 |
44 | // window_display is taken from notebook metadata
45 | if (Jupyter.notebook.metadata.datacleaner) {
46 | if (Jupyter.notebook.metadata.datacleaner.window_display)
47 | cfg.window_display = Jupyter.notebook.metadata.datacleaner.window_display;
48 | if (Jupyter.notebook.metadata.datacleaner.position)
49 | cfg.position = Jupyter.notebook.metadata.datacleaner.position;
50 | }
51 |
52 | cfg = Jupyter.notebook.metadata.datacleaner = $.extend(true,
53 | cfg, Jupyter.notebook.metadata.datacleaner);
54 |
55 | // but cols and kernels_config are taken from system (if defined)
56 | if (config.data.datacleaner) {
57 | if (config.data.datacleaner.kernels_config) {
58 | cfg.kernels_config = $.extend(true, cfg.kernels_config, config.data.datacleaner.kernels_config);
59 | }
60 | }
61 |
62 | // call callbacks
63 | callback && callback();
64 | st.config_loaded = true;
65 | });
66 | config.load();
67 | return cfg;
68 | }
69 |
70 | function toggledatacleaner() {
71 | toggle_datacleaner(cfg, st);
72 | }
73 |
74 | var datacleaner_button = function() {
75 | if (!Jupyter.toolbar) {
76 | events.on("app_initialized.NotebookApp", datacleaner_button);
77 | return;
78 | }
79 | if ($("#datacleaner_button").length === 0) {
80 | Jupyter.toolbar.add_buttons_group([{
81 | 'label': 'Data Cleaner',
82 | 'icon': 'fa-bar-chart-o',
83 | 'callback': toggledatacleaner,
84 | 'id': 'datacleaner_button'
85 | }]);
86 | }
87 |
88 | require(['nbextensions/sherlockml-dataclean/iosbadge'],
89 | function() {
90 | if ($("#datacleaner_button").find('.iosb').length === 0) {
91 | $("#datacleaner_button").iosbadge({ theme: 'grey', size: 20 });
92 | }
93 | $("#datacleaner_button").find('.iosb').addClass('hidden');
94 | });
95 | };
96 |
97 | var load_css = function() {
98 | var link = document.createElement("link");
99 | link.type = "text/css";
100 | link.rel = "stylesheet";
101 | link.href = require.toUrl("./main.css");
102 | document.getElementsByTagName("head")[0].appendChild(link);
103 | };
104 |
105 |
106 | function html_table(jsonDataframes) {
107 | var dfList = JSON.parse(String(jsonDataframes));
108 | var table = '
'
109 | +'
'
112 | +''
113 | +'| Name | Shape | Columns | '
114 | +'
';
115 | n_dataframes = dfList.length;
116 |
117 | for (var i = 0; i < n_dataframes; i++) {
118 | table +=
119 | ''
120 | +'| '
122 | + dfList[i].dfName + ' | '
123 | + dfList[i].dfShape + ' | '
124 | + dfList[i].dfColnames + ' |
'
125 | + ''
126 | + '| '
136 | + 'Loading widget... |
'
137 | +''
138 | + ''
148 | +''
150 | +''
151 | + '| Column | '
152 | + 'Pandas dtype | '
153 | + 'Nulls | '
154 | + '# Distinct | '
155 | +' '
156 | +'';
157 | var n_cols = dfList[i].dfCols.length;
158 | for (var j = 0; j < n_cols; j++) {
159 | col = dfList[i].dfCols[j];
160 | table +=
161 | ''
162 | +'| '
165 | + col.colname + ' | '
166 | + col.description.dtype + ' | '
167 | + col.description.null_percentage + ' | '
168 | + col.description.distinct + ' | '
169 | + ' | | Loading widget... | ';
180 | }
181 | table +=
182 | ''
183 | +' '
184 | +' |
';
185 | }
186 | var full_table = table + '
';
187 |
188 | return full_table;
189 | }
190 |
191 | function display_widgets(msg, output_wrapper) {
192 |
193 | if (msg.header.msg_type == 'display_data') {
194 |
195 | var output_area = new outputarea.OutputArea({
196 | config: Jupyter.notebook.config,
197 | selector: output_wrapper,
198 | prompt_area: false,
199 | events: Jupyter.notebook.events,
200 | keyboard_manager: Jupyter.notebook.keyboard_manager,
201 | });
202 |
203 | output_area.handle_output(msg);
204 | }
205 |
206 | if (msg.header.msg_type == 'error') {
207 | console.warn(log_prefix + msg.content.evalue);
208 | console.warn(log_prefix + msg.content.traceback);
209 | }
210 | }
211 |
212 | function display_column_widget(selector) {
213 | if($('#datacleaner-wrapper').is(':visible')){
214 |
215 | var dataframe_id = $(selector).attr('data-frame-id');
216 |
217 | var column_id = $(selector).attr('id');
218 |
219 | var col_output_wrapper;
220 |
221 | if ($('#'+column_id+'_widget').find('.output').length===0){
222 | col_output_wrapper = $('');
223 | $('#'+column_id+'_row').html(col_output_wrapper);
224 |
225 | Jupyter.notebook.kernel.execute('_datacleaner.dataframe_managers['+dataframe_id+'].column_widget('+column_id+')',
226 | {iopub: { output: function(msg){display_widgets(msg, col_output_wrapper)} } }, { silent: false });
227 | }
228 | }
229 |
230 | }
231 |
232 | function display_pipeline_widget(selector) {
233 | if($('#datacleaner-wrapper').is(':visible')){
234 |
235 | var dataframe_id = $(selector).attr('data-frame-id');
236 |
237 | var pipeline_output_wrapper;
238 |
239 | if ($('#'+dataframe_id+'_widget').find('.output').length===0){
240 | pipeline_output_wrapper = $('');
241 |
242 | $('#'+dataframe_id+'_row').html(pipeline_output_wrapper);
243 |
244 | Jupyter.notebook.kernel.execute('_datacleaner.dataframe_managers['+dataframe_id+'].dataframe_widget',
245 | {iopub: { output: function(msg){display_widgets(msg,pipeline_output_wrapper)} } }, { silent: false });
246 | }
247 | }
248 | }
249 |
250 | //runs after every code cell execution in case dataframes have been updated
251 | function code_exec_callback(msg) {
252 | if (msg.header.msg_type == 'stream') {
253 | var jsonDataframes = msg.content.text;
254 | if (jsonDataframes === undefined)
255 | datacleaner_init();
256 | else {
257 | //redraw table
258 | $('#datacleaner').html(html_table(jsonDataframes));
259 |
260 | if (n_dataframes > 0) {
261 | $("#datacleaner_button").iosbadge({content: n_dataframes});
262 | $("#datacleaner_button").find('.iosb').removeClass('hidden');
263 | } else {
264 | $("#datacleaner_button").find('.iosb').addClass('hidden');
265 | }
266 |
267 | //add click events
268 | $('.tablesorter').delegate('.toggleColumn', 'click' ,function(){
269 | $(this).closest('tr').nextUntil('tr:not(.tablesorter-childRow)').children('td').toggleClass('hidden');
270 | $(this).toggleClass('arrow-right');
271 | $(this).toggleClass('arrow-down');
272 | display_column_widget(this);
273 | return false;
274 | });
275 |
276 | $('.tablesorter').on('click', '.toggleDataframe' ,function(){
277 | $(this).closest('tr').nextUntil('tr:not(.tablesorter-childRow)').children('td').toggleClass('hidden');
278 | $(this).toggleClass('arrow-right');
279 | $(this).toggleClass('arrow-down');
280 | display_pipeline_widget(this)
281 | return false;
282 | });
283 |
284 | //redisplay already open widgets
285 | $('.toggleColumn').each(function(){
286 | if (!($(this).closest('tr').nextUntil('tr:not(.tablesorter-childRow)').children('td').hasClass('hidden'))){
287 | $(this).toggleClass('arrow-right');
288 | $(this).toggleClass('arrow-down');
289 | display_column_widget(this)
290 | }
291 | });
292 |
293 | $('.toggleDataframe').each(function(){
294 | if (!($(this).closest('tr').next('tr').find('.pipeline_widget').hasClass('hidden'))){
295 | $(this).toggleClass('arrow-right');
296 | $(this).toggleClass('arrow-down');
297 | display_pipeline_widget(this)
298 | }
299 | });
300 |
301 | }
302 | require(['nbextensions/sherlockml-dataclean/jquery.tablesorter.min'],
303 | function() {
304 | setTimeout(function() { if ($('#datacleaner').length>0)
305 | $('#datacleaner table').tablesorter()}, 100);
306 | });
307 | }
308 |
309 | if (msg.header.msg_type == 'error') {
310 | console.warn(log_prefix + msg.content.evalue);
311 | console.warn(log_prefix + msg.content.traceback);
312 | }
313 | }
314 |
315 | var varRefresh = function() {
316 | require(['nbextensions/sherlockml-dataclean/jquery.tablesorter.min'],
317 | function() {
318 | Jupyter.notebook.kernel.execute(
319 | cfg.python.varRefreshCmd, { iopub: { output: code_exec_callback } }, { silent: false }
320 | );
321 | });
322 | };
323 |
324 |
325 | var datacleaner_init = function() {
326 |
327 | cfg = read_config(cfg, function() {
328 | if (typeof Jupyter.notebook.kernel !== "undefined" && Jupyter.notebook.kernel !== null) {
329 | datacleaner_button();
330 | } else {
331 | console.warn(log_prefix + "Kernel not available?");
332 | }
333 | });
334 |
335 | data_cleaner(cfg, st);
336 |
337 | //CREATE DATACLEANER PYTHON OBJECT
338 | Jupyter.notebook.kernel.execute((
339 | `try:
340 | _datacleaner
341 | except NameError:
342 | from dataclean.manager import DataCleaner as _DataCleaner
343 | _datacleaner = _DataCleaner()`)
344 | .replace(/^ /gm, ''))
345 |
346 | events.on('execute.CodeCell', varRefresh);
347 | events.on('varRefresh', varRefresh);
348 | };
349 |
350 |
351 | var create_datacleaner_div = function(cfg, st) {
352 | function save_position(){
353 | Jupyter.notebook.metadata.datacleaner.position = {
354 | 'left': $('#datacleaner-wrapper').css('left'),
355 | 'top': $('#datacleaner-wrapper').css('top'),
356 | 'width': $('#datacleaner-wrapper').css('width'),
357 | 'height': $('#datacleaner-wrapper').css('height'),
358 | 'right': $('#datacleaner-wrapper').css('right')
359 | };
360 | }
361 | var datacleaner_wrapper = $('')
362 | .append(
363 | $('')
364 | .addClass("header")
365 | .text("Data Cleaner ")
366 | .append(
367 | $("")
368 | .attr("href", "#")
369 | .text("[x]")
370 | .addClass("kill-btn")
371 | .attr('title', 'Close window')
372 | .click(function() {
373 | save_position();
374 | toggledatacleaner();
375 | return false;
376 | })
377 | )
378 | .append(
379 | $("")
380 | .attr("href", "#")
381 | .addClass("hide-btn")
382 | .attr('title', 'Hide Data Cleaner')
383 | .text("[-]")
384 | .click(function() {
385 | $('#datacleaner-wrapper').css('position', 'fixed');
386 | $('#datacleaner').slideToggle({
387 | 'complete': function() {
388 | Jupyter.notebook.metadata.datacleaner['datacleaner_section_display'] = $('#datacleaner').css('display');
389 | save_position();
390 | Jupyter.notebook.set_dirty();
391 | }
392 | });
393 | $('#datacleaner-wrapper').toggleClass('closed');
394 | if ($('#datacleaner-wrapper').hasClass('closed')) {
395 | cfg.oldHeight = $('#datacleaner-wrapper').height(); //.css('height');
396 | $('#datacleaner-wrapper').css({ height: 40 });
397 | $('#datacleaner-wrapper .hide-btn')
398 | .text('[+]')
399 | .attr('title', 'Show Data Cleaner');
400 | } else {
401 | $('#datacleaner-wrapper').height(cfg.oldHeight); //css({ height: cfg.oldHeight });
402 | $('#datacleaner').height(cfg.oldHeight - $('#datacleaner-header').height() - 30 )
403 | $('#datacleaner-wrapper .hide-btn')
404 | .text('[-]')
405 | .attr('title', 'Hide Data Cleaner');
406 | }
407 | return false;
408 | })
409 | ).append(
410 | $("")
411 | .attr("href", "#")
412 | .text(" \u21BB")
413 | .addClass("reload-btn")
414 | .attr('title', 'Reload Data Cleaner')
415 | .click(function() {
416 | varRefresh();
417 | return false;
418 | })
419 | ).append(
420 | $("")
421 | .html("  ")
422 | ).append(
423 | $("")
424 | .html(" ")
425 | )
426 | ).append(
427 | $("").attr("id", "datacleaner").addClass('datacleaner')
428 | )
429 |
430 | $("body").append(datacleaner_wrapper);
431 | // Ensure position is fixed
432 | $('#datacleaner-wrapper').css('position', 'fixed');
433 |
434 | // enable dragging and save position on stop moving
435 | $('#datacleaner-wrapper').draggable({
436 | handle:'#datacleaner-header',
437 | drag: function(event, ui) {}, //end of drag function
438 | start: function(event, ui) {
439 | $(this).width($(this).width());
440 | },
441 | stop: function(event, ui) { // on save, store window position
442 | $(this).offset({top:Math.max($(this).offset().top,0)});
443 | save_position();
444 | Jupyter.notebook.set_dirty();
445 | // Ensure position is fixed (again)
446 | $('#datacleaner-wrapper').css('position', 'fixed');
447 | },
448 | });
449 |
450 | $('#datacleaner-wrapper').resizable({
451 | resize: function(event, ui) {
452 | $('#datacleaner').height($('#datacleaner-wrapper').height() - $('#datacleaner-header').height());
453 | },
454 | start: function(event, ui) {
455 | $(this).css('position', 'fixed');
456 | },
457 | stop: function(event, ui) {
458 | save_position();
459 | $('#datacleaner').height($('#datacleaner-wrapper').height() - $('#datacleaner-header').height())
460 | Jupyter.notebook.set_dirty();
461 | }
462 | })
463 |
464 | if (Jupyter.notebook.metadata.datacleaner !== undefined) {
465 | if (Jupyter.notebook.metadata.datacleaner.position !== undefined) {
466 | $('#datacleaner-wrapper').css(Jupyter.notebook.metadata.datacleaner.position);
467 | }
468 | }
469 |
470 | // Ensure position is fixed
471 | $('#datacleaner-wrapper').css('position', 'fixed');
472 |
473 | // Restore window display
474 | if (Jupyter.notebook.metadata.datacleaner !== undefined) {
475 | if (Jupyter.notebook.metadata.datacleaner['datacleaner_section_display'] !== undefined) {
476 | $('#datacleaner').css('display', Jupyter.notebook.metadata.datacleaner['datacleaner_section_display'])
477 | if (Jupyter.notebook.metadata.datacleaner['datacleaner_section_display'] == 'none') {
478 | $('#datacleaner-wrapper').addClass('closed');
479 | $('#datacleaner-wrapper').css({ height: 40 });
480 | $('#datacleaner-wrapper .hide-btn')
481 | .text('[+]')
482 | .attr('title', 'Show Data Cleaner');
483 | }
484 | }
485 | if (Jupyter.notebook.metadata.datacleaner['window_display'] !== undefined) {
486 | console.log(log_prefix + "Restoring Data Cleaner window");
487 | $('#datacleaner-wrapper').css('display','none');
488 | if ($('#datacleaner-wrapper').hasClass('closed')){
489 | $('#datacleaner').height(cfg.oldHeight - $('#datacleaner-header').height())
490 | }else{
491 | $('#datacleaner').height($('#datacleaner-wrapper').height() - $('#datacleaner-header').height()-30)
492 | }
493 |
494 | }
495 | } else {
496 | $('#datacleaner-wrapper').toggle();
497 | }
498 |
499 | if ($('#datacleaner-wrapper').css('display') == undefined) $('#datacleaner-wrapper').css('display', "none") //block
500 |
501 | datacleaner_wrapper.addClass('datacleaner-float-wrapper');
502 |
503 | }
504 |
505 | var data_cleaner = function(cfg, st) {
506 | var datacleaner_wrapper = $("#datacleaner-wrapper");
507 | if (datacleaner_wrapper.length === 0) {
508 | create_datacleaner_div(cfg, st);
509 | }
510 |
511 | $(window).resize(function() {
512 | $('#datacleaner').css({ maxHeight: $(window).height() - 30 });
513 | $('#datacleaner-wrapper').css({ maxHeight: $(window).height() - 10 });
514 | });
515 |
516 | $(window).trigger('resize');
517 | varRefresh();
518 | };
519 |
520 | var toggle_datacleaner = function(cfg, st) {
521 | // toggle draw (first because of first-click behavior)
522 | $("#datacleaner-wrapper").toggle({
523 | 'progress': function() {},
524 | 'complete': function() {
525 | Jupyter.notebook.metadata.datacleaner['window_display'] = $('#datacleaner-wrapper').css('display') == 'block';
526 | Jupyter.notebook.set_dirty();
527 | // recompute:
528 | data_cleaner(cfg, st);
529 | }
530 | });
531 | };
532 |
533 |
534 | var load_jupyter_extension = function() {
535 | load_css();
536 | datacleaner_button();
537 |
538 | // If a kernel is available,
539 | if (typeof Jupyter.notebook.kernel !== "undefined" && Jupyter.notebook.kernel !== null) {
540 | datacleaner_init();
541 | }
542 |
543 | events.on("kernel_ready.Kernel", function(evt, data) {
544 | datacleaner_init();
545 | });
546 |
547 | };
548 |
549 | return {
550 | load_ipython_extension: load_jupyter_extension,
551 | varRefresh: varRefresh
552 | };
553 |
554 | });
555 |
556 | /*
557 | This code based on jupyter-varInpsector https://github.com/jfbercher/jupyter_varInspector
558 | Now part of https://github.com/ipython-contrib/jupyter_contrib_nbextensions
559 | which is licensed as follows:
560 |
561 | IPython-contrib is licensed under the terms of the Modified BSD License (also known as New or Revised or 3-Clause BSD), as follows:
562 |
563 | Copyright (c) 2013-2015, IPython-contrib Developers
564 |
565 | All rights reserved.
566 |
567 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
568 |
569 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
570 |
571 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
572 |
573 | Neither the name of the IPython-contrib Developers nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
574 |
575 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
576 |
577 | */
578 |
--------------------------------------------------------------------------------
/dataclean/static/jquery.tablesorter.min.js:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2007 Christian Bach.
3 |
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 |
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 |
8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.*/
9 |
10 | !function(e){"function"==typeof define&&define.amd?define(["jquery"],e):"object"==typeof module&&"object"==typeof module.exports?module.exports=e(require("jquery")):e(jQuery)}(function(e){return function(t){"use strict";var r=t.tablesorter={version:"2.29.0",parsers:[],widgets:[],defaults:{theme:"default",widthFixed:!1,showProcessing:!1,headerTemplate:"{content}",onRenderTemplate:null,onRenderHeader:null,cancelSelection:!0,tabIndex:!0,dateFormat:"mmddyyyy",sortMultiSortKey:"shiftKey",sortResetKey:"ctrlKey",usNumberFormat:!0,delayInit:!1,serverSideSorting:!1,resort:!0,headers:{},ignoreCase:!0,sortForce:null,sortList:[],sortAppend:null,sortStable:!1,sortInitialOrder:"asc",sortLocaleCompare:!1,sortReset:!1,sortRestart:!1,emptyTo:"bottom",stringTo:"max",duplicateSpan:!0,textExtraction:"basic",textAttribute:"data-text",textSorter:null,numberSorter:null,initWidgets:!0,widgetClass:"widget-{name}",widgets:[],widgetOptions:{zebra:["even","odd"]},initialized:null,tableClass:"",cssAsc:"",cssDesc:"",cssNone:"",cssHeader:"",cssHeaderRow:"",cssProcessing:"",cssChildRow:"tablesorter-childRow",cssInfoBlock:"tablesorter-infoOnly",cssNoSort:"tablesorter-noSort",cssIgnoreRow:"tablesorter-ignoreRow",cssIcon:"tablesorter-icon",cssIconNone:"",cssIconAsc:"",cssIconDesc:"",cssIconDisabled:"",pointerClick:"click",pointerDown:"mousedown",pointerUp:"mouseup",selectorHeaders:"> thead th, > thead td",selectorSort:"th, td",selectorRemove:".remove-me",debug:!1,headerList:[],empties:{},strings:{},parsers:[],globalize:0,imgAttr:0},css:{table:"tablesorter",cssHasChild:"tablesorter-hasChildRow",childRow:"tablesorter-childRow",colgroup:"tablesorter-colgroup",header:"tablesorter-header",headerRow:"tablesorter-headerRow",headerIn:"tablesorter-header-inner",icon:"tablesorter-icon",processing:"tablesorter-processing",sortAsc:"tablesorter-headerAsc",sortDesc:"tablesorter-headerDesc",sortNone:"tablesorter-headerUnSorted"},language:{sortAsc:"Ascending sort applied, ",sortDesc:"Descending sort applied, ",sortNone:"No sort applied, ",sortDisabled:"sorting is disabled",nextAsc:"activate to apply an ascending sort",nextDesc:"activate to apply a descending sort",nextNone:"activate to remove the sort"},regex:{templateContent:/\{content\}/g,templateIcon:/\{icon\}/g,templateName:/\{name\}/i,spaces:/\s+/g,nonWord:/\W/g,formElements:/(input|select|button|textarea)/i,chunk:/(^([+\-]?(?:\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?)?$|^0x[0-9a-f]+$|\d+)/gi,chunks:/(^\\0|\\0$)/,hex:/^0x[0-9a-f]+$/i,comma:/,/g,digitNonUS:/[\s|\.]/g,digitNegativeTest:/^\s*\([.\d]+\)/,digitNegativeReplace:/^\s*\(([.\d]+)\)/,digitTest:/^[\-+(]?\d+[)]?$/,digitReplace:/[,.'"\s]/g},string:{max:1,min:-1,emptymin:1,emptymax:-1,zero:0,none:0,"null":0,top:!0,bottom:!1},keyCodes:{enter:13},dates:{},instanceMethods:{},setup:function(e,o){if(e&&e.tHead&&0!==e.tBodies.length&&!0!==e.hasInitialized){var s="",a=t(e),n=t.metadata;e.hasInitialized=!1,e.isProcessing=!0,e.config=o,t.data(e,"tablesorter",o),o.debug&&(console[console.group?"group":"log"]("Initializing tablesorter v"+r.version),t.data(e,"startoveralltimer",new Date)),o.supportsDataObject=function(e){return e[0]=parseInt(e[0],10),e[0]>1||1===e[0]&&parseInt(e[1],10)>=4}(t.fn.jquery.split(".")),o.emptyTo=o.emptyTo.toLowerCase(),o.stringTo=o.stringTo.toLowerCase(),o.last={sortList:[],clickedIndex:-1},/tablesorter\-/.test(a.attr("class"))||(s=""!==o.theme?" tablesorter-"+o.theme:""),o.namespace?o.namespace="."+o.namespace.replace(r.regex.nonWord,""):o.namespace=".tablesorter"+Math.random().toString(16).slice(2),o.table=e,o.$table=a.addClass(r.css.table+" "+o.tableClass+s+" "+o.namespace.slice(1)).attr("role","grid"),o.$headers=a.find(o.selectorHeaders),o.$table.children().children("tr").attr("role","row"),o.$tbodies=a.children("tbody:not(."+o.cssInfoBlock+")").attr({"aria-live":"polite","aria-relevant":"all"}),o.$table.children("caption").length&&((s=o.$table.children("caption")[0]).id||(s.id=o.namespace.slice(1)+"caption"),o.$table.attr("aria-labelledby",s.id)),o.widgetInit={},o.textExtraction=o.$table.attr("data-text-extraction")||o.textExtraction||"basic",r.buildHeaders(o),r.fixColumnWidth(e),r.addWidgetFromClass(e),r.applyWidgetOptions(e),r.setupParsers(o),o.totalRows=0,r.validateOptions(o),o.delayInit||r.buildCache(o),r.bindEvents(e,o.$headers,!0),r.bindMethods(o),o.supportsDataObject&&void 0!==a.data().sortlist?o.sortList=a.data().sortlist:n&&a.metadata()&&a.metadata().sortlist&&(o.sortList=a.metadata().sortlist),r.applyWidget(e,!0),o.sortList.length>0?r.sortOn(o,o.sortList,{},!o.initWidgets):(r.setHeadersCss(o),o.initWidgets&&r.applyWidget(e,!1)),o.showProcessing&&a.unbind("sortBegin"+o.namespace+" sortEnd"+o.namespace).bind("sortBegin"+o.namespace+" sortEnd"+o.namespace,function(t){clearTimeout(o.timerProcessing),r.isProcessing(e),"sortBegin"===t.type&&(o.timerProcessing=setTimeout(function(){r.isProcessing(e,!0)},500))}),e.hasInitialized=!0,e.isProcessing=!1,o.debug&&(console.log("Overall initialization time:"+r.benchmark(t.data(e,"startoveralltimer"))),o.debug&&console.groupEnd&&console.groupEnd()),a.triggerHandler("tablesorter-initialized",e),"function"==typeof o.initialized&&o.initialized(e)}else o.debug&&(e.hasInitialized?console.warn("Stopping initialization. Tablesorter has already been initialized"):console.error("Stopping initialization! No table, thead or tbody",e))},bindMethods:function(e){var o=e.$table,s=e.namespace,a="sortReset update updateRows updateAll updateHeaders addRows updateCell updateComplete sorton appendCache updateCache applyWidgetId applyWidgets refreshWidgets destroy mouseup mouseleave ".split(" ").join(s+" ");o.unbind(a.replace(r.regex.spaces," ")).bind("sortReset"+s,function(e,t){e.stopPropagation(),r.sortReset(this.config,function(e){e.isApplyingWidgets?setTimeout(function(){r.applyWidget(e,"",t)},100):r.applyWidget(e,"",t)})}).bind("updateAll"+s,function(e,t,o){e.stopPropagation(),r.updateAll(this.config,t,o)}).bind("update"+s+" updateRows"+s,function(e,t,o){e.stopPropagation(),r.update(this.config,t,o)}).bind("updateHeaders"+s,function(e,t){e.stopPropagation(),r.updateHeaders(this.config,t)}).bind("updateCell"+s,function(e,t,o,s){e.stopPropagation(),r.updateCell(this.config,t,o,s)}).bind("addRows"+s,function(e,t,o,s){e.stopPropagation(),r.addRows(this.config,t,o,s)}).bind("updateComplete"+s,function(){this.isUpdating=!1}).bind("sorton"+s,function(e,t,o,s){e.stopPropagation(),r.sortOn(this.config,t,o,s)}).bind("appendCache"+s,function(e,o,s){e.stopPropagation(),r.appendCache(this.config,s),t.isFunction(o)&&o(this)}).bind("updateCache"+s,function(e,t,o){e.stopPropagation(),r.updateCache(this.config,t,o)}).bind("applyWidgetId"+s,function(e,t){e.stopPropagation(),r.applyWidgetId(this,t)}).bind("applyWidgets"+s,function(e,t){e.stopPropagation(),r.applyWidget(this,!1,t)}).bind("refreshWidgets"+s,function(e,t,o){e.stopPropagation(),r.refreshWidgets(this,t,o)}).bind("removeWidget"+s,function(e,t,o){e.stopPropagation(),r.removeWidget(this,t,o)}).bind("destroy"+s,function(e,t,o){e.stopPropagation(),r.destroy(this,t,o)}).bind("resetToLoadState"+s,function(o){o.stopPropagation(),r.removeWidget(this,!0,!1);var s=t.extend(!0,{},e.originalSettings);(e=t.extend(!0,{},r.defaults,s)).originalSettings=s,this.hasInitialized=!1,r.setup(this,e)})},bindEvents:function(e,o,s){var a,n=(e=t(e)[0]).config,i=n.namespace,l=null;!0!==s&&(o.addClass(i.slice(1)+"_extra_headers"),(a=r.getClosest(o,"table")).length&&"TABLE"===a[0].nodeName&&a[0]!==e&&t(a[0]).addClass(i.slice(1)+"_extra_table")),a=(n.pointerDown+" "+n.pointerUp+" "+n.pointerClick+" sort keyup ").replace(r.regex.spaces," ").split(" ").join(i+" "),o.find(n.selectorSort).add(o.filter(n.selectorSort)).unbind(a).bind(a,function(e,s){var a,i,d,c=t(e.target),g=" "+e.type+" ";if(!(1!==(e.which||e.button)&&!g.match(" "+n.pointerClick+" | sort | keyup ")||" keyup "===g&&e.which!==r.keyCodes.enter||g.match(" "+n.pointerClick+" ")&&void 0!==e.which||g.match(" "+n.pointerUp+" ")&&l!==e.target&&!0!==s)){if(g.match(" "+n.pointerDown+" "))return l=e.target,void("1"===(d=c.jquery.split("."))[0]&&d[1]<4&&e.preventDefault());if(l=null,r.regex.formElements.test(e.target.nodeName)||c.hasClass(n.cssNoSort)||c.parents("."+n.cssNoSort).length>0||c.parents("button").length>0)return!n.cancelSelection;n.delayInit&&r.isEmptyObject(n.cache)&&r.buildCache(n),a=r.getHeaderCell(t(this)),d=o.index(a),n.last.clickedIndex=d<0?a.attr("data-column"):d,(i=n.$headers[n.last.clickedIndex])&&!i.sortDisabled&&r.initSort(n,i,e)}}),n.cancelSelection&&o.attr("unselectable","on").bind("selectstart",!1).css({"user-select":"none",MozUserSelect:"none"})},buildHeaders:function(e){var o,s,a,n;for(e.headerList=[],e.headerContent=[],e.sortVars=[],e.debug&&(a=new Date),e.columns=r.computeColumnIndex(e.$table.children("thead, tfoot").children("tr")),s=e.cssIcon?'':"",e.$headers=t(t.map(e.$table.find(e.selectorHeaders),function(o,a){var n,i,l,d,c,g,p=t(o);if(!p.parent().hasClass(e.cssIgnoreRow))return n=r.getColumnData(e.table,e.headers,a,!0),e.headerContent[a]=p.html(),""===e.headerTemplate||p.find("."+r.css.headerIn).length||(d=e.headerTemplate.replace(r.regex.templateContent,p.html()).replace(r.regex.templateIcon,p.find("."+r.css.icon).length?"":s),e.onRenderTemplate&&(i=e.onRenderTemplate.apply(p,[a,d]))&&"string"==typeof i&&(d=i),p.html('")),e.onRenderHeader&&e.onRenderHeader.apply(p,[a,e,e.$table]),g=r.getHeaderCell(p),l=parseInt(g.attr("data-column"),10),o.column=l,c=r.getOrder(r.getData(p,n,"sortInitialOrder")||e.sortInitialOrder),e.sortVars[l]={count:-1,order:c?e.sortReset?[1,0,2]:[1,0]:e.sortReset?[0,1,2]:[0,1],lockedOrder:!1},void 0!==(c=r.getData(p,n,"lockedOrder")||!1)&&!1!==c&&(e.sortVars[l].lockedOrder=!0,e.sortVars[l].order=r.getOrder(c)?[1,1]:[0,0]),e.headerList[a]=o,p.addClass(r.css.header+" "+e.cssHeader),r.getClosest(p,"tr").addClass(r.css.headerRow+" "+e.cssHeaderRow).attr("role","row"),e.tabIndex&&p.attr("tabindex",0),o})),e.$headerIndexed=[],n=0;n0))for(i+=a,n+=a;a+1>0;)s.parsers[i-a]=p,s.extractors[i-a]=u,a--;i++}y+=s.parsers.length?m:1}e.debug&&(r.isEmptyObject(w)?console.warn(" No parsers detected!"):console[console.table?"table":"log"](w),console.log("Completed detecting parsers"+r.benchmark(f)),console.groupEnd&&console.groupEnd()),e.parsers=s.parsers,e.extractors=s.extractors},addParser:function(e){var t,o=r.parsers.length,s=!0;for(t=0;t=0;)if((n=r.parsers[d])&&"text"!==n.id&&n.is&&n.is(g,e.table,c,i))return n;return r.getParserById("text")},getElementText:function(e,o,s){if(!o)return"";var a,n=e.textExtraction||"",i=o.jquery?o:t(o);return"string"==typeof n?"basic"===n&&void 0!==(a=i.attr(e.textAttribute))?t.trim(a):t.trim(o.textContent||i.text()):"function"==typeof n?t.trim(n(i[0],e.table,s)):"function"==typeof(a=r.getColumnData(e.table,n,s))?t.trim(a(i[0],e.table,s)):t.trim(i[0].textContent||i.text())},getParsedText:function(e,t,o,s){void 0===s&&(s=r.getElementText(e,t,o));var a=""+s,n=e.parsers[o],i=e.extractors[o];return n&&(i&&"function"==typeof i.format&&(s=i.format(s,e.table,t,o)),a="no-parser"===n.id?"":n.format(""+s,e.table,t,o),e.ignoreCase&&"string"==typeof a&&(a=a.toLowerCase())),a},buildCache:function(e,o,s){var a,n,i,l,d,c,g,p,u,f,h,m,b,y,w,x,v,C,$,I,D,R,T=e.table,L=e.parsers;if(e.$tbodies=e.$table.children("tbody:not(."+e.cssInfoBlock+")"),g=void 0===s?e.$tbodies:s,e.cache={},e.totalRows=0,!L)return e.debug?console.warn("Warning: *Empty table!* Not building a cache"):"";for(e.debug&&(m=new Date),e.showProcessing&&r.isProcessing(T,!0),c=0;c0&&(C+=v,I+=v)),C++;else{for(y.$row=p,y.order=l,C=0,I=e.columns,d=0;d0)){for(R=0;R<=v;)i=e.duplicateSpan||0===R?n:"string"!=typeof e.textExtraction?r.getElementText(e,h,C+R)||"":"",y.raw[C+R]=i,u[C+R]=i,R++;C+=v,I+=v}C++}u[e.columns]=y,a.normalized[a.normalized.length]=u}a.colMax=x,e.totalRows+=a.normalized.length}if(e.showProcessing&&r.isProcessing(T),e.debug){for(D=Math.min(5,e.cache[0].normalized.length),console[console.group?"group":"log"]("Building cache for "+e.totalRows+" rows (showing "+D+" rows in log) and "+e.columns+" columns"+r.benchmark(m)),n={},d=0;d-1);return o}),(p=p.not(".sorter-false").filter('[data-column="'+a[o][0]+'"]'+(1===n?":last":""))).length)){for(s=0;s=0?a:n[1]%g.length}},updateAll:function(e,t,o){var s=e.table;s.isUpdating=!0,r.refreshWidgets(s,!0,!0),r.buildHeaders(e),r.bindEvents(s,e.$headers,!0),r.bindMethods(e),r.commonUpdate(e,t,o)},update:function(e,t,o){e.table.isUpdating=!0,r.updateHeader(e),r.commonUpdate(e,t,o)},updateHeaders:function(e,t){e.table.isUpdating=!0,r.buildHeaders(e),r.bindEvents(e.table,e.$headers,!0),r.resortComplete(e,t)},updateCell:function(e,o,s,a){if(t(o).closest("tr").hasClass(e.cssChildRow))console.warn('Tablesorter Warning! "updateCell" for child row content has been disabled, use "update" instead');else{if(r.isEmptyObject(e.cache))return r.updateHeader(e),void r.commonUpdate(e,s,a);e.table.isUpdating=!0,e.$table.find(e.selectorRemove).remove();var n,i,l,d,c,g,p=e.$tbodies,u=t(o),f=p.index(r.getClosest(u,"tbody")),h=e.cache[f],m=r.getClosest(u,"tr");if(o=u[0],p.length&&f>=0){if(l=p.eq(f).find("tr").not("."+e.cssChildRow).index(m),c=h.normalized[l],(g=m[0].cells.length)!==e.columns)for(d=0,n=!1,i=0;i0&&(h+=w),h++;b[o.columns]=m,o.cache[d].normalized[f]=b}r.checkResort(o,a,n)}},updateCache:function(e,t,o){e.parsers&&e.parsers.length||r.setupParsers(e,o),r.buildCache(e,t,o)},appendCache:function(e,t){var o,s,a,n,i,l,d,c=e.table,g=e.widgetOptions,p=e.$tbodies,u=[],f=e.cache;if(r.isEmptyObject(f))return e.appender?e.appender(c,u):c.isUpdating?e.$table.triggerHandler("updateComplete",c):"";for(e.debug&&(d=new Date),l=0;l1))for(n=1;n=0)for(n=0;n1))for(n=1;ns)return 1}for(o=(e||"").replace(d.chunk,"\\0$1\\0").replace(d.chunks,"").split("\\0"),s=(t||"").replace(d.chunk,"\\0$1\\0").replace(d.chunks,"").split("\\0"),l=Math.max(o.length,s.length),i=0;in)return 1}return 0},sortNaturalAsc:function(e,t,o,s){if(e===t)return 0;var a=r.string[s.empties[o]||s.emptyTo];return""===e&&0!==a?"boolean"==typeof a?a?-1:1:-a||-1:""===t&&0!==a?"boolean"==typeof a?a?1:-1:a||1:r.sortNatural(e,t)},sortNaturalDesc:function(e,t,o,s){if(e===t)return 0;var a=r.string[s.empties[o]||s.emptyTo];return""===e&&0!==a?"boolean"==typeof a?a?-1:1:a||1:""===t&&0!==a?"boolean"==typeof a?a?1:-1:-a||-1:r.sortNatural(t,e)},sortText:function(e,t){return e>t?1:e=0&&!0!==s&&d.widgets.splice(i,1),n&&n.remove&&(d.debug&&console.log((s?"Refreshing":"Removing")+' "'+o[a]+'" widget'),n.remove(e,d,d.widgetOptions,s),d.widgetInit[o[a]]=!1);d.$table.triggerHandler("widgetRemoveEnd",e)},refreshWidgets:function(e,o,s){var a,n,i=(e=t(e)[0]).config.widgets,l=r.widgets,d=l.length,c=[],g=function(e){t(e).triggerHandler("refreshComplete")};for(a=0;a'),o=l.$table.width(),n=(a=l.$tbodies.find("tr:first").children(":visible")).length,i=0;i").css("width",s));l.$table.prepend(d)}},getData:function(e,r,o){var s,a,n="",i=t(e);return i.length?(s=!!t.metadata&&i.metadata(),a=" "+(i.attr("class")||""),void 0!==i.data(o)||void 0!==i.data(o.toLowerCase())?n+=i.data(o)||i.data(o.toLowerCase()):s&&void 0!==s[o]?n+=s[o]:r&&void 0!==r[o]?n+=r[o]:" "!==a&&a.match(" "+o+"-")&&(n=a.match(new RegExp("\\s"+o+"-([\\w-]+)"))[1]||""),t.trim(n)):""},getColumnData:function(e,r,o,s,a){if("object"!=typeof r||null===r)return r;var n,i=(e=t(e)[0]).config,l=a||i.$headers,d=i.$headerIndexed&&i.$headerIndexed[o]||l.filter('[data-column="'+o+'"]:last');if(void 0!==r[o])return s?r[o]:r[l.index(d)];for(n in r)if("string"==typeof n&&d.filter(n).add(d.find(n)).length)return r[n]},isProcessing:function(e,o,s){var a=(e=t(e))[0].config,n=s||e.find("."+r.css.header);o?(void 0!==s&&a.sortList.length>0&&(n=n.filter(function(){return!this.sortDisabled&&r.isValueInArray(parseFloat(t(this).attr("data-column")),a.sortList)>=0})),e.add(n).addClass(r.css.processing+" "+a.cssProcessing)):e.add(n).removeClass(r.css.processing+" "+a.cssProcessing)},processTbody:function(e,r,o){if(e=t(e)[0],o)return e.isProcessing=!0,r.before(''),t.fn.detach?r.detach():r.remove();var s=t(e).find("colgroup.tablesorter-savemyplace");r.insertAfter(s),s.remove(),e.isProcessing=!1},clearTableBody:function(e){t(e)[0].config.$tbodies.children().detach()},characterEquivalents:{a:"áàâãäąå",A:"ÁÀÂÃÄĄÅ",c:"çćč",C:"ÇĆČ",e:"éèêëěę",E:"ÉÈÊËĚĘ",i:"íìİîïı",I:"ÍÌİÎÏ",o:"óòôõöō",O:"ÓÒÔÕÖŌ",ss:"ß",SS:"ẞ",u:"úùûüů",U:"ÚÙÛÜŮ"},replaceAccents:function(e){var t,o="[",s=r.characterEquivalents;if(!r.characterRegex){r.characterRegexArray={};for(t in s)"string"==typeof t&&(o+=s[t],r.characterRegexArray[t]=new RegExp("["+s[t]+"]","g"));r.characterRegex=new RegExp(o+"]")}if(r.characterRegex.test(e))for(t in s)"string"==typeof t&&(e=e.replace(r.characterRegexArray[t],t));return e},validateOptions:function(e){var o,s,a,n,i="headers sortForce sortList sortAppend widgets".split(" "),l=e.originalSettings;if(l){e.debug&&(n=new Date);for(o in l)if("undefined"===(a=typeof r.defaults[o]))console.warn('Tablesorter Warning! "table.config.'+o+'" option not recognized');else if("object"===a)for(s in l[o])a=r.defaults[o]&&typeof r.defaults[o][s],t.inArray(o,i)<0&&"undefined"===a&&console.warn('Tablesorter Warning! "table.config.'+o+"."+s+'" option not recognized');e.debug&&console.log("validate options time:"+r.benchmark(n))}},restoreHeaders:function(e){var o,s,a=t(e)[0].config,n=a.$table.find(a.selectorHeaders),i=n.length;for(o=0;o tr").children("th, td");!1===o&&t.inArray("uitheme",i.widgets)>=0&&(n.triggerHandler("applyWidgetId",["uitheme"]),n.triggerHandler("applyWidgetId",["zebra"])),d.find("tr").not(c).remove(),a="sortReset update updateRows updateAll updateHeaders updateCell addRows updateComplete sorton appendCache updateCache applyWidgetId applyWidgets refreshWidgets removeWidget destroy mouseup mouseleave "+"keypress sortBegin sortEnd resetToLoadState ".split(" ").join(i.namespace+" "),n.removeData("tablesorter").unbind(a.replace(r.regex.spaces," ")),i.$headers.add(g).removeClass([r.css.header,i.cssHeader,i.cssAsc,i.cssDesc,r.css.sortAsc,r.css.sortDesc,r.css.sortNone].join(" ")).removeAttr("data-column").removeAttr("aria-label").attr("aria-disabled","true"),c.find(i.selectorSort).unbind("mousedown mouseup keypress ".split(" ").join(i.namespace+" ").replace(r.regex.spaces," ")),r.restoreHeaders(e),n.toggleClass(r.css.table+" "+i.tableClass+" tablesorter-"+i.theme,!1===o),n.removeClass(i.namespace.slice(1)),e.hasInitialized=!1,delete e.config.cache,"function"==typeof s&&s(e),l&&console.log("tablesorter has been removed")}}};t.fn.tablesorter=function(e){return this.each(function(){var o=this,s=t.extend(!0,{},r.defaults,e,r.instanceMethods);s.originalSettings=e,!o.hasInitialized&&r.buildTable&&"TABLE"!==this.nodeName?r.buildTable(o,s):r.setup(o,s)})},window.console&&window.console.log||(r.logs=[],console={},console.log=console.warn=console.error=console.table=function(){var e=arguments.length>1?arguments:arguments[0];r.logs[r.logs.length]={date:Date.now(),log:e}}),r.addParser({id:"no-parser",is:function(){return!1},format:function(){return""},type:"text"}),r.addParser({id:"text",is:function(){return!0},format:function(e,o){var s=o.config;return e&&(e=t.trim(s.ignoreCase?e.toLocaleLowerCase():e),e=s.sortLocaleCompare?r.replaceAccents(e):e),e},type:"text"}),r.regex.nondigit=/[^\w,. \-()]/g,r.addParser({id:"digit",is:function(e){return r.isDigit(e)},format:function(e,o){var s=r.formatFloat((e||"").replace(r.regex.nondigit,""),o);return e&&"number"==typeof s?s:e?t.trim(e&&o.config.ignoreCase?e.toLocaleLowerCase():e):e},type:"numeric"}),r.regex.currencyReplace=/[+\-,. ]/g,r.regex.currencyTest=/^\(?\d+[\u00a3$\u20ac\u00a4\u00a5\u00a2?.]|[\u00a3$\u20ac\u00a4\u00a5\u00a2?.]\d+\)?$/,r.addParser({id:"currency",is:function(e){return e=(e||"").replace(r.regex.currencyReplace,""),r.regex.currencyTest.test(e)},format:function(e,o){var s=r.formatFloat((e||"").replace(r.regex.nondigit,""),o);return e&&"number"==typeof s?s:e?t.trim(e&&o.config.ignoreCase?e.toLocaleLowerCase():e):e},type:"numeric"}),r.regex.urlProtocolTest=/^(https?|ftp|file):\/\//,r.regex.urlProtocolReplace=/(https?|ftp|file):\/\/(www\.)?/,r.addParser({id:"url",is:function(e){return r.regex.urlProtocolTest.test(e)},format:function(e){return e?t.trim(e.replace(r.regex.urlProtocolReplace,"")):e},type:"text"}),r.regex.dash=/-/g,r.regex.isoDate=/^\d{4}[\/\-]\d{1,2}[\/\-]\d{1,2}/,r.addParser({id:"isoDate",is:function(e){return r.regex.isoDate.test(e)},format:function(e,t){var o=e?new Date(e.replace(r.regex.dash,"/")):e;return o instanceof Date&&isFinite(o)?o.getTime():e},type:"numeric"}),r.regex.percent=/%/g,r.regex.percentTest=/(\d\s*?%|%\s*?\d)/,r.addParser({id:"percent",is:function(e){return r.regex.percentTest.test(e)&&e.length<15},format:function(e,t){return e?r.formatFloat(e.replace(r.regex.percent,""),t):e},type:"numeric"}),r.addParser({id:"image",is:function(e,t,r,o){return o.find("img").length>0},format:function(e,r,o){return t(o).find("img").attr(r.config.imgAttr||"alt")||e},parsed:!0,type:"text"}),r.regex.dateReplace=/(\S)([AP]M)$/i,r.regex.usLongDateTest1=/^[A-Z]{3,10}\.?\s+\d{1,2},?\s+(\d{4})(\s+\d{1,2}:\d{2}(:\d{2})?(\s+[AP]M)?)?$/i,r.regex.usLongDateTest2=/^\d{1,2}\s+[A-Z]{3,10}\s+\d{4}/i,r.addParser({id:"usLongDate",is:function(e){return r.regex.usLongDateTest1.test(e)||r.regex.usLongDateTest2.test(e)},format:function(e,t){var o=e?new Date(e.replace(r.regex.dateReplace,"$1 $2")):e;return o instanceof Date&&isFinite(o)?o.getTime():e},type:"numeric"}),r.regex.shortDateTest=/(^\d{1,2}[\/\s]\d{1,2}[\/\s]\d{4})|(^\d{4}[\/\s]\d{1,2}[\/\s]\d{1,2})/,r.regex.shortDateReplace=/[\-.,]/g,r.regex.shortDateXXY=/(\d{1,2})[\/\s](\d{1,2})[\/\s](\d{4})/,r.regex.shortDateYMD=/(\d{4})[\/\s](\d{1,2})[\/\s](\d{1,2})/,r.convertFormat=function(e,t){e=(e||"").replace(r.regex.spaces," ").replace(r.regex.shortDateReplace,"/"),"mmddyyyy"===t?e=e.replace(r.regex.shortDateXXY,"$3/$1/$2"):"ddmmyyyy"===t?e=e.replace(r.regex.shortDateXXY,"$3/$2/$1"):"yyyymmdd"===t&&(e=e.replace(r.regex.shortDateYMD,"$1/$2/$3"));var o=new Date(e);return o instanceof Date&&isFinite(o)?o.getTime():""},r.addParser({id:"shortDate",is:function(e){return e=(e||"").replace(r.regex.spaces," ").replace(r.regex.shortDateReplace,"/"),r.regex.shortDateTest.test(e)},format:function(e,t,o,s){if(e){var a=t.config,n=a.$headerIndexed[s],i=n.length&&n.data("dateFormat")||r.getData(n,r.getColumnData(t,a.headers,s),"dateFormat")||a.dateFormat;return n.length&&n.data("dateFormat",i),r.convertFormat(e,i)||e}return e},type:"numeric"}),r.regex.timeTest=/^(0?[1-9]|1[0-2]):([0-5]\d)(\s[AP]M)$|^((?:[01]\d|[2][0-4]):[0-5]\d)$/i,r.regex.timeMatch=/(0?[1-9]|1[0-2]):([0-5]\d)(\s[AP]M)|((?:[01]\d|[2][0-4]):[0-5]\d)/i,r.addParser({id:"time",is:function(e){return r.regex.timeTest.test(e)},format:function(e,t){var o,s=(e||"").match(r.regex.timeMatch),a=new Date(e),n=e&&(null!==s?s[0]:"00:00 AM"),i=n?new Date("2000/01/01 "+n.replace(r.regex.dateReplace,"$1 $2")):n;return i instanceof Date&&isFinite(i)?(o=a instanceof Date&&isFinite(a)?a.getTime():0,o?parseFloat(i.getTime()+"."+a.getTime()):i.getTime()):e},type:"numeric"}),r.addParser({id:"metadata",is:function(){return!1},format:function(e,r,o){var s=r.config,a=s.parserMetadataName?s.parserMetadataName:"sortValue";return t(o).metadata()[a]},type:"numeric"}),r.addWidget({id:"zebra",priority:90,format:function(e,r,o){var s,a,n,i,l,d,c,g=new RegExp(r.cssChildRow,"i"),p=r.$tbodies.add(t(r.namespace+"_extra_table").children("tbody:not(."+r.cssInfoBlock+")"));for(l=0;l 0
57 | else 0
58 | )
59 |
60 | if fraction_categorical >= categorical_threshold:
61 | categorical_type = CategoricalTypes.CATEGORICAL
62 | else:
63 | categorical_type = CategoricalTypes.CONTINUOUS
64 |
65 | return categorical_type
66 |
67 |
68 | class CallbackManager(object):
69 | """For registering and triggering callbacks between classes"""
70 |
71 | def __init__(self):
72 | self.callbacks = []
73 |
74 | def send_callbacks(self, *args, **kwargs):
75 | for callback in self.callbacks:
76 | callback(*args, **kwargs)
77 |
78 | def register_callback(self, callback):
79 | self.callbacks.append(callback)
80 |
81 |
82 | class StepWidgetControllerBase(object):
83 | """Widget controls to create a cleaning step"""
84 |
85 | __metaclass__ = ABCMeta
86 |
87 | def __init__(self):
88 | self.update_step_callback = CallbackManager()
89 | self.submit_step_callback = CallbackManager()
90 |
91 | self.tab_title = "A title for the tab widget page"
92 | # this should be placed into the ALLOWED_TRANSFORMATIONS dict
93 | # for controls that go into the column widgets
94 | self.transform_type = "A unique string or an enum class"
95 |
96 | def load_data(self, column, numerical_data):
97 | self.column = column
98 | self.colname = column.name
99 | self.numerical_data = numerical_data
100 |
101 | def create_widgets(self):
102 | """Create your control widgets"""
103 | self.submit_button = ipywidgets.Button(description="Add to Pipeline")
104 | self.submit_button.on_click(
105 | lambda _: self.submit_step_callback.send_callbacks()
106 | )
107 |
108 | def reset_controls(self):
109 | """Reset the controls to their base state"""
110 | self.submit_button.description = "Add to Pipeline"
111 |
112 | @abstractmethod
113 | def update_step(self):
114 | """Create a pipeline step as the controls are changed"""
115 | self.update_step_callback.send_callbacks()
116 |
117 | def _update_step(self, _):
118 | """For use as a widget observer"""
119 | return self.update_step()
120 |
121 | @abstractmethod
122 | def render_widget(self, step=None):
123 | """Return the overall parent widget for your controls in the state
124 | required to display the input step"""
125 | if step:
126 | self.submit_button.description = "Replace Current Step"
127 |
128 |
129 | class NullReplaceWidgetController(StepWidgetControllerBase):
130 | """Widget controls to create a null replacement step"""
131 |
132 | def __init__(self):
133 | super(NullReplaceWidgetController, self).__init__()
134 | self.tab_title = "Nulls"
135 | self.transform_type = NullRemovalMethod
136 |
137 | def create_widgets(self):
138 | super(NullReplaceWidgetController, self).create_widgets()
139 |
140 | self.null_percent_bar = ipywidgets.FloatProgress(
141 | value=0,
142 | min=0,
143 | max=100,
144 | description="Missing:",
145 | disabled=False,
146 | continuous_update=False,
147 | readout=True,
148 | readout_format=".2g",
149 | layout=ipywidgets.Layout(width="400px"),
150 | bar_style="warning",
151 | )
152 |
153 | self.null_replace_selector = ipywidgets.Dropdown(
154 | options=[],
155 | description="Replacement Method: ",
156 | layout=ipywidgets.Layout(width="400px"),
157 | style={"description_width": "initial"},
158 | )
159 | self.null_replace_selector.observe(self._update_step, names="value")
160 |
161 | self.null_text = ipywidgets.Label()
162 |
163 | self.null_removal_controls = ipywidgets.VBox(
164 | [
165 | self.null_text,
166 | self.null_percent_bar,
167 | self.null_replace_selector,
168 | self.submit_button,
169 | ],
170 | layout=ipywidgets.Layout(width="100%"),
171 | )
172 | self.null_removal_controls.layout.align_items = "center"
173 |
174 | def reset_controls(self, categorical_type):
175 | super(NullReplaceWidgetController, self).reset_controls()
176 |
177 | self.null_replace_selector.unobserve(self._update_step, names="value")
178 |
179 | self.null_text.value = "{0} of {1} ({2:.0f}%) selected".format(
180 | self.column.isnull().sum(),
181 | len(self.column),
182 | (100 * self.column.isnull().sum() / len(self.column))
183 | if len(self.column) > 0
184 | else 0,
185 | )
186 |
187 | self.null_percent_bar.bar_style = "warning"
188 | self.null_percent_bar.value = (
189 | (100 * self.column.isnull().sum() / len(self.column))
190 | if len(self.column) > 0
191 | else 0
192 | )
193 |
194 | allowed_transforms = {
195 | x.value: x
196 | for x in ALLOWED_TRANSFORMATIONS[categorical_type]
197 | if type(x) is self.transform_type
198 | }
199 |
200 | self.null_replace_selector.options = allowed_transforms
201 |
202 | if len(allowed_transforms) > 0:
203 | self.null_replace_selector.value = self.transform_type.NONE
204 |
205 | self.submit_button.disabled = True
206 | self.null_replace_selector.observe(self._update_step, names="value")
207 |
208 | def update_step(self):
209 |
210 | if self.null_replace_selector.value == self.transform_type.NONE:
211 | self.submit_button.disabled = True
212 | self.null_percent_bar.bar_style = "warning"
213 | else:
214 | self.submit_button.disabled = False
215 | self.null_percent_bar.bar_style = "success"
216 |
217 | step = NullRemovalStep(
218 | replacement_method=self.null_replace_selector.value,
219 | colname=self.colname,
220 | )
221 |
222 | self.update_step_callback.send_callbacks(step)
223 |
224 | def render_widget(self, step=None):
225 | super(NullReplaceWidgetController, self).render_widget(step)
226 | if step:
227 | self.null_replace_selector.value = step.replacement_method
228 | return self.null_removal_controls
229 |
230 |
231 | class OutlierReplaceWidgetController(StepWidgetControllerBase):
232 | """Widget controls to create an outlier replacement step"""
233 |
234 | def __init__(self):
235 | super(OutlierReplaceWidgetController, self).__init__()
236 | self.tab_title = "Outliers"
237 | self.transform_type = OutlierRemovalMethod
238 |
239 | def create_widgets(self):
240 | super(OutlierReplaceWidgetController, self).create_widgets()
241 | self.outlier_range_slider = ipywidgets.FloatRangeSlider(
242 | value=[0, 1],
243 | min=0,
244 | max=1,
245 | step=0.04,
246 | description="Range:",
247 | disabled=False,
248 | continuous_update=False,
249 | readout=True,
250 | readout_format=".2g",
251 | layout=ipywidgets.Layout(width="400px"),
252 | style={"handle_color": "lightblue"},
253 | )
254 |
255 | self.outlier_replace_selector = ipywidgets.Dropdown(
256 | options=[],
257 | description="Replacement Method: ",
258 | layout=ipywidgets.Layout(width="400px"),
259 | style={"description_width": "initial"},
260 | )
261 |
262 | self.outlier_range_slider.observe(self._update_step, names="value")
263 | self.outlier_replace_selector.observe(self._update_step, names="value")
264 | self.cut_text = ipywidgets.Label()
265 |
266 | self.outlier_removal_controls = ipywidgets.VBox(
267 | [
268 | self.cut_text,
269 | self.outlier_range_slider,
270 | self.outlier_replace_selector,
271 | self.submit_button,
272 | ],
273 | layout=ipywidgets.Layout(width="100%"),
274 | )
275 | self.outlier_removal_controls.layout.align_items = "center"
276 |
277 | def reset_controls(self, categorical_type):
278 | super(OutlierReplaceWidgetController, self).reset_controls()
279 |
280 | self.outlier_range_slider.unobserve(self._update_step, names="value")
281 | self.outlier_replace_selector.unobserve(
282 | self._update_step, names="value"
283 | )
284 |
285 | self.cut_text.value = "{0} of {1} ({2:.0f}%) selected".format(
286 | 0, len(self.column), 0.0
287 | )
288 |
289 | with self.outlier_range_slider.hold_trait_notifications():
290 | self.outlier_range_slider.min = self.numerical_data.min()
291 | self.outlier_range_slider.max = self.numerical_data.max()
292 |
293 | self.outlier_range_slider.value = [
294 | self.numerical_data.min(),
295 | self.numerical_data.max(),
296 | ]
297 |
298 | allowed_transforms = {
299 | x.value: x
300 | for x in ALLOWED_TRANSFORMATIONS[categorical_type]
301 | if type(x) is self.transform_type
302 | }
303 |
304 | self.outlier_replace_selector.options = allowed_transforms
305 |
306 | if len(allowed_transforms) > 0:
307 | self.outlier_replace_selector.value = self.transform_type.NONE
308 | self.submit_button.disabled = True
309 |
310 | self.outlier_range_slider.observe(self._update_step, names="value")
311 | self.outlier_replace_selector.observe(self._update_step, names="value")
312 |
313 | def update_step(self):
314 |
315 | if self.outlier_replace_selector.value == self.transform_type.NONE:
316 | self.submit_button.disabled = True
317 | else:
318 | self.submit_button.disabled = False
319 |
320 | num_values_cut = self.numerical_data[
321 | (self.numerical_data < self.outlier_range_slider.value[0])
322 | | (self.numerical_data > self.outlier_range_slider.value[1])
323 | ].count()
324 |
325 | percent_values_cut = (
326 | (100 * num_values_cut / len(self.column))
327 | if len(self.column) > 0
328 | else 0
329 | )
330 |
331 | self.cut_text.value = "{0} of {1} ({2:.0f}%) selected".format(
332 | num_values_cut, len(self.column), percent_values_cut
333 | )
334 |
335 | step = OutlierRemovalStep(
336 | replacement_method=self.outlier_replace_selector.value,
337 | colname=self.colname,
338 | low_cut=self.outlier_range_slider.value[0],
339 | high_cut=self.outlier_range_slider.value[1],
340 | )
341 |
342 | self.update_step_callback.send_callbacks(step)
343 |
344 | def render_widget(self, step=None):
345 | super(OutlierReplaceWidgetController, self).render_widget(step)
346 | if step:
347 | self.outlier_range_slider.value = [step.low_cut, step.high_cut]
348 | self.outlier_replace_selector.value = step.replacement_method
349 | return self.outlier_removal_controls
350 |
351 |
352 | class TypeConvertWidgetController(StepWidgetControllerBase):
353 | """Widget controls to create a mistyped values replacement step"""
354 |
355 | def __init__(self):
356 | super(TypeConvertWidgetController, self).__init__()
357 | self.transform_type = TypeConvertMethod
358 | self.tab_title = "Mismatched Types"
359 |
360 | def load_data(self, column, numerical_data):
361 | super(TypeConvertWidgetController, self).load_data(
362 | column, numerical_data
363 | )
364 | self.type_count_dict = {float: 0, int: 0, str: 0}
365 |
366 | for (data_type, count) in (
367 | self.column.dropna().apply(type).value_counts().iteritems()
368 | ):
369 | self.type_count_dict[data_type] = count
370 |
371 | def create_widgets(self):
372 | super(TypeConvertWidgetController, self).create_widgets()
373 |
374 | self.float_percent_bar = ipywidgets.FloatProgress(
375 | value=0,
376 | min=0,
377 | max=100,
378 | description="Floats:",
379 | orientation="horizontal",
380 | )
381 | self.n_float = ipywidgets.Label()
382 | float_bar_widget = ipywidgets.HBox(
383 | [self.float_percent_bar, self.n_float]
384 | )
385 |
386 | self.int_percent_bar = ipywidgets.FloatProgress(
387 | value=0,
388 | min=0,
389 | max=100,
390 | description="Ints:",
391 | orientation="horizontal",
392 | )
393 | self.n_int = ipywidgets.Label()
394 | int_bar_widget = ipywidgets.HBox([self.int_percent_bar, self.n_int])
395 |
396 | self.str_percent_bar = ipywidgets.FloatProgress(
397 | value=0,
398 | min=0,
399 | max=100,
400 | description="Strings:",
401 | orientation="horizontal",
402 | )
403 | self.n_str = ipywidgets.Label()
404 | str_bar_widget = ipywidgets.HBox([self.str_percent_bar, self.n_str])
405 |
406 | self.type_selector = ipywidgets.Dropdown(
407 | options={"int": int, "float": float, "string": str},
408 | description="This column is of type:",
409 | layout=ipywidgets.Layout(width="300px"),
410 | style={"description_width": "initial"},
411 | )
412 |
413 | self.replace_selector = ipywidgets.Dropdown(
414 | description="For mismatched values:",
415 | layout=ipywidgets.Layout(width="300px"),
416 | style={"description_width": "initial"},
417 | )
418 |
419 | self.type_selector.observe(self._update_step, names="value")
420 | self.replace_selector.observe(self._update_step, names="value")
421 |
422 | self.widget = ipywidgets.VBox(
423 | [
424 | float_bar_widget,
425 | int_bar_widget,
426 | str_bar_widget,
427 | ipywidgets.HBox(
428 | [
429 | ipywidgets.VBox(
430 | [self.type_selector, self.replace_selector]
431 | ),
432 | self.submit_button,
433 | ]
434 | ),
435 | ]
436 | )
437 |
438 | self.bar_widget_dict = {
439 | float: float_bar_widget,
440 | int: int_bar_widget,
441 | str: str_bar_widget,
442 | }
443 |
444 | def reset_controls(self, categorical_type):
445 | super(TypeConvertWidgetController, self).reset_controls()
446 | self.type_selector.unobserve(self._update_step, names="value")
447 | self.replace_selector.unobserve(self._update_step, names="value")
448 |
449 | allowed_transforms = {
450 | x.value: x
451 | for x in ALLOWED_TRANSFORMATIONS[categorical_type]
452 | if isinstance(x, self.transform_type)
453 | }
454 |
455 | self.replace_selector.options = allowed_transforms
456 |
457 | if len(allowed_transforms) > 0:
458 | self.replace_selector.value = self.transform_type.NONE
459 | self.submit_button.disabled = True
460 |
461 | counts = reversed(
462 | sorted(self.type_count_dict, key=self.type_count_dict.get)
463 | )
464 |
465 | current_type = next(counts)
466 |
467 | while current_type not in self.type_selector.options.values() or (
468 | current_type is str
469 | and categorical_type is CategoricalTypes.CONTINUOUS
470 | ):
471 | current_type = next(counts)
472 |
473 | self.type_selector.value = current_type
474 |
475 | for dtype, widget_box in self.bar_widget_dict.items():
476 | widget_box.children[0].value = (
477 | (100 * self.type_count_dict[dtype] / len(self.column))
478 | if len(self.column) > 0
479 | else 0
480 | )
481 | widget_box.children[0].bar_style = (
482 | "success" if current_type is dtype else "warning"
483 | )
484 | widget_box.children[1].value = "{0} of {1} ({2:.0f}%)".format(
485 | self.type_count_dict[dtype],
486 | len(self.column),
487 | widget_box.children[0].value,
488 | )
489 |
490 | self.type_selector.observe(self._update_step, names="value")
491 | self.replace_selector.observe(self._update_step, names="value")
492 |
493 | def update_step(self):
494 | for dtype, widget_box in self.bar_widget_dict.items():
495 | widget_box.children[0].bar_style = (
496 | "success" if self.type_selector.value is dtype else "warning"
497 | )
498 | if self.replace_selector.value == self.transform_type.NONE:
499 | self.submit_button.disabled = True
500 | else:
501 | self.submit_button.disabled = False
502 |
503 | step = TypeConversionStep(
504 | replacement_method=self.replace_selector.value,
505 | colname=self.colname,
506 | data_type=self.type_selector.value,
507 | )
508 | self.update_step_callback.send_callbacks(step)
509 |
510 | def render_widget(self, step=None):
511 | super(TypeConvertWidgetController, self).render_widget(step)
512 | if step:
513 | self.type_selector.value = step.data_type
514 | self.replace_selector.value = step.replacement_method
515 | return self.widget
516 |
517 |
518 | class RbmWidgetController(StepWidgetControllerBase):
519 | """Widget controls to create an RBM imputation step"""
520 |
521 | def __init__(self):
522 | super(RbmWidgetController, self).__init__()
523 | self.transform_type = "RBM Imputation"
524 |
525 | def load_data(self, dataframe):
526 | self.dataframe = dataframe
527 |
528 | def create_widgets(self):
529 |
530 | self.submit_button = ipywidgets.Button(description="Add to Pipeline")
531 | self.submit_button.on_click(
532 | lambda _: self.submit_step_callback.send_callbacks(self.step)
533 | )
534 |
535 | title = ipywidgets.Label(
536 | value="Impute missing data with " "a Restricted Boltzmann Machine"
537 | )
538 |
539 | self.col_list = ipywidgets.SelectMultiple(
540 | options=[], description="On columns "
541 | )
542 |
543 | self.col_list.observe(
544 | lambda _: self._reload_categorical_list_options(
545 | self.categorical_list.options, index=self.col_list.index
546 | )
547 | )
548 |
549 | self.categorical_list = ipywidgets.SelectMultiple(
550 | options=[], description=" as "
551 | )
552 |
553 | self.categorical_list.observe(
554 | self._change_categorical_type, names="index"
555 | )
556 |
557 | switch_categorical_type = ipywidgets.Button(description="<>")
558 | switch_categorical_type.on_click(
559 | lambda _: self._change_categorical_type(
560 | {
561 | "old": self.categorical_list.index,
562 | "new": self.categorical_list.index,
563 | }
564 | )
565 | )
566 |
567 | self.widget = ipywidgets.VBox(
568 | [
569 | title,
570 | ipywidgets.HBox(
571 | [
572 | self.col_list,
573 | self.categorical_list,
574 | switch_categorical_type,
575 | ]
576 | ),
577 | ipywidgets.VBox(
578 | [
579 | self.submit_button,
580 | ipywidgets.Label(
581 | value="(Until you execute or export your pipeline, "
582 | "RBM imputed values are placeholders only.)"
583 | ),
584 | ]
585 | ),
586 | ]
587 | )
588 |
589 | def _reload_categorical_list_options(self, options, index=()):
590 | self.categorical_list.unobserve(
591 | self._change_categorical_type, names="index"
592 | )
593 | self.categorical_list.options = self._format_list(options)
594 | self.categorical_list.index = index
595 | self.categorical_list.observe(
596 | self._change_categorical_type, names="index"
597 | )
598 | self.update_step()
599 |
600 | def _change_categorical_type(self, index):
601 | old_index = index["old"]
602 |
603 | options = list(self.categorical_list.options)
604 | indices_to_change = index["new"]
605 |
606 | for index in indices_to_change:
607 | if options[index].strip() == CategoricalTypes.CONTINUOUS.value:
608 | options[index] = CategoricalTypes.CATEGORICAL.value
609 | elif options[index].strip() == CategoricalTypes.CATEGORICAL.value:
610 | options[index] = CategoricalTypes.CONTINUOUS.value
611 |
612 | self._reload_categorical_list_options(options, index=old_index)
613 |
614 | def _format_list(self, input_list):
615 | # workaround - ensures unique values go into Select widget even though
616 | # we just want multiple instances of "categorical" and "continuous"
617 | output_list = []
618 | for i, item in enumerate(input_list):
619 | output_list.append(item.strip() + " " * i)
620 | return output_list
621 |
622 | def reset_controls(self):
623 | super(RbmWidgetController, self).reset_controls()
624 | self.col_list.options = self.dataframe.columns.tolist()
625 | categorical_list = []
626 |
627 | for col in self.col_list.options:
628 | categorical_list.append(is_categorical(self.dataframe[col]).value)
629 |
630 | self.col_list.value = ()
631 | self.col_list.rows = self.categorical_list.rows = len(categorical_list)
632 |
633 | self._reload_categorical_list_options(categorical_list)
634 |
635 | def update_step(self):
636 |
637 | numerical_columns = []
638 | categorical_columns = []
639 |
640 | for i in self.col_list.index:
641 | categorical_type = self.categorical_list.options[i].strip()
642 | if categorical_type == CategoricalTypes.CONTINUOUS.value:
643 | numerical_columns.append(self.col_list.options[i])
644 | elif categorical_type == CategoricalTypes.CATEGORICAL.value:
645 | categorical_columns.append(self.col_list.options[i])
646 |
647 | self.step = RbmStep(
648 | numerical_columns=numerical_columns,
649 | categorical_columns=categorical_columns,
650 | )
651 |
652 | def render_widget(self, step=None):
653 | super(RbmWidgetController, self).render_widget(step)
654 |
655 | widget = self.widget
656 |
657 | if isinstance(step, RbmStep):
658 | self.col_list.value = (
659 | step.numerical_columns + step.categorical_columns
660 | )
661 |
662 | categorical_list = []
663 |
664 | for col in self.col_list.options:
665 | if col in step.numerical_columns:
666 | categorical_list.append(CategoricalTypes.CONTINUOUS.value)
667 | elif col in step.categorical_columns:
668 | categorical_list.append(CategoricalTypes.CATEGORICAL.value)
669 | else:
670 | categorical_list.append(
671 | is_categorical(self.dataframe[col]).value
672 | )
673 |
674 | self._reload_categorical_list_options(
675 | categorical_list, index=self.col_list.index
676 | )
677 | self.step = step
678 | elif step:
679 | widget = render_inactive_widget(step)
680 |
681 | return widget
682 |
683 |
684 | def _noninteractive(func):
685 | """Ensure plots are created in non-interactive mode with seaborn style."""
686 |
687 | @wraps(func)
688 | def noninteractive_wrapper(*args, **kwargs):
689 | mpl_interactivity = matplotlib.is_interactive()
690 | matplotlib.interactive(False)
691 |
692 | with pyplot.style.context("seaborn"):
693 | rval = func(*args, **kwargs)
694 |
695 | matplotlib.interactive(mpl_interactivity)
696 | return rval
697 |
698 | return noninteractive_wrapper
699 |
700 |
701 | class PlotWidgetController(object):
702 | """Widget controls to display and update plots for dataframe columns."""
703 |
704 | gs_one_plot = matplotlib.gridspec.GridSpec(1, 1)
705 | gs_two_plots = matplotlib.gridspec.GridSpec(
706 | 2, 1, height_ratios=[1, 1], hspace=0.1
707 | )
708 |
709 | CUT_LINE_COLOUR = "red"
710 | CUT_BINS_COLOUR = "orange"
711 |
712 | def __init__(self):
713 | self.output_widget = ipywidgets.Output(
714 | layout=ipywidgets.Layout(min_width="300px", height="160px")
715 | )
716 | self.create_figure()
717 |
718 | def load_data(self, column, numerical_data):
719 | self.column = column
720 | self.colname = column.name
721 | self.numerical_data = numerical_data
722 |
723 | @_noninteractive
724 | def create_figure(self):
725 | self.fig = pyplot.figure()
726 |
727 | self.ax_main = self.fig.add_subplot(self.gs_two_plots[0])
728 |
729 | self.ax_mod = self.fig.add_subplot(self.gs_two_plots[1])
730 |
731 | pyplot.setp(self.ax_mod.get_xticklabels(), visible=False)
732 | pyplot.setp(self.ax_mod.get_yticklabels(), visible=False)
733 |
734 | self.ax_cut = self.ax_main.twinx()
735 |
736 | # enforces desired drawing order
737 | self.ax_mod.set_zorder(1)
738 | self.ax_main.set_zorder(2)
739 | self.ax_cut.set_zorder(3)
740 |
741 | self.ax_cut.get_xaxis().set_visible(False)
742 | self.ax_cut.get_yaxis().set_visible(False)
743 |
744 | self.ax_main.tick_params(axis="y", which="major", labelsize=12)
745 | self.ax_mod.tick_params(axis="y", which="major", labelsize=12)
746 |
747 | def display_figure(self):
748 | self.output_widget.clear_output(wait=True)
749 |
750 | # Magic numbers came from testing using categorical columns
751 | # with large numbers of categories
752 | fig_width = 0.1 * len(self.ax_main.get_xticks()) + 3.5
753 | fig_height = 2.4
754 | self.fig.set_size_inches(fig_width, fig_height)
755 |
756 | self.output_widget.layout.width = "{}px".format(fig_width * 70)
757 | self.output_widget.layout.height = "{}px".format(fig_height * 80)
758 |
759 | if self.categorical_type is CategoricalTypes.CONTINUOUS:
760 | self.ax_main.xaxis.set_major_locator(
761 | matplotlib.ticker.AutoLocator()
762 | )
763 |
764 | with self.output_widget:
765 | display(self.fig)
766 |
767 | def reset_plots(self, categorical_type):
768 | self.categorical_type = categorical_type
769 | self.draw_main_plot()
770 | self.update_plots()
771 |
772 | @_noninteractive
773 | def draw_main_plot(self):
774 | self.ax_main.clear()
775 | self.ax_mod.clear()
776 |
777 | if self.categorical_type is CategoricalTypes.CATEGORICAL:
778 | col = self.column.dropna().value_counts()
779 | col.index = col.index.format()
780 | if len(col) > 0:
781 | col.sort_index().plot(kind="bar", ax=self.ax_main, alpha=0.4)
782 | else:
783 | hist_orig, self.bins = np.histogram(self.numerical_data)
784 | self.bin_width = self.bins[1] - self.bins[0]
785 |
786 | margin = (self.bins[-1] - self.bins[0]) * self.ax_main.margins()[0]
787 |
788 | self.ax_main.set_xlim(
789 | (self.bins[0] - margin, self.bins[-1] + margin)
790 | )
791 |
792 | self.ax_main.bar(
793 | self.bins[:-1],
794 | hist_orig,
795 | width=self.bin_width,
796 | align="edge",
797 | alpha=0.4,
798 | )
799 |
800 | self.ymax = self.ax_main.get_ylim()[1]
801 | self.low_cut_line, = self.ax_main.plot(
802 | [None, None], [self.ymax, 0], color=self.CUT_LINE_COLOUR
803 | )
804 |
805 | self.high_cut_line, = self.ax_main.plot(
806 | [None, None], [self.ymax, 0], color=self.CUT_LINE_COLOUR
807 | )
808 |
809 | def update_plots(self, step=None, col_mod=None):
810 | if isinstance(step, OutlierRemovalStep):
811 | self.low_cut_line.set_xdata([[step.low_cut, step.low_cut]])
812 | self.high_cut_line.set_xdata([[step.high_cut, step.high_cut]])
813 | self.draw_cut_plot(step.low_cut, step.high_cut)
814 | else:
815 | self.hide_cut_plot()
816 |
817 | self.draw_modified_plot(
818 | col_mod if col_mod is not None else self.column
819 | )
820 |
821 | self.display_figure()
822 |
823 | @_noninteractive
824 | def draw_modified_plot(self, col_mod):
825 | self.ax_mod.clear()
826 |
827 | data_mod = col_mod.loc[
828 | col_mod.apply(lambda x: isinstance(x, (int, float)))
829 | ]
830 | data_mod = data_mod.dropna()
831 | col_mod = col_mod.dropna().value_counts()
832 |
833 | if self.categorical_type is CategoricalTypes.CATEGORICAL and not self.column.dropna().value_counts().equals(
834 | col_mod
835 | ):
836 | self.ax_main.set_position(
837 | self.gs_two_plots[0].get_position(self.fig)
838 | )
839 | self.ax_cut.set_position(
840 | self.gs_two_plots[0].get_position(self.fig)
841 | )
842 |
843 | pyplot.setp(self.ax_main.get_xticklabels(), visible=False)
844 | pyplot.setp(self.ax_mod.get_xticklabels(), visible=True)
845 | pyplot.setp(self.ax_mod.get_yticklabels(), visible=True)
846 |
847 | col_orig = self.column.dropna().value_counts()
848 |
849 | col_mod.index = col_mod.index.format()
850 | col_orig.index = col_orig.index.format()
851 |
852 | col_delta = col_mod.sub(col_orig, fill_value=0)
853 | col_delta = col_delta[col_delta > 0]
854 |
855 | col_mod = col_mod.sub(col_delta, fill_value=0)
856 |
857 | col_mod = pd.concat([col_mod, col_delta], axis=1)
858 |
859 | col_mod.sort_index().plot(
860 | kind="bar",
861 | ax=self.ax_mod,
862 | alpha=0.4,
863 | stacked=True,
864 | legend=False,
865 | )
866 |
867 | elif (
868 | self.categorical_type is not CategoricalTypes.CATEGORICAL
869 | and not data_mod.equals(self.numerical_data)
870 | ):
871 | self.ax_main.set_position(
872 | self.gs_two_plots[0].get_position(self.fig)
873 | )
874 | self.ax_cut.set_position(
875 | self.gs_two_plots[0].get_position(self.fig)
876 | )
877 |
878 | pyplot.setp(self.ax_main.get_xticklabels(), visible=False)
879 | pyplot.setp(self.ax_mod.get_xticklabels(), visible=True)
880 | pyplot.setp(self.ax_mod.get_yticklabels(), visible=True)
881 |
882 | hist_mod, _ = np.histogram(data_mod, self.bins)
883 | hist_orig, _ = np.histogram(self.numerical_data, self.bins)
884 |
885 | hist_delta = hist_mod - hist_orig
886 | hist_delta[hist_delta < 0] = 0
887 |
888 | self.ax_mod.bar(
889 | self.bins[:-1],
890 | hist_mod - hist_delta,
891 | width=self.bin_width,
892 | align="edge",
893 | alpha=0.4,
894 | )
895 | self.ax_mod.bar(
896 | self.bins[:-1],
897 | hist_delta,
898 | width=self.bin_width,
899 | color="g",
900 | bottom=hist_mod - hist_delta,
901 | align="edge",
902 | alpha=0.4,
903 | )
904 | else:
905 | self.ax_main.set_position(
906 | self.gs_one_plot[0].get_position(self.fig)
907 | )
908 | self.ax_cut.set_position(
909 | self.gs_one_plot[0].get_position(self.fig)
910 | )
911 | pyplot.setp(self.ax_main.get_xticklabels(), visible=True)
912 | pyplot.setp(self.ax_mod.get_xticklabels(), visible=False)
913 | pyplot.setp(self.ax_mod.get_yticklabels(), visible=False)
914 |
915 | @_noninteractive
916 | def draw_cut_plot(self, low_cut, high_cut):
917 | self.ax_cut.set_visible(True)
918 |
919 | ticks = self.ax_cut.get_xticks()
920 | self.ax_cut.clear()
921 | self.ax_cut.set_xticks(ticks)
922 |
923 | cut_data = self.numerical_data.loc[
924 | self.numerical_data.apply(lambda x: x < low_cut or x > high_cut)
925 | ]
926 |
927 | hist_cut, _ = np.histogram(cut_data, self.bins)
928 |
929 | self.ax_cut.bar(
930 | self.bins[:-1],
931 | hist_cut,
932 | width=self.bin_width,
933 | align="edge",
934 | color=self.CUT_BINS_COLOUR,
935 | alpha=0.4,
936 | )
937 |
938 | self.ax_cut.set_ylim(self.ax_main.get_ylim())
939 |
940 | def hide_cut_plot(self):
941 | self.low_cut_line.set_xdata([[None, None]])
942 | self.high_cut_line.set_xdata([[None, None]])
943 | self.ax_cut.set_visible(False)
944 |
945 | def render_widget(self):
946 | if self.fig:
947 | pyplot.close(self.fig)
948 | self.create_figure()
949 |
950 | return self.output_widget
951 |
952 |
953 | class ColumnWidgetController(object):
954 | """Container widget for column-specific step creation control widgets"""
955 |
956 | def __init__(self):
957 |
958 | self.widget = None
959 | self.step_being_modified = None
960 | self.new_step_callback = CallbackManager()
961 | self.modify_step_callback = CallbackManager()
962 | self.active_callback = self.new_step_callback
963 | self.categorical_type = None
964 |
965 | self.plot_widget_controller = PlotWidgetController()
966 |
967 | def update_active_step(new_step):
968 | self.active_step = new_step
969 | col_mod = new_step.execute(self.dataframe)[self.colname]
970 | self.redraw_preview(col_mod)
971 | self.plot_widget_controller.update_plots(new_step, col_mod)
972 |
973 | self.step_creation_controls = [
974 | NullReplaceWidgetController(),
975 | OutlierReplaceWidgetController(),
976 | TypeConvertWidgetController(),
977 | ]
978 |
979 | self.controls_dict = {}
980 |
981 | for controller in self.step_creation_controls:
982 | self.controls_dict[controller.transform_type] = controller
983 | controller.update_step_callback.register_callback(
984 | update_active_step
985 | )
986 |
987 | self.create_widgets()
988 |
989 | def create_widgets(self):
990 |
991 | self.categorical_selector = ipywidgets.Dropdown(
992 | options={
993 | cat_type.value: cat_type for cat_type in CategoricalTypes
994 | },
995 | layout=ipywidgets.Layout(width="80%"),
996 | )
997 |
998 | self.plot_widget_container = ipywidgets.VBox(
999 | [
1000 | self.plot_widget_controller.render_widget(),
1001 | self.categorical_selector,
1002 | ],
1003 | layout=ipywidgets.Layout(
1004 | width="350px",
1005 | height="220px",
1006 | overflow_x="scroll",
1007 | overflow_y="auto",
1008 | ),
1009 | )
1010 |
1011 | self.plot_widget_container.layout.align_items = "flex-start"
1012 |
1013 | self.categorical_selector.observe(
1014 | self.categorical_selector_onchange, names="value"
1015 | )
1016 |
1017 | self.preview_widget = ipywidgets.HTML()
1018 | self.preview_widget_container = ipywidgets.VBox(
1019 | [ipywidgets.Label(value="Current Step"), self.preview_widget],
1020 | layout=ipywidgets.Layout(max_height="200px"),
1021 | )
1022 |
1023 | self.tab_widget = ipywidgets.Tab(
1024 | layout=ipywidgets.Layout(
1025 | overflow_x="scroll", width="600px", height="90%"
1026 | )
1027 | )
1028 |
1029 | self.tab_widget.observe(
1030 | self.tab_widget_onchange, names="selected_index"
1031 | )
1032 |
1033 | for controller in self.step_creation_controls:
1034 | controller.create_widgets()
1035 |
1036 | controller.submit_step_callback.register_callback(
1037 | lambda: self.active_callback.send_callbacks(self.active_step)
1038 | )
1039 |
1040 | self.widget = ipywidgets.HBox(
1041 | [
1042 | self.plot_widget_container,
1043 | self.tab_widget,
1044 | self.preview_widget_container,
1045 | ],
1046 | layout=ipywidgets.Layout(
1047 | display="flex",
1048 | align_items="stretch",
1049 | width="100%",
1050 | height="220px",
1051 | ),
1052 | )
1053 |
1054 | def tab_widget_onchange(self, _):
1055 | index = self.tab_widget.selected_index
1056 |
1057 | for controller in self.controls_dict.values():
1058 | if controller.tab_title == self.tab_widget.get_title(index):
1059 | controller.update_step()
1060 |
1061 | def categorical_selector_onchange(self, _):
1062 | self.categorical_type = self.categorical_selector.value
1063 |
1064 | self.active_step = NullRemovalStep(
1065 | replacement_method=NullRemovalMethod.NONE, colname=self.colname
1066 | )
1067 |
1068 | self.reset_controls()
1069 |
1070 | def load_data(self, series, dataframe, step=None):
1071 | self.dataframe = dataframe
1072 | self.column = series
1073 | self.colname = series.name
1074 |
1075 | if not self.categorical_type:
1076 | self.categorical_type = is_categorical(series)
1077 |
1078 | self.numerical_data = series.loc[
1079 | series.apply(lambda x: isinstance(x, (int, float)))
1080 | ]
1081 |
1082 | self.numerical_data = self.numerical_data.dropna()
1083 |
1084 | for controller in self.step_creation_controls:
1085 | controller.load_data(
1086 | column=self.column, numerical_data=self.numerical_data
1087 | )
1088 | self.plot_widget_controller.load_data(
1089 | column=self.column, numerical_data=self.numerical_data
1090 | )
1091 |
1092 | self.redraw_preview()
1093 | self.step_being_modified = step
1094 |
1095 | def redraw_preview(self, col_modified=None):
1096 |
1097 | if col_modified is not None:
1098 | col_mod = col_modified.reindex(
1099 | index=self.column.index, fill_value="
"
1100 | )
1101 | else:
1102 | col_mod = self.column
1103 |
1104 | self.preview_widget.value = (
1105 | "This Step"
1106 | + pd.concat(
1107 | [self.column.rename("before"), col_mod.rename("after")], axis=1
1108 | )
1109 | .style.set_table_attributes('class="table"')
1110 | .render()
1111 | )
1112 |
1113 | def render_widget(self):
1114 | self.reset_controls()
1115 | self.redraw_preview()
1116 |
1117 | if self.step_being_modified:
1118 | self.set_controls_for_step(self.step_being_modified)
1119 |
1120 | return self.widget
1121 |
1122 | def reset_controls(self):
1123 | self.tab_widget.unobserve(
1124 | self.tab_widget_onchange, names="selected_index"
1125 | )
1126 | self.categorical_selector.unobserve(
1127 | self.categorical_selector_onchange, names="value"
1128 | )
1129 |
1130 | self.active_callback = self.new_step_callback
1131 |
1132 | tab_children = []
1133 | tab_titles = []
1134 |
1135 | allowed_transforms = set(
1136 | transform if isinstance(transform, str) else type(transform)
1137 | for transform in ALLOWED_TRANSFORMATIONS[self.categorical_type]
1138 | )
1139 |
1140 | for transform_type in sorted(allowed_transforms, key=str):
1141 | tab_children.append(
1142 | self.controls_dict[transform_type].render_widget()
1143 | )
1144 | tab_titles.append(self.controls_dict[transform_type].tab_title)
1145 |
1146 | self.tab_widget.children = tuple(tab_children)
1147 |
1148 | for i in range(len(tab_children)):
1149 | self.tab_widget.set_title(i, tab_titles[i])
1150 |
1151 | self.tab_widget.selected_index = 0
1152 |
1153 | self.active_step = NullRemovalStep(
1154 | replacement_method=NullRemovalMethod.NONE, colname=self.colname
1155 | )
1156 |
1157 | for controller in self.step_creation_controls:
1158 | controller.reset_controls(categorical_type=self.categorical_type)
1159 |
1160 | self.categorical_selector.disabled = False
1161 | self.categorical_selector.value = self.categorical_type
1162 |
1163 | self.plot_widget_controller.reset_plots(self.categorical_type)
1164 |
1165 | self.tab_widget.observe(
1166 | self.tab_widget_onchange, names="selected_index"
1167 | )
1168 | self.categorical_selector.observe(
1169 | self.categorical_selector_onchange, names="value"
1170 | )
1171 |
1172 | def set_controls_for_step(self, step):
1173 |
1174 | if hasattr(step, "colname") and step.colname == self.colname:
1175 |
1176 | while (
1177 | step.replacement_method
1178 | not in ALLOWED_TRANSFORMATIONS[self.categorical_type]
1179 | ):
1180 | self.categorical_selector.index = (
1181 | self.categorical_selector.index + 1
1182 | ) % len(self.categorical_selector.options)
1183 |
1184 | self.tab_widget.children = [
1185 | self.controls_dict[
1186 | type(step.replacement_method)
1187 | ].render_widget(step)
1188 | ]
1189 |
1190 | self.active_callback = self.modify_step_callback
1191 | self.tab_widget.set_title(0, "Modifying Current Step")
1192 | self.tab_widget.selected_index = 0
1193 | else:
1194 | self.tab_widget.children = [render_inactive_widget(step)]
1195 |
1196 | self.tab_widget.set_title(0, str(self.colname))
1197 | self.categorical_selector.disabled = True
1198 |
1199 |
1200 | class DataFrameWidgetController(object):
1201 | """Container widget for dataframe-wide controls and the pipeline"""
1202 |
1203 | def __init__(self, pipeline_widget, sampled_rows):
1204 | self.resample_callback = CallbackManager()
1205 | self.new_step_callback = CallbackManager()
1206 | self.modify_step_callback = CallbackManager()
1207 |
1208 | self.active_callback = self.new_step_callback
1209 |
1210 | self.rbm_widget_controller = RbmWidgetController()
1211 | self.rbm_widget_controller.create_widgets()
1212 |
1213 | def submit_rbm_step(*args, **kwargs):
1214 | self.active_callback.send_callbacks(*args, **kwargs)
1215 |
1216 | self.rbm_widget_controller.submit_step_callback.register_callback(
1217 | submit_rbm_step
1218 | )
1219 |
1220 | self.pipeline_widget_container = ipywidgets.Accordion(
1221 | children=[pipeline_widget]
1222 | )
1223 | self.pipeline_widget_container.set_title(0, "Pipeline")
1224 | self.pipeline_widget_container.selected_index = None
1225 | self.preview_widget = ipywidgets.Output(
1226 | layout=ipywidgets.Layout(
1227 | overflow_y="scroll",
1228 | overflow_x="scroll",
1229 | width="100%",
1230 | height="190px",
1231 | )
1232 | )
1233 |
1234 | self.rbm_widget_container = ipywidgets.Accordion(
1235 | children=[self.rbm_widget_controller.render_widget()]
1236 | )
1237 | self.rbm_widget_container.set_title(0, "Restricted Boltzmann Machine")
1238 | self.rbm_widget_container.selected_index = None
1239 |
1240 | self.preview_widget_container = ipywidgets.Accordion(
1241 | children=[self.preview_widget]
1242 | )
1243 | self.preview_widget_container.set_title(0, "DataFrame Preview")
1244 | self.preview_widget_container.selected_index = None
1245 |
1246 | child_widgets = [
1247 | self.preview_widget_container,
1248 | self.rbm_widget_container,
1249 | self.pipeline_widget_container,
1250 | ipywidgets.Label(
1251 | "Click on a column name below to start adding steps."
1252 | ),
1253 | ]
1254 |
1255 | if sampled_rows:
1256 | sample_label = ipywidgets.Label(
1257 | value="Viewing {} sampled rows from your dataframe.".format(
1258 | sampled_rows
1259 | )
1260 | )
1261 | sample_btn = ipywidgets.Button(description="Resample")
1262 | sample_btn.on_click(
1263 | lambda _: self.resample_callback.send_callbacks()
1264 | )
1265 | child_widgets = [
1266 | ipywidgets.HBox([sample_label, sample_btn])
1267 | ] + child_widgets
1268 |
1269 | self.widget = ipywidgets.VBox(child_widgets)
1270 |
1271 | def _redraw_preview(self, dataframe):
1272 | self.preview_widget.clear_output(wait=True)
1273 | with self.preview_widget:
1274 | display(
1275 | dataframe.style.set_caption(
1276 | "Preview up to the current pipeline step"
1277 | )
1278 | )
1279 |
1280 | def render_widget(self, dataframe, step=None):
1281 | self.dataframe = dataframe
1282 | self._redraw_preview(dataframe)
1283 | self.rbm_widget_controller.load_data(dataframe)
1284 | self.rbm_widget_controller.reset_controls()
1285 | self.rbm_widget_container.children = tuple(
1286 | [self.rbm_widget_controller.render_widget(step)]
1287 | )
1288 |
1289 | # if we are currently modifying a non column-specific step
1290 | if step and not hasattr(step, "colname"):
1291 | self.active_callback = self.modify_step_callback
1292 | else:
1293 | self.active_callback = self.new_step_callback
1294 |
1295 | return self.widget
1296 |
1297 | def display_pipeline(self):
1298 | self.pipeline_widget_container.selected_index = 0
1299 |
1300 |
1301 | class PipelineWidgetController(object):
1302 | """Container widget for a view of the processing pipeline"""
1303 |
1304 | CAROUSEL_LAYOUT = ipywidgets.Layout(
1305 | overflow_x="scroll",
1306 | width="800px",
1307 | height="",
1308 | flex_direction="row",
1309 | display="flex",
1310 | )
1311 |
1312 | def __init__(self, pipeline, name):
1313 |
1314 | self.pipeline = pipeline
1315 | self.name = name
1316 | self.pipeline_view = ipywidgets.Box(
1317 | children=[], layout=self.CAROUSEL_LAYOUT
1318 | )
1319 | self.info_label = ipywidgets.Label(value="")
1320 | self.info_label.layout.height = "30px"
1321 |
1322 | self.add_button = ipywidgets.Button(description="+")
1323 | self.add_button.layout.visibility = "hidden"
1324 | self.add_button.on_click(lambda _: self._enter_add_mode())
1325 |
1326 | self.add_mode_callback = CallbackManager()
1327 | self.edit_mode_callback = CallbackManager()
1328 | self.delete_step_callback = CallbackManager()
1329 | self.execute_callback = CallbackManager()
1330 | self.export_callback = CallbackManager()
1331 |
1332 | self.execute_button = ipywidgets.Button(description="Execute Pipeline")
1333 | self.execute_button.on_click(lambda _: self._execute_pipeline())
1334 |
1335 | self.export_button = ipywidgets.Button(description="Export to Code")
1336 | self.export_button.on_click(lambda _: self._export_pipeline())
1337 |
1338 | def render_widget(self, active_step=None):
1339 |
1340 | children = []
1341 | self.pipeline_step_widgets = []
1342 | self.display_message("Add a step to get started")
1343 |
1344 | for step in self.pipeline.steps:
1345 | pipeline_step_widget = PipelineStepWidgetController(step)
1346 |
1347 | pipeline_step_widget.modify_step_callback.register_callback(
1348 | self._enter_edit_mode
1349 | )
1350 |
1351 | pipeline_step_widget.stop_modifying_callback.register_callback(
1352 | self._enter_add_mode
1353 | )
1354 |
1355 | pipeline_step_widget.delete_step_callback.register_callback(
1356 | self._delete_step
1357 | )
1358 |
1359 | self.pipeline_step_widgets.append(pipeline_step_widget)
1360 | children.append(pipeline_step_widget.widget)
1361 |
1362 | if children:
1363 | children.append(
1364 | ipywidgets.VBox(
1365 | [self.add_button, self.execute_button, self.export_button],
1366 | layout=ipywidgets.Layout(min_width="150px"),
1367 | )
1368 | )
1369 | self.display_message("")
1370 |
1371 | self.pipeline_view.children = tuple(children)
1372 | self.widget = ipywidgets.VBox([self.pipeline_view, self.info_label])
1373 | self._enter_edit_mode(active_step)
1374 |
1375 | return self.widget
1376 |
1377 | def _enter_edit_mode(self, step):
1378 | if step:
1379 | self.add_button.layout.visibility = (
1380 | None
1381 | ) # this means it's visible!
1382 | for pipeline_step_widget in self.pipeline_step_widgets:
1383 | if pipeline_step_widget.step is step:
1384 | pipeline_step_widget._set_active_style()
1385 | else:
1386 | pipeline_step_widget._set_inactive_style()
1387 | self.edit_mode_callback.send_callbacks(step)
1388 | message = "Modifying step"
1389 | if hasattr(step, "colname"):
1390 | message += " on column " + str(step.colname)
1391 | self.display_message(message)
1392 | else:
1393 | self.add_button.layout.visibility = "hidden"
1394 |
1395 | def _export_pipeline(self):
1396 | self.display_message("Exported to code cell.")
1397 | self.export_callback.send_callbacks()
1398 |
1399 | def _execute_pipeline(self):
1400 | self.display_message("Executing pipeline... ")
1401 |
1402 | self.execute_callback.send_callbacks()
1403 |
1404 | self.display_message(
1405 | 'Cleaned DataFrame output to "'
1406 | + self.name
1407 | + '_cleaned". '
1408 | + "Reload DataCleaner to refresh list."
1409 | )
1410 |
1411 | def _delete_step(self, step):
1412 | self.add_button.layout.visibility = "hidden"
1413 | self.delete_step_callback.send_callbacks(step)
1414 |
1415 | def _enter_add_mode(self):
1416 | self.add_button.layout.visibility = "hidden"
1417 | for pipeline_step_widget in self.pipeline_step_widgets:
1418 | pipeline_step_widget._set_inactive_style()
1419 | self.add_mode_callback.send_callbacks()
1420 | self.display_message("")
1421 |
1422 | def display_message(self, message):
1423 | self.info_label.value = message
1424 |
1425 |
1426 | class PipelineStepWidgetController(object):
1427 | """Container widget for a single step of the processing pipeline"""
1428 |
1429 | def __init__(self, step):
1430 |
1431 | select_box = ipywidgets.Select(
1432 | options=step.description.replace(", ", "\n").splitlines(),
1433 | rows=3,
1434 | disabled=False,
1435 | layout=ipywidgets.Layout(width="200px"),
1436 | )
1437 |
1438 | self.modify_button = ipywidgets.ToggleButton(
1439 | layout=ipywidgets.Layout(height="25px", width="98%")
1440 | )
1441 |
1442 | self.delete_button = ipywidgets.Button(
1443 | description="Delete Step",
1444 | layout=ipywidgets.Layout(
1445 | height="25px", width="98%", visibility="hidden"
1446 | ),
1447 | button_style="warning",
1448 | )
1449 |
1450 | self.widget = ipywidgets.VBox(
1451 | [self.modify_button, select_box, self.delete_button],
1452 | layout=ipywidgets.Layout(min_width="200px"),
1453 | )
1454 |
1455 | self.step = step
1456 | self.modify_step_callback = CallbackManager()
1457 | self.stop_modifying_callback = CallbackManager()
1458 | self.delete_step_callback = CallbackManager()
1459 |
1460 | self.modify_button.observe(self._modify_button_on_click, names="value")
1461 | self.delete_button.on_click(
1462 | lambda _: self.delete_step_callback.send_callbacks(self.step)
1463 | )
1464 |
1465 | self._set_inactive_style()
1466 |
1467 | def _modify_button_on_click(self, value):
1468 | if value["new"] is True:
1469 | self.modify_step_callback.send_callbacks(self.step)
1470 | else:
1471 | self.stop_modifying_callback.send_callbacks()
1472 |
1473 | def _set_active_style(self):
1474 | self.modify_button.button_style = "primary"
1475 | self.modify_button.description = "Modifying"
1476 | self.modify_button.value = True
1477 | self.delete_button.layout.visibility = None # This means visible.
1478 |
1479 | def _set_inactive_style(self):
1480 | self.modify_button.button_style = ""
1481 | self.modify_button.description = "Modify"
1482 |
1483 | self.modify_button.unobserve(
1484 | self._modify_button_on_click, names="value"
1485 | )
1486 | self.modify_button.value = False
1487 | self.modify_button.observe(self._modify_button_on_click, names="value")
1488 |
1489 | self.delete_button.layout.visibility = "hidden"
1490 |
--------------------------------------------------------------------------------