├── .gitignore ├── MANIFEST.in ├── dataclean ├── __init__.py ├── static │ ├── main.css │ ├── iosbadge.js │ ├── main.js │ └── jquery.tablesorter.min.js ├── codegen.py ├── pipeline.py ├── manager.py ├── cleaning.py └── widget.py ├── setup.py ├── README.rst └── LICENSE.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | __pycache__/ 3 | dist/ 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include dataclean/static *.js *.css 2 | -------------------------------------------------------------------------------- /dataclean/__init__.py: -------------------------------------------------------------------------------- 1 | def _jupyter_nbextension_paths(): 2 | return [ 3 | { 4 | "section": "notebook", 5 | "src": "static", 6 | "dest": "sherlockml-dataclean", 7 | "require": "sherlockml-dataclean/main", 8 | } 9 | ] 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | STATIC_JS_FILES = [ 5 | "dataclean/static/main.js", 6 | "dataclean/static/jquery.tablesorter.min.js", 7 | "dataclean/static/iosbadge.js", 8 | "dataclean/static/main.css", 9 | ] 10 | 11 | 12 | def read_long_description(): 13 | with open(os.path.join(os.path.dirname(__file__), "README.rst")) as fp: 14 | return fp.read() 15 | 16 | 17 | setup( 18 | name="ipydataclean", 19 | version="0.2.2", 20 | url="https://github.com/facultyai/ipydataclean", 21 | author="Faculty", 22 | author_email="opensource@faculty.ai", 23 | description="Interactive cleaning for pandas DataFrames", 24 | license="Apache 2.0", 25 | long_description=read_long_description(), 26 | data_files=[("share/jupyter/nbextensions/ipydataclean", STATIC_JS_FILES)], 27 | packages=["dataclean"], 28 | install_requires=[ 29 | "future", 30 | "ipython", 31 | "ipywidgets>=7.0.0", 32 | "matplotlib", 33 | "numpy", 34 | "pandas", 35 | "scikit-learn", 36 | "scipy", 37 | "boltzmannclean", 38 | 'funcsigs;python_version<"3.0"', 39 | ], 40 | ) 41 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ipydataclean 2 | ============ 3 | 4 | Jupyter notebook extension and python library for interactive cleaning of 5 | pandas DataFrames with a selection of techniques, from simple replacements of 6 | missing values to imputation with a Restricted Boltzmann Machine. 7 | 8 | Installation 9 | ------------ 10 | 11 | .. code-block:: bash 12 | 13 | pip install ipydataclean 14 | jupyter nbextension enable dataclean --py --sys-prefix 15 | 16 | Usage 17 | ----- 18 | 19 | Use your Jupyter notebook as normal. When a pandas DataFrame is present in your 20 | python kernel you should see a new notification on the Data Cleaner icon in 21 | your toolbar. DataFrames with names beginning with an underscore will be 22 | ignored. 23 | 24 | .. figure:: https://user-images.githubusercontent.com/29061040/37827637-30cf156a-2e90-11e8-9b84-81a41cf94898.png 25 | :width: 25 % 26 | :alt: Data Cleaner toolbar icon. 27 | 28 | Data Cleaner toolbar icon. 29 | 30 | Clicking on the icon will open a floating window containing a summary of the 31 | DataFrames in your kernel. Clicking on the name of one of these DataFrames will 32 | show some of the Data Cleaner controls and some summary statistics on the 33 | DataFrame columns. 34 | 35 | .. figure:: https://user-images.githubusercontent.com/29061040/37827939-520b095e-2e91-11e8-8a85-a4d8cb0dfed1.png 36 | :width: 25 % 37 | :alt: Data Cleaner window. 38 | 39 | Data Cleaner window. 40 | 41 | Clicking on the name of one of these columns will show data cleaning tools 42 | specific to that column, with a histogram or bar chart showing the distribution 43 | of these values. As you create a step the effect that this will have on the 44 | data distribution is shown as a preview. 45 | 46 | .. figure:: https://user-images.githubusercontent.com/29061040/37828167-169edb9c-2e92-11e8-88cd-f918d2c498df.png 47 | :width: 50 % 48 | :alt: Creating a data cleaning step on a column. 49 | 50 | Creating a data cleaning step on a column. 51 | 52 | You can also choose to fill in missing and mistyped values in your DataFrame 53 | with a Restricted Boltzmann Machine. This uses the boltzmannclean package. 54 | 55 | .. figure:: https://user-images.githubusercontent.com/29061040/37828870-d096628e-2e94-11e8-9291-511fab3bdf7a.png 56 | :width: 40 % 57 | :alt: Creating a Restricted Boltzmann Machine cleaning step. 58 | 59 | Creating a Restricted Boltzmann Machine cleaning step. 60 | 61 | Once you create your steps they are added to a processing pipeline which can be 62 | viewed in the "Pipeline" widget. 63 | 64 | .. figure:: https://user-images.githubusercontent.com/29061040/37829003-4488afda-2e95-11e8-9995-9ebc1348d2bf.png 65 | :width: 40 % 66 | :alt: A data cleaning pipeline. 67 | 68 | A data cleaning pipeline. 69 | 70 | These steps can be modified or deleted using these controls, and when ready the 71 | pipeline can be executed on the dataframe or output to code. Executing your 72 | pipeline will create a new DataFrame with the suffix "_cleaned" in your kernel, 73 | while exporting will create a new code cell in your notebook defining a python 74 | function which will carry out the pipeline cleaning steps. 75 | 76 | .. figure:: https://user-images.githubusercontent.com/29061040/37829131-bf920dd4-2e95-11e8-9e77-aaa3533c2095.png 77 | :width: 40 % 78 | :alt: An exported pipeline. 79 | 80 | An exported pipeline. 81 | 82 | Caveats 83 | ------- 84 | 85 | Duplicated or non string column names are not supported. 86 | 87 | For DataFrames over 1000 rows, a sample of 1000 rows will be used for 88 | previewing and creating your processing pipeline, with the whole DataFrame only 89 | operated on when the pipeline is executed. 90 | -------------------------------------------------------------------------------- /dataclean/static/main.css: -------------------------------------------------------------------------------- 1 | 2 | .datacleaner { 3 | max-height: 500px; 4 | min-height: 100px; 5 | display:inline-block; 6 | font-size: 80%; 7 | padding: 0px; 8 | overflow-y: auto; 9 | font-weight: normal; 10 | color: #333333; 11 | white-space: nowrap; 12 | overflow-x: auto; 13 | } 14 | 15 | .datacleaner-float-wrapper { 16 | position: fixed !important; 17 | top: 120px; 18 | width:700px; 19 | right: 20px; 20 | border: thin solid rgba(0, 0, 0, 0.38); 21 | border-radius: 5px; 22 | padding:10px; 23 | background-color: #fff; 24 | opacity: .95; 25 | z-index: 100; 26 | overflow: auto; 27 | } 28 | 29 | .hide-btn{ 30 | float: right; 31 | } 32 | 33 | .reload-btn{ 34 | float: right; 35 | } 36 | 37 | .kill-btn{ 38 | float: right; 39 | } 40 | 41 | .col-md-9 { 42 | overflow:hidden; 43 | margin-left: 14%; 44 | width: 80%} 45 | 46 | #datacleaner-wrapper.closed { 47 | min-width: 250px; 48 | width: auto; 49 | transition: width; 50 | } 51 | #datacleaner-wrapper:hover{ 52 | opacity: 1; 53 | } 54 | #datacleaner-wrapper .header { 55 | font-size: 16px; 56 | font-weight: bold; 57 | } 58 | #datacleaner-wrapper .hide-btn { 59 | font-size: 14px; 60 | font-family: monospace; 61 | } 62 | 63 | #datacleaner-wrapper .reload-btn { 64 | font-size: 14px; 65 | font-family: monospace; 66 | } 67 | 68 | #datacleaner-wrapper .kill-btn { 69 | font-size: 14px; 70 | font-family: monospace; 71 | } 72 | 73 | 74 | 75 | /* don't waste so much screen space... */ 76 | #datacleaner-wrapper .toc-item{ 77 | padding-left: 20px; 78 | } 79 | 80 | #datacleaner-wrapper .toc-item .toc-item{ 81 | padding-left: 10px; 82 | } 83 | 84 | 85 | table.table, table.table tr, table.table td, table.table th { 86 | border: 0; 87 | } 88 | table.table-nonfluid { 89 | width: auto !important; 90 | } 91 | table.table { 92 | margin-left: 0; 93 | margin-right: 0; 94 | } 95 | /* tablesorter */ 96 | .tablesorter-default .header, 97 | .tablesorter-default .tablesorter-header { 98 | background-image: url(); 99 | background-position: right center; 100 | background-repeat: no-repeat; 101 | cursor: pointer; 102 | padding-right: 20px; 103 | } 104 | .tablesorter-default thead .headerSortUp, 105 | .tablesorter-default thead .tablesorter-headerSortUp, 106 | .tablesorter-default thead .tablesorter-headerAsc { 107 | background-image: url(); 108 | } 109 | .tablesorter-default thead .headerSortDown, 110 | .tablesorter-default thead .tablesorter-headerSortDown, 111 | .tablesorter-default thead .tablesorter-headerDesc { 112 | background-image: url(); 113 | } 114 | .tablesorter-default thead .sorter-false { 115 | background-image: none; 116 | cursor: default; 117 | padding-right: 5px; 118 | } 119 | 120 | .arrow-down:before { 121 | content: " "; 122 | display: inline-block; 123 | vertical-align: middle; 124 | width: 0; 125 | height: 0; 126 | border-left: 4px solid transparent; 127 | border-right: 4px solid transparent; 128 | border-top: 4px solid #888; 129 | margin-right: 4px; 130 | } 131 | 132 | .arrow-right:before { 133 | content: " "; 134 | display: inline-block; 135 | vertical-align: middle; 136 | width: 0; 137 | height: 0; 138 | border-top: 4px solid transparent; 139 | border-left: 4px solid #888; 140 | border-bottom: 4px solid transparent; 141 | margin-right: 4px; 142 | } 143 | /* notification badge */ 144 | .iosb { 145 | position: absolute; 146 | z-index: 20; 147 | background: #fff; } 148 | 149 | .iosb-content { 150 | text-align: center; 151 | font-weight: 700; 152 | font-family: monospace, sans-serif; } 153 | 154 | .iosb-grey { 155 | background-color: #3a3a3a; 156 | background-image: -webkit-gradient(linear, left top, left bottom, from(#868686), to(#3a3a3a)); 157 | background-image: -webkit-linear-gradient(#868686, #3a3a3a); 158 | background-image: -moz-linear-gradient(#868686, #3a3a3a); 159 | background-image: -o-linear-gradient(#868686, #3a3a3a); 160 | background-image: linear-gradient(#868686, #3a3a3a); } 161 | .iosb-grey .iosb-content { 162 | color: #fff; 163 | text-shadow: 1px -1px 1px #474747; } 164 | 165 | .iosb-ios { 166 | background-color: #4a6c9b; 167 | background-image: -webkit-gradient(linear, left top, left bottom, from(#849cbb), to(#4a6c9b)); 168 | background-image: -webkit-linear-gradient(#849cbb, #4a6c9b); 169 | background-image: -moz-linear-gradient(#849cbb, #4a6c9b); 170 | background-image: -o-linear-gradient(#849cbb, #4a6c9b); 171 | background-image: linear-gradient(#849cbb, #4a6c9b); } 172 | .iosb-ios .iosb-content { 173 | color: #fff; 174 | text-shadow: 1px -1px 1px #626a76; } 175 | 176 | /* END themes */ 177 | /* BEGIN sizes */ 178 | .iosb-20 { 179 | -webkit-box-shadow: 0 1px 2px rgba(68, 68, 68, 0.8), 0 1px rgba(255, 255, 255, 0.3) inset; 180 | -moz-box-shadow: 0 1px 2px rgba(68, 68, 68, 0.8), 0 1px rgba(255, 255, 255, 0.3) inset; 181 | box-shadow: 0 1px 2px rgba(68, 68, 68, 0.8), 0 1px rgba(255, 255, 255, 0.3) inset; 182 | min-width: 15px; 183 | height: 15px; } 184 | .iosb-20, .iosb-20 .iosb-inner { 185 | -moz-border-radius: 7px; 186 | border-radius: 7px; } 187 | .iosb-20 .iosb-inner { 188 | margin: 1px; 189 | min-width: 13px; 190 | height: 13px; } 191 | .iosb-20 .iosb-content { 192 | padding: 0 5px; 193 | line-height: 13px; 194 | height: 13px; } 195 | .iosb-20.iosb-top-left { 196 | top: -5px; 197 | left: -5px; } 198 | .iosb-20.iosb-top-right { 199 | top: -5px; 200 | right: -5px; } 201 | .iosb-20.iosb-bottom-left { 202 | bottom: -5px; 203 | left: -5px; } 204 | .iosb-20.iosb-bottom-right { 205 | bottom: -5px; 206 | right: -5px; } 207 | .iosb-20 .iosb-string { 208 | font-size: 8px; } 209 | .iosb-20 .iosb-number { 210 | font-size: 9px; } 211 | 212 | /* END sizes */ -------------------------------------------------------------------------------- /dataclean/codegen.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import builtins 4 | import re 5 | from collections import namedtuple 6 | 7 | try: 8 | from inspect import signature 9 | except ImportError: # Python 2 10 | from funcsigs import signature 11 | 12 | from inspect import getsourcelines, ismethod, isclass, isfunction, ismodule 13 | from textwrap import dedent 14 | 15 | 16 | def indent(text, prefix): 17 | """Adds 'prefix' to the beginning of lines in 'text'.""" 18 | 19 | def prefixed_lines(): 20 | for line in text.splitlines(True): 21 | yield (prefix + line if line.strip() else line) 22 | 23 | return "".join(prefixed_lines()) 24 | 25 | 26 | ClosureVars = namedtuple("ClosureVars", "nonlocals globals builtins unbound") 27 | 28 | 29 | def getclosurevars(func): 30 | """ 31 | Get the mapping of free variables to their current values. 32 | 33 | Returns a named tuple of dicts mapping the current nonlocal, global 34 | and builtin references as seen by the body of the function. A final 35 | set of unbound names that could not be resolved is also provided. 36 | """ 37 | # From the Python3 inspect module, vendored here for Python2 compatibility 38 | 39 | if ismethod(func): 40 | func = func.__func__ 41 | 42 | if not isfunction(func): 43 | raise TypeError("'{!r}' is not a Python function".format(func)) 44 | 45 | code = func.__code__ 46 | # Nonlocal references are named in co_freevars and resolved 47 | # by looking them up in __closure__ by positional index 48 | if func.__closure__ is None: 49 | nonlocal_vars = {} 50 | else: 51 | nonlocal_vars = { 52 | var: cell.cell_contents 53 | for var, cell in zip(code.co_freevars, func.__closure__) 54 | } 55 | 56 | # Global and builtin references are named in co_names and resolved 57 | # by looking them up in __globals__ or __builtins__ 58 | global_ns = func.__globals__ 59 | builtin_ns = global_ns.get("__builtins__", builtins.__dict__) 60 | if ismodule(builtin_ns): 61 | builtin_ns = builtin_ns.__dict__ 62 | global_vars = {} 63 | builtin_vars = {} 64 | unbound_names = set() 65 | for name in code.co_names: 66 | if name in ("None", "True", "False"): 67 | # Because these used to be builtins instead of keywords, they 68 | # may still show up as name references. We ignore them. 69 | continue 70 | try: 71 | global_vars[name] = global_ns[name] 72 | except KeyError: 73 | try: 74 | builtin_vars[name] = builtin_ns[name] 75 | except KeyError: 76 | unbound_names.add(name) 77 | 78 | return ClosureVars(nonlocal_vars, global_vars, builtin_vars, unbound_names) 79 | 80 | 81 | CODE_INDENT = " " 82 | 83 | EXPORT_FUNCTION_SIGNATURE = "def exported_pipeline(df):\n" 84 | 85 | STEP_CODE_PREFIX = indent("\ndataframe = df.copy()\n\n", CODE_INDENT) 86 | 87 | STEP_CODE_SUFFIX = indent("return dataframe", CODE_INDENT) 88 | 89 | 90 | def replace(string, substitutions): 91 | """Replaces all substitutions in one pass to avoid conflicts""" 92 | 93 | substrings = sorted(substitutions, key=len, reverse=True) 94 | regex = re.compile("|".join(map(re.escape, substrings))) 95 | return regex.sub(lambda match: substitutions[match.group(0)], string) 96 | 97 | 98 | def render_code(function, **params): 99 | """ 100 | Generate the code of a function with text replacement of arguments. 101 | 102 | Renders the code of a python function applying textual substitutions of 103 | input arguments with their repr/value. 104 | 105 | Parameters 106 | ---------- 107 | function : function 108 | Python function to render. 109 | 110 | This function should have any code to be output within lines [2:-1] of 111 | the code as written. For intended usage this means that the signature, 112 | the one line docstring, and the return statement are ommitted when 113 | rendering. One should also take care the function does not use text 114 | which may clash with substitutions made when calling this function. 115 | 116 | Returns 117 | ------- 118 | str 119 | The text of the input function with arguments replaced, indented once. 120 | """ 121 | 122 | substitutions = {} 123 | comment = "" 124 | 125 | if "code_comment" in params: 126 | for line in params["code_comment"].split("\n"): 127 | comment += "# " + line + "\n" 128 | 129 | code = getsourcelines(function) 130 | 131 | # [2:-1] slice removes signature, docstring and return statement 132 | code = dedent("".join(code[0][2:-1])) 133 | 134 | for arg_name in signature(function).parameters.keys(): 135 | if arg_name in params: 136 | # repr of a type, e.g. repr(int) doesn't produce valid python 137 | if isinstance(params[arg_name], type): 138 | substitutions[arg_name] = params[arg_name].__name__ 139 | else: 140 | substitutions[arg_name] = repr(params[arg_name]) 141 | 142 | if substitutions: 143 | code = replace(code, substitutions) 144 | 145 | return indent(comment + code, CODE_INDENT) 146 | 147 | 148 | def get_module_dependencies(function): 149 | """ 150 | Generate the import statements required for a function 151 | 152 | Parameters 153 | ---------- 154 | function : function 155 | Python function for which to generate import statements. 156 | 157 | Returns 158 | ------- 159 | import list: list of str 160 | The import statements required for a function, indented once. 161 | 162 | For any closure variables not themselves a module or imported from one, 163 | the generated statement will attempt to bind the repr() of the variable 164 | to the variable name. 165 | """ 166 | 167 | import_list = [] 168 | import_statement = None 169 | 170 | for name, imported in getclosurevars(function).globals.items(): 171 | 172 | if hasattr(imported, "__module__"): 173 | import_statement = "from {0} import {1}".format( 174 | imported.__module__, imported.__name__ 175 | ) 176 | 177 | if imported.__name__ != name: 178 | import_statement += " as {0}".format(name) 179 | 180 | import_statement += "\n" 181 | 182 | elif ismodule(imported): 183 | import_statement = "import {0}".format(imported.__name__) 184 | 185 | if imported.__name__ != name: 186 | import_statement += " as {0}".format(name) 187 | 188 | import_statement += "\n" 189 | 190 | else: 191 | import_statement = "{0} = {1}\n".format(name, repr(imported)) 192 | 193 | if import_statement: 194 | import_list.append(indent(import_statement, CODE_INDENT)) 195 | 196 | return import_list 197 | -------------------------------------------------------------------------------- /dataclean/pipeline.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractproperty 2 | 3 | import boltzmannclean 4 | 5 | import dataclean.codegen as codegen 6 | from dataclean.cleaning import ( 7 | OUTLIER_REMOVAL_METHODS, 8 | NULL_REMOVAL_METHODS, 9 | TYPE_CONVERT_METHODS, 10 | ) 11 | 12 | 13 | class DataCleanStepBase(object): 14 | """Base class for a cleaning step to be applied to a dataframe""" 15 | 16 | __metaclass__ = ABCMeta 17 | 18 | def __init__(self, **params): 19 | self.params = params 20 | 21 | @abstractproperty 22 | def cleaning_function(self): 23 | pass 24 | 25 | def execute(self, dataframe, preview=True): 26 | return self.cleaning_function( 27 | dataframe.copy() if preview else dataframe, **self.params 28 | ) 29 | 30 | @abstractproperty 31 | def description(self): 32 | """Return a human readable brief description of the step""" 33 | pass 34 | 35 | def render_code(self): 36 | return codegen.render_code( 37 | function=self.cleaning_function, 38 | code_comment=self.description, 39 | **self.params 40 | ) 41 | 42 | def required_import_statements(self): 43 | return codegen.get_module_dependencies(self.cleaning_function) 44 | 45 | 46 | class OutlierRemovalStep(DataCleanStepBase): 47 | """A step to handle outliers in a numerical dataframe column""" 48 | 49 | def __init__(self, **params): 50 | super(OutlierRemovalStep, self).__init__(**params) 51 | self.colname = self.params["colname"] 52 | self.low_cut = self.params["low_cut"] 53 | self.high_cut = self.params["high_cut"] 54 | self.replacement_method = self.params.pop("replacement_method") 55 | 56 | @property 57 | def cleaning_function(self): 58 | return OUTLIER_REMOVAL_METHODS[self.replacement_method] 59 | 60 | @property 61 | def description(self): 62 | description = ( 63 | "On {colname}, " 64 | "for values outside {low_cut} to {high_cut}, {replacement_method}" 65 | ).format( 66 | colname=self.colname, 67 | low_cut=self.low_cut, 68 | high_cut=self.high_cut, 69 | replacement_method=self.replacement_method.value, 70 | ) 71 | 72 | return description 73 | 74 | 75 | class NullRemovalStep(DataCleanStepBase): 76 | """A step to handle null values in a dataframe column""" 77 | 78 | def __init__(self, **params): 79 | super(NullRemovalStep, self).__init__(**params) 80 | self.colname = self.params["colname"] 81 | self.replacement_method = self.params.pop("replacement_method") 82 | 83 | @property 84 | def cleaning_function(self): 85 | return NULL_REMOVAL_METHODS[self.replacement_method] 86 | 87 | @property 88 | def description(self): 89 | description = ( 90 | "On {colname}, " + "for missing values, {replacement_method}" 91 | ).format( 92 | colname=self.colname, 93 | replacement_method=self.replacement_method.value, 94 | ) 95 | 96 | return description 97 | 98 | 99 | class TypeConversionStep(DataCleanStepBase): 100 | """A step to handle mistyped values in a dataframe column""" 101 | 102 | def __init__(self, **params): 103 | super(TypeConversionStep, self).__init__(**params) 104 | self.colname = self.params["colname"] 105 | self.data_type = self.params["data_type"] 106 | self.replacement_method = self.params.pop("replacement_method") 107 | 108 | @property 109 | def cleaning_function(self): 110 | return TYPE_CONVERT_METHODS[self.replacement_method] 111 | 112 | @property 113 | def description(self): 114 | description = ( 115 | "On {colname}, " 116 | + "for non {data_type} types, {replacement_method}" 117 | ).format( 118 | colname=self.colname, 119 | replacement_method=self.replacement_method.value, 120 | data_type=self.data_type.__name__, 121 | ) 122 | 123 | return description 124 | 125 | 126 | class RbmStep(DataCleanStepBase): 127 | """A step to fill missing values with a Restricted Boltzmann Machine""" 128 | 129 | def __init__(self, **params): 130 | super(RbmStep, self).__init__(**params) 131 | self.numerical_columns = self.params["numerical_columns"] 132 | self.categorical_columns = self.params["categorical_columns"] 133 | 134 | @property 135 | def cleaning_function(self): 136 | return boltzmannclean.clean 137 | 138 | def execute(self, dataframe, preview=True): 139 | return self.cleaning_function( 140 | dataframe.copy() if preview else dataframe, 141 | tune_rbm=not preview, 142 | **self.params 143 | ) 144 | 145 | @property 146 | def description(self): 147 | description = ( 148 | "On {num_cols} columns, " + "impute values, with an RBM" 149 | ).format( 150 | num_cols=len(self.numerical_columns + self.categorical_columns) 151 | ) 152 | 153 | return description 154 | 155 | def render_code(self): 156 | return codegen.render_code( 157 | function=self.cleaning_function, 158 | tune_rbm=True, 159 | code_comment=self.description, 160 | **self.params 161 | ) 162 | 163 | 164 | class Pipeline(object): 165 | """Keeps track of which cleaning step the user wishes to apply.""" 166 | 167 | def __init__(self): 168 | self.steps = [] 169 | 170 | def append(self, step): 171 | self.steps.append(step) 172 | 173 | def remove(self, step): 174 | self.steps.remove(step) 175 | 176 | def replace(self, old_step, new_step): 177 | if old_step in self.steps: 178 | index = self.steps.index(old_step) 179 | self.steps.remove(old_step) 180 | self.steps.insert(index, new_step) 181 | 182 | def execute(self, dataframe, up_to_step=None, preview=True): 183 | """Executes the current pipeline up to up_to_step on dataframe""" 184 | 185 | new_dataframe = dataframe 186 | 187 | for step in self.steps: 188 | if step is up_to_step: 189 | break 190 | new_dataframe = step.execute(new_dataframe, preview) 191 | # avoids the unnecessary pandas SettingWithCopy warning 192 | new_dataframe.is_copy = False 193 | 194 | return new_dataframe 195 | 196 | def export(self): 197 | """Returns the python code making up the pipeline""" 198 | 199 | code = "" 200 | imports = [] 201 | 202 | for step in self.steps: 203 | code += step.render_code() 204 | imports += step.required_import_statements() 205 | 206 | export_code = codegen.EXPORT_FUNCTION_SIGNATURE 207 | 208 | for import_statement in sorted(set(imports)): 209 | export_code += import_statement 210 | 211 | export_code += ( 212 | codegen.STEP_CODE_PREFIX + code + codegen.STEP_CODE_SUFFIX 213 | ) 214 | 215 | return export_code 216 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2018 ASI Data Science. All rights reserved. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | -------------------------------------------------------------------------------- /dataclean/manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from base64 import b64encode 4 | 5 | import ipywidgets 6 | from IPython.display import Javascript, display 7 | from IPython.utils.py3compat import str_to_bytes, bytes_to_str 8 | from pandas import DataFrame 9 | 10 | from dataclean.pipeline import Pipeline 11 | from dataclean.widget import ( 12 | CallbackManager, 13 | ColumnWidgetController, 14 | DataFrameWidgetController, 15 | PipelineWidgetController, 16 | ) 17 | 18 | 19 | def create_new_code_cell(code): 20 | """Javascript to create and populate a new code cell in the notebook""" 21 | encoded_code = bytes_to_str(b64encode(str_to_bytes(code))) 22 | display( 23 | Javascript( 24 | """ 25 | var code_cell = IPython.notebook.insert_cell_below('code'); 26 | code_cell.set_text(atob("{0}")); 27 | """.format( 28 | encoded_code 29 | ) 30 | ) 31 | ) 32 | 33 | 34 | def display_colwidget(col_id): 35 | """Javascript to display a collapsed column widget""" 36 | display( 37 | Javascript( 38 | """ 39 | if ($('#{0}_row').hasClass('hidden')){{$('#{0}').click()}} 40 | """.format( 41 | str(col_id) 42 | ) 43 | ) 44 | ) 45 | 46 | 47 | class DataCleaner(object): 48 | """Keeps track of DataFrames in the user's kernel""" 49 | 50 | def __init__(self): 51 | self.dataframe_managers = {} 52 | self._main = sys.modules["__main__"] 53 | self.refresh() 54 | 55 | def refresh(self): 56 | dataframe_managers_new = {} 57 | for var_name, var in vars(self._main).items(): 58 | if isinstance(var, DataFrame) and not var_name.startswith("_"): 59 | manager = self._manager_for_dataframe(var, var_name) 60 | dataframe_managers_new[id(var)] = manager 61 | 62 | self.dataframe_managers = dataframe_managers_new 63 | 64 | def dataframe_metadata(self): 65 | self.refresh() 66 | metadata = [] 67 | for manager in self.dataframe_managers.values(): 68 | metadata.append(manager.metadata()) 69 | return json.dumps(metadata) 70 | 71 | def manager_for_id(self, dataframe_id): 72 | return self.dataframe_managers[dataframe_id] 73 | 74 | def _manager_for_dataframe(self, dataframe, name): 75 | for manager in self.dataframe_managers.values(): 76 | if manager.full_dataframe is dataframe: 77 | manager.name = name 78 | break 79 | else: 80 | manager = DataframeManager(dataframe, name) 81 | 82 | def export_cleaned_dataframe(new_dataframe, dataframe_name): 83 | new_df_name = dataframe_name + "_cleaned" 84 | suffix = 0 85 | 86 | # ensures we have a unique name 87 | while getattr(self._main, new_df_name, None) is not None: 88 | suffix += 1 89 | new_df_name = dataframe_name + "_cleaned_" + str(suffix) 90 | 91 | setattr(self._main, new_df_name, new_dataframe) 92 | 93 | def export_to_code(code): 94 | create_new_code_cell(code) 95 | 96 | manager.execute_callback.register_callback( 97 | export_cleaned_dataframe 98 | ) 99 | manager.export_callback.register_callback(export_to_code) 100 | 101 | return manager 102 | 103 | 104 | class DataframeManager(object): 105 | """Manages the widget controller classes for a single DataFrame""" 106 | 107 | MAX_ROWS = 1000 108 | 109 | def __init__(self, dataframe, name): 110 | self.name = name 111 | self.column_widget_controller_by_id = {} 112 | self._pipeline_widget_controller = None 113 | self._dataframe_widget_controller = None 114 | 115 | self.execute_callback = CallbackManager() 116 | self.export_callback = CallbackManager() 117 | 118 | self.full_dataframe = dataframe 119 | 120 | if dataframe.shape[0] > self.MAX_ROWS: 121 | self.dataframe = dataframe.sample(n=self.MAX_ROWS) 122 | self.is_sample = True 123 | else: 124 | self.dataframe = dataframe 125 | self.is_sample = False 126 | 127 | if not (dataframe.columns.is_unique and dataframe.index.is_unique): 128 | self.dataframe = DataFrame({"_": []}) 129 | 130 | self.pipeline = Pipeline() 131 | self.active_step = None 132 | 133 | self.column_by_id = {} 134 | for colname, column in self.dataframe.items(): 135 | self.column_by_id[id(column)] = self.dataframe[colname] 136 | 137 | def metadata(self): 138 | metadata = { 139 | "dfName": self.name, 140 | "dfId": id(self.full_dataframe), 141 | "dfShape": self.full_dataframe.shape, 142 | "dfColnames": sorted( 143 | self.full_dataframe.columns.to_series().apply(str) 144 | ), 145 | "dfCols": [ 146 | { 147 | "colname": colname, 148 | "colId": id(self.full_dataframe[colname]), 149 | "description": { 150 | "dtype": str(column.dtype), 151 | "null_percentage": "{0:.0f}%".format( 152 | 100 * column.isnull().sum() / float(len(column)) 153 | if len(column) > 0 154 | else 0 155 | ), 156 | "distinct": len(column.value_counts()), 157 | }, 158 | } 159 | for colname, column in self.full_dataframe.items() 160 | ], 161 | } 162 | return metadata 163 | 164 | @property 165 | def dataframe_widget(self): 166 | if self._dataframe_widget_controller is None: 167 | self._dataframe_widget_controller = DataFrameWidgetController( 168 | self.pipeline_widget, self.MAX_ROWS if self.is_sample else 0 169 | ) 170 | 171 | def resample(): 172 | self.dataframe = self.full_dataframe.sample(n=self.MAX_ROWS) 173 | self._refresh_colwidgets() 174 | 175 | self._dataframe_widget_controller.resample_callback.register_callback( 176 | resample 177 | ) 178 | self._dataframe_widget_controller.new_step_callback.register_callback( 179 | self._new_step 180 | ) 181 | self._dataframe_widget_controller.modify_step_callback.register_callback( 182 | self._replace_active_step 183 | ) 184 | 185 | if self.dataframe.equals(DataFrame({"_": []})): 186 | widget = ipywidgets.Label( 187 | value=( 188 | "DataFrames with non-unique column names or index are " 189 | "unsupported." 190 | ), 191 | layout=ipywidgets.Layout(width="600px"), 192 | ) 193 | elif self.dataframe.empty: 194 | widget = ipywidgets.Label(value=("DataFrame is empty.")) 195 | else: 196 | widget = self._dataframe_widget_controller.render_widget( 197 | self.dataframe, self.active_step 198 | ) 199 | 200 | return widget 201 | 202 | @property 203 | def pipeline_widget(self): 204 | if self._pipeline_widget_controller is None: 205 | self._pipeline_widget_controller = PipelineWidgetController( 206 | self.pipeline, self.name 207 | ) 208 | 209 | def enter_edit_mode(active_step): 210 | self._refresh_colwidgets(step=active_step) 211 | self.active_step = active_step 212 | if hasattr(active_step, "colname"): 213 | display_colwidget( 214 | id(self.full_dataframe[active_step.colname]) 215 | ) 216 | 217 | def enter_add_mode(): 218 | self._refresh_colwidgets() 219 | self.active_step = None 220 | 221 | def execute_pipeline(): 222 | new_dataframe = self.pipeline.execute( 223 | self.full_dataframe.copy(), preview=False 224 | ) 225 | self.execute_callback.send_callbacks(new_dataframe, self.name) 226 | 227 | def export_pipeline(): 228 | code = self.pipeline.export() 229 | self.export_callback.send_callbacks(code) 230 | 231 | self._pipeline_widget_controller.add_mode_callback.register_callback( 232 | enter_add_mode 233 | ) 234 | self._pipeline_widget_controller.edit_mode_callback.register_callback( 235 | enter_edit_mode 236 | ) 237 | 238 | self._pipeline_widget_controller.execute_callback.register_callback( 239 | execute_pipeline 240 | ) 241 | self._pipeline_widget_controller.export_callback.register_callback( 242 | export_pipeline 243 | ) 244 | 245 | self._pipeline_widget_controller.delete_step_callback.register_callback( 246 | self._delete_step 247 | ) 248 | 249 | return self._pipeline_widget_controller.render_widget(self.active_step) 250 | 251 | def column_widget(self, col_id): 252 | if self.dataframe.empty: 253 | widget = ipywidgets.Label(value="") 254 | else: 255 | if col_id in self.column_widget_controller_by_id: 256 | col_widget_controller = self.column_widget_controller_by_id[ 257 | col_id 258 | ] 259 | else: 260 | column = self.column_by_id[col_id] 261 | 262 | col_widget_controller = ColumnWidgetController() 263 | col_widget_controller.load_data( 264 | column, self.dataframe, self.active_step 265 | ) 266 | 267 | self.column_widget_controller_by_id[ 268 | col_id 269 | ] = col_widget_controller 270 | 271 | col_widget_controller.new_step_callback.register_callback( 272 | self._new_step 273 | ) 274 | col_widget_controller.modify_step_callback.register_callback( 275 | self._replace_active_step 276 | ) 277 | 278 | widget = col_widget_controller.render_widget() 279 | 280 | return widget 281 | 282 | def _refresh_colwidgets(self, step=None): 283 | new_dataframe = self.pipeline.execute(self.dataframe, up_to_step=step) 284 | for ( 285 | col_id, 286 | col_widget_controller, 287 | ) in self.column_widget_controller_by_id.items(): 288 | col_widget_controller.load_data( 289 | new_dataframe[self.column_by_id[col_id].name], 290 | new_dataframe, 291 | step, 292 | ) 293 | col_widget_controller.render_widget() 294 | self._dataframe_widget_controller.render_widget(new_dataframe, step) 295 | 296 | def _new_step(self, new_step): 297 | self.pipeline.append(new_step) 298 | if self._pipeline_widget_controller: 299 | self._pipeline_widget_controller.render_widget() 300 | self._dataframe_widget_controller.display_pipeline() 301 | self._refresh_colwidgets() 302 | 303 | def _replace_active_step(self, modified_step): 304 | self.pipeline.replace(self.active_step, modified_step) 305 | self.active_step = None 306 | if self._pipeline_widget_controller: 307 | self._pipeline_widget_controller.render_widget() 308 | self._dataframe_widget_controller.display_pipeline() 309 | self._refresh_colwidgets() 310 | 311 | def _delete_step(self, step): 312 | self.pipeline.remove(step) 313 | self._pipeline_widget_controller.render_widget() 314 | self.active_step = None 315 | self._refresh_colwidgets() 316 | -------------------------------------------------------------------------------- /dataclean/cleaning.py: -------------------------------------------------------------------------------- 1 | from builtins import int 2 | from enum import Enum 3 | 4 | from sklearn.neighbors import KernelDensity 5 | 6 | 7 | def outlier_removal_mean(dataframe, colname, low_cut, high_cut): 8 | """Replace outliers with the mean on dataframe[colname]""" 9 | 10 | col = dataframe[colname] 11 | 12 | col_numerics = col.loc[ 13 | col.apply( 14 | lambda x: isinstance(x, (int, float)) 15 | and (x >= low_cut and x <= high_cut) 16 | ) 17 | ] 18 | 19 | dataframe.loc[ 20 | col.apply( 21 | lambda x: isinstance(x, (int, float)) 22 | and (x < low_cut or x > high_cut) 23 | ), 24 | colname, 25 | ] = col_numerics.mean() 26 | 27 | return dataframe 28 | 29 | 30 | def outlier_removal_null(dataframe, colname, low_cut, high_cut): 31 | """Replace outliers with empty values on dataframe[colname]""" 32 | 33 | col = dataframe[colname] 34 | 35 | dataframe.loc[ 36 | col.apply( 37 | lambda x: isinstance(x, (int, float)) 38 | and (x < low_cut or x > high_cut) 39 | ), 40 | colname, 41 | ] = None 42 | 43 | return dataframe 44 | 45 | 46 | def outlier_removal_median(dataframe, colname, low_cut, high_cut): 47 | """Replace outliers with the median on dataframe[colname]""" 48 | 49 | col = dataframe[colname] 50 | 51 | col_numerics = col.loc[ 52 | col.apply( 53 | lambda x: isinstance(x, (int, float)) 54 | and (x >= low_cut and x <= high_cut) 55 | ) 56 | ] 57 | 58 | dataframe.loc[ 59 | col.apply( 60 | lambda x: isinstance(x, (int, float)) 61 | and (x < low_cut or x > high_cut) 62 | ), 63 | colname, 64 | ] = col_numerics.median() 65 | 66 | return dataframe 67 | 68 | 69 | def outlier_removal_mode_numeric(dataframe, colname, low_cut, high_cut): 70 | """Replace outliers with the modal numeric value on dataframe[colname]""" 71 | 72 | col = dataframe[colname] 73 | 74 | col_numerics = col.loc[ 75 | col.apply( 76 | lambda x: isinstance(x, (int, float)) 77 | and (x >= low_cut and x <= high_cut) 78 | ) 79 | ] 80 | 81 | dataframe.loc[ 82 | col.apply( 83 | lambda x: isinstance(x, (int, float)) 84 | and (x < low_cut or x > high_cut) 85 | ), 86 | colname, 87 | ] = col_numerics.mode().get(0, None) 88 | 89 | return dataframe 90 | 91 | 92 | def outlier_removal_nearest_cut(dataframe, colname, low_cut, high_cut): 93 | """Clip outliers on dataframe[colname]""" 94 | 95 | col = dataframe[colname] 96 | 97 | dataframe.loc[ 98 | col.apply(lambda x: isinstance(x, (int, float)) and x < low_cut), 99 | colname, 100 | ] = low_cut 101 | 102 | dataframe.loc[ 103 | col.apply(lambda x: isinstance(x, (int, float)) and x > high_cut), 104 | colname, 105 | ] = high_cut 106 | 107 | return dataframe 108 | 109 | 110 | def outlier_removal_drop(dataframe, colname, low_cut, high_cut): 111 | """Drop rows with outliers on dataframe[colname]""" 112 | 113 | col = dataframe[colname] 114 | 115 | dataframe = dataframe.loc[ 116 | col.isnull() 117 | | col.apply( 118 | lambda x: not isinstance(x, (int, float)) 119 | or (x >= low_cut and x <= high_cut) 120 | ), 121 | :, 122 | ] 123 | 124 | return dataframe 125 | 126 | 127 | def outlier_removal_sample(dataframe, colname, low_cut, high_cut): 128 | """Replace outliers with samples from a KDE on dataframe[colname]""" 129 | 130 | col = dataframe[colname] 131 | 132 | col_numerics = col.loc[ 133 | col.apply( 134 | lambda x: isinstance(x, (int, float)) 135 | and (x >= low_cut and x <= high_cut) 136 | ) 137 | ] 138 | if col_numerics.empty: 139 | col_numerics[0] = low_cut 140 | col_numerics[1] = high_cut 141 | 142 | kde = KernelDensity() 143 | kde.fit(col_numerics.values.reshape(-1, 1)) 144 | 145 | is_outlier = col.apply( 146 | lambda x: isinstance(x, (int, float)) and (x < low_cut or x > high_cut) 147 | ) 148 | 149 | samples = kde.sample(n_samples=is_outlier.sum()) 150 | 151 | dataframe.loc[is_outlier, colname] = samples.flatten() 152 | 153 | return dataframe 154 | 155 | 156 | def null_removal_mean(dataframe, colname): 157 | """Replace nulls with the mean on dataframe[colname]""" 158 | 159 | col = dataframe[colname] 160 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))] 161 | 162 | dataframe[colname] = col.fillna(col_numerics.mean()) 163 | 164 | return dataframe 165 | 166 | 167 | def null_removal_sample(dataframe, colname): 168 | """Replace nulls with samples from a KDE on dataframe[colname]""" 169 | 170 | col = dataframe[colname] 171 | 172 | col_numerics = col.loc[ 173 | col.notnull() & col.apply(lambda x: isinstance(x, (int, float))) 174 | ] 175 | if col_numerics.empty: 176 | col_numerics[0] = 0 177 | 178 | kde = KernelDensity() 179 | kde.fit(col_numerics.values.reshape(-1, 1)) 180 | 181 | samples = kde.sample(n_samples=col.isnull().sum()) 182 | 183 | dataframe.loc[col.isnull(), colname] = samples.flatten() 184 | 185 | return dataframe 186 | 187 | 188 | def null_removal_median(dataframe, colname): 189 | """Replace nulls with the median on dataframe[colname]""" 190 | 191 | col = dataframe[colname] 192 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))] 193 | 194 | dataframe[colname] = col.fillna(col_numerics.median()) 195 | 196 | return dataframe 197 | 198 | 199 | def null_removal_mode(dataframe, colname): 200 | """Replace nulls with the mode on dataframe[colname]""" 201 | 202 | col = dataframe[colname] 203 | 204 | dataframe[colname] = col.fillna(col.mode().get(0, None)) 205 | 206 | return dataframe 207 | 208 | 209 | def null_removal_mode_numeric(dataframe, colname): 210 | """Replace nulls with the modal numeric value on dataframe[colname]""" 211 | 212 | col = dataframe[colname] 213 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))] 214 | 215 | dataframe[colname] = col.fillna(col_numerics.mode().get(0, None)) 216 | 217 | return dataframe 218 | 219 | 220 | def null_removal_drop(dataframe, colname): 221 | """Drops rows with nulls on dataframe[colname]""" 222 | 223 | dataframe = dataframe.dropna(subset=[colname]) 224 | 225 | return dataframe 226 | 227 | 228 | def type_convert_mean(dataframe, colname, data_type): 229 | """Replace mistyped values with the mean on dataframe[colname]""" 230 | 231 | col = dataframe[colname] 232 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))] 233 | 234 | dataframe.loc[ 235 | col.notnull() & col.apply(lambda x: not isinstance(x, data_type)), 236 | colname, 237 | ] = col_numerics.mean() 238 | 239 | return dataframe 240 | 241 | 242 | def type_convert_median(dataframe, colname, data_type): 243 | """Replace mistyped values with the median on dataframe[colname]""" 244 | 245 | col = dataframe[colname] 246 | col_numerics = col.loc[col.apply(lambda x: isinstance(x, (int, float)))] 247 | 248 | dataframe.loc[ 249 | col.notnull() & col.apply(lambda x: not isinstance(x, data_type)), 250 | colname, 251 | ] = col_numerics.median() 252 | 253 | return dataframe 254 | 255 | 256 | def type_convert_mode(dataframe, colname, data_type): 257 | """Replace mistyped values with the modal value on dataframe[colname]""" 258 | 259 | col = dataframe[colname] 260 | col_this_type = col.loc[col.apply(lambda x: isinstance(x, data_type))] 261 | 262 | dataframe.loc[ 263 | col.notnull() & col.apply(lambda x: not isinstance(x, data_type)), 264 | colname, 265 | ] = col_this_type.mode().get(0, None) 266 | 267 | return dataframe 268 | 269 | 270 | def type_convert_cast(dataframe, colname, data_type): 271 | """Tries to cast mistyped values on dataframe[colname]""" 272 | 273 | def try_cast(x): 274 | try: 275 | return data_type(x) 276 | except ValueError: 277 | return x 278 | 279 | dataframe[colname] = dataframe[colname].apply(try_cast) 280 | 281 | return dataframe 282 | 283 | 284 | def type_convert_drop(dataframe, colname, data_type): 285 | """Drops rows with mistyped values with the mean on dataframe[colname]""" 286 | 287 | col = dataframe[colname] 288 | 289 | dataframe = dataframe.loc[ 290 | col.isnull() | col.apply(lambda x: isinstance(x, data_type)), : 291 | ] 292 | 293 | return dataframe 294 | 295 | 296 | def type_convert_sample(dataframe, colname, data_type): 297 | """Replace mistyped values with samples from a KDE on dataframe[colname]""" 298 | 299 | col = dataframe[colname] 300 | 301 | col_numerics = col.loc[ 302 | col.notnull() & col.apply(lambda x: isinstance(x, (int, float))) 303 | ] 304 | if col_numerics.empty: 305 | col_numerics[0] = 0 306 | 307 | kde = KernelDensity() 308 | kde.fit(col_numerics.values.reshape(-1, 1)) 309 | 310 | is_wrong_type = col.apply(lambda x: not isinstance(x, data_type)) 311 | 312 | samples = kde.sample(n_samples=is_wrong_type.sum()) 313 | 314 | dataframe.loc[is_wrong_type, colname] = samples.flatten() 315 | 316 | return dataframe 317 | 318 | 319 | class OutlierRemovalMethod(Enum): 320 | NONE = "Do Nothing" 321 | MEAN = "Replace with Mean (excluding outliers)" 322 | MEDIAN = "Replace with Median (excluding outliers)" 323 | NEAREST_CUT = "Replace with Nearest Cut (Clip)" 324 | MODE_NUMERIC = "Replace with Mode" 325 | SAMPLE = "Sample from Column Distribution" 326 | NULL = "Replace with Null" 327 | DROP = "Drop Rows" 328 | 329 | 330 | class NullRemovalMethod(Enum): 331 | NONE = "Do Nothing" 332 | MEAN = "Replace with Mean" 333 | MEDIAN = "Replace with Median" 334 | MODE = "Replace with Most Common Value" 335 | MODE_NUMERIC = "Replace with Mode" 336 | SAMPLE = "Sample from Column Distribution" 337 | DROP = "Drop Rows" 338 | 339 | 340 | class TypeConvertMethod(Enum): 341 | NONE = "Do Nothing" 342 | CAST = "Try to Cast" 343 | MEAN = "Replace with Mean" 344 | MEDIAN = "Replace with Median" 345 | MODE = "Replace with Most Common Value" 346 | SAMPLE = "Sample from Column Distribution" 347 | DROP = "Drop Rows" 348 | 349 | 350 | # Numeric values are always treated as continuous 351 | class CategoricalTypes(Enum): 352 | CONTINUOUS = "Numeric" 353 | CATEGORICAL = "Categorical" 354 | 355 | 356 | OUTLIER_REMOVAL_METHODS = { 357 | OutlierRemovalMethod.MEAN: outlier_removal_mean, 358 | OutlierRemovalMethod.MEDIAN: outlier_removal_median, 359 | OutlierRemovalMethod.NEAREST_CUT: outlier_removal_nearest_cut, 360 | OutlierRemovalMethod.DROP: outlier_removal_drop, 361 | OutlierRemovalMethod.MODE_NUMERIC: outlier_removal_mode_numeric, 362 | OutlierRemovalMethod.SAMPLE: outlier_removal_sample, 363 | OutlierRemovalMethod.NULL: outlier_removal_null, 364 | OutlierRemovalMethod.NONE: lambda df, *_, **__: df, 365 | } 366 | 367 | NULL_REMOVAL_METHODS = { 368 | NullRemovalMethod.MEAN: null_removal_mean, 369 | NullRemovalMethod.MEDIAN: null_removal_median, 370 | NullRemovalMethod.MODE: null_removal_mode, 371 | NullRemovalMethod.MODE_NUMERIC: null_removal_mode_numeric, 372 | NullRemovalMethod.DROP: null_removal_drop, 373 | NullRemovalMethod.SAMPLE: null_removal_sample, 374 | NullRemovalMethod.NONE: lambda df, *_, **__: df, 375 | } 376 | 377 | TYPE_CONVERT_METHODS = { 378 | TypeConvertMethod.MEAN: type_convert_mean, 379 | TypeConvertMethod.MEDIAN: type_convert_median, 380 | TypeConvertMethod.MODE: type_convert_mode, 381 | TypeConvertMethod.DROP: type_convert_drop, 382 | TypeConvertMethod.CAST: type_convert_cast, 383 | TypeConvertMethod.SAMPLE: type_convert_sample, 384 | TypeConvertMethod.NONE: lambda df, *_, **__: df, 385 | } 386 | 387 | 388 | # Encodes which transformations are allowed for which data types 389 | ALLOWED_TRANSFORMATIONS = { 390 | CategoricalTypes.CONTINUOUS: [ 391 | OutlierRemovalMethod.MEAN, 392 | OutlierRemovalMethod.MEDIAN, 393 | OutlierRemovalMethod.NEAREST_CUT, 394 | OutlierRemovalMethod.DROP, 395 | OutlierRemovalMethod.MODE_NUMERIC, 396 | OutlierRemovalMethod.SAMPLE, 397 | OutlierRemovalMethod.NULL, 398 | OutlierRemovalMethod.NONE, 399 | NullRemovalMethod.MEAN, 400 | NullRemovalMethod.MEDIAN, 401 | NullRemovalMethod.MODE_NUMERIC, 402 | NullRemovalMethod.DROP, 403 | NullRemovalMethod.SAMPLE, 404 | NullRemovalMethod.NONE, 405 | TypeConvertMethod.MEAN, 406 | TypeConvertMethod.MEDIAN, 407 | TypeConvertMethod.MODE, 408 | TypeConvertMethod.DROP, 409 | TypeConvertMethod.CAST, 410 | TypeConvertMethod.SAMPLE, 411 | TypeConvertMethod.NONE, 412 | ], 413 | CategoricalTypes.CATEGORICAL: [ 414 | NullRemovalMethod.MODE, 415 | NullRemovalMethod.DROP, 416 | NullRemovalMethod.NONE, 417 | TypeConvertMethod.DROP, 418 | TypeConvertMethod.CAST, 419 | TypeConvertMethod.MODE, 420 | TypeConvertMethod.NONE, 421 | ], 422 | } 423 | -------------------------------------------------------------------------------- /dataclean/static/iosbadge.js: -------------------------------------------------------------------------------- 1 | /*! iOSBadge - v0.2.0 2 | * http://kristerkari.github.com/iOSBadge/ 3 | * Copyright (c) 2016 Krister Kari; Licensed MIT 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.*/ 9 | 10 | (function($, window, document) { 11 | 'use strict'; 12 | 13 | /** 14 | Check if the content is a number 15 | @param content {String|Number} The content 16 | @return {Boolean} true or false depending on if the content is a number 17 | @private 18 | */ 19 | var isNumber; 20 | isNumber = function(content) { 21 | if (typeof content === 'string' || typeof content === 'number') { 22 | if (!isNaN(parseInt(content, 10))) { 23 | return true; 24 | } else { 25 | return false; 26 | } 27 | } else { 28 | return false; 29 | } 30 | }; 31 | 32 | /** 33 | Constructor and plugin settings 34 | 35 | Make sure that the plugin works even without the `new` keyword. 36 | 37 | Check for any user defined settings and initialize the plugin. 38 | @class IOSBadge 39 | @constructor 40 | @example 41 | var badge = new IOSBadge(); 42 | */ 43 | window.IOSBadge = (function() { 44 | function IOSBadge(element, settings) { 45 | if (!(this instanceof IOSBadge)) { 46 | return new IOSBadge(element, settings); 47 | } else if (!element || !(element.nodeType && element.nodeType === 1)) { 48 | throw new Error('You need to pass an element as the first argument to iOSBadge'); 49 | } 50 | this.element = element; 51 | this.settings = settings; 52 | if (element.length && element.length > 1) { 53 | this.element = element[0]; 54 | } 55 | if (settings && typeof settings === 'object') { 56 | this.content = settings.content || 1; 57 | this.size = settings.size || 20; 58 | this.position = settings.position || 'top-right'; 59 | this.namespace = settings.namespace || 'iosb'; 60 | this.theme = settings.theme || 'red'; 61 | } else { 62 | this.content = 1; 63 | this.size = 20; 64 | this.position = 'top-right'; 65 | this.namespace = 'iosb'; 66 | this.theme = 'red'; 67 | } 68 | this._generate(); 69 | } 70 | 71 | 72 | /** 73 | Generate elements used by the plugin. 74 | @method _generate 75 | @private 76 | */ 77 | 78 | IOSBadge.prototype._generate = function() { 79 | this.type = isNumber(this.content) ? 'number' : 'string'; 80 | if (/^(-|\+)\d+/.test(this.content)) { 81 | this.content = this.content.slice(1); 82 | } 83 | this.badgeElem = document.createElement('div'); 84 | this.badgeInner = document.createElement('div'); 85 | this.badgeContent = document.createElement('div'); 86 | this._setContent(this.content); 87 | this._setClasses(this.position, this.size, this.type, this.theme); 88 | this.badgeInner.appendChild(this.badgeContent); 89 | this.badgeElem.appendChild(this.badgeInner); 90 | this.element.appendChild(this.badgeElem); 91 | }; 92 | 93 | 94 | /** 95 | Set jQuery/Zepto options from the user. 96 | @method _setOptions 97 | @param options {Object|String} Plugin options given with jQuery or Zepto. 98 | @private 99 | */ 100 | 101 | IOSBadge.prototype._setOptions = function(options) { 102 | if (options && typeof options === 'object') { 103 | if (options.content != null) { 104 | this.setContent(options.content); 105 | } 106 | if (options.position != null) { 107 | this.setPosition(options.position); 108 | } 109 | if (options.theme != null) { 110 | this.setTheme(options.theme); 111 | } 112 | if (options.size != null) { 113 | this.setSize(options.size); 114 | } 115 | if (options.hide && options.hide === true) { 116 | this.hide(); 117 | } else if (options.show && options.show === true) { 118 | this.show(); 119 | } 120 | } else if (typeof options === 'string') { 121 | if (options.toLowerCase() === 'showbadge') { 122 | this.show(); 123 | } else if (options.toLowerCase() === 'hidebadge') { 124 | this.hide(); 125 | } else { 126 | this.setContent(options); 127 | } 128 | } 129 | }; 130 | 131 | 132 | /** 133 | Set the content of badge element. 134 | @method _setContent 135 | @param content {Number|String} content for the badge element. 136 | @private 137 | */ 138 | 139 | IOSBadge.prototype._setContent = function(content) { 140 | this.content = content; 141 | this.badgeContent.innerHTML = content; 142 | }; 143 | 144 | 145 | /** 146 | Set the classnames used by the plugin. 147 | @method _setClasses 148 | @param position {String} Badge position. 149 | @param size {String} Badge size. 150 | @param type {String} Badge type (number or string). 151 | @param theme {String} Badge theme. 152 | @private 153 | */ 154 | 155 | IOSBadge.prototype._setClasses = function(position, size, type, theme) { 156 | var namespace; 157 | namespace = this.namespace; 158 | this.badgeElem.className = namespace + " " + namespace + "-" + size + " " + namespace + "-" + position; 159 | this.badgeInner.className = namespace + "-inner " + namespace + "-" + theme; 160 | this.badgeContent.className = namespace + "-content " + namespace + "-" + type; 161 | }; 162 | 163 | 164 | /** 165 | Returns the current content set for badge. Not chainable. 166 | @method getContent 167 | @return {Number|String} Badge content. 168 | @example 169 | badge.getContent(); 170 | */ 171 | 172 | IOSBadge.prototype.getContent = function() { 173 | var badgeContent, badgeContentInt; 174 | badgeContent = this.badgeContent.innerHTML; 175 | badgeContentInt = parseInt(badgeContent, 10); 176 | if (!isNaN(badgeContentInt)) { 177 | return badgeContentInt; 178 | } else { 179 | return badgeContent; 180 | } 181 | }; 182 | 183 | 184 | /** 185 | Set the content of your badge. Content can be a number or a string. 186 | Increase or decrease your current badge number by passing a `'+'` or `'-'` prefixed 187 | number in a string e.g. `.setContent('+7')` 188 | @method setContent 189 | @param content {Number|String} Badge content. 190 | @chainable 191 | @example 192 | badge.setContent(6); 193 | */ 194 | 195 | IOSBadge.prototype.setContent = function(content) { 196 | var amount, firstChar, type; 197 | if (content == null) { 198 | content = 1; 199 | } 200 | type = typeof content; 201 | if (type === 'object' || type === 'function') { 202 | return this; 203 | } else if (this.badgeElem.style.display === 'none') { 204 | this.show(); 205 | } 206 | if (type === 'string') { 207 | firstChar = content.charAt(0); 208 | amount = +content.substring(1) || 0; 209 | if (content === '') { 210 | this._setContent(''); 211 | this.hide(); 212 | return this; 213 | } else if (firstChar === '+') { 214 | this.increaseBy(amount); 215 | return this; 216 | } else if (firstChar === '-') { 217 | this.decreaseBy(amount); 218 | return this; 219 | } else if (isNumber(content)) { 220 | type = 'number'; 221 | } else { 222 | type = 'string'; 223 | } 224 | } else { 225 | type = 'number'; 226 | } 227 | this.type = type; 228 | this._setClasses(this.position, this.size, type, this.theme); 229 | this._setContent(content); 230 | return this; 231 | }; 232 | 233 | 234 | /** 235 | Set the position of your badge. 236 | Positions are: `'top-left'`, `'top-right'`, `'bottom-left'` or `'bottom-right'`. 237 | @method setPosition 238 | @param position {String} Badge position. 239 | @chainable 240 | @example 241 | badge.setPosition('bottom-left'); 242 | */ 243 | 244 | IOSBadge.prototype.setPosition = function(position) { 245 | if (typeof position === 'string') { 246 | this.position = position; 247 | this._setClasses(position, this.size, this.type, this.theme); 248 | } 249 | return this; 250 | }; 251 | 252 | 253 | /** 254 | Set the theme of your badge. 255 | Available default themes are: `'red'`, `'blue'`, `'green'`, `'grey'` and `'ios'`. 256 | Themes can be configured in the `iosbadge.scss` file. 257 | @method setTheme 258 | @param theme {String} Badge theme. 259 | @chainable 260 | @example 261 | badge.setTheme('ios'); 262 | */ 263 | 264 | IOSBadge.prototype.setTheme = function(theme) { 265 | if (typeof theme === 'string') { 266 | this.theme = theme; 267 | this._setClasses(this.position, this.size, this.type, theme); 268 | } 269 | return this; 270 | }; 271 | 272 | 273 | /** 274 | Set the size of your badge. 275 | Available default sizes are: `20`, `22`, `24`, `26`, `28`, `30`, `32`, `34` and `36`. 276 | Sizes can be configured in the `iosbadge.scss` file. 277 | @method setSize 278 | @param size {Number|String} Badge size. 279 | @chainable 280 | @example 281 | badge.setSize(30); 282 | */ 283 | 284 | IOSBadge.prototype.setSize = function(size) { 285 | if (isNumber(size)) { 286 | this.size = parseInt(size, 10); 287 | this._setClasses(this.position, this.size, this.type, this.theme); 288 | } 289 | return this; 290 | }; 291 | 292 | 293 | /** 294 | Decrease the current number in your badge. 295 | @method decreaseBy 296 | @param amount {Number} The amount to decrease by. 297 | @chainable 298 | @example 299 | badge.decreaseBy(2); 300 | */ 301 | 302 | IOSBadge.prototype.decreaseBy = function(amount) { 303 | if (isNumber(amount)) { 304 | this.type = 'number'; 305 | this._setClasses(this.position, this.size, this.type, this.theme); 306 | this._setContent((parseInt(this.content, 10) || 0) - parseInt(amount, 10)); 307 | } 308 | return this; 309 | }; 310 | 311 | 312 | /** 313 | Increase the current number in your badge. 314 | @method increaseBy 315 | @param amount {Number} The amount to increase by. 316 | @chainable 317 | @example 318 | badge.increaseBy(2); 319 | */ 320 | 321 | IOSBadge.prototype.increaseBy = function(amount) { 322 | if (isNumber(amount)) { 323 | this.type = 'number'; 324 | this._setClasses(this.position, this.size, this.type, this.theme); 325 | this._setContent((parseInt(this.content, 10) || 0) + parseInt(amount, 10)); 326 | } 327 | return this; 328 | }; 329 | 330 | 331 | /** 332 | Hide your badge element. 333 | @method hide 334 | @chainable 335 | @example 336 | badge.hide(); 337 | */ 338 | 339 | IOSBadge.prototype.hide = function() { 340 | this.badgeElem.style.display = 'none'; 341 | return this; 342 | }; 343 | 344 | 345 | /** 346 | Show your badge element. 347 | @method hide 348 | @chainable 349 | @example 350 | badge.show(); 351 | */ 352 | 353 | IOSBadge.prototype.show = function() { 354 | this.badgeElem.style.display = 'block'; 355 | return this; 356 | }; 357 | 358 | return IOSBadge; 359 | 360 | })(); 361 | if (typeof $ === 'function') { 362 | $.fn.iosbadge = function(options) { 363 | var iOSBadge; 364 | iOSBadge = 'iosbadge'; 365 | if (typeof options === 'string' && options.toLowerCase() === 'getcontent' && this.data(iOSBadge)) { 366 | return this.data(iOSBadge).getContent(); 367 | } else { 368 | return this.each(function() { 369 | var $self, pluginData, self; 370 | self = this; 371 | $self = $(self); 372 | pluginData = $self.data(iOSBadge); 373 | if (!pluginData) { 374 | $self.data(iOSBadge, new window.IOSBadge(self, options)); 375 | } else { 376 | pluginData._setOptions(options); 377 | } 378 | }); 379 | } 380 | }; 381 | } 382 | })(window.jQuery || window.Zepto, window, window.document); 383 | -------------------------------------------------------------------------------- /dataclean/static/main.js: -------------------------------------------------------------------------------- 1 | define(["require", "jquery", "base/js/namespace", 'services/config', 2 | 'base/js/events', 'base/js/utils', 'notebook/js/codecell', 'notebook/js/outputarea' 3 | ], function(require, $, Jupyter, configmod, events, utils, codecell, outputarea) { 4 | 5 | var Notebook = require('notebook/js/notebook').Notebook; 6 | "use strict"; 7 | var mod_name = "dataclean"; 8 | var log_prefix = '[' + mod_name + '] '; 9 | 10 | var n_dataframes = 0 11 | 12 | // ...........Parameters configuration...................... 13 | // define default values for config parameters if they were not present in general settings (notebook.json) 14 | var cfg = { 15 | 'position' : { 16 | top: '50px' 17 | }, 18 | 'window_display': false, 19 | 'python': { 20 | varRefreshCmd: (`try: 21 | print(_datacleaner.dataframe_metadata()) 22 | except: 23 | print([])`) 24 | .replace(/^ /gm, '') 25 | }, 26 | }; 27 | 28 | //.....................global variables.... 29 | 30 | 31 | var st = {}; 32 | st.config_loaded = false; 33 | st.extension_initialized = false; 34 | 35 | function read_config(cfg, callback) { // read after nb is loaded 36 | // create config object to load parameters 37 | var config = Jupyter.notebook.config; 38 | config.loaded.then(function() { 39 | 40 | cfg = $.extend(true, cfg, config.data.datacleaner); 41 | // then update cfg with some vars found in current notebook metadata 42 | // and save in nb metadata (then can be modified per document) 43 | 44 | // window_display is taken from notebook metadata 45 | if (Jupyter.notebook.metadata.datacleaner) { 46 | if (Jupyter.notebook.metadata.datacleaner.window_display) 47 | cfg.window_display = Jupyter.notebook.metadata.datacleaner.window_display; 48 | if (Jupyter.notebook.metadata.datacleaner.position) 49 | cfg.position = Jupyter.notebook.metadata.datacleaner.position; 50 | } 51 | 52 | cfg = Jupyter.notebook.metadata.datacleaner = $.extend(true, 53 | cfg, Jupyter.notebook.metadata.datacleaner); 54 | 55 | // but cols and kernels_config are taken from system (if defined) 56 | if (config.data.datacleaner) { 57 | if (config.data.datacleaner.kernels_config) { 58 | cfg.kernels_config = $.extend(true, cfg.kernels_config, config.data.datacleaner.kernels_config); 59 | } 60 | } 61 | 62 | // call callbacks 63 | callback && callback(); 64 | st.config_loaded = true; 65 | }); 66 | config.load(); 67 | return cfg; 68 | } 69 | 70 | function toggledatacleaner() { 71 | toggle_datacleaner(cfg, st); 72 | } 73 | 74 | var datacleaner_button = function() { 75 | if (!Jupyter.toolbar) { 76 | events.on("app_initialized.NotebookApp", datacleaner_button); 77 | return; 78 | } 79 | if ($("#datacleaner_button").length === 0) { 80 | Jupyter.toolbar.add_buttons_group([{ 81 | 'label': 'Data Cleaner', 82 | 'icon': 'fa-bar-chart-o', 83 | 'callback': toggledatacleaner, 84 | 'id': 'datacleaner_button' 85 | }]); 86 | } 87 | 88 | require(['nbextensions/sherlockml-dataclean/iosbadge'], 89 | function() { 90 | if ($("#datacleaner_button").find('.iosb').length === 0) { 91 | $("#datacleaner_button").iosbadge({ theme: 'grey', size: 20 }); 92 | } 93 | $("#datacleaner_button").find('.iosb').addClass('hidden'); 94 | }); 95 | }; 96 | 97 | var load_css = function() { 98 | var link = document.createElement("link"); 99 | link.type = "text/css"; 100 | link.rel = "stylesheet"; 101 | link.href = require.toUrl("./main.css"); 102 | document.getElementsByTagName("head")[0].appendChild(link); 103 | }; 104 | 105 | 106 | function html_table(jsonDataframes) { 107 | var dfList = JSON.parse(String(jsonDataframes)); 108 | var table = '
' 109 | +'' 112 | +'' 113 | +'' 114 | +''; 115 | n_dataframes = dfList.length; 116 | 117 | for (var i = 0; i < n_dataframes; i++) { 118 | table += 119 | '' 120 | +'' 125 | + '' 126 | + '' 137 | +'' 138 | + ''; 185 | } 186 | var full_table = table + '
NameShapeColumns
' 122 | + dfList[i].dfName + '' 123 | + dfList[i].dfShape + '' 124 | + dfList[i].dfColnames + '
'; 187 | 188 | return full_table; 189 | } 190 | 191 | function display_widgets(msg, output_wrapper) { 192 | 193 | if (msg.header.msg_type == 'display_data') { 194 | 195 | var output_area = new outputarea.OutputArea({ 196 | config: Jupyter.notebook.config, 197 | selector: output_wrapper, 198 | prompt_area: false, 199 | events: Jupyter.notebook.events, 200 | keyboard_manager: Jupyter.notebook.keyboard_manager, 201 | }); 202 | 203 | output_area.handle_output(msg); 204 | } 205 | 206 | if (msg.header.msg_type == 'error') { 207 | console.warn(log_prefix + msg.content.evalue); 208 | console.warn(log_prefix + msg.content.traceback); 209 | } 210 | } 211 | 212 | function display_column_widget(selector) { 213 | if($('#datacleaner-wrapper').is(':visible')){ 214 | 215 | var dataframe_id = $(selector).attr('data-frame-id'); 216 | 217 | var column_id = $(selector).attr('id'); 218 | 219 | var col_output_wrapper; 220 | 221 | if ($('#'+column_id+'_widget').find('.output').length===0){ 222 | col_output_wrapper = $('
'); 223 | $('#'+column_id+'_row').html(col_output_wrapper); 224 | 225 | Jupyter.notebook.kernel.execute('_datacleaner.dataframe_managers['+dataframe_id+'].column_widget('+column_id+')', 226 | {iopub: { output: function(msg){display_widgets(msg, col_output_wrapper)} } }, { silent: false }); 227 | } 228 | } 229 | 230 | } 231 | 232 | function display_pipeline_widget(selector) { 233 | if($('#datacleaner-wrapper').is(':visible')){ 234 | 235 | var dataframe_id = $(selector).attr('data-frame-id'); 236 | 237 | var pipeline_output_wrapper; 238 | 239 | if ($('#'+dataframe_id+'_widget').find('.output').length===0){ 240 | pipeline_output_wrapper = $('
'); 241 | 242 | $('#'+dataframe_id+'_row').html(pipeline_output_wrapper); 243 | 244 | Jupyter.notebook.kernel.execute('_datacleaner.dataframe_managers['+dataframe_id+'].dataframe_widget', 245 | {iopub: { output: function(msg){display_widgets(msg,pipeline_output_wrapper)} } }, { silent: false }); 246 | } 247 | } 248 | } 249 | 250 | //runs after every code cell execution in case dataframes have been updated 251 | function code_exec_callback(msg) { 252 | if (msg.header.msg_type == 'stream') { 253 | var jsonDataframes = msg.content.text; 254 | if (jsonDataframes === undefined) 255 | datacleaner_init(); 256 | else { 257 | //redraw table 258 | $('#datacleaner').html(html_table(jsonDataframes)); 259 | 260 | if (n_dataframes > 0) { 261 | $("#datacleaner_button").iosbadge({content: n_dataframes}); 262 | $("#datacleaner_button").find('.iosb').removeClass('hidden'); 263 | } else { 264 | $("#datacleaner_button").find('.iosb').addClass('hidden'); 265 | } 266 | 267 | //add click events 268 | $('.tablesorter').delegate('.toggleColumn', 'click' ,function(){ 269 | $(this).closest('tr').nextUntil('tr:not(.tablesorter-childRow)').children('td').toggleClass('hidden'); 270 | $(this).toggleClass('arrow-right'); 271 | $(this).toggleClass('arrow-down'); 272 | display_column_widget(this); 273 | return false; 274 | }); 275 | 276 | $('.tablesorter').on('click', '.toggleDataframe' ,function(){ 277 | $(this).closest('tr').nextUntil('tr:not(.tablesorter-childRow)').children('td').toggleClass('hidden'); 278 | $(this).toggleClass('arrow-right'); 279 | $(this).toggleClass('arrow-down'); 280 | display_pipeline_widget(this) 281 | return false; 282 | }); 283 | 284 | //redisplay already open widgets 285 | $('.toggleColumn').each(function(){ 286 | if (!($(this).closest('tr').nextUntil('tr:not(.tablesorter-childRow)').children('td').hasClass('hidden'))){ 287 | $(this).toggleClass('arrow-right'); 288 | $(this).toggleClass('arrow-down'); 289 | display_column_widget(this) 290 | } 291 | }); 292 | 293 | $('.toggleDataframe').each(function(){ 294 | if (!($(this).closest('tr').next('tr').find('.pipeline_widget').hasClass('hidden'))){ 295 | $(this).toggleClass('arrow-right'); 296 | $(this).toggleClass('arrow-down'); 297 | display_pipeline_widget(this) 298 | } 299 | }); 300 | 301 | } 302 | require(['nbextensions/sherlockml-dataclean/jquery.tablesorter.min'], 303 | function() { 304 | setTimeout(function() { if ($('#datacleaner').length>0) 305 | $('#datacleaner table').tablesorter()}, 100); 306 | }); 307 | } 308 | 309 | if (msg.header.msg_type == 'error') { 310 | console.warn(log_prefix + msg.content.evalue); 311 | console.warn(log_prefix + msg.content.traceback); 312 | } 313 | } 314 | 315 | var varRefresh = function() { 316 | require(['nbextensions/sherlockml-dataclean/jquery.tablesorter.min'], 317 | function() { 318 | Jupyter.notebook.kernel.execute( 319 | cfg.python.varRefreshCmd, { iopub: { output: code_exec_callback } }, { silent: false } 320 | ); 321 | }); 322 | }; 323 | 324 | 325 | var datacleaner_init = function() { 326 | 327 | cfg = read_config(cfg, function() { 328 | if (typeof Jupyter.notebook.kernel !== "undefined" && Jupyter.notebook.kernel !== null) { 329 | datacleaner_button(); 330 | } else { 331 | console.warn(log_prefix + "Kernel not available?"); 332 | } 333 | }); 334 | 335 | data_cleaner(cfg, st); 336 | 337 | //CREATE DATACLEANER PYTHON OBJECT 338 | Jupyter.notebook.kernel.execute(( 339 | `try: 340 | _datacleaner 341 | except NameError: 342 | from dataclean.manager import DataCleaner as _DataCleaner 343 | _datacleaner = _DataCleaner()`) 344 | .replace(/^ /gm, '')) 345 | 346 | events.on('execute.CodeCell', varRefresh); 347 | events.on('varRefresh', varRefresh); 348 | }; 349 | 350 | 351 | var create_datacleaner_div = function(cfg, st) { 352 | function save_position(){ 353 | Jupyter.notebook.metadata.datacleaner.position = { 354 | 'left': $('#datacleaner-wrapper').css('left'), 355 | 'top': $('#datacleaner-wrapper').css('top'), 356 | 'width': $('#datacleaner-wrapper').css('width'), 357 | 'height': $('#datacleaner-wrapper').css('height'), 358 | 'right': $('#datacleaner-wrapper').css('right') 359 | }; 360 | } 361 | var datacleaner_wrapper = $('
') 362 | .append( 363 | $('
') 364 | .addClass("header") 365 | .text("Data Cleaner ") 366 | .append( 367 | $("") 368 | .attr("href", "#") 369 | .text("[x]") 370 | .addClass("kill-btn") 371 | .attr('title', 'Close window') 372 | .click(function() { 373 | save_position(); 374 | toggledatacleaner(); 375 | return false; 376 | }) 377 | ) 378 | .append( 379 | $("") 380 | .attr("href", "#") 381 | .addClass("hide-btn") 382 | .attr('title', 'Hide Data Cleaner') 383 | .text("[-]") 384 | .click(function() { 385 | $('#datacleaner-wrapper').css('position', 'fixed'); 386 | $('#datacleaner').slideToggle({ 387 | 'complete': function() { 388 | Jupyter.notebook.metadata.datacleaner['datacleaner_section_display'] = $('#datacleaner').css('display'); 389 | save_position(); 390 | Jupyter.notebook.set_dirty(); 391 | } 392 | }); 393 | $('#datacleaner-wrapper').toggleClass('closed'); 394 | if ($('#datacleaner-wrapper').hasClass('closed')) { 395 | cfg.oldHeight = $('#datacleaner-wrapper').height(); //.css('height'); 396 | $('#datacleaner-wrapper').css({ height: 40 }); 397 | $('#datacleaner-wrapper .hide-btn') 398 | .text('[+]') 399 | .attr('title', 'Show Data Cleaner'); 400 | } else { 401 | $('#datacleaner-wrapper').height(cfg.oldHeight); //css({ height: cfg.oldHeight }); 402 | $('#datacleaner').height(cfg.oldHeight - $('#datacleaner-header').height() - 30 ) 403 | $('#datacleaner-wrapper .hide-btn') 404 | .text('[-]') 405 | .attr('title', 'Hide Data Cleaner'); 406 | } 407 | return false; 408 | }) 409 | ).append( 410 | $("") 411 | .attr("href", "#") 412 | .text(" \u21BB") 413 | .addClass("reload-btn") 414 | .attr('title', 'Reload Data Cleaner') 415 | .click(function() { 416 | varRefresh(); 417 | return false; 418 | }) 419 | ).append( 420 | $("") 421 | .html("  ") 422 | ).append( 423 | $("") 424 | .html("  ") 425 | ) 426 | ).append( 427 | $("
").attr("id", "datacleaner").addClass('datacleaner') 428 | ) 429 | 430 | $("body").append(datacleaner_wrapper); 431 | // Ensure position is fixed 432 | $('#datacleaner-wrapper').css('position', 'fixed'); 433 | 434 | // enable dragging and save position on stop moving 435 | $('#datacleaner-wrapper').draggable({ 436 | handle:'#datacleaner-header', 437 | drag: function(event, ui) {}, //end of drag function 438 | start: function(event, ui) { 439 | $(this).width($(this).width()); 440 | }, 441 | stop: function(event, ui) { // on save, store window position 442 | $(this).offset({top:Math.max($(this).offset().top,0)}); 443 | save_position(); 444 | Jupyter.notebook.set_dirty(); 445 | // Ensure position is fixed (again) 446 | $('#datacleaner-wrapper').css('position', 'fixed'); 447 | }, 448 | }); 449 | 450 | $('#datacleaner-wrapper').resizable({ 451 | resize: function(event, ui) { 452 | $('#datacleaner').height($('#datacleaner-wrapper').height() - $('#datacleaner-header').height()); 453 | }, 454 | start: function(event, ui) { 455 | $(this).css('position', 'fixed'); 456 | }, 457 | stop: function(event, ui) { 458 | save_position(); 459 | $('#datacleaner').height($('#datacleaner-wrapper').height() - $('#datacleaner-header').height()) 460 | Jupyter.notebook.set_dirty(); 461 | } 462 | }) 463 | 464 | if (Jupyter.notebook.metadata.datacleaner !== undefined) { 465 | if (Jupyter.notebook.metadata.datacleaner.position !== undefined) { 466 | $('#datacleaner-wrapper').css(Jupyter.notebook.metadata.datacleaner.position); 467 | } 468 | } 469 | 470 | // Ensure position is fixed 471 | $('#datacleaner-wrapper').css('position', 'fixed'); 472 | 473 | // Restore window display 474 | if (Jupyter.notebook.metadata.datacleaner !== undefined) { 475 | if (Jupyter.notebook.metadata.datacleaner['datacleaner_section_display'] !== undefined) { 476 | $('#datacleaner').css('display', Jupyter.notebook.metadata.datacleaner['datacleaner_section_display']) 477 | if (Jupyter.notebook.metadata.datacleaner['datacleaner_section_display'] == 'none') { 478 | $('#datacleaner-wrapper').addClass('closed'); 479 | $('#datacleaner-wrapper').css({ height: 40 }); 480 | $('#datacleaner-wrapper .hide-btn') 481 | .text('[+]') 482 | .attr('title', 'Show Data Cleaner'); 483 | } 484 | } 485 | if (Jupyter.notebook.metadata.datacleaner['window_display'] !== undefined) { 486 | console.log(log_prefix + "Restoring Data Cleaner window"); 487 | $('#datacleaner-wrapper').css('display','none'); 488 | if ($('#datacleaner-wrapper').hasClass('closed')){ 489 | $('#datacleaner').height(cfg.oldHeight - $('#datacleaner-header').height()) 490 | }else{ 491 | $('#datacleaner').height($('#datacleaner-wrapper').height() - $('#datacleaner-header').height()-30) 492 | } 493 | 494 | } 495 | } else { 496 | $('#datacleaner-wrapper').toggle(); 497 | } 498 | 499 | if ($('#datacleaner-wrapper').css('display') == undefined) $('#datacleaner-wrapper').css('display', "none") //block 500 | 501 | datacleaner_wrapper.addClass('datacleaner-float-wrapper'); 502 | 503 | } 504 | 505 | var data_cleaner = function(cfg, st) { 506 | var datacleaner_wrapper = $("#datacleaner-wrapper"); 507 | if (datacleaner_wrapper.length === 0) { 508 | create_datacleaner_div(cfg, st); 509 | } 510 | 511 | $(window).resize(function() { 512 | $('#datacleaner').css({ maxHeight: $(window).height() - 30 }); 513 | $('#datacleaner-wrapper').css({ maxHeight: $(window).height() - 10 }); 514 | }); 515 | 516 | $(window).trigger('resize'); 517 | varRefresh(); 518 | }; 519 | 520 | var toggle_datacleaner = function(cfg, st) { 521 | // toggle draw (first because of first-click behavior) 522 | $("#datacleaner-wrapper").toggle({ 523 | 'progress': function() {}, 524 | 'complete': function() { 525 | Jupyter.notebook.metadata.datacleaner['window_display'] = $('#datacleaner-wrapper').css('display') == 'block'; 526 | Jupyter.notebook.set_dirty(); 527 | // recompute: 528 | data_cleaner(cfg, st); 529 | } 530 | }); 531 | }; 532 | 533 | 534 | var load_jupyter_extension = function() { 535 | load_css(); 536 | datacleaner_button(); 537 | 538 | // If a kernel is available, 539 | if (typeof Jupyter.notebook.kernel !== "undefined" && Jupyter.notebook.kernel !== null) { 540 | datacleaner_init(); 541 | } 542 | 543 | events.on("kernel_ready.Kernel", function(evt, data) { 544 | datacleaner_init(); 545 | }); 546 | 547 | }; 548 | 549 | return { 550 | load_ipython_extension: load_jupyter_extension, 551 | varRefresh: varRefresh 552 | }; 553 | 554 | }); 555 | 556 | /* 557 | This code based on jupyter-varInpsector https://github.com/jfbercher/jupyter_varInspector 558 | Now part of https://github.com/ipython-contrib/jupyter_contrib_nbextensions 559 | which is licensed as follows: 560 | 561 | IPython-contrib is licensed under the terms of the Modified BSD License (also known as New or Revised or 3-Clause BSD), as follows: 562 | 563 | Copyright (c) 2013-2015, IPython-contrib Developers 564 | 565 | All rights reserved. 566 | 567 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 568 | 569 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 570 | 571 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 572 | 573 | Neither the name of the IPython-contrib Developers nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 574 | 575 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 576 | 577 | */ 578 | -------------------------------------------------------------------------------- /dataclean/static/jquery.tablesorter.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2007 Christian Bach. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.*/ 9 | 10 | !function(e){"function"==typeof define&&define.amd?define(["jquery"],e):"object"==typeof module&&"object"==typeof module.exports?module.exports=e(require("jquery")):e(jQuery)}(function(e){return function(t){"use strict";var r=t.tablesorter={version:"2.29.0",parsers:[],widgets:[],defaults:{theme:"default",widthFixed:!1,showProcessing:!1,headerTemplate:"{content}",onRenderTemplate:null,onRenderHeader:null,cancelSelection:!0,tabIndex:!0,dateFormat:"mmddyyyy",sortMultiSortKey:"shiftKey",sortResetKey:"ctrlKey",usNumberFormat:!0,delayInit:!1,serverSideSorting:!1,resort:!0,headers:{},ignoreCase:!0,sortForce:null,sortList:[],sortAppend:null,sortStable:!1,sortInitialOrder:"asc",sortLocaleCompare:!1,sortReset:!1,sortRestart:!1,emptyTo:"bottom",stringTo:"max",duplicateSpan:!0,textExtraction:"basic",textAttribute:"data-text",textSorter:null,numberSorter:null,initWidgets:!0,widgetClass:"widget-{name}",widgets:[],widgetOptions:{zebra:["even","odd"]},initialized:null,tableClass:"",cssAsc:"",cssDesc:"",cssNone:"",cssHeader:"",cssHeaderRow:"",cssProcessing:"",cssChildRow:"tablesorter-childRow",cssInfoBlock:"tablesorter-infoOnly",cssNoSort:"tablesorter-noSort",cssIgnoreRow:"tablesorter-ignoreRow",cssIcon:"tablesorter-icon",cssIconNone:"",cssIconAsc:"",cssIconDesc:"",cssIconDisabled:"",pointerClick:"click",pointerDown:"mousedown",pointerUp:"mouseup",selectorHeaders:"> thead th, > thead td",selectorSort:"th, td",selectorRemove:".remove-me",debug:!1,headerList:[],empties:{},strings:{},parsers:[],globalize:0,imgAttr:0},css:{table:"tablesorter",cssHasChild:"tablesorter-hasChildRow",childRow:"tablesorter-childRow",colgroup:"tablesorter-colgroup",header:"tablesorter-header",headerRow:"tablesorter-headerRow",headerIn:"tablesorter-header-inner",icon:"tablesorter-icon",processing:"tablesorter-processing",sortAsc:"tablesorter-headerAsc",sortDesc:"tablesorter-headerDesc",sortNone:"tablesorter-headerUnSorted"},language:{sortAsc:"Ascending sort applied, ",sortDesc:"Descending sort applied, ",sortNone:"No sort applied, ",sortDisabled:"sorting is disabled",nextAsc:"activate to apply an ascending sort",nextDesc:"activate to apply a descending sort",nextNone:"activate to remove the sort"},regex:{templateContent:/\{content\}/g,templateIcon:/\{icon\}/g,templateName:/\{name\}/i,spaces:/\s+/g,nonWord:/\W/g,formElements:/(input|select|button|textarea)/i,chunk:/(^([+\-]?(?:\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?)?$|^0x[0-9a-f]+$|\d+)/gi,chunks:/(^\\0|\\0$)/,hex:/^0x[0-9a-f]+$/i,comma:/,/g,digitNonUS:/[\s|\.]/g,digitNegativeTest:/^\s*\([.\d]+\)/,digitNegativeReplace:/^\s*\(([.\d]+)\)/,digitTest:/^[\-+(]?\d+[)]?$/,digitReplace:/[,.'"\s]/g},string:{max:1,min:-1,emptymin:1,emptymax:-1,zero:0,none:0,"null":0,top:!0,bottom:!1},keyCodes:{enter:13},dates:{},instanceMethods:{},setup:function(e,o){if(e&&e.tHead&&0!==e.tBodies.length&&!0!==e.hasInitialized){var s="",a=t(e),n=t.metadata;e.hasInitialized=!1,e.isProcessing=!0,e.config=o,t.data(e,"tablesorter",o),o.debug&&(console[console.group?"group":"log"]("Initializing tablesorter v"+r.version),t.data(e,"startoveralltimer",new Date)),o.supportsDataObject=function(e){return e[0]=parseInt(e[0],10),e[0]>1||1===e[0]&&parseInt(e[1],10)>=4}(t.fn.jquery.split(".")),o.emptyTo=o.emptyTo.toLowerCase(),o.stringTo=o.stringTo.toLowerCase(),o.last={sortList:[],clickedIndex:-1},/tablesorter\-/.test(a.attr("class"))||(s=""!==o.theme?" tablesorter-"+o.theme:""),o.namespace?o.namespace="."+o.namespace.replace(r.regex.nonWord,""):o.namespace=".tablesorter"+Math.random().toString(16).slice(2),o.table=e,o.$table=a.addClass(r.css.table+" "+o.tableClass+s+" "+o.namespace.slice(1)).attr("role","grid"),o.$headers=a.find(o.selectorHeaders),o.$table.children().children("tr").attr("role","row"),o.$tbodies=a.children("tbody:not(."+o.cssInfoBlock+")").attr({"aria-live":"polite","aria-relevant":"all"}),o.$table.children("caption").length&&((s=o.$table.children("caption")[0]).id||(s.id=o.namespace.slice(1)+"caption"),o.$table.attr("aria-labelledby",s.id)),o.widgetInit={},o.textExtraction=o.$table.attr("data-text-extraction")||o.textExtraction||"basic",r.buildHeaders(o),r.fixColumnWidth(e),r.addWidgetFromClass(e),r.applyWidgetOptions(e),r.setupParsers(o),o.totalRows=0,r.validateOptions(o),o.delayInit||r.buildCache(o),r.bindEvents(e,o.$headers,!0),r.bindMethods(o),o.supportsDataObject&&void 0!==a.data().sortlist?o.sortList=a.data().sortlist:n&&a.metadata()&&a.metadata().sortlist&&(o.sortList=a.metadata().sortlist),r.applyWidget(e,!0),o.sortList.length>0?r.sortOn(o,o.sortList,{},!o.initWidgets):(r.setHeadersCss(o),o.initWidgets&&r.applyWidget(e,!1)),o.showProcessing&&a.unbind("sortBegin"+o.namespace+" sortEnd"+o.namespace).bind("sortBegin"+o.namespace+" sortEnd"+o.namespace,function(t){clearTimeout(o.timerProcessing),r.isProcessing(e),"sortBegin"===t.type&&(o.timerProcessing=setTimeout(function(){r.isProcessing(e,!0)},500))}),e.hasInitialized=!0,e.isProcessing=!1,o.debug&&(console.log("Overall initialization time:"+r.benchmark(t.data(e,"startoveralltimer"))),o.debug&&console.groupEnd&&console.groupEnd()),a.triggerHandler("tablesorter-initialized",e),"function"==typeof o.initialized&&o.initialized(e)}else o.debug&&(e.hasInitialized?console.warn("Stopping initialization. Tablesorter has already been initialized"):console.error("Stopping initialization! No table, thead or tbody",e))},bindMethods:function(e){var o=e.$table,s=e.namespace,a="sortReset update updateRows updateAll updateHeaders addRows updateCell updateComplete sorton appendCache updateCache applyWidgetId applyWidgets refreshWidgets destroy mouseup mouseleave ".split(" ").join(s+" ");o.unbind(a.replace(r.regex.spaces," ")).bind("sortReset"+s,function(e,t){e.stopPropagation(),r.sortReset(this.config,function(e){e.isApplyingWidgets?setTimeout(function(){r.applyWidget(e,"",t)},100):r.applyWidget(e,"",t)})}).bind("updateAll"+s,function(e,t,o){e.stopPropagation(),r.updateAll(this.config,t,o)}).bind("update"+s+" updateRows"+s,function(e,t,o){e.stopPropagation(),r.update(this.config,t,o)}).bind("updateHeaders"+s,function(e,t){e.stopPropagation(),r.updateHeaders(this.config,t)}).bind("updateCell"+s,function(e,t,o,s){e.stopPropagation(),r.updateCell(this.config,t,o,s)}).bind("addRows"+s,function(e,t,o,s){e.stopPropagation(),r.addRows(this.config,t,o,s)}).bind("updateComplete"+s,function(){this.isUpdating=!1}).bind("sorton"+s,function(e,t,o,s){e.stopPropagation(),r.sortOn(this.config,t,o,s)}).bind("appendCache"+s,function(e,o,s){e.stopPropagation(),r.appendCache(this.config,s),t.isFunction(o)&&o(this)}).bind("updateCache"+s,function(e,t,o){e.stopPropagation(),r.updateCache(this.config,t,o)}).bind("applyWidgetId"+s,function(e,t){e.stopPropagation(),r.applyWidgetId(this,t)}).bind("applyWidgets"+s,function(e,t){e.stopPropagation(),r.applyWidget(this,!1,t)}).bind("refreshWidgets"+s,function(e,t,o){e.stopPropagation(),r.refreshWidgets(this,t,o)}).bind("removeWidget"+s,function(e,t,o){e.stopPropagation(),r.removeWidget(this,t,o)}).bind("destroy"+s,function(e,t,o){e.stopPropagation(),r.destroy(this,t,o)}).bind("resetToLoadState"+s,function(o){o.stopPropagation(),r.removeWidget(this,!0,!1);var s=t.extend(!0,{},e.originalSettings);(e=t.extend(!0,{},r.defaults,s)).originalSettings=s,this.hasInitialized=!1,r.setup(this,e)})},bindEvents:function(e,o,s){var a,n=(e=t(e)[0]).config,i=n.namespace,l=null;!0!==s&&(o.addClass(i.slice(1)+"_extra_headers"),(a=r.getClosest(o,"table")).length&&"TABLE"===a[0].nodeName&&a[0]!==e&&t(a[0]).addClass(i.slice(1)+"_extra_table")),a=(n.pointerDown+" "+n.pointerUp+" "+n.pointerClick+" sort keyup ").replace(r.regex.spaces," ").split(" ").join(i+" "),o.find(n.selectorSort).add(o.filter(n.selectorSort)).unbind(a).bind(a,function(e,s){var a,i,d,c=t(e.target),g=" "+e.type+" ";if(!(1!==(e.which||e.button)&&!g.match(" "+n.pointerClick+" | sort | keyup ")||" keyup "===g&&e.which!==r.keyCodes.enter||g.match(" "+n.pointerClick+" ")&&void 0!==e.which||g.match(" "+n.pointerUp+" ")&&l!==e.target&&!0!==s)){if(g.match(" "+n.pointerDown+" "))return l=e.target,void("1"===(d=c.jquery.split("."))[0]&&d[1]<4&&e.preventDefault());if(l=null,r.regex.formElements.test(e.target.nodeName)||c.hasClass(n.cssNoSort)||c.parents("."+n.cssNoSort).length>0||c.parents("button").length>0)return!n.cancelSelection;n.delayInit&&r.isEmptyObject(n.cache)&&r.buildCache(n),a=r.getHeaderCell(t(this)),d=o.index(a),n.last.clickedIndex=d<0?a.attr("data-column"):d,(i=n.$headers[n.last.clickedIndex])&&!i.sortDisabled&&r.initSort(n,i,e)}}),n.cancelSelection&&o.attr("unselectable","on").bind("selectstart",!1).css({"user-select":"none",MozUserSelect:"none"})},buildHeaders:function(e){var o,s,a,n;for(e.headerList=[],e.headerContent=[],e.sortVars=[],e.debug&&(a=new Date),e.columns=r.computeColumnIndex(e.$table.children("thead, tfoot").children("tr")),s=e.cssIcon?'':"",e.$headers=t(t.map(e.$table.find(e.selectorHeaders),function(o,a){var n,i,l,d,c,g,p=t(o);if(!p.parent().hasClass(e.cssIgnoreRow))return n=r.getColumnData(e.table,e.headers,a,!0),e.headerContent[a]=p.html(),""===e.headerTemplate||p.find("."+r.css.headerIn).length||(d=e.headerTemplate.replace(r.regex.templateContent,p.html()).replace(r.regex.templateIcon,p.find("."+r.css.icon).length?"":s),e.onRenderTemplate&&(i=e.onRenderTemplate.apply(p,[a,d]))&&"string"==typeof i&&(d=i),p.html('
'+d+"
")),e.onRenderHeader&&e.onRenderHeader.apply(p,[a,e,e.$table]),g=r.getHeaderCell(p),l=parseInt(g.attr("data-column"),10),o.column=l,c=r.getOrder(r.getData(p,n,"sortInitialOrder")||e.sortInitialOrder),e.sortVars[l]={count:-1,order:c?e.sortReset?[1,0,2]:[1,0]:e.sortReset?[0,1,2]:[0,1],lockedOrder:!1},void 0!==(c=r.getData(p,n,"lockedOrder")||!1)&&!1!==c&&(e.sortVars[l].lockedOrder=!0,e.sortVars[l].order=r.getOrder(c)?[1,1]:[0,0]),e.headerList[a]=o,p.addClass(r.css.header+" "+e.cssHeader),r.getClosest(p,"tr").addClass(r.css.headerRow+" "+e.cssHeaderRow).attr("role","row"),e.tabIndex&&p.attr("tabindex",0),o})),e.$headerIndexed=[],n=0;n0))for(i+=a,n+=a;a+1>0;)s.parsers[i-a]=p,s.extractors[i-a]=u,a--;i++}y+=s.parsers.length?m:1}e.debug&&(r.isEmptyObject(w)?console.warn(" No parsers detected!"):console[console.table?"table":"log"](w),console.log("Completed detecting parsers"+r.benchmark(f)),console.groupEnd&&console.groupEnd()),e.parsers=s.parsers,e.extractors=s.extractors},addParser:function(e){var t,o=r.parsers.length,s=!0;for(t=0;t=0;)if((n=r.parsers[d])&&"text"!==n.id&&n.is&&n.is(g,e.table,c,i))return n;return r.getParserById("text")},getElementText:function(e,o,s){if(!o)return"";var a,n=e.textExtraction||"",i=o.jquery?o:t(o);return"string"==typeof n?"basic"===n&&void 0!==(a=i.attr(e.textAttribute))?t.trim(a):t.trim(o.textContent||i.text()):"function"==typeof n?t.trim(n(i[0],e.table,s)):"function"==typeof(a=r.getColumnData(e.table,n,s))?t.trim(a(i[0],e.table,s)):t.trim(i[0].textContent||i.text())},getParsedText:function(e,t,o,s){void 0===s&&(s=r.getElementText(e,t,o));var a=""+s,n=e.parsers[o],i=e.extractors[o];return n&&(i&&"function"==typeof i.format&&(s=i.format(s,e.table,t,o)),a="no-parser"===n.id?"":n.format(""+s,e.table,t,o),e.ignoreCase&&"string"==typeof a&&(a=a.toLowerCase())),a},buildCache:function(e,o,s){var a,n,i,l,d,c,g,p,u,f,h,m,b,y,w,x,v,C,$,I,D,R,T=e.table,L=e.parsers;if(e.$tbodies=e.$table.children("tbody:not(."+e.cssInfoBlock+")"),g=void 0===s?e.$tbodies:s,e.cache={},e.totalRows=0,!L)return e.debug?console.warn("Warning: *Empty table!* Not building a cache"):"";for(e.debug&&(m=new Date),e.showProcessing&&r.isProcessing(T,!0),c=0;c0&&(C+=v,I+=v)),C++;else{for(y.$row=p,y.order=l,C=0,I=e.columns,d=0;d0)){for(R=0;R<=v;)i=e.duplicateSpan||0===R?n:"string"!=typeof e.textExtraction?r.getElementText(e,h,C+R)||"":"",y.raw[C+R]=i,u[C+R]=i,R++;C+=v,I+=v}C++}u[e.columns]=y,a.normalized[a.normalized.length]=u}a.colMax=x,e.totalRows+=a.normalized.length}if(e.showProcessing&&r.isProcessing(T),e.debug){for(D=Math.min(5,e.cache[0].normalized.length),console[console.group?"group":"log"]("Building cache for "+e.totalRows+" rows (showing "+D+" rows in log) and "+e.columns+" columns"+r.benchmark(m)),n={},d=0;d-1);return o}),(p=p.not(".sorter-false").filter('[data-column="'+a[o][0]+'"]'+(1===n?":last":""))).length)){for(s=0;s=0?a:n[1]%g.length}},updateAll:function(e,t,o){var s=e.table;s.isUpdating=!0,r.refreshWidgets(s,!0,!0),r.buildHeaders(e),r.bindEvents(s,e.$headers,!0),r.bindMethods(e),r.commonUpdate(e,t,o)},update:function(e,t,o){e.table.isUpdating=!0,r.updateHeader(e),r.commonUpdate(e,t,o)},updateHeaders:function(e,t){e.table.isUpdating=!0,r.buildHeaders(e),r.bindEvents(e.table,e.$headers,!0),r.resortComplete(e,t)},updateCell:function(e,o,s,a){if(t(o).closest("tr").hasClass(e.cssChildRow))console.warn('Tablesorter Warning! "updateCell" for child row content has been disabled, use "update" instead');else{if(r.isEmptyObject(e.cache))return r.updateHeader(e),void r.commonUpdate(e,s,a);e.table.isUpdating=!0,e.$table.find(e.selectorRemove).remove();var n,i,l,d,c,g,p=e.$tbodies,u=t(o),f=p.index(r.getClosest(u,"tbody")),h=e.cache[f],m=r.getClosest(u,"tr");if(o=u[0],p.length&&f>=0){if(l=p.eq(f).find("tr").not("."+e.cssChildRow).index(m),c=h.normalized[l],(g=m[0].cells.length)!==e.columns)for(d=0,n=!1,i=0;i0&&(h+=w),h++;b[o.columns]=m,o.cache[d].normalized[f]=b}r.checkResort(o,a,n)}},updateCache:function(e,t,o){e.parsers&&e.parsers.length||r.setupParsers(e,o),r.buildCache(e,t,o)},appendCache:function(e,t){var o,s,a,n,i,l,d,c=e.table,g=e.widgetOptions,p=e.$tbodies,u=[],f=e.cache;if(r.isEmptyObject(f))return e.appender?e.appender(c,u):c.isUpdating?e.$table.triggerHandler("updateComplete",c):"";for(e.debug&&(d=new Date),l=0;l1))for(n=1;n=0)for(n=0;n1))for(n=1;ns)return 1}for(o=(e||"").replace(d.chunk,"\\0$1\\0").replace(d.chunks,"").split("\\0"),s=(t||"").replace(d.chunk,"\\0$1\\0").replace(d.chunks,"").split("\\0"),l=Math.max(o.length,s.length),i=0;in)return 1}return 0},sortNaturalAsc:function(e,t,o,s){if(e===t)return 0;var a=r.string[s.empties[o]||s.emptyTo];return""===e&&0!==a?"boolean"==typeof a?a?-1:1:-a||-1:""===t&&0!==a?"boolean"==typeof a?a?1:-1:a||1:r.sortNatural(e,t)},sortNaturalDesc:function(e,t,o,s){if(e===t)return 0;var a=r.string[s.empties[o]||s.emptyTo];return""===e&&0!==a?"boolean"==typeof a?a?-1:1:a||1:""===t&&0!==a?"boolean"==typeof a?a?1:-1:-a||-1:r.sortNatural(t,e)},sortText:function(e,t){return e>t?1:e=0&&!0!==s&&d.widgets.splice(i,1),n&&n.remove&&(d.debug&&console.log((s?"Refreshing":"Removing")+' "'+o[a]+'" widget'),n.remove(e,d,d.widgetOptions,s),d.widgetInit[o[a]]=!1);d.$table.triggerHandler("widgetRemoveEnd",e)},refreshWidgets:function(e,o,s){var a,n,i=(e=t(e)[0]).config.widgets,l=r.widgets,d=l.length,c=[],g=function(e){t(e).triggerHandler("refreshComplete")};for(a=0;a'),o=l.$table.width(),n=(a=l.$tbodies.find("tr:first").children(":visible")).length,i=0;i").css("width",s));l.$table.prepend(d)}},getData:function(e,r,o){var s,a,n="",i=t(e);return i.length?(s=!!t.metadata&&i.metadata(),a=" "+(i.attr("class")||""),void 0!==i.data(o)||void 0!==i.data(o.toLowerCase())?n+=i.data(o)||i.data(o.toLowerCase()):s&&void 0!==s[o]?n+=s[o]:r&&void 0!==r[o]?n+=r[o]:" "!==a&&a.match(" "+o+"-")&&(n=a.match(new RegExp("\\s"+o+"-([\\w-]+)"))[1]||""),t.trim(n)):""},getColumnData:function(e,r,o,s,a){if("object"!=typeof r||null===r)return r;var n,i=(e=t(e)[0]).config,l=a||i.$headers,d=i.$headerIndexed&&i.$headerIndexed[o]||l.filter('[data-column="'+o+'"]:last');if(void 0!==r[o])return s?r[o]:r[l.index(d)];for(n in r)if("string"==typeof n&&d.filter(n).add(d.find(n)).length)return r[n]},isProcessing:function(e,o,s){var a=(e=t(e))[0].config,n=s||e.find("."+r.css.header);o?(void 0!==s&&a.sortList.length>0&&(n=n.filter(function(){return!this.sortDisabled&&r.isValueInArray(parseFloat(t(this).attr("data-column")),a.sortList)>=0})),e.add(n).addClass(r.css.processing+" "+a.cssProcessing)):e.add(n).removeClass(r.css.processing+" "+a.cssProcessing)},processTbody:function(e,r,o){if(e=t(e)[0],o)return e.isProcessing=!0,r.before(''),t.fn.detach?r.detach():r.remove();var s=t(e).find("colgroup.tablesorter-savemyplace");r.insertAfter(s),s.remove(),e.isProcessing=!1},clearTableBody:function(e){t(e)[0].config.$tbodies.children().detach()},characterEquivalents:{a:"áàâãäąå",A:"ÁÀÂÃÄĄÅ",c:"çćč",C:"ÇĆČ",e:"éèêëěę",E:"ÉÈÊËĚĘ",i:"íìİîïı",I:"ÍÌİÎÏ",o:"óòôõöō",O:"ÓÒÔÕÖŌ",ss:"ß",SS:"ẞ",u:"úùûüů",U:"ÚÙÛÜŮ"},replaceAccents:function(e){var t,o="[",s=r.characterEquivalents;if(!r.characterRegex){r.characterRegexArray={};for(t in s)"string"==typeof t&&(o+=s[t],r.characterRegexArray[t]=new RegExp("["+s[t]+"]","g"));r.characterRegex=new RegExp(o+"]")}if(r.characterRegex.test(e))for(t in s)"string"==typeof t&&(e=e.replace(r.characterRegexArray[t],t));return e},validateOptions:function(e){var o,s,a,n,i="headers sortForce sortList sortAppend widgets".split(" "),l=e.originalSettings;if(l){e.debug&&(n=new Date);for(o in l)if("undefined"===(a=typeof r.defaults[o]))console.warn('Tablesorter Warning! "table.config.'+o+'" option not recognized');else if("object"===a)for(s in l[o])a=r.defaults[o]&&typeof r.defaults[o][s],t.inArray(o,i)<0&&"undefined"===a&&console.warn('Tablesorter Warning! "table.config.'+o+"."+s+'" option not recognized');e.debug&&console.log("validate options time:"+r.benchmark(n))}},restoreHeaders:function(e){var o,s,a=t(e)[0].config,n=a.$table.find(a.selectorHeaders),i=n.length;for(o=0;o tr").children("th, td");!1===o&&t.inArray("uitheme",i.widgets)>=0&&(n.triggerHandler("applyWidgetId",["uitheme"]),n.triggerHandler("applyWidgetId",["zebra"])),d.find("tr").not(c).remove(),a="sortReset update updateRows updateAll updateHeaders updateCell addRows updateComplete sorton appendCache updateCache applyWidgetId applyWidgets refreshWidgets removeWidget destroy mouseup mouseleave "+"keypress sortBegin sortEnd resetToLoadState ".split(" ").join(i.namespace+" "),n.removeData("tablesorter").unbind(a.replace(r.regex.spaces," ")),i.$headers.add(g).removeClass([r.css.header,i.cssHeader,i.cssAsc,i.cssDesc,r.css.sortAsc,r.css.sortDesc,r.css.sortNone].join(" ")).removeAttr("data-column").removeAttr("aria-label").attr("aria-disabled","true"),c.find(i.selectorSort).unbind("mousedown mouseup keypress ".split(" ").join(i.namespace+" ").replace(r.regex.spaces," ")),r.restoreHeaders(e),n.toggleClass(r.css.table+" "+i.tableClass+" tablesorter-"+i.theme,!1===o),n.removeClass(i.namespace.slice(1)),e.hasInitialized=!1,delete e.config.cache,"function"==typeof s&&s(e),l&&console.log("tablesorter has been removed")}}};t.fn.tablesorter=function(e){return this.each(function(){var o=this,s=t.extend(!0,{},r.defaults,e,r.instanceMethods);s.originalSettings=e,!o.hasInitialized&&r.buildTable&&"TABLE"!==this.nodeName?r.buildTable(o,s):r.setup(o,s)})},window.console&&window.console.log||(r.logs=[],console={},console.log=console.warn=console.error=console.table=function(){var e=arguments.length>1?arguments:arguments[0];r.logs[r.logs.length]={date:Date.now(),log:e}}),r.addParser({id:"no-parser",is:function(){return!1},format:function(){return""},type:"text"}),r.addParser({id:"text",is:function(){return!0},format:function(e,o){var s=o.config;return e&&(e=t.trim(s.ignoreCase?e.toLocaleLowerCase():e),e=s.sortLocaleCompare?r.replaceAccents(e):e),e},type:"text"}),r.regex.nondigit=/[^\w,. \-()]/g,r.addParser({id:"digit",is:function(e){return r.isDigit(e)},format:function(e,o){var s=r.formatFloat((e||"").replace(r.regex.nondigit,""),o);return e&&"number"==typeof s?s:e?t.trim(e&&o.config.ignoreCase?e.toLocaleLowerCase():e):e},type:"numeric"}),r.regex.currencyReplace=/[+\-,. ]/g,r.regex.currencyTest=/^\(?\d+[\u00a3$\u20ac\u00a4\u00a5\u00a2?.]|[\u00a3$\u20ac\u00a4\u00a5\u00a2?.]\d+\)?$/,r.addParser({id:"currency",is:function(e){return e=(e||"").replace(r.regex.currencyReplace,""),r.regex.currencyTest.test(e)},format:function(e,o){var s=r.formatFloat((e||"").replace(r.regex.nondigit,""),o);return e&&"number"==typeof s?s:e?t.trim(e&&o.config.ignoreCase?e.toLocaleLowerCase():e):e},type:"numeric"}),r.regex.urlProtocolTest=/^(https?|ftp|file):\/\//,r.regex.urlProtocolReplace=/(https?|ftp|file):\/\/(www\.)?/,r.addParser({id:"url",is:function(e){return r.regex.urlProtocolTest.test(e)},format:function(e){return e?t.trim(e.replace(r.regex.urlProtocolReplace,"")):e},type:"text"}),r.regex.dash=/-/g,r.regex.isoDate=/^\d{4}[\/\-]\d{1,2}[\/\-]\d{1,2}/,r.addParser({id:"isoDate",is:function(e){return r.regex.isoDate.test(e)},format:function(e,t){var o=e?new Date(e.replace(r.regex.dash,"/")):e;return o instanceof Date&&isFinite(o)?o.getTime():e},type:"numeric"}),r.regex.percent=/%/g,r.regex.percentTest=/(\d\s*?%|%\s*?\d)/,r.addParser({id:"percent",is:function(e){return r.regex.percentTest.test(e)&&e.length<15},format:function(e,t){return e?r.formatFloat(e.replace(r.regex.percent,""),t):e},type:"numeric"}),r.addParser({id:"image",is:function(e,t,r,o){return o.find("img").length>0},format:function(e,r,o){return t(o).find("img").attr(r.config.imgAttr||"alt")||e},parsed:!0,type:"text"}),r.regex.dateReplace=/(\S)([AP]M)$/i,r.regex.usLongDateTest1=/^[A-Z]{3,10}\.?\s+\d{1,2},?\s+(\d{4})(\s+\d{1,2}:\d{2}(:\d{2})?(\s+[AP]M)?)?$/i,r.regex.usLongDateTest2=/^\d{1,2}\s+[A-Z]{3,10}\s+\d{4}/i,r.addParser({id:"usLongDate",is:function(e){return r.regex.usLongDateTest1.test(e)||r.regex.usLongDateTest2.test(e)},format:function(e,t){var o=e?new Date(e.replace(r.regex.dateReplace,"$1 $2")):e;return o instanceof Date&&isFinite(o)?o.getTime():e},type:"numeric"}),r.regex.shortDateTest=/(^\d{1,2}[\/\s]\d{1,2}[\/\s]\d{4})|(^\d{4}[\/\s]\d{1,2}[\/\s]\d{1,2})/,r.regex.shortDateReplace=/[\-.,]/g,r.regex.shortDateXXY=/(\d{1,2})[\/\s](\d{1,2})[\/\s](\d{4})/,r.regex.shortDateYMD=/(\d{4})[\/\s](\d{1,2})[\/\s](\d{1,2})/,r.convertFormat=function(e,t){e=(e||"").replace(r.regex.spaces," ").replace(r.regex.shortDateReplace,"/"),"mmddyyyy"===t?e=e.replace(r.regex.shortDateXXY,"$3/$1/$2"):"ddmmyyyy"===t?e=e.replace(r.regex.shortDateXXY,"$3/$2/$1"):"yyyymmdd"===t&&(e=e.replace(r.regex.shortDateYMD,"$1/$2/$3"));var o=new Date(e);return o instanceof Date&&isFinite(o)?o.getTime():""},r.addParser({id:"shortDate",is:function(e){return e=(e||"").replace(r.regex.spaces," ").replace(r.regex.shortDateReplace,"/"),r.regex.shortDateTest.test(e)},format:function(e,t,o,s){if(e){var a=t.config,n=a.$headerIndexed[s],i=n.length&&n.data("dateFormat")||r.getData(n,r.getColumnData(t,a.headers,s),"dateFormat")||a.dateFormat;return n.length&&n.data("dateFormat",i),r.convertFormat(e,i)||e}return e},type:"numeric"}),r.regex.timeTest=/^(0?[1-9]|1[0-2]):([0-5]\d)(\s[AP]M)$|^((?:[01]\d|[2][0-4]):[0-5]\d)$/i,r.regex.timeMatch=/(0?[1-9]|1[0-2]):([0-5]\d)(\s[AP]M)|((?:[01]\d|[2][0-4]):[0-5]\d)/i,r.addParser({id:"time",is:function(e){return r.regex.timeTest.test(e)},format:function(e,t){var o,s=(e||"").match(r.regex.timeMatch),a=new Date(e),n=e&&(null!==s?s[0]:"00:00 AM"),i=n?new Date("2000/01/01 "+n.replace(r.regex.dateReplace,"$1 $2")):n;return i instanceof Date&&isFinite(i)?(o=a instanceof Date&&isFinite(a)?a.getTime():0,o?parseFloat(i.getTime()+"."+a.getTime()):i.getTime()):e},type:"numeric"}),r.addParser({id:"metadata",is:function(){return!1},format:function(e,r,o){var s=r.config,a=s.parserMetadataName?s.parserMetadataName:"sortValue";return t(o).metadata()[a]},type:"numeric"}),r.addWidget({id:"zebra",priority:90,format:function(e,r,o){var s,a,n,i,l,d,c,g=new RegExp(r.cssChildRow,"i"),p=r.$tbodies.add(t(r.namespace+"_extra_table").children("tbody:not(."+r.cssInfoBlock+")"));for(l=0;l 0 57 | else 0 58 | ) 59 | 60 | if fraction_categorical >= categorical_threshold: 61 | categorical_type = CategoricalTypes.CATEGORICAL 62 | else: 63 | categorical_type = CategoricalTypes.CONTINUOUS 64 | 65 | return categorical_type 66 | 67 | 68 | class CallbackManager(object): 69 | """For registering and triggering callbacks between classes""" 70 | 71 | def __init__(self): 72 | self.callbacks = [] 73 | 74 | def send_callbacks(self, *args, **kwargs): 75 | for callback in self.callbacks: 76 | callback(*args, **kwargs) 77 | 78 | def register_callback(self, callback): 79 | self.callbacks.append(callback) 80 | 81 | 82 | class StepWidgetControllerBase(object): 83 | """Widget controls to create a cleaning step""" 84 | 85 | __metaclass__ = ABCMeta 86 | 87 | def __init__(self): 88 | self.update_step_callback = CallbackManager() 89 | self.submit_step_callback = CallbackManager() 90 | 91 | self.tab_title = "A title for the tab widget page" 92 | # this should be placed into the ALLOWED_TRANSFORMATIONS dict 93 | # for controls that go into the column widgets 94 | self.transform_type = "A unique string or an enum class" 95 | 96 | def load_data(self, column, numerical_data): 97 | self.column = column 98 | self.colname = column.name 99 | self.numerical_data = numerical_data 100 | 101 | def create_widgets(self): 102 | """Create your control widgets""" 103 | self.submit_button = ipywidgets.Button(description="Add to Pipeline") 104 | self.submit_button.on_click( 105 | lambda _: self.submit_step_callback.send_callbacks() 106 | ) 107 | 108 | def reset_controls(self): 109 | """Reset the controls to their base state""" 110 | self.submit_button.description = "Add to Pipeline" 111 | 112 | @abstractmethod 113 | def update_step(self): 114 | """Create a pipeline step as the controls are changed""" 115 | self.update_step_callback.send_callbacks() 116 | 117 | def _update_step(self, _): 118 | """For use as a widget observer""" 119 | return self.update_step() 120 | 121 | @abstractmethod 122 | def render_widget(self, step=None): 123 | """Return the overall parent widget for your controls in the state 124 | required to display the input step""" 125 | if step: 126 | self.submit_button.description = "Replace Current Step" 127 | 128 | 129 | class NullReplaceWidgetController(StepWidgetControllerBase): 130 | """Widget controls to create a null replacement step""" 131 | 132 | def __init__(self): 133 | super(NullReplaceWidgetController, self).__init__() 134 | self.tab_title = "Nulls" 135 | self.transform_type = NullRemovalMethod 136 | 137 | def create_widgets(self): 138 | super(NullReplaceWidgetController, self).create_widgets() 139 | 140 | self.null_percent_bar = ipywidgets.FloatProgress( 141 | value=0, 142 | min=0, 143 | max=100, 144 | description="Missing:", 145 | disabled=False, 146 | continuous_update=False, 147 | readout=True, 148 | readout_format=".2g", 149 | layout=ipywidgets.Layout(width="400px"), 150 | bar_style="warning", 151 | ) 152 | 153 | self.null_replace_selector = ipywidgets.Dropdown( 154 | options=[], 155 | description="Replacement Method: ", 156 | layout=ipywidgets.Layout(width="400px"), 157 | style={"description_width": "initial"}, 158 | ) 159 | self.null_replace_selector.observe(self._update_step, names="value") 160 | 161 | self.null_text = ipywidgets.Label() 162 | 163 | self.null_removal_controls = ipywidgets.VBox( 164 | [ 165 | self.null_text, 166 | self.null_percent_bar, 167 | self.null_replace_selector, 168 | self.submit_button, 169 | ], 170 | layout=ipywidgets.Layout(width="100%"), 171 | ) 172 | self.null_removal_controls.layout.align_items = "center" 173 | 174 | def reset_controls(self, categorical_type): 175 | super(NullReplaceWidgetController, self).reset_controls() 176 | 177 | self.null_replace_selector.unobserve(self._update_step, names="value") 178 | 179 | self.null_text.value = "{0} of {1} ({2:.0f}%) selected".format( 180 | self.column.isnull().sum(), 181 | len(self.column), 182 | (100 * self.column.isnull().sum() / len(self.column)) 183 | if len(self.column) > 0 184 | else 0, 185 | ) 186 | 187 | self.null_percent_bar.bar_style = "warning" 188 | self.null_percent_bar.value = ( 189 | (100 * self.column.isnull().sum() / len(self.column)) 190 | if len(self.column) > 0 191 | else 0 192 | ) 193 | 194 | allowed_transforms = { 195 | x.value: x 196 | for x in ALLOWED_TRANSFORMATIONS[categorical_type] 197 | if type(x) is self.transform_type 198 | } 199 | 200 | self.null_replace_selector.options = allowed_transforms 201 | 202 | if len(allowed_transforms) > 0: 203 | self.null_replace_selector.value = self.transform_type.NONE 204 | 205 | self.submit_button.disabled = True 206 | self.null_replace_selector.observe(self._update_step, names="value") 207 | 208 | def update_step(self): 209 | 210 | if self.null_replace_selector.value == self.transform_type.NONE: 211 | self.submit_button.disabled = True 212 | self.null_percent_bar.bar_style = "warning" 213 | else: 214 | self.submit_button.disabled = False 215 | self.null_percent_bar.bar_style = "success" 216 | 217 | step = NullRemovalStep( 218 | replacement_method=self.null_replace_selector.value, 219 | colname=self.colname, 220 | ) 221 | 222 | self.update_step_callback.send_callbacks(step) 223 | 224 | def render_widget(self, step=None): 225 | super(NullReplaceWidgetController, self).render_widget(step) 226 | if step: 227 | self.null_replace_selector.value = step.replacement_method 228 | return self.null_removal_controls 229 | 230 | 231 | class OutlierReplaceWidgetController(StepWidgetControllerBase): 232 | """Widget controls to create an outlier replacement step""" 233 | 234 | def __init__(self): 235 | super(OutlierReplaceWidgetController, self).__init__() 236 | self.tab_title = "Outliers" 237 | self.transform_type = OutlierRemovalMethod 238 | 239 | def create_widgets(self): 240 | super(OutlierReplaceWidgetController, self).create_widgets() 241 | self.outlier_range_slider = ipywidgets.FloatRangeSlider( 242 | value=[0, 1], 243 | min=0, 244 | max=1, 245 | step=0.04, 246 | description="Range:", 247 | disabled=False, 248 | continuous_update=False, 249 | readout=True, 250 | readout_format=".2g", 251 | layout=ipywidgets.Layout(width="400px"), 252 | style={"handle_color": "lightblue"}, 253 | ) 254 | 255 | self.outlier_replace_selector = ipywidgets.Dropdown( 256 | options=[], 257 | description="Replacement Method: ", 258 | layout=ipywidgets.Layout(width="400px"), 259 | style={"description_width": "initial"}, 260 | ) 261 | 262 | self.outlier_range_slider.observe(self._update_step, names="value") 263 | self.outlier_replace_selector.observe(self._update_step, names="value") 264 | self.cut_text = ipywidgets.Label() 265 | 266 | self.outlier_removal_controls = ipywidgets.VBox( 267 | [ 268 | self.cut_text, 269 | self.outlier_range_slider, 270 | self.outlier_replace_selector, 271 | self.submit_button, 272 | ], 273 | layout=ipywidgets.Layout(width="100%"), 274 | ) 275 | self.outlier_removal_controls.layout.align_items = "center" 276 | 277 | def reset_controls(self, categorical_type): 278 | super(OutlierReplaceWidgetController, self).reset_controls() 279 | 280 | self.outlier_range_slider.unobserve(self._update_step, names="value") 281 | self.outlier_replace_selector.unobserve( 282 | self._update_step, names="value" 283 | ) 284 | 285 | self.cut_text.value = "{0} of {1} ({2:.0f}%) selected".format( 286 | 0, len(self.column), 0.0 287 | ) 288 | 289 | with self.outlier_range_slider.hold_trait_notifications(): 290 | self.outlier_range_slider.min = self.numerical_data.min() 291 | self.outlier_range_slider.max = self.numerical_data.max() 292 | 293 | self.outlier_range_slider.value = [ 294 | self.numerical_data.min(), 295 | self.numerical_data.max(), 296 | ] 297 | 298 | allowed_transforms = { 299 | x.value: x 300 | for x in ALLOWED_TRANSFORMATIONS[categorical_type] 301 | if type(x) is self.transform_type 302 | } 303 | 304 | self.outlier_replace_selector.options = allowed_transforms 305 | 306 | if len(allowed_transforms) > 0: 307 | self.outlier_replace_selector.value = self.transform_type.NONE 308 | self.submit_button.disabled = True 309 | 310 | self.outlier_range_slider.observe(self._update_step, names="value") 311 | self.outlier_replace_selector.observe(self._update_step, names="value") 312 | 313 | def update_step(self): 314 | 315 | if self.outlier_replace_selector.value == self.transform_type.NONE: 316 | self.submit_button.disabled = True 317 | else: 318 | self.submit_button.disabled = False 319 | 320 | num_values_cut = self.numerical_data[ 321 | (self.numerical_data < self.outlier_range_slider.value[0]) 322 | | (self.numerical_data > self.outlier_range_slider.value[1]) 323 | ].count() 324 | 325 | percent_values_cut = ( 326 | (100 * num_values_cut / len(self.column)) 327 | if len(self.column) > 0 328 | else 0 329 | ) 330 | 331 | self.cut_text.value = "{0} of {1} ({2:.0f}%) selected".format( 332 | num_values_cut, len(self.column), percent_values_cut 333 | ) 334 | 335 | step = OutlierRemovalStep( 336 | replacement_method=self.outlier_replace_selector.value, 337 | colname=self.colname, 338 | low_cut=self.outlier_range_slider.value[0], 339 | high_cut=self.outlier_range_slider.value[1], 340 | ) 341 | 342 | self.update_step_callback.send_callbacks(step) 343 | 344 | def render_widget(self, step=None): 345 | super(OutlierReplaceWidgetController, self).render_widget(step) 346 | if step: 347 | self.outlier_range_slider.value = [step.low_cut, step.high_cut] 348 | self.outlier_replace_selector.value = step.replacement_method 349 | return self.outlier_removal_controls 350 | 351 | 352 | class TypeConvertWidgetController(StepWidgetControllerBase): 353 | """Widget controls to create a mistyped values replacement step""" 354 | 355 | def __init__(self): 356 | super(TypeConvertWidgetController, self).__init__() 357 | self.transform_type = TypeConvertMethod 358 | self.tab_title = "Mismatched Types" 359 | 360 | def load_data(self, column, numerical_data): 361 | super(TypeConvertWidgetController, self).load_data( 362 | column, numerical_data 363 | ) 364 | self.type_count_dict = {float: 0, int: 0, str: 0} 365 | 366 | for (data_type, count) in ( 367 | self.column.dropna().apply(type).value_counts().iteritems() 368 | ): 369 | self.type_count_dict[data_type] = count 370 | 371 | def create_widgets(self): 372 | super(TypeConvertWidgetController, self).create_widgets() 373 | 374 | self.float_percent_bar = ipywidgets.FloatProgress( 375 | value=0, 376 | min=0, 377 | max=100, 378 | description="Floats:", 379 | orientation="horizontal", 380 | ) 381 | self.n_float = ipywidgets.Label() 382 | float_bar_widget = ipywidgets.HBox( 383 | [self.float_percent_bar, self.n_float] 384 | ) 385 | 386 | self.int_percent_bar = ipywidgets.FloatProgress( 387 | value=0, 388 | min=0, 389 | max=100, 390 | description="Ints:", 391 | orientation="horizontal", 392 | ) 393 | self.n_int = ipywidgets.Label() 394 | int_bar_widget = ipywidgets.HBox([self.int_percent_bar, self.n_int]) 395 | 396 | self.str_percent_bar = ipywidgets.FloatProgress( 397 | value=0, 398 | min=0, 399 | max=100, 400 | description="Strings:", 401 | orientation="horizontal", 402 | ) 403 | self.n_str = ipywidgets.Label() 404 | str_bar_widget = ipywidgets.HBox([self.str_percent_bar, self.n_str]) 405 | 406 | self.type_selector = ipywidgets.Dropdown( 407 | options={"int": int, "float": float, "string": str}, 408 | description="This column is of type:", 409 | layout=ipywidgets.Layout(width="300px"), 410 | style={"description_width": "initial"}, 411 | ) 412 | 413 | self.replace_selector = ipywidgets.Dropdown( 414 | description="For mismatched values:", 415 | layout=ipywidgets.Layout(width="300px"), 416 | style={"description_width": "initial"}, 417 | ) 418 | 419 | self.type_selector.observe(self._update_step, names="value") 420 | self.replace_selector.observe(self._update_step, names="value") 421 | 422 | self.widget = ipywidgets.VBox( 423 | [ 424 | float_bar_widget, 425 | int_bar_widget, 426 | str_bar_widget, 427 | ipywidgets.HBox( 428 | [ 429 | ipywidgets.VBox( 430 | [self.type_selector, self.replace_selector] 431 | ), 432 | self.submit_button, 433 | ] 434 | ), 435 | ] 436 | ) 437 | 438 | self.bar_widget_dict = { 439 | float: float_bar_widget, 440 | int: int_bar_widget, 441 | str: str_bar_widget, 442 | } 443 | 444 | def reset_controls(self, categorical_type): 445 | super(TypeConvertWidgetController, self).reset_controls() 446 | self.type_selector.unobserve(self._update_step, names="value") 447 | self.replace_selector.unobserve(self._update_step, names="value") 448 | 449 | allowed_transforms = { 450 | x.value: x 451 | for x in ALLOWED_TRANSFORMATIONS[categorical_type] 452 | if isinstance(x, self.transform_type) 453 | } 454 | 455 | self.replace_selector.options = allowed_transforms 456 | 457 | if len(allowed_transforms) > 0: 458 | self.replace_selector.value = self.transform_type.NONE 459 | self.submit_button.disabled = True 460 | 461 | counts = reversed( 462 | sorted(self.type_count_dict, key=self.type_count_dict.get) 463 | ) 464 | 465 | current_type = next(counts) 466 | 467 | while current_type not in self.type_selector.options.values() or ( 468 | current_type is str 469 | and categorical_type is CategoricalTypes.CONTINUOUS 470 | ): 471 | current_type = next(counts) 472 | 473 | self.type_selector.value = current_type 474 | 475 | for dtype, widget_box in self.bar_widget_dict.items(): 476 | widget_box.children[0].value = ( 477 | (100 * self.type_count_dict[dtype] / len(self.column)) 478 | if len(self.column) > 0 479 | else 0 480 | ) 481 | widget_box.children[0].bar_style = ( 482 | "success" if current_type is dtype else "warning" 483 | ) 484 | widget_box.children[1].value = "{0} of {1} ({2:.0f}%)".format( 485 | self.type_count_dict[dtype], 486 | len(self.column), 487 | widget_box.children[0].value, 488 | ) 489 | 490 | self.type_selector.observe(self._update_step, names="value") 491 | self.replace_selector.observe(self._update_step, names="value") 492 | 493 | def update_step(self): 494 | for dtype, widget_box in self.bar_widget_dict.items(): 495 | widget_box.children[0].bar_style = ( 496 | "success" if self.type_selector.value is dtype else "warning" 497 | ) 498 | if self.replace_selector.value == self.transform_type.NONE: 499 | self.submit_button.disabled = True 500 | else: 501 | self.submit_button.disabled = False 502 | 503 | step = TypeConversionStep( 504 | replacement_method=self.replace_selector.value, 505 | colname=self.colname, 506 | data_type=self.type_selector.value, 507 | ) 508 | self.update_step_callback.send_callbacks(step) 509 | 510 | def render_widget(self, step=None): 511 | super(TypeConvertWidgetController, self).render_widget(step) 512 | if step: 513 | self.type_selector.value = step.data_type 514 | self.replace_selector.value = step.replacement_method 515 | return self.widget 516 | 517 | 518 | class RbmWidgetController(StepWidgetControllerBase): 519 | """Widget controls to create an RBM imputation step""" 520 | 521 | def __init__(self): 522 | super(RbmWidgetController, self).__init__() 523 | self.transform_type = "RBM Imputation" 524 | 525 | def load_data(self, dataframe): 526 | self.dataframe = dataframe 527 | 528 | def create_widgets(self): 529 | 530 | self.submit_button = ipywidgets.Button(description="Add to Pipeline") 531 | self.submit_button.on_click( 532 | lambda _: self.submit_step_callback.send_callbacks(self.step) 533 | ) 534 | 535 | title = ipywidgets.Label( 536 | value="Impute missing data with " "a Restricted Boltzmann Machine" 537 | ) 538 | 539 | self.col_list = ipywidgets.SelectMultiple( 540 | options=[], description="On columns " 541 | ) 542 | 543 | self.col_list.observe( 544 | lambda _: self._reload_categorical_list_options( 545 | self.categorical_list.options, index=self.col_list.index 546 | ) 547 | ) 548 | 549 | self.categorical_list = ipywidgets.SelectMultiple( 550 | options=[], description=" as " 551 | ) 552 | 553 | self.categorical_list.observe( 554 | self._change_categorical_type, names="index" 555 | ) 556 | 557 | switch_categorical_type = ipywidgets.Button(description="<>") 558 | switch_categorical_type.on_click( 559 | lambda _: self._change_categorical_type( 560 | { 561 | "old": self.categorical_list.index, 562 | "new": self.categorical_list.index, 563 | } 564 | ) 565 | ) 566 | 567 | self.widget = ipywidgets.VBox( 568 | [ 569 | title, 570 | ipywidgets.HBox( 571 | [ 572 | self.col_list, 573 | self.categorical_list, 574 | switch_categorical_type, 575 | ] 576 | ), 577 | ipywidgets.VBox( 578 | [ 579 | self.submit_button, 580 | ipywidgets.Label( 581 | value="(Until you execute or export your pipeline, " 582 | "RBM imputed values are placeholders only.)" 583 | ), 584 | ] 585 | ), 586 | ] 587 | ) 588 | 589 | def _reload_categorical_list_options(self, options, index=()): 590 | self.categorical_list.unobserve( 591 | self._change_categorical_type, names="index" 592 | ) 593 | self.categorical_list.options = self._format_list(options) 594 | self.categorical_list.index = index 595 | self.categorical_list.observe( 596 | self._change_categorical_type, names="index" 597 | ) 598 | self.update_step() 599 | 600 | def _change_categorical_type(self, index): 601 | old_index = index["old"] 602 | 603 | options = list(self.categorical_list.options) 604 | indices_to_change = index["new"] 605 | 606 | for index in indices_to_change: 607 | if options[index].strip() == CategoricalTypes.CONTINUOUS.value: 608 | options[index] = CategoricalTypes.CATEGORICAL.value 609 | elif options[index].strip() == CategoricalTypes.CATEGORICAL.value: 610 | options[index] = CategoricalTypes.CONTINUOUS.value 611 | 612 | self._reload_categorical_list_options(options, index=old_index) 613 | 614 | def _format_list(self, input_list): 615 | # workaround - ensures unique values go into Select widget even though 616 | # we just want multiple instances of "categorical" and "continuous" 617 | output_list = [] 618 | for i, item in enumerate(input_list): 619 | output_list.append(item.strip() + " " * i) 620 | return output_list 621 | 622 | def reset_controls(self): 623 | super(RbmWidgetController, self).reset_controls() 624 | self.col_list.options = self.dataframe.columns.tolist() 625 | categorical_list = [] 626 | 627 | for col in self.col_list.options: 628 | categorical_list.append(is_categorical(self.dataframe[col]).value) 629 | 630 | self.col_list.value = () 631 | self.col_list.rows = self.categorical_list.rows = len(categorical_list) 632 | 633 | self._reload_categorical_list_options(categorical_list) 634 | 635 | def update_step(self): 636 | 637 | numerical_columns = [] 638 | categorical_columns = [] 639 | 640 | for i in self.col_list.index: 641 | categorical_type = self.categorical_list.options[i].strip() 642 | if categorical_type == CategoricalTypes.CONTINUOUS.value: 643 | numerical_columns.append(self.col_list.options[i]) 644 | elif categorical_type == CategoricalTypes.CATEGORICAL.value: 645 | categorical_columns.append(self.col_list.options[i]) 646 | 647 | self.step = RbmStep( 648 | numerical_columns=numerical_columns, 649 | categorical_columns=categorical_columns, 650 | ) 651 | 652 | def render_widget(self, step=None): 653 | super(RbmWidgetController, self).render_widget(step) 654 | 655 | widget = self.widget 656 | 657 | if isinstance(step, RbmStep): 658 | self.col_list.value = ( 659 | step.numerical_columns + step.categorical_columns 660 | ) 661 | 662 | categorical_list = [] 663 | 664 | for col in self.col_list.options: 665 | if col in step.numerical_columns: 666 | categorical_list.append(CategoricalTypes.CONTINUOUS.value) 667 | elif col in step.categorical_columns: 668 | categorical_list.append(CategoricalTypes.CATEGORICAL.value) 669 | else: 670 | categorical_list.append( 671 | is_categorical(self.dataframe[col]).value 672 | ) 673 | 674 | self._reload_categorical_list_options( 675 | categorical_list, index=self.col_list.index 676 | ) 677 | self.step = step 678 | elif step: 679 | widget = render_inactive_widget(step) 680 | 681 | return widget 682 | 683 | 684 | def _noninteractive(func): 685 | """Ensure plots are created in non-interactive mode with seaborn style.""" 686 | 687 | @wraps(func) 688 | def noninteractive_wrapper(*args, **kwargs): 689 | mpl_interactivity = matplotlib.is_interactive() 690 | matplotlib.interactive(False) 691 | 692 | with pyplot.style.context("seaborn"): 693 | rval = func(*args, **kwargs) 694 | 695 | matplotlib.interactive(mpl_interactivity) 696 | return rval 697 | 698 | return noninteractive_wrapper 699 | 700 | 701 | class PlotWidgetController(object): 702 | """Widget controls to display and update plots for dataframe columns.""" 703 | 704 | gs_one_plot = matplotlib.gridspec.GridSpec(1, 1) 705 | gs_two_plots = matplotlib.gridspec.GridSpec( 706 | 2, 1, height_ratios=[1, 1], hspace=0.1 707 | ) 708 | 709 | CUT_LINE_COLOUR = "red" 710 | CUT_BINS_COLOUR = "orange" 711 | 712 | def __init__(self): 713 | self.output_widget = ipywidgets.Output( 714 | layout=ipywidgets.Layout(min_width="300px", height="160px") 715 | ) 716 | self.create_figure() 717 | 718 | def load_data(self, column, numerical_data): 719 | self.column = column 720 | self.colname = column.name 721 | self.numerical_data = numerical_data 722 | 723 | @_noninteractive 724 | def create_figure(self): 725 | self.fig = pyplot.figure() 726 | 727 | self.ax_main = self.fig.add_subplot(self.gs_two_plots[0]) 728 | 729 | self.ax_mod = self.fig.add_subplot(self.gs_two_plots[1]) 730 | 731 | pyplot.setp(self.ax_mod.get_xticklabels(), visible=False) 732 | pyplot.setp(self.ax_mod.get_yticklabels(), visible=False) 733 | 734 | self.ax_cut = self.ax_main.twinx() 735 | 736 | # enforces desired drawing order 737 | self.ax_mod.set_zorder(1) 738 | self.ax_main.set_zorder(2) 739 | self.ax_cut.set_zorder(3) 740 | 741 | self.ax_cut.get_xaxis().set_visible(False) 742 | self.ax_cut.get_yaxis().set_visible(False) 743 | 744 | self.ax_main.tick_params(axis="y", which="major", labelsize=12) 745 | self.ax_mod.tick_params(axis="y", which="major", labelsize=12) 746 | 747 | def display_figure(self): 748 | self.output_widget.clear_output(wait=True) 749 | 750 | # Magic numbers came from testing using categorical columns 751 | # with large numbers of categories 752 | fig_width = 0.1 * len(self.ax_main.get_xticks()) + 3.5 753 | fig_height = 2.4 754 | self.fig.set_size_inches(fig_width, fig_height) 755 | 756 | self.output_widget.layout.width = "{}px".format(fig_width * 70) 757 | self.output_widget.layout.height = "{}px".format(fig_height * 80) 758 | 759 | if self.categorical_type is CategoricalTypes.CONTINUOUS: 760 | self.ax_main.xaxis.set_major_locator( 761 | matplotlib.ticker.AutoLocator() 762 | ) 763 | 764 | with self.output_widget: 765 | display(self.fig) 766 | 767 | def reset_plots(self, categorical_type): 768 | self.categorical_type = categorical_type 769 | self.draw_main_plot() 770 | self.update_plots() 771 | 772 | @_noninteractive 773 | def draw_main_plot(self): 774 | self.ax_main.clear() 775 | self.ax_mod.clear() 776 | 777 | if self.categorical_type is CategoricalTypes.CATEGORICAL: 778 | col = self.column.dropna().value_counts() 779 | col.index = col.index.format() 780 | if len(col) > 0: 781 | col.sort_index().plot(kind="bar", ax=self.ax_main, alpha=0.4) 782 | else: 783 | hist_orig, self.bins = np.histogram(self.numerical_data) 784 | self.bin_width = self.bins[1] - self.bins[0] 785 | 786 | margin = (self.bins[-1] - self.bins[0]) * self.ax_main.margins()[0] 787 | 788 | self.ax_main.set_xlim( 789 | (self.bins[0] - margin, self.bins[-1] + margin) 790 | ) 791 | 792 | self.ax_main.bar( 793 | self.bins[:-1], 794 | hist_orig, 795 | width=self.bin_width, 796 | align="edge", 797 | alpha=0.4, 798 | ) 799 | 800 | self.ymax = self.ax_main.get_ylim()[1] 801 | self.low_cut_line, = self.ax_main.plot( 802 | [None, None], [self.ymax, 0], color=self.CUT_LINE_COLOUR 803 | ) 804 | 805 | self.high_cut_line, = self.ax_main.plot( 806 | [None, None], [self.ymax, 0], color=self.CUT_LINE_COLOUR 807 | ) 808 | 809 | def update_plots(self, step=None, col_mod=None): 810 | if isinstance(step, OutlierRemovalStep): 811 | self.low_cut_line.set_xdata([[step.low_cut, step.low_cut]]) 812 | self.high_cut_line.set_xdata([[step.high_cut, step.high_cut]]) 813 | self.draw_cut_plot(step.low_cut, step.high_cut) 814 | else: 815 | self.hide_cut_plot() 816 | 817 | self.draw_modified_plot( 818 | col_mod if col_mod is not None else self.column 819 | ) 820 | 821 | self.display_figure() 822 | 823 | @_noninteractive 824 | def draw_modified_plot(self, col_mod): 825 | self.ax_mod.clear() 826 | 827 | data_mod = col_mod.loc[ 828 | col_mod.apply(lambda x: isinstance(x, (int, float))) 829 | ] 830 | data_mod = data_mod.dropna() 831 | col_mod = col_mod.dropna().value_counts() 832 | 833 | if self.categorical_type is CategoricalTypes.CATEGORICAL and not self.column.dropna().value_counts().equals( 834 | col_mod 835 | ): 836 | self.ax_main.set_position( 837 | self.gs_two_plots[0].get_position(self.fig) 838 | ) 839 | self.ax_cut.set_position( 840 | self.gs_two_plots[0].get_position(self.fig) 841 | ) 842 | 843 | pyplot.setp(self.ax_main.get_xticklabels(), visible=False) 844 | pyplot.setp(self.ax_mod.get_xticklabels(), visible=True) 845 | pyplot.setp(self.ax_mod.get_yticklabels(), visible=True) 846 | 847 | col_orig = self.column.dropna().value_counts() 848 | 849 | col_mod.index = col_mod.index.format() 850 | col_orig.index = col_orig.index.format() 851 | 852 | col_delta = col_mod.sub(col_orig, fill_value=0) 853 | col_delta = col_delta[col_delta > 0] 854 | 855 | col_mod = col_mod.sub(col_delta, fill_value=0) 856 | 857 | col_mod = pd.concat([col_mod, col_delta], axis=1) 858 | 859 | col_mod.sort_index().plot( 860 | kind="bar", 861 | ax=self.ax_mod, 862 | alpha=0.4, 863 | stacked=True, 864 | legend=False, 865 | ) 866 | 867 | elif ( 868 | self.categorical_type is not CategoricalTypes.CATEGORICAL 869 | and not data_mod.equals(self.numerical_data) 870 | ): 871 | self.ax_main.set_position( 872 | self.gs_two_plots[0].get_position(self.fig) 873 | ) 874 | self.ax_cut.set_position( 875 | self.gs_two_plots[0].get_position(self.fig) 876 | ) 877 | 878 | pyplot.setp(self.ax_main.get_xticklabels(), visible=False) 879 | pyplot.setp(self.ax_mod.get_xticklabels(), visible=True) 880 | pyplot.setp(self.ax_mod.get_yticklabels(), visible=True) 881 | 882 | hist_mod, _ = np.histogram(data_mod, self.bins) 883 | hist_orig, _ = np.histogram(self.numerical_data, self.bins) 884 | 885 | hist_delta = hist_mod - hist_orig 886 | hist_delta[hist_delta < 0] = 0 887 | 888 | self.ax_mod.bar( 889 | self.bins[:-1], 890 | hist_mod - hist_delta, 891 | width=self.bin_width, 892 | align="edge", 893 | alpha=0.4, 894 | ) 895 | self.ax_mod.bar( 896 | self.bins[:-1], 897 | hist_delta, 898 | width=self.bin_width, 899 | color="g", 900 | bottom=hist_mod - hist_delta, 901 | align="edge", 902 | alpha=0.4, 903 | ) 904 | else: 905 | self.ax_main.set_position( 906 | self.gs_one_plot[0].get_position(self.fig) 907 | ) 908 | self.ax_cut.set_position( 909 | self.gs_one_plot[0].get_position(self.fig) 910 | ) 911 | pyplot.setp(self.ax_main.get_xticklabels(), visible=True) 912 | pyplot.setp(self.ax_mod.get_xticklabels(), visible=False) 913 | pyplot.setp(self.ax_mod.get_yticklabels(), visible=False) 914 | 915 | @_noninteractive 916 | def draw_cut_plot(self, low_cut, high_cut): 917 | self.ax_cut.set_visible(True) 918 | 919 | ticks = self.ax_cut.get_xticks() 920 | self.ax_cut.clear() 921 | self.ax_cut.set_xticks(ticks) 922 | 923 | cut_data = self.numerical_data.loc[ 924 | self.numerical_data.apply(lambda x: x < low_cut or x > high_cut) 925 | ] 926 | 927 | hist_cut, _ = np.histogram(cut_data, self.bins) 928 | 929 | self.ax_cut.bar( 930 | self.bins[:-1], 931 | hist_cut, 932 | width=self.bin_width, 933 | align="edge", 934 | color=self.CUT_BINS_COLOUR, 935 | alpha=0.4, 936 | ) 937 | 938 | self.ax_cut.set_ylim(self.ax_main.get_ylim()) 939 | 940 | def hide_cut_plot(self): 941 | self.low_cut_line.set_xdata([[None, None]]) 942 | self.high_cut_line.set_xdata([[None, None]]) 943 | self.ax_cut.set_visible(False) 944 | 945 | def render_widget(self): 946 | if self.fig: 947 | pyplot.close(self.fig) 948 | self.create_figure() 949 | 950 | return self.output_widget 951 | 952 | 953 | class ColumnWidgetController(object): 954 | """Container widget for column-specific step creation control widgets""" 955 | 956 | def __init__(self): 957 | 958 | self.widget = None 959 | self.step_being_modified = None 960 | self.new_step_callback = CallbackManager() 961 | self.modify_step_callback = CallbackManager() 962 | self.active_callback = self.new_step_callback 963 | self.categorical_type = None 964 | 965 | self.plot_widget_controller = PlotWidgetController() 966 | 967 | def update_active_step(new_step): 968 | self.active_step = new_step 969 | col_mod = new_step.execute(self.dataframe)[self.colname] 970 | self.redraw_preview(col_mod) 971 | self.plot_widget_controller.update_plots(new_step, col_mod) 972 | 973 | self.step_creation_controls = [ 974 | NullReplaceWidgetController(), 975 | OutlierReplaceWidgetController(), 976 | TypeConvertWidgetController(), 977 | ] 978 | 979 | self.controls_dict = {} 980 | 981 | for controller in self.step_creation_controls: 982 | self.controls_dict[controller.transform_type] = controller 983 | controller.update_step_callback.register_callback( 984 | update_active_step 985 | ) 986 | 987 | self.create_widgets() 988 | 989 | def create_widgets(self): 990 | 991 | self.categorical_selector = ipywidgets.Dropdown( 992 | options={ 993 | cat_type.value: cat_type for cat_type in CategoricalTypes 994 | }, 995 | layout=ipywidgets.Layout(width="80%"), 996 | ) 997 | 998 | self.plot_widget_container = ipywidgets.VBox( 999 | [ 1000 | self.plot_widget_controller.render_widget(), 1001 | self.categorical_selector, 1002 | ], 1003 | layout=ipywidgets.Layout( 1004 | width="350px", 1005 | height="220px", 1006 | overflow_x="scroll", 1007 | overflow_y="auto", 1008 | ), 1009 | ) 1010 | 1011 | self.plot_widget_container.layout.align_items = "flex-start" 1012 | 1013 | self.categorical_selector.observe( 1014 | self.categorical_selector_onchange, names="value" 1015 | ) 1016 | 1017 | self.preview_widget = ipywidgets.HTML() 1018 | self.preview_widget_container = ipywidgets.VBox( 1019 | [ipywidgets.Label(value="Current Step"), self.preview_widget], 1020 | layout=ipywidgets.Layout(max_height="200px"), 1021 | ) 1022 | 1023 | self.tab_widget = ipywidgets.Tab( 1024 | layout=ipywidgets.Layout( 1025 | overflow_x="scroll", width="600px", height="90%" 1026 | ) 1027 | ) 1028 | 1029 | self.tab_widget.observe( 1030 | self.tab_widget_onchange, names="selected_index" 1031 | ) 1032 | 1033 | for controller in self.step_creation_controls: 1034 | controller.create_widgets() 1035 | 1036 | controller.submit_step_callback.register_callback( 1037 | lambda: self.active_callback.send_callbacks(self.active_step) 1038 | ) 1039 | 1040 | self.widget = ipywidgets.HBox( 1041 | [ 1042 | self.plot_widget_container, 1043 | self.tab_widget, 1044 | self.preview_widget_container, 1045 | ], 1046 | layout=ipywidgets.Layout( 1047 | display="flex", 1048 | align_items="stretch", 1049 | width="100%", 1050 | height="220px", 1051 | ), 1052 | ) 1053 | 1054 | def tab_widget_onchange(self, _): 1055 | index = self.tab_widget.selected_index 1056 | 1057 | for controller in self.controls_dict.values(): 1058 | if controller.tab_title == self.tab_widget.get_title(index): 1059 | controller.update_step() 1060 | 1061 | def categorical_selector_onchange(self, _): 1062 | self.categorical_type = self.categorical_selector.value 1063 | 1064 | self.active_step = NullRemovalStep( 1065 | replacement_method=NullRemovalMethod.NONE, colname=self.colname 1066 | ) 1067 | 1068 | self.reset_controls() 1069 | 1070 | def load_data(self, series, dataframe, step=None): 1071 | self.dataframe = dataframe 1072 | self.column = series 1073 | self.colname = series.name 1074 | 1075 | if not self.categorical_type: 1076 | self.categorical_type = is_categorical(series) 1077 | 1078 | self.numerical_data = series.loc[ 1079 | series.apply(lambda x: isinstance(x, (int, float))) 1080 | ] 1081 | 1082 | self.numerical_data = self.numerical_data.dropna() 1083 | 1084 | for controller in self.step_creation_controls: 1085 | controller.load_data( 1086 | column=self.column, numerical_data=self.numerical_data 1087 | ) 1088 | self.plot_widget_controller.load_data( 1089 | column=self.column, numerical_data=self.numerical_data 1090 | ) 1091 | 1092 | self.redraw_preview() 1093 | self.step_being_modified = step 1094 | 1095 | def redraw_preview(self, col_modified=None): 1096 | 1097 | if col_modified is not None: 1098 | col_mod = col_modified.reindex( 1099 | index=self.column.index, fill_value="
" 1100 | ) 1101 | else: 1102 | col_mod = self.column 1103 | 1104 | self.preview_widget.value = ( 1105 | "
This Step
" 1106 | + pd.concat( 1107 | [self.column.rename("before"), col_mod.rename("after")], axis=1 1108 | ) 1109 | .style.set_table_attributes('class="table"') 1110 | .render() 1111 | ) 1112 | 1113 | def render_widget(self): 1114 | self.reset_controls() 1115 | self.redraw_preview() 1116 | 1117 | if self.step_being_modified: 1118 | self.set_controls_for_step(self.step_being_modified) 1119 | 1120 | return self.widget 1121 | 1122 | def reset_controls(self): 1123 | self.tab_widget.unobserve( 1124 | self.tab_widget_onchange, names="selected_index" 1125 | ) 1126 | self.categorical_selector.unobserve( 1127 | self.categorical_selector_onchange, names="value" 1128 | ) 1129 | 1130 | self.active_callback = self.new_step_callback 1131 | 1132 | tab_children = [] 1133 | tab_titles = [] 1134 | 1135 | allowed_transforms = set( 1136 | transform if isinstance(transform, str) else type(transform) 1137 | for transform in ALLOWED_TRANSFORMATIONS[self.categorical_type] 1138 | ) 1139 | 1140 | for transform_type in sorted(allowed_transforms, key=str): 1141 | tab_children.append( 1142 | self.controls_dict[transform_type].render_widget() 1143 | ) 1144 | tab_titles.append(self.controls_dict[transform_type].tab_title) 1145 | 1146 | self.tab_widget.children = tuple(tab_children) 1147 | 1148 | for i in range(len(tab_children)): 1149 | self.tab_widget.set_title(i, tab_titles[i]) 1150 | 1151 | self.tab_widget.selected_index = 0 1152 | 1153 | self.active_step = NullRemovalStep( 1154 | replacement_method=NullRemovalMethod.NONE, colname=self.colname 1155 | ) 1156 | 1157 | for controller in self.step_creation_controls: 1158 | controller.reset_controls(categorical_type=self.categorical_type) 1159 | 1160 | self.categorical_selector.disabled = False 1161 | self.categorical_selector.value = self.categorical_type 1162 | 1163 | self.plot_widget_controller.reset_plots(self.categorical_type) 1164 | 1165 | self.tab_widget.observe( 1166 | self.tab_widget_onchange, names="selected_index" 1167 | ) 1168 | self.categorical_selector.observe( 1169 | self.categorical_selector_onchange, names="value" 1170 | ) 1171 | 1172 | def set_controls_for_step(self, step): 1173 | 1174 | if hasattr(step, "colname") and step.colname == self.colname: 1175 | 1176 | while ( 1177 | step.replacement_method 1178 | not in ALLOWED_TRANSFORMATIONS[self.categorical_type] 1179 | ): 1180 | self.categorical_selector.index = ( 1181 | self.categorical_selector.index + 1 1182 | ) % len(self.categorical_selector.options) 1183 | 1184 | self.tab_widget.children = [ 1185 | self.controls_dict[ 1186 | type(step.replacement_method) 1187 | ].render_widget(step) 1188 | ] 1189 | 1190 | self.active_callback = self.modify_step_callback 1191 | self.tab_widget.set_title(0, "Modifying Current Step") 1192 | self.tab_widget.selected_index = 0 1193 | else: 1194 | self.tab_widget.children = [render_inactive_widget(step)] 1195 | 1196 | self.tab_widget.set_title(0, str(self.colname)) 1197 | self.categorical_selector.disabled = True 1198 | 1199 | 1200 | class DataFrameWidgetController(object): 1201 | """Container widget for dataframe-wide controls and the pipeline""" 1202 | 1203 | def __init__(self, pipeline_widget, sampled_rows): 1204 | self.resample_callback = CallbackManager() 1205 | self.new_step_callback = CallbackManager() 1206 | self.modify_step_callback = CallbackManager() 1207 | 1208 | self.active_callback = self.new_step_callback 1209 | 1210 | self.rbm_widget_controller = RbmWidgetController() 1211 | self.rbm_widget_controller.create_widgets() 1212 | 1213 | def submit_rbm_step(*args, **kwargs): 1214 | self.active_callback.send_callbacks(*args, **kwargs) 1215 | 1216 | self.rbm_widget_controller.submit_step_callback.register_callback( 1217 | submit_rbm_step 1218 | ) 1219 | 1220 | self.pipeline_widget_container = ipywidgets.Accordion( 1221 | children=[pipeline_widget] 1222 | ) 1223 | self.pipeline_widget_container.set_title(0, "Pipeline") 1224 | self.pipeline_widget_container.selected_index = None 1225 | self.preview_widget = ipywidgets.Output( 1226 | layout=ipywidgets.Layout( 1227 | overflow_y="scroll", 1228 | overflow_x="scroll", 1229 | width="100%", 1230 | height="190px", 1231 | ) 1232 | ) 1233 | 1234 | self.rbm_widget_container = ipywidgets.Accordion( 1235 | children=[self.rbm_widget_controller.render_widget()] 1236 | ) 1237 | self.rbm_widget_container.set_title(0, "Restricted Boltzmann Machine") 1238 | self.rbm_widget_container.selected_index = None 1239 | 1240 | self.preview_widget_container = ipywidgets.Accordion( 1241 | children=[self.preview_widget] 1242 | ) 1243 | self.preview_widget_container.set_title(0, "DataFrame Preview") 1244 | self.preview_widget_container.selected_index = None 1245 | 1246 | child_widgets = [ 1247 | self.preview_widget_container, 1248 | self.rbm_widget_container, 1249 | self.pipeline_widget_container, 1250 | ipywidgets.Label( 1251 | "Click on a column name below to start adding steps." 1252 | ), 1253 | ] 1254 | 1255 | if sampled_rows: 1256 | sample_label = ipywidgets.Label( 1257 | value="Viewing {} sampled rows from your dataframe.".format( 1258 | sampled_rows 1259 | ) 1260 | ) 1261 | sample_btn = ipywidgets.Button(description="Resample") 1262 | sample_btn.on_click( 1263 | lambda _: self.resample_callback.send_callbacks() 1264 | ) 1265 | child_widgets = [ 1266 | ipywidgets.HBox([sample_label, sample_btn]) 1267 | ] + child_widgets 1268 | 1269 | self.widget = ipywidgets.VBox(child_widgets) 1270 | 1271 | def _redraw_preview(self, dataframe): 1272 | self.preview_widget.clear_output(wait=True) 1273 | with self.preview_widget: 1274 | display( 1275 | dataframe.style.set_caption( 1276 | "Preview up to the current pipeline step" 1277 | ) 1278 | ) 1279 | 1280 | def render_widget(self, dataframe, step=None): 1281 | self.dataframe = dataframe 1282 | self._redraw_preview(dataframe) 1283 | self.rbm_widget_controller.load_data(dataframe) 1284 | self.rbm_widget_controller.reset_controls() 1285 | self.rbm_widget_container.children = tuple( 1286 | [self.rbm_widget_controller.render_widget(step)] 1287 | ) 1288 | 1289 | # if we are currently modifying a non column-specific step 1290 | if step and not hasattr(step, "colname"): 1291 | self.active_callback = self.modify_step_callback 1292 | else: 1293 | self.active_callback = self.new_step_callback 1294 | 1295 | return self.widget 1296 | 1297 | def display_pipeline(self): 1298 | self.pipeline_widget_container.selected_index = 0 1299 | 1300 | 1301 | class PipelineWidgetController(object): 1302 | """Container widget for a view of the processing pipeline""" 1303 | 1304 | CAROUSEL_LAYOUT = ipywidgets.Layout( 1305 | overflow_x="scroll", 1306 | width="800px", 1307 | height="", 1308 | flex_direction="row", 1309 | display="flex", 1310 | ) 1311 | 1312 | def __init__(self, pipeline, name): 1313 | 1314 | self.pipeline = pipeline 1315 | self.name = name 1316 | self.pipeline_view = ipywidgets.Box( 1317 | children=[], layout=self.CAROUSEL_LAYOUT 1318 | ) 1319 | self.info_label = ipywidgets.Label(value="") 1320 | self.info_label.layout.height = "30px" 1321 | 1322 | self.add_button = ipywidgets.Button(description="+") 1323 | self.add_button.layout.visibility = "hidden" 1324 | self.add_button.on_click(lambda _: self._enter_add_mode()) 1325 | 1326 | self.add_mode_callback = CallbackManager() 1327 | self.edit_mode_callback = CallbackManager() 1328 | self.delete_step_callback = CallbackManager() 1329 | self.execute_callback = CallbackManager() 1330 | self.export_callback = CallbackManager() 1331 | 1332 | self.execute_button = ipywidgets.Button(description="Execute Pipeline") 1333 | self.execute_button.on_click(lambda _: self._execute_pipeline()) 1334 | 1335 | self.export_button = ipywidgets.Button(description="Export to Code") 1336 | self.export_button.on_click(lambda _: self._export_pipeline()) 1337 | 1338 | def render_widget(self, active_step=None): 1339 | 1340 | children = [] 1341 | self.pipeline_step_widgets = [] 1342 | self.display_message("Add a step to get started") 1343 | 1344 | for step in self.pipeline.steps: 1345 | pipeline_step_widget = PipelineStepWidgetController(step) 1346 | 1347 | pipeline_step_widget.modify_step_callback.register_callback( 1348 | self._enter_edit_mode 1349 | ) 1350 | 1351 | pipeline_step_widget.stop_modifying_callback.register_callback( 1352 | self._enter_add_mode 1353 | ) 1354 | 1355 | pipeline_step_widget.delete_step_callback.register_callback( 1356 | self._delete_step 1357 | ) 1358 | 1359 | self.pipeline_step_widgets.append(pipeline_step_widget) 1360 | children.append(pipeline_step_widget.widget) 1361 | 1362 | if children: 1363 | children.append( 1364 | ipywidgets.VBox( 1365 | [self.add_button, self.execute_button, self.export_button], 1366 | layout=ipywidgets.Layout(min_width="150px"), 1367 | ) 1368 | ) 1369 | self.display_message("") 1370 | 1371 | self.pipeline_view.children = tuple(children) 1372 | self.widget = ipywidgets.VBox([self.pipeline_view, self.info_label]) 1373 | self._enter_edit_mode(active_step) 1374 | 1375 | return self.widget 1376 | 1377 | def _enter_edit_mode(self, step): 1378 | if step: 1379 | self.add_button.layout.visibility = ( 1380 | None 1381 | ) # this means it's visible! 1382 | for pipeline_step_widget in self.pipeline_step_widgets: 1383 | if pipeline_step_widget.step is step: 1384 | pipeline_step_widget._set_active_style() 1385 | else: 1386 | pipeline_step_widget._set_inactive_style() 1387 | self.edit_mode_callback.send_callbacks(step) 1388 | message = "Modifying step" 1389 | if hasattr(step, "colname"): 1390 | message += " on column " + str(step.colname) 1391 | self.display_message(message) 1392 | else: 1393 | self.add_button.layout.visibility = "hidden" 1394 | 1395 | def _export_pipeline(self): 1396 | self.display_message("Exported to code cell.") 1397 | self.export_callback.send_callbacks() 1398 | 1399 | def _execute_pipeline(self): 1400 | self.display_message("Executing pipeline... ") 1401 | 1402 | self.execute_callback.send_callbacks() 1403 | 1404 | self.display_message( 1405 | 'Cleaned DataFrame output to "' 1406 | + self.name 1407 | + '_cleaned". ' 1408 | + "Reload DataCleaner to refresh list." 1409 | ) 1410 | 1411 | def _delete_step(self, step): 1412 | self.add_button.layout.visibility = "hidden" 1413 | self.delete_step_callback.send_callbacks(step) 1414 | 1415 | def _enter_add_mode(self): 1416 | self.add_button.layout.visibility = "hidden" 1417 | for pipeline_step_widget in self.pipeline_step_widgets: 1418 | pipeline_step_widget._set_inactive_style() 1419 | self.add_mode_callback.send_callbacks() 1420 | self.display_message("") 1421 | 1422 | def display_message(self, message): 1423 | self.info_label.value = message 1424 | 1425 | 1426 | class PipelineStepWidgetController(object): 1427 | """Container widget for a single step of the processing pipeline""" 1428 | 1429 | def __init__(self, step): 1430 | 1431 | select_box = ipywidgets.Select( 1432 | options=step.description.replace(", ", "\n").splitlines(), 1433 | rows=3, 1434 | disabled=False, 1435 | layout=ipywidgets.Layout(width="200px"), 1436 | ) 1437 | 1438 | self.modify_button = ipywidgets.ToggleButton( 1439 | layout=ipywidgets.Layout(height="25px", width="98%") 1440 | ) 1441 | 1442 | self.delete_button = ipywidgets.Button( 1443 | description="Delete Step", 1444 | layout=ipywidgets.Layout( 1445 | height="25px", width="98%", visibility="hidden" 1446 | ), 1447 | button_style="warning", 1448 | ) 1449 | 1450 | self.widget = ipywidgets.VBox( 1451 | [self.modify_button, select_box, self.delete_button], 1452 | layout=ipywidgets.Layout(min_width="200px"), 1453 | ) 1454 | 1455 | self.step = step 1456 | self.modify_step_callback = CallbackManager() 1457 | self.stop_modifying_callback = CallbackManager() 1458 | self.delete_step_callback = CallbackManager() 1459 | 1460 | self.modify_button.observe(self._modify_button_on_click, names="value") 1461 | self.delete_button.on_click( 1462 | lambda _: self.delete_step_callback.send_callbacks(self.step) 1463 | ) 1464 | 1465 | self._set_inactive_style() 1466 | 1467 | def _modify_button_on_click(self, value): 1468 | if value["new"] is True: 1469 | self.modify_step_callback.send_callbacks(self.step) 1470 | else: 1471 | self.stop_modifying_callback.send_callbacks() 1472 | 1473 | def _set_active_style(self): 1474 | self.modify_button.button_style = "primary" 1475 | self.modify_button.description = "Modifying" 1476 | self.modify_button.value = True 1477 | self.delete_button.layout.visibility = None # This means visible. 1478 | 1479 | def _set_inactive_style(self): 1480 | self.modify_button.button_style = "" 1481 | self.modify_button.description = "Modify" 1482 | 1483 | self.modify_button.unobserve( 1484 | self._modify_button_on_click, names="value" 1485 | ) 1486 | self.modify_button.value = False 1487 | self.modify_button.observe(self._modify_button_on_click, names="value") 1488 | 1489 | self.delete_button.layout.visibility = "hidden" 1490 | --------------------------------------------------------------------------------