8 |
--------------------------------------------------------------------------------
/jupyter-lite.json:
--------------------------------------------------------------------------------
1 | {
2 | "jupyter-lite-schema-version": 0,
3 | "jupyter-config-data": {
4 | "disabledExtensions": [
5 | "@jupyterlab/drawio-extension",
6 | "jupyterlab-kernel-spy",
7 | "jupyterlab-tour"
8 | ],
9 | "litePluginSettings": {
10 | "@jupyterlite/pyodide-kernel-extension:kernel": {
11 | "pyodideUrl": "https://cdn.jsdelivr.net/pyodide/v0.23.1/full/pyodide.js"
12 | }
13 | }
14 | }
15 | }
16 |
17 |
--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
1 | =====================================================
2 | Machine-learning on dirty data in Python: a tutorial
3 | =====================================================
4 |
5 | Often in data science, machine-learning applications spend a significant
6 | energy preparing, tidying, and cleaning the data before the machine
7 | learning.
8 |
9 | Here we give a set of Python tutorials on how some of these operations
10 | can be simplified with adequate machine-learning tools.
11 |
12 | .. include:: gen_notes/index.rst
13 | :start-line: 2
14 | :end-before: .. rst-class:: sphx-glr-signature
15 |
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 2-Clause License
2 |
3 | Copyright (c) 2021, dirty-data-science
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
--------------------------------------------------------------------------------
/_static/scrolltoc.js:
--------------------------------------------------------------------------------
1 | function updateSideBarPosition(sections) {
2 | var pos = $(window).scrollTop();
3 |
4 | // Highlight the current section
5 | i = 0;
6 | current_section = 0;
7 | $('a.internal').removeClass('active');
8 | $('ul.active').removeClass('active');
9 | $('li.preactive').removeClass('preactive');
10 | for(i in sections) {
11 | if(sections[i] > pos) {
12 | break
13 | }
14 | console.log(i);
15 | current_section = i
16 | if($('a.internal[href$="' + i + '"]').is(':visible')){
17 | current_section = i
18 | }
19 | }
20 | $('a.internal[href$="' + current_section + '"]').addClass('active');
21 | $('a.internal[href$="' + current_section + '"]').parent().parent().addClass('active')
22 | $('a.internal[href$="' + current_section + '"]').parent().parent().parent().addClass('preactive')
23 | $('a.internal[href$="' + current_section + '"]').parent().parent().parent().parent().parent().addClass('preactive')
24 | }
25 |
26 | $(function() {
27 | sections = {};
28 | url = document.URL.replace(/#.*$/, "");
29 |
30 | // Grab positions of our sections
31 | $('.headerlink').each(function(){
32 | sections[this.href.replace(url, '')] = $(this).offset().top - 150
33 | });
34 |
35 | updateSideBarPosition(sections);
36 | $(window).scroll(function(event) {
37 | updateSideBarPosition(sections)
38 | });
39 |
40 | $(window).resize(function(event) {
41 | updateSideBarPosition(sections)
42 | });
43 | });
44 |
45 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # vim
132 | *~
133 | *.swp
134 |
--------------------------------------------------------------------------------
/_static/piggy.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | PYTHON = python3
6 | SPHINXOPTS =
7 | SPHINXBUILD = $(PYTHON) -m sphinx
8 |
9 | ALLSPHINXOPTS = -d build/doctrees $(SPHINXOPTS) .
10 |
11 |
12 | .PHONY: help clean html web pickle htmlhelp latex changes linkcheck zip
13 |
14 | all: html-noplot
15 |
16 | help:
17 | @echo "Please use \`make ' where is one of"
18 | @echo " html to make standalone HTML files"
19 | @echo " pickle to make pickle files (usable by e.g. sphinx-web)"
20 | @echo " htmlhelp to make HTML files and a HTML help project"
21 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
22 | @echo " pdf to make PDF from LaTeX, you can set PAPER=a4 or PAPER=letter"
23 | @echo " changes to make an overview over all changed/added/deprecated items"
24 | @echo " linkcheck to check all external links for integrity"
25 | @echo " install to upload to github the web pages"
26 | @echo " zip to create the zip file with examples and doc"
27 |
28 | clean:
29 | -rm -rf build/*
30 | -rm -rf intro/scipy/auto_examples/ intro/matplotlib/auto_examples/ intro/summary-exercises/auto_examples advanced/mathematical_optimization/auto_examples/ advanced/advanced_numpy/auto_examples/ advanced/image_processing/auto_examples advanced/scipy_sparse/auto_examples packages/3d_plotting/auto_examples packages/statistics/auto_examples/ packages/scikit-image/auto_examples/ packages/scikit-learn/auto_examples intro/numpy/auto_examples guide/auto_examples
31 |
32 | test:
33 | MATPLOTLIBRC=build_tools $(PYTHON) -m pytest --doctest-glob '*.rst' --ignore advanced/advanced_numpy/examples/myobject_test.py --ignore advanced/interfacing_with_c/numpy_c_api/test_cos_module_np.py --ignore advanced/interfacing_with_c/ctypes/cos_module.py --ignore advanced/interfacing_with_c/swig_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/cython_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/ctypes_numpy/cos_doubles.py --ignore advanced/interfacing_with_c/ctypes_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/numpy_shared/test_cos_doubles.py
34 |
35 | test-stop-when-failing:
36 | MATPLOTLIBRC=build_tools $(PYTHON) -m pytest -x --doctest-glob '*.rst' --ignore advanced/advanced_numpy/examples/myobject_test.py --ignore advanced/interfacing_with_c/numpy_c_api/test_cos_module_np.py --ignore advanced/interfacing_with_c/ctypes/cos_module.py --ignore advanced/interfacing_with_c/swig_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/cython_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/ctypes_numpy/cos_doubles.py --ignore advanced/interfacing_with_c/ctypes_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/numpy_shared/test_cos_doubles.py
37 |
38 | html-noplot:
39 | $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) build/html
40 | @echo
41 | @echo "Build finished. The HTML pages are in build/html."
42 |
43 | html:
44 | mkdir -p build/html build/doctrees
45 | # This line makes the build a bit more lengthy, and the
46 | # the embedding of images more robust
47 | rm -rf build/html/_images
48 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html
49 | @echo
50 | @echo "Build finished. The HTML pages are in build/html."
51 |
52 | cleandoctrees:
53 | rm -rf build/doctrees
54 |
55 | pickle:
56 | mkdir -p build/pickle build/doctrees
57 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) build/pickle
58 | @echo
59 | @echo "Build finished; now you can process the pickle files or run"
60 | @echo " sphinx-web build/pickle"
61 | @echo "to start the sphinx-web server."
62 |
63 | web: pickle
64 |
65 | linkcheck:
66 | mkdir -p build/linkcheck build/doctrees
67 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) build/linkcheck
68 | @echo
69 | @echo "Link check complete; look for any errors in the above output " \
70 | "or in build/linkcheck/output.txt."
71 |
72 |
--------------------------------------------------------------------------------
/_static/custom.css:
--------------------------------------------------------------------------------
1 | /** Styling **************************************************************/
2 |
3 | /* Main page title */
4 | div.body h1 {
5 | text-align: center;
6 | font-size: 270%;
7 | color: #055100;
8 | margin-bottom: 1em;
9 | }
10 |
11 | /* Secondary sections title */
12 | div.body h2 {
13 | color: #055100;
14 | margin-top: 2.5em;
15 | }
16 |
17 | div.body h3 {
18 | margin-top: 2em;
19 | }
20 |
21 | div.body h4 {
22 | margin-top: 1.5em;
23 | }
24 |
25 | div.body h3, div.body h4 {
26 | color: #000;
27 | }
28 |
29 | /* More horizontal space in pre (to fit 80char lines) */
30 | div.body pre {
31 | padding: 7px 30px 7px 10px;
32 | }
33 |
34 | @media only screen and (min-width: 1200px) {
35 | div.body pre, .sphx-glr-script-out .highlight pre {
36 | margin-right: -30px;
37 | }
38 | }
39 |
40 | div.body dl.footnote {
41 | font-size: smaller;
42 | }
43 |
44 | /* Side hanging footnotes */
45 | @media (min-width: 1500px) {
46 | div.body dl.footnote {
47 | position: absolute;
48 | right: 1ex;
49 | margin-top: -10ex;
50 | width: 17em;
51 | }
52 | }
53 |
54 | @media (min-width: 1650px) {
55 | div.body dl.footnote {
56 | width: 20em;
57 | }
58 |
59 | }
60 |
61 | @media (min-width: 1750px) {
62 | div.body dl.footnote {
63 | width: 25em;
64 | }
65 |
66 | }
67 |
68 | /** Sphinx gallery *******************************************************/
69 |
70 | /* The download note on top of the examples */
71 | div.sphx-glr-download-link-note {
72 | right: 2pt;
73 | top: 10pt;
74 | max-width: 15ex;
75 | float: right;
76 | padding: 0px 1px 10px 10px;
77 | background-color: #F6F6F6;
78 | font-size: small;
79 | }
80 |
81 | /* Hide the download python button (the notebook is enough) */
82 | div.sphx-glr-download-python {
83 | display: none;
84 | }
85 |
86 | /* The download block at the bottom */
87 | div.sphx-glr-footer-example {
88 | background-color: #F6F6F6;
89 | border: solid 1px #CCC;
90 | padding: 0px 5px;
91 | }
92 |
93 | div.sphx-glr-download code.download {
94 | word-break: break-word;
95 | font-size: small;
96 | }
97 |
98 |
99 | /* Large screens */
100 | @media only screen and (min-width: 1400px) {
101 | /* The download note on top of the examples */
102 | div.sphx-glr-download-link-note {
103 | display: none;
104 | }
105 |
106 | /* The download block at the bottom */
107 | div.sphx-glr-footer-example {
108 | position: absolute;
109 | right: 10px;
110 | max-width: 200px;
111 | top: 50px;
112 | }
113 | }
114 |
115 | @media only screen and (min-width: 1550px) {
116 | div.sphx-glr-footer-example {
117 | max-width: 300px;
118 | }
119 | }
120 |
121 | @media only screen and (min-width: 1750px) {
122 | div.sphx-glr-footer-example {
123 | max-width: 400px;
124 | }
125 | }
126 |
127 | div.sphx-glr-download-link-note p.admonition-title {
128 | display: none;
129 | }
130 |
131 | /* For screens larger than 1200px: alabaster default settings are that
132 | * the body is 940px wide and the sidebar 220px */
133 | @media only screen and (min-width: 1200px) {
134 | div.sphx-glr-download-link-note {
135 | position: absolute; /* Overrides the float behavior */
136 | }
137 | }
138 |
139 | div.sphx-glr-script-out, p.sphx-glr-script-out {
140 | margin-left: -3.5ex;
141 | }
142 |
143 | /** Sidebar **************************************************************/
144 |
145 | /* Hide the title in the navigation bar in the index page */
146 | /*div.sphinxsidebarwrapper li a {
147 | display: none;
148 | }*/
149 |
150 | div.sphinxsidebarwrapper h3.toc-title {
151 | font-size: unset;
152 | font-style: italic;
153 | text-decoration: underline;
154 | margin-bottom: 0px;
155 | }
156 |
157 | div.sphinxsidebarwrapper ul:first-of-type {
158 | margin-top: 0px;
159 | }
160 |
161 | /* Undo for children */
162 | div.sphinxsidebarwrapper li li a {
163 | display: block;
164 | }
165 |
166 | div.sphinxsidebarwrapper li li {
167 | margin: 1ex 0ex;
168 | }
169 |
170 | /* Online one level of titles */
171 | div.sphinxsidebarwrapper li li ul {
172 | display: none;
173 | }
174 |
175 | /* Undo for other pages */
176 | div.sphinxsidebarwrapper li.toctree-l1 a {
177 | display: block;
178 | }
179 |
180 | /* Less info in the navigation sidebar */
181 | /*div.sphinxsidebarwrapper h3:first-of-type {
182 | display: none;
183 | }*/
184 |
185 | /* Undo for childs of enclosing divs */
186 | div.sphinxsidebarwrapper div h3:first-of-type {
187 | display: block;
188 | }
189 |
190 | /* The section links */
191 | div.sphinxsidebarwrapper li a {
192 | display: block;
193 | margin-left: -2.5ex;
194 | padding-left: 2.5ex;
195 | }
196 |
197 | div.sphinxsidebarwrapper ul ul {
198 | list-style: circle;
199 | }
200 |
201 | div.sphinxsidebarwrapper li li a:hover {
202 | border-right: #076B00 solid 5px;
203 | background-color: #F6F6F6;
204 | }
205 |
206 | /* The "active" sections */
207 | div.sphinxsidebarwrapper li li.preactive li ul {
208 | display: none;
209 | }
210 |
211 | div.sphinxsidebarwrapper li li.preactive li ul.active {
212 | display: block;
213 | }
214 |
215 | div.sphinxsidebarwrapper li li.preactive > a:first-child {
216 | border-right: #C1CEC1 solid 5px;
217 | }
218 |
219 | div.sphinxsidebarwrapper li ul.active ul {
220 | display: block;
221 | font-size: smaller;
222 | }
223 |
224 | div.sphinxsidebarwrapper li li a.active {
225 | border-right: #055100 solid 5px;
226 | background-color: #F3F3F3;
227 | text-decoration: none;
228 | }
229 |
230 | div.sphinxsidebarwrapper > ul > li.preactive li ul {
231 | display: none;
232 | }
233 |
234 | div.sphinxsidebarwrapper > ul > li.preactive li ul.active {
235 | display: block;
236 | }
237 |
238 | div.sphinxsidebarwrapper li li.preactive ul {
239 | display: block;
240 | }
241 |
242 | div.sphinxsidebarwrapper li li a.active + ul {
243 | display: block;
244 | font-size: smaller;
245 | }
246 |
247 | div.sphinxsidebarwrapper li li ul.active {
248 | display: block;
249 | }
250 |
251 | div.sphinxsidebarwrapper li li ul {
252 | font-size: smaller;
253 | }
254 |
255 |
256 | /* Flush the sidebar more to the left */
257 | @media (min-width: 1300px) {
258 | div.sphinxsidebar {
259 | left: 5%;
260 | width: 300px;
261 | }
262 |
263 | div.sphinxsidebar p.logo {
264 | width: 220px;
265 | }
266 | }
267 |
268 | @media (min-width: 1500px) {
269 | div.sphinxsidebar {
270 | left: 10%;
271 | }
272 | }
273 |
274 | /*************************************************************************/
275 |
276 | /* My custom classes */
277 | p.right-align {
278 | float: right;
279 | }
280 |
--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # This file is execfile()d with the current directory set to its
4 | # containing dir.
5 | #
6 | # Note that not all possible configuration values are present in this
7 | # autogenerated file.
8 | #
9 | # All configuration values have a default; values that are commented out
10 | # serve to show the default.
11 |
12 | # If extensions (or modules to document with autodoc) are in another directory,
13 | # add these directories to sys.path here. If the directory is relative to the
14 | # documentation root, use os.path.abspath to make it absolute, like shown here.
15 | #
16 | import os
17 |
18 |
19 | # -- General configuration ------------------------------------------------
20 |
21 | # If your documentation needs a minimal Sphinx version, state it here.
22 | #
23 | # needs_sphinx = '1.0'
24 |
25 | # Add any Sphinx extension module names here, as strings. They can be
26 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
27 | # ones.
28 | extensions = ['sphinx.ext.autodoc',
29 | 'sphinx.ext.doctest',
30 | 'sphinx.ext.intersphinx',
31 | 'sphinx.ext.mathjax',
32 | 'sphinx.ext.viewcode',
33 | 'sphinx.ext.githubpages',
34 | 'sphinx_gallery.gen_gallery',
35 | 'jupyterlite_sphinx',
36 | ]
37 |
38 | try:
39 | import sphinxext.opengraph
40 | extensions.append('sphinxext.opengraph')
41 | except ImportError:
42 | print("ERROR: sphinxext.opengraph import failed")
43 |
44 | # Add any paths that contain templates here, relative to this directory.
45 | templates_path = ['_templates']
46 |
47 | # The suffix(es) of source filenames.
48 | # You can specify multiple suffix as a list of string:
49 | #
50 | # source_suffix = ['.rst', '.md']
51 | source_suffix = '.rst'
52 |
53 | # The master toctree document.
54 | master_doc = 'index'
55 |
56 | # General information about the project.
57 | project = u'Dirty data science'
58 | author = u'Gaël Varoquaux'
59 | copyright = u'2021, ' + author
60 |
61 | # The version info for the project you're documenting, acts as replacement for
62 | # |version| and |release|, also used in various other places throughout the
63 | # built documents.
64 | #
65 | version = '2021.1'
66 | # The full version, including alpha/beta/rc tags.
67 | release = version
68 |
69 | # The language for content autogenerated by Sphinx. Refer to documentation
70 | # for a list of supported languages.
71 | #
72 | # This is also used if you do content translation via gettext catalogs.
73 | # Usually you set "language" from the command line for these cases.
74 | language = None
75 |
76 | # List of patterns, relative to source directory, that match files and
77 | # directories to ignore when looking for source files.
78 | # This patterns also effect to html_static_path and html_extra_path
79 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
80 |
81 | # The name of the Pygments (syntax highlighting) style to use.
82 | pygments_style = 'sphinx'
83 |
84 | # If true, `todo` and `todoList` produce output, else they produce nothing.
85 | todo_include_todos = False
86 |
87 |
88 | # -- Options for HTML output ----------------------------------------------
89 |
90 | # The theme to use for HTML and HTML Help pages. See the documentation for
91 | # a list of builtin themes.
92 | #
93 | html_theme = 'alabaster'
94 | # Doc: https://alabaster.readthedocs.io/en/latest/customization.html
95 |
96 | html_sidebars = {
97 | '**': [
98 | 'about.html',
99 | #'globallinks.html',
100 | 'localtoc.html',
101 | 'relations.html',
102 | #'searchbox.html',
103 | ],
104 | 'index': [
105 | 'about.html',
106 | 'localtoc.html',
107 | 'relations.html',
108 | #'searchbox.html',
109 | ]
110 | }
111 |
112 | # Theme options are theme-specific and customize the look and feel of a theme
113 | # further. For a list of options available for each theme, see the
114 | # documentation.
115 | #
116 | html_theme_options = {
117 | 'logo': 'piggy.svg',
118 | 'github_user': 'dirty-data-science',
119 | 'github_repo': 'python',
120 | 'github_button': 'true',
121 | 'github_type': 'star',
122 | 'github_count': 'true',
123 | 'show_powered_by': 'false',
124 | 'logo_name': 'true',
125 | 'gray_1': "#030",
126 | 'gray_2': "#F1FFF1",
127 | 'link': "#076B00",
128 | # 'gray_3': "#090",
129 | 'fixed_sidebar': 'true',
130 | 'note_bg': "rgb(246, 248, 250);",
131 | #'topic_bg': "rgb(246, 248, 250);",
132 | }
133 |
134 | # Add any paths that contain custom static files (such as style sheets) here,
135 | # relative to this directory. They are copied after the builtin static files,
136 | # so a file named "default.css" will overwrite the builtin "default.css".
137 | html_static_path = ['_static']
138 |
139 |
140 | # Modify the title, so as to get good social-media links
141 | html_title = "— Dirty data science"
142 |
143 |
144 | # Configuration for intersphinx
145 | intersphinx_mapping = {
146 | 'python': ('https://docs.python.org/3/', None),
147 | 'numpy': ('https://docs.scipy.org/doc/numpy', None),
148 | 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
149 | 'matplotlib': ('https://matplotlib.org/', None),
150 | 'sklearn': ('https://scikit-learn.org/stable', None),
151 | 'skimage': ('http://scikit-image.org/docs/stable/', None),
152 | 'mayavi': ('http://docs.enthought.com/mayavi/mayavi/', None),
153 | 'statsmodels': ('http://www.statsmodels.org/stable/', None),
154 | 'pandas': ('http://pandas.pydata.org/pandas-docs/stable/', None),
155 | 'seaborn': ('http://seaborn.pydata.org/', None),
156 | 'skrub': ('https://skrub-data.org/stable/', None),
157 | }
158 |
159 |
160 | # -- sphinx-gallery configuration -----------------------------------------
161 | from sphinx_gallery.sorting import FileNameSortKey
162 | sphinx_gallery_conf = {
163 | 'filename_pattern': '',
164 | 'backreferences_dir': os.path.join('generated'),
165 | 'reference_url': {
166 | # 'dirty_cat': 'https://dirty-cat.github.io/stable/',
167 | 'numpy': 'http://docs.scipy.org/doc/numpy',
168 | # 'scipy': 'http://docs.scipy.org/doc/scipy/reference',
169 | # 'pandas': 'http://pandas.pydata.org/pandas-docs/stable',
170 | # 'seaborn': 'http://seaborn.pydata.org/',
171 | 'matplotlib': 'http://matplotlib.org/stable',
172 | 'sklearn': 'http://scikit-learn.org/stable',
173 | # #'scikit-image': 'http://scikit-image.org/docs/stable/',
174 | # #'mayavi': 'http://docs.enthought.com/mayavi/mayavi/',
175 | #'statsmodels': 'http://www.statsmodels.org/stable/',
176 | },
177 | 'examples_dirs':'notes',
178 | 'gallery_dirs':'gen_notes',
179 | 'within_subsection_order': FileNameSortKey,
180 | 'download_all_examples': False,
181 | 'binder': {
182 | 'org': 'dirty-data-science',
183 | 'repo': 'python',
184 | 'binderhub_url': 'https://mybinder.org',
185 | 'branch': 'gh-pages',
186 | 'dependencies': ['requirements.txt',],
187 | 'notebooks_dir': 'notes'
188 | },
189 | 'jupyterlite': {
190 | 'use_jupyter_lab': False,
191 | },
192 | "inspect_global_variables": False,
193 | }
194 |
195 | # -- sphinxext.opengraph configuration -------------------------------------
196 | ogp_site_url = "https://dirtydata.science/python"
197 | ogp_image = "https://dirtydata.science/python/_static/piggy.svg"
198 | ogp_use_first_image = True
199 | ogp_site_name = "Dirty Data Science"
200 |
201 |
202 | # -- The javascript to highlight the toc as we scroll ----------------------
203 | html_js_files = ['scrolltoc.js']
204 |
--------------------------------------------------------------------------------
/notes/02_dirty_categories.py:
--------------------------------------------------------------------------------
1 | """
2 | ========================================================
3 | Dirty categories: learning with non normalized strings
4 | ========================================================
5 |
6 | Including strings that represent categories often calls for much data
7 | preparation. In particular categories may appear with many morphological
8 | variants, when they have been manually input, or assembled from diverse
9 | sources.
10 |
11 | Including such a column in a learning pipeline as a standard categorical
12 | colum leads to categories with very high cardinalities and can lose
13 | information on which categories are similar.
14 |
15 | Here we look at a dataset on wages [#]_ where the column *Employee
16 | Position Title* contains dirty categories.
17 |
18 | .. [#] https://catalog.data.gov/dataset/employee-salaries-2016
19 |
20 | We investigate encodings to include such compare different categorical
21 | encodings for the dirty column to predict the *Current Annual Salary*,
22 | using gradient boosted trees. For this purpose, we use the skrub
23 | library ( https://skrub-data.org ).
24 |
25 | """
26 |
27 | # %%
28 | #
29 | # .. |SV| replace::
30 | # :class:`~skrub.TableVectorizer`
31 | #
32 | # .. |tabular_learner| replace::
33 | # :func:`~skrub.tabular_learner`
34 | #
35 | # .. |OneHotEncoder| replace::
36 | # :class:`~sklearn.preprocessing.OneHotEncoder`
37 | #
38 | # .. |RandomForestRegressor| replace::
39 | # :class:`~sklearn.ensemble.RandomForestRegressor`
40 | #
41 | # .. |SE| replace:: :class:`~skrub.SimilarityEncoder`
42 | #
43 | # .. |GapEncoder| replace:: :class:`~skrub.GapEncoder`
44 | #
45 | # .. |permutation importances| replace::
46 | # :func:`~sklearn.inspection.permutation_importance`
47 | #
48 | #
49 | # The data
50 | # ========
51 | #
52 | # Data Importing and preprocessing
53 | # --------------------------------
54 | #
55 | # We first download the dataset:
56 | from skrub.datasets import fetch_employee_salaries
57 | employee_salaries = fetch_employee_salaries()
58 | print(employee_salaries.description)
59 |
60 | # %%
61 | # Then we load it:
62 | import pandas as pd
63 | df = employee_salaries.X.copy()
64 | df
65 |
66 | # %%
67 | # Recover the target
68 |
69 | y = employee_salaries.y
70 |
71 | # %%
72 | #
73 | # A simple default as a learner
74 | # ===============================
75 | #
76 | # The function |tabular_learner| is a simple way of creating a default
77 | # learner for tabular_learner data:
78 | from skrub import tabular_learner
79 | model = tabular_learner("regressor")
80 |
81 | # %%
82 | # We can quickly compute its cross-validation score using the
83 | # corresponding scikit-learn utility
84 | from sklearn.model_selection import cross_validate
85 | import numpy as np
86 |
87 | results = cross_validate(model, df, y)
88 | print(f"Prediction score: {np.mean(results['test_score'])}")
89 | print(f"Training time: {np.mean(results['fit_time'])}")
90 |
91 | # %%
92 | # Below the hood, `model` is a pipeline:
93 | model
94 |
95 | # %%
96 | # We can see that it is made of first a |SV|, and an
97 | # HistGradientBoostingRegressor
98 |
99 | # %%
100 | # Understanding the vectorizer + learner pipeline
101 | # =======================================
102 | #
103 | # The number one difficulty is that our input is a complex and
104 | # heterogeneous dataframe:
105 | df
106 |
107 | # %%
108 | # The |SV| is a transformer that turns this dataframe into a
109 | # form suited for machine learning.
110 | #
111 | # Feeding it output to a powerful learner,
112 | # such as gradient boosted trees, gives **a machine-learning method that
113 | # can be readily applied to the dataframe**.
114 | from skrub import TableVectorizer
115 |
116 | # %%
117 | # Assembling the pipeline
118 | # ---------------------------
119 | #
120 |
121 | # %%
122 | # We use the |SV| with a HistGradientBoostingRegressor, which is a good
123 | # predictor for data with heterogeneous columns
124 | from sklearn.ensemble import HistGradientBoostingRegressor
125 |
126 | # %%
127 | # We then create a pipeline chaining our encoders to a learner
128 | from sklearn.pipeline import make_pipeline
129 |
130 | pipeline = make_pipeline(
131 | TableVectorizer(),
132 | HistGradientBoostingRegressor()
133 | )
134 | pipeline
135 |
136 | # %%
137 | # Note that it is almost the same model as above (can you spot the
138 | # differences)
139 | #
140 | # Let's perform a cross-validation to see how well this model predicts
141 |
142 | results = cross_validate(pipeline, df, y)
143 | print(f"Prediction score: {np.mean(results['test_score'])}")
144 | print(f"Training time: {np.mean(results['fit_time'])}")
145 |
146 |
147 | # %%
148 | # The prediction perform here is pretty much as good as above
149 | # but the code here is much simpler as it does not involve specifying
150 | # columns manually.
151 |
152 | # %%
153 | # Analyzing the features created
154 | # -------------------------------
155 | #
156 | # Let us perform the same workflow, but without the `Pipeline`, so we can
157 | # analyze its mechanisms along the way.
158 | tab_vec = TableVectorizer()
159 |
160 | # %%
161 | # We split the data between train and test, and transform them:
162 | from sklearn.model_selection import train_test_split
163 | df_train, df_test, y_train, y_test = train_test_split(
164 | df, y, test_size=0.15, random_state=42
165 | )
166 |
167 | X_train_enc = tab_vec.fit_transform(df_train, y_train)
168 | X_test_enc = tab_vec.transform(df_test)
169 |
170 | # %%
171 | # The encoded data, X_train_enc and X_test_enc are numerical arrays:
172 | X_train_enc
173 |
174 | # %%
175 | # They have more columns than the original dataframe, but not much more:
176 | X_train_enc.shape
177 |
178 | # %%
179 | # Inspecting the features created
180 | # .................................
181 | #
182 | # The |SV| assigns a transformer for each column. We can inspect this
183 | # choice:
184 | tab_vec.transformers_
185 |
186 | # %%
187 | # This is what is being passed to transform the different columns under the hood.
188 | # We can notice it classified the columns "gender" and "assignment_category"
189 | # as low cardinality string variables.
190 | # A |OneHotEncoder| will be applied to these columns.
191 | #
192 | # The vectorizer actually makes the difference between string variables
193 | # (data type ``object`` and ``string``) and categorical variables
194 | # (data type ``category``).
195 | #
196 | # Next, we can have a look at the encoded feature names.
197 | #
198 | # Before encoding:
199 | df.columns.to_list()
200 |
201 | # %%
202 | # After encoding (we only plot the first 8 feature names):
203 | feature_names = tab_vec.get_feature_names_out()
204 | feature_names[:8]
205 |
206 | # %%
207 | # As we can see, it created a new column for each unique value.
208 | # This is because we used |SE| on the column "division",
209 | # which was classified as a high cardinality string variable.
210 | # (default values, see |SV|'s docstring).
211 | #
212 | # In total, we have reasonnable number of encoded columns.
213 | len(feature_names)
214 |
215 |
216 | # %%
217 | # Feature importance in the statistical model
218 | # ---------------------------------------------
219 | #
220 | # Here we consider interpretability, plot the feature importances of a
221 | # classifier. We can do this because the |GapEncoder| leads to
222 | # interpretable features even with messy categories
223 | #
224 | # .. topic:: Note:
225 | #
226 | # To minimize compute time, use the feature importances computed by the
227 | # |RandomForestRegressor|, but you should prefer |permutation importances|
228 | # instead (which are less subject to biases)
229 | #
230 | # First, let's train the |RandomForestRegressor|,
231 |
232 | from sklearn.ensemble import RandomForestRegressor
233 | regressor = RandomForestRegressor()
234 | regressor.fit(X_train_enc, y_train)
235 |
236 |
237 | # %%
238 | # Retrieving the feature importances
239 | importances = regressor.feature_importances_
240 | std = np.std(
241 | [
242 | tree.feature_importances_
243 | for tree in regressor.estimators_
244 | ],
245 | axis=0
246 | )
247 | indices = np.argsort(importances)[::-1]
248 |
249 | # %%
250 | # Plotting the results:
251 |
252 | import matplotlib.pyplot as plt
253 | plt.figure(figsize=(12, 9))
254 | plt.title("Feature importances")
255 | n = 20
256 | n_indices = indices[:n]
257 | labels = np.array(feature_names)[n_indices]
258 | plt.barh(range(n), importances[n_indices], color="b", yerr=std[n_indices])
259 | plt.yticks(range(n), labels, size=15)
260 | plt.tight_layout(pad=1)
261 | plt.show()
262 |
263 | # %%
264 | # We can deduce from this data that the three factors that define the
265 | # most the salary are: being hired for a long time, being a manager, and
266 | # having a permanent, full-time job :).
267 |
268 |
269 | # %%
270 | #
271 | # Exploring different machine-learning pipeline to encode the data
272 | # =================================================================
273 | #
274 | # The learning pipeline
275 | # ----------------------------
276 | #
277 | # To build a learning pipeline, we need to assemble encoders for each
278 | # column, and apply a supervised learning model on top.
279 |
280 | # %%
281 | # Encoding the table
282 | # ........................
283 | #
284 | # The TableVectorizer applies different transformations to the different
285 | # columns to turn them into numerical values suitable for learning
286 |
287 | from skrub import TableVectorizer
288 | encoder = TableVectorizer()
289 |
290 | # %%
291 | # Pipelining an encoder with a learner
292 | # ....................................
293 | #
294 | # Here again we use a pipeline with HistGradientBoostingRegressor
295 | from sklearn.ensemble import HistGradientBoostingRegressor
296 | pipeline = make_pipeline(encoder, HistGradientBoostingRegressor())
297 |
298 | # %%
299 | # The pipeline can be readily applied to the dataframe for prediction
300 | pipeline.fit(df, y)
301 |
302 | # The categorical encoders
303 | # ........................
304 | #
305 | # A encoder is needed to turn a categorical column into a numerical
306 | # representation
307 | from sklearn.preprocessing import OneHotEncoder
308 |
309 | one_hot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
310 |
311 | # %%
312 | # Dirty-category encoding
313 | # -------------------------
314 | #
315 | # The one-hot encoder is actually not well suited to the 'Employee
316 | # Position Title' column, as this columns contains 400 different entries.
317 | #
318 | # We will now experiments with different encoders for dirty columns
319 | from skrub import SimilarityEncoder, MinHashEncoder,\
320 | GapEncoder
321 | from sklearn.preprocessing import TargetEncoder
322 |
323 | similarity = SimilarityEncoder()
324 | target = TargetEncoder()
325 | minhash = MinHashEncoder(n_components=100)
326 | gap = GapEncoder(n_components=100)
327 |
328 | encoders = {
329 | 'one-hot': one_hot,
330 | 'similarity': similarity,
331 | 'target': target,
332 | 'minhash': minhash,
333 | 'gap': gap}
334 |
335 | # %%
336 | # We now loop over the different encoding methods,
337 | # instantiate each time a new pipeline, fit it
338 | # and store the returned cross-validation score:
339 |
340 | all_scores = dict()
341 |
342 | for name, method in encoders.items():
343 | encoder = TableVectorizer(high_cardinality=method)
344 |
345 | pipeline = make_pipeline(encoder, HistGradientBoostingRegressor())
346 | scores = cross_validate(pipeline, df, y)
347 | print('{} encoding'.format(name))
348 | print('r2 score: mean: {:.3f}; std: {:.3f}'.format(
349 | np.mean(scores['test_score']), np.std(scores['test_score'])))
350 | print('time: {:.3f}\n'.format(
351 | np.mean(scores['fit_time'])))
352 | all_scores[name] = scores['test_score']
353 |
354 | # %%
355 | # Note that the time it takes to fit varies also a lot, and not only the
356 | # prediction score
357 |
358 | # %%
359 | # Plotting the results
360 | # .....................
361 | # Finally, we plot the scores on a boxplot:
362 |
363 | import seaborn
364 | import matplotlib.pyplot as plt
365 | plt.figure(figsize=(4, 3))
366 | ax = seaborn.boxplot(data=pd.DataFrame(all_scores), orient='h')
367 | plt.ylabel('Encoding', size=20)
368 | plt.xlabel('Prediction accuracy ', size=20)
369 | plt.yticks(size=20)
370 | plt.tight_layout()
371 |
372 | # %%
373 | # The clear trend is that encoders that use the string form
374 | # of the category (similarity, minhash, and gap) perform better than
375 | # those that discard it.
376 | #
377 | # SimilarityEncoder is the best performer, but it is less scalable on big
378 | # data than MinHashEncoder and GapEncoder. The most scalable encoder is
379 | # the MinHashEncoder. GapEncoder, on the other hand, has the benefit that
380 | # it provides interpretable features, as shown above
381 | #
382 | # |
383 | #
384 | #
385 | # .. topic:: The TableVectorizer automates preprocessing
386 | #
387 | # As this notebook demonstrates, many preprocessing steps can be
388 | # automated by the |SV|, and the resulting pipeline can still be
389 | # inspected, even with non-normalized entries.
390 | #
391 |
--------------------------------------------------------------------------------
/notes/01_missing_values.py:
--------------------------------------------------------------------------------
1 | """
2 | =========================================
3 | Machine learning with missing values
4 | =========================================
5 |
6 | Here we use simulated data to understanding the fundamentals of statistical
7 | learning with missing values.
8 |
9 | This notebook reveals why a HistGradientBoostingRegressor (
10 | :class:`sklearn.ensemble.HistGradientBoostingRegressor` ) is a good choice to
11 | predict with missing values.
12 |
13 | We use simulations to control the missing-value mechanism, and inspect
14 | it's impact on predictive models. In particular, standard imputation
15 | procedures can reconstruct missing values without distortion only if the
16 | data is *missing at random*.
17 |
18 | A good introduction to the mathematics behind this notebook can be found in
19 | https://arxiv.org/abs/1902.06931
20 |
21 | .. topic:: **Missing values in categorical data**
22 |
23 | If a categorical column has missing values, the simplest approach is
24 | to create a specific category "missing" and assign missing values to
25 | this new category, to represent missingness in the classifier.
26 | Indeed, as we will see, imputation is not crucial for prediction.
27 | In the following we focus on continuous columns, where the discrete
28 | nature of a missing value poses more problems.
29 |
30 | """
31 |
32 |
33 | # %%
34 | # The fully-observed data: a toy regression problem
35 | # ==================================================
36 | #
37 | # We consider a simple regression problem where X (the data) is bivariate
38 | # gaussian, and y (the prediction target) is a linear function of the first
39 | # coordinate, with noise.
40 | #
41 | # The data-generating mechanism
42 | # ------------------------------
43 |
44 | import numpy as np
45 |
46 | def generate_without_missing_values(n_samples, rng=42):
47 | mean = [0, 0]
48 | cov = [[1, 0.9], [0.9, 1]]
49 | if not isinstance(rng, np.random.RandomState):
50 | rng = np.random.RandomState(rng)
51 | X = rng.multivariate_normal(mean, cov, size=n_samples)
52 |
53 | epsilon = 0.1 * rng.randn(n_samples)
54 | y = X[:, 0] + epsilon
55 |
56 | return X, y
57 |
58 | # %%
59 | # A quick plot reveals what the data looks like
60 |
61 | import matplotlib.pyplot as plt
62 | plt.rcParams['figure.figsize'] = (5, 4) # Smaller default figure size
63 |
64 | plt.figure()
65 | X_full, y_full = generate_without_missing_values(1000)
66 | plt.scatter(X_full[:, 0], X_full[:, 1], c=y_full)
67 | plt.colorbar(label='y')
68 |
69 | # %%
70 | # Missing completely at random settings
71 | # ======================================
72 | #
73 | # We now consider missing completely at random settings (a special case
74 | # of missing at random): the missingness is completely independent from
75 | # the values.
76 | #
77 | # The missing-values mechanism
78 | # -----------------------------
79 |
80 | def generate_mcar(n_samples, missing_rate=.5, rng=42):
81 | X, y = generate_without_missing_values(n_samples, rng=rng)
82 | if not isinstance(rng, np.random.RandomState):
83 | rng = np.random.RandomState(rng)
84 |
85 | M = rng.binomial(1, missing_rate, (n_samples, 2))
86 | np.putmask(X, M, np.nan)
87 |
88 | return X, y
89 |
90 | # %%
91 | # A quick plot to look at the data
92 | X, y = generate_mcar(500)
93 |
94 | plt.figure()
95 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5', label='All data')
96 | plt.colorbar(label='y')
97 | plt.scatter(X[:, 0], X[:, 1], c=y, label='Fully observed')
98 | plt.legend()
99 |
100 | # %%
101 | # We can see that the distribution of the fully-observed data is the same
102 | # than that of the original data
103 | #
104 | # Conditional Imputation with the IterativeImputer
105 | # ------------------------------------------------
106 | #
107 | # As the data is MAR (missing at random), an imputer can use the
108 | # conditional dependencies between the observed and the missing values to
109 | # impute the missing values.
110 | #
111 | # We'll use the IterativeImputer, a good imputer, but it needs to be enabled
112 | from sklearn.experimental import enable_iterative_imputer
113 | from sklearn import impute
114 | iterative_imputer = impute.IterativeImputer()
115 |
116 | # %%
117 | # Let us try the imputer on the small data used to visualize
118 | #
119 | # **The imputation is learned by fitting the imputer on the data with
120 | # missing values**
121 | iterative_imputer.fit(X)
122 |
123 | # %%
124 | # **The data are imputed with the transform method**
125 | X_imputed = iterative_imputer.transform(X)
126 |
127 | # %%
128 | # We can display the imputed data as our previous visualization
129 | plt.figure()
130 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5',
131 | label='All data', alpha=.5)
132 | plt.scatter(X_imputed[:, 0], X_imputed[:, 1], c=y, marker='X',
133 | label='Imputed')
134 | plt.colorbar(label='y')
135 | plt.legend()
136 |
137 | # %%
138 | # We can see that the imputer did a fairly good job of recovering the
139 | # data distribution
140 | #
141 | # Supervised learning: imputation and a linear model
142 | # -----------------------------------------------------------
143 | #
144 | # Given that the relationship between the fully-observed X and y is a
145 | # linear relationship, it seems natural to use a linear model for
146 | # prediction. It must be adapted to missing values using imputation.
147 | #
148 | # To use it in supervised setting, we will pipeline it with a linear
149 | # model, using a ridge, which is a good default linear model
150 | from sklearn.pipeline import make_pipeline
151 | from sklearn.linear_model import RidgeCV
152 |
153 | iterative_and_ridge = make_pipeline(impute.IterativeImputer(), RidgeCV())
154 |
155 | # %%
156 | # We can evaluate the model performance in a cross-validation loop
157 | # (for better evaluation accuracy, we increase slightly the number of
158 | # folds to 10)
159 | from sklearn import model_selection
160 | scores_iterative_and_ridge = model_selection.cross_val_score(
161 | iterative_and_ridge, X, y, cv=10)
162 |
163 | scores_iterative_and_ridge
164 |
165 | # %%
166 | # **Computational cost**: One drawback of the IterativeImputer to keep in
167 | # mind is that its computational cost can become prohibitive of large
168 | # datasets (it has a bad computation scalability).
169 |
170 | # %%
171 | # Mean imputation: SimpleImputer
172 | # -------------------------------
173 | #
174 | # We can try a simple imputer: imputation by the mean
175 | mean_imputer = impute.SimpleImputer()
176 |
177 | # %%
178 | # A quick visualization reveals a larger disortion of the distribution
179 | X_imputed = mean_imputer.fit_transform(X)
180 | plt.figure()
181 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5',
182 | label='All data', alpha=.5)
183 | plt.scatter(X_imputed[:, 0], X_imputed[:, 1], c=y, marker='X',
184 | label='Imputed')
185 | plt.colorbar(label='y')
186 |
187 | # %%
188 | # Evaluating in prediction pipeline
189 | mean_and_ridge = make_pipeline(impute.SimpleImputer(), RidgeCV())
190 | scores_mean_and_ridge = model_selection.cross_val_score(
191 | mean_and_ridge, X, y, cv=10)
192 |
193 | scores_mean_and_ridge
194 |
195 | # %%
196 | # Supervised learning without imputation
197 | # ----------------------------------------
198 | #
199 | # The HistGradientBoosting models are based on trees, which can be
200 | # adapted to model directly missing values
201 | from sklearn.experimental import enable_hist_gradient_boosting
202 | from sklearn.ensemble import HistGradientBoostingRegressor
203 | score_hist_gradient_boosting = model_selection.cross_val_score(
204 | HistGradientBoostingRegressor(), X, y, cv=10)
205 |
206 | score_hist_gradient_boosting
207 |
208 | # %%
209 | # Recap: which pipeline predicts well on our small data?
210 | # -------------------------------------------------------
211 | #
212 | # Let's plot the scores to see things better
213 | import pandas as pd
214 | import seaborn as sns
215 |
216 | scores = pd.DataFrame({'Mean imputation + Ridge': scores_mean_and_ridge,
217 | 'IterativeImputer + Ridge': scores_iterative_and_ridge,
218 | 'HistGradientBoostingRegressor': score_hist_gradient_boosting,
219 | })
220 |
221 | sns.boxplot(data=scores, orient='h')
222 | plt.title('Prediction accuracy\n linear and small data\n'
223 | 'Missing Completely at Random')
224 | plt.tight_layout()
225 |
226 |
227 | # %%
228 | # Not much difference with the more sophisticated imputer. A more thorough
229 | # analysis would be necessary, with more cross-validation runs.
230 | #
231 | # Prediction performance with large datasets
232 | # -------------------------------------------
233 | #
234 | # Let us compare models in regimes where there is plenty of data
235 |
236 | X, y = generate_mcar(n_samples=20000)
237 |
238 | # %%
239 | # Iterative imputation and linear model
240 | scores_iterative_and_ridge= model_selection.cross_val_score(
241 | iterative_and_ridge, X, y, cv=10)
242 |
243 | # %%
244 | # Mean imputation and linear model
245 | scores_mean_and_ridge = model_selection.cross_val_score(
246 | mean_and_ridge, X, y, cv=10)
247 |
248 | # %%
249 | # And now the HistGradientBoostingRegressor, which does not need
250 | # imputation
251 | score_hist_gradient_boosting = model_selection.cross_val_score(
252 | HistGradientBoostingRegressor(), X, y, cv=10)
253 |
254 | # %%
255 | # We plot the results
256 | scores = pd.DataFrame({'Mean imputation + Ridge': scores_mean_and_ridge,
257 | 'IterativeImputer + Ridge': scores_iterative_and_ridge,
258 | 'HistGradientBoostingRegressor': score_hist_gradient_boosting,
259 | })
260 |
261 | sns.boxplot(data=scores, orient='h')
262 | plt.title('Prediction accuracy\n linear and large data\n'
263 | 'Missing Completely at Random')
264 | plt.tight_layout()
265 |
266 |
267 | # %%
268 | #
269 | # **When there is a reasonnable amout of data, the
270 | # HistGradientBoostingRegressor is the best strategy** even for a linear
271 | # data-generating mechanism, in MAR settings, which are settings
272 | # favorable to imputation + linear model [#]_.
273 | #
274 | # .. [#] Even in the case of a linear data-generating mechanism, the
275 | # optimal prediction one data imputed by a constant
276 | # is a piecewise affine function with 2^d regions (
277 | # http://proceedings.mlr.press/v108/morvan20a.html ). The
278 | # larger the dimensionality (number of features), the more a
279 | # imperfect imputation is hard to approximate with a simple model.
280 | #
281 | # |
282 |
283 | # %%
284 | # Missing not at random: censoring
285 | # ======================================
286 | #
287 | # We now consider missing not at random settings, in particular
288 | # self-masking or censoring, where large values are more likely to be
289 | # missing.
290 | #
291 | # The missing-values mechanism
292 | # -----------------------------
293 |
294 | def generate_censored(n_samples, missing_rate=.4, rng=42):
295 | X, y = generate_without_missing_values(n_samples, rng=rng)
296 | if not isinstance(rng, np.random.RandomState):
297 | rng = np.random.RandomState(rng)
298 |
299 | B = rng.binomial(1, 2 * missing_rate, (n_samples, 2))
300 | M = (X > 0.5) * B
301 |
302 | np.putmask(X, M, np.nan)
303 |
304 | return X, y
305 |
306 | # %%
307 | # A quick plot to look at the data
308 | X, y = generate_censored(500, missing_rate=.4)
309 |
310 | plt.figure()
311 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5',
312 | label='All data')
313 | plt.colorbar(label='y')
314 | plt.scatter(X[:, 0], X[:, 1], c=y, label='Fully observed')
315 | plt.legend()
316 |
317 | # %%
318 | # Here the full-observed data does not reflect well at all the
319 | # distribution of all the data
320 |
321 | # %%
322 | # Imputation fails to recover the distribution
323 | # --------------------------------------------------------
324 | #
325 | # With MNAR data, off-the-shelf imputation methods do not recover the
326 | # initial distribution:
327 |
328 | iterative_imputer = impute.IterativeImputer()
329 | X_imputed = iterative_imputer.fit_transform(X)
330 |
331 | plt.figure()
332 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5',
333 | label='All data', alpha=.5)
334 | plt.scatter(X_imputed[:, 0], X_imputed[:, 1], c=y, marker='X',
335 | label='Imputed')
336 | plt.colorbar(label='y')
337 | plt.legend()
338 |
339 | # %%
340 | # Recovering the initial data distribution would need much more mass on
341 | # the right and the top of the figure. The imputed data is shifted to
342 | # lower values than the original data.
343 | #
344 | # Note also that as imputed values typically have lower X values than
345 | # their full-observed counterparts, the association between X and y is
346 | # also distorted. This is visible as the imputed values appear as lighter
347 | # diagonal lines.
348 | #
349 | # An important consequence is that **the link between imputed X and y is no
350 | # longer linear**, although the original data-generating mechanism is
351 | # linear [#]_. For this reason, **it is often a good idea to use non-linear
352 | # learners in the presence of missing values**.
353 | #
354 | # .. [#] As mentionned above, even in the case of a linear
355 | # data-generating mechanism, imperfect imputation leads to complex
356 | # functions to link to y (
357 | # http://proceedings.mlr.press/v108/morvan20a.html )
358 |
359 | # %%
360 | # Predictive pipelines
361 | # -----------------------------
362 | #
363 | # Let us now evaluate predictive pipelines
364 | scores = dict()
365 |
366 | # Iterative imputation and linear model
367 | scores['IterativeImputer + Ridge'] = model_selection.cross_val_score(
368 | iterative_and_ridge, X, y, cv=10)
369 |
370 | # Mean imputation and linear model
371 | scores['Mean imputation + Ridge'] = model_selection.cross_val_score(
372 | mean_and_ridge, X, y, cv=10)
373 |
374 | # IterativeImputer and non-linear model
375 | iterative_and_gb = make_pipeline(impute.IterativeImputer(),
376 | HistGradientBoostingRegressor())
377 | scores['Mean imputation\n+ HistGradientBoostingRegressor'] = model_selection.cross_val_score(
378 | iterative_and_gb, X, y, cv=10)
379 |
380 | # Mean imputation and non-linear model
381 | mean_and_gb = make_pipeline(impute.SimpleImputer(),
382 | HistGradientBoostingRegressor())
383 | scores['IterativeImputer\n+ HistGradientBoostingRegressor'] = model_selection.cross_val_score(
384 | mean_and_gb, X, y, cv=10)
385 |
386 | # And now the HistGradientBoostingRegressor, whithout imputation
387 | scores['HistGradientBoostingRegressor'] = model_selection.cross_val_score(
388 | HistGradientBoostingRegressor(), X, y, cv=10)
389 |
390 | # We plot the results
391 | sns.boxplot(data=pd.DataFrame(scores), orient='h')
392 | plt.title('Prediction accuracy\n linear and small data\n'
393 | 'Missing not at Random')
394 | plt.tight_layout()
395 |
396 |
397 | # %%
398 | # We can see that the imputation is not the most important step of the
399 | # pipeline [#]_, rather **what is important is to use a powerful model**.
400 | # Here there is information in missingness (if a value is missing, it is
401 | # large), information that a model can use to predict better.
402 | #
403 | # .. [#] Note that there are less missing values in the example here
404 | # compared to the section above on MCAR, hence the absolute prediction
405 | # accuracies are not comparable.
406 |
407 | # %%
408 | # .. topic:: Prediction with missing values
409 | #
410 | # The data above are very simple: linear data-generating mechanism,
411 | # Gaussian, and low dimensional. Yet, they show the importance of using
412 | # non-linear models, in particular the HistGradientBoostingRegressor
413 | # which natively deals with missing values.
414 |
415 |
416 | # %%
417 | # Using a predictor for the fully-observed case
418 | # ==============================================
419 | #
420 | # Let us go back to the "easy" case of the missing completely at random
421 | # settings with plenty of data
422 | n_samples = 20000
423 |
424 | X, y = generate_mcar(n_samples, missing_rate=.5)
425 |
426 | # %%
427 | # Suppose we have been able to train a predictive model that works on
428 | # fully-observed data:
429 |
430 | X_full, y_full = generate_without_missing_values(n_samples)
431 | full_data_predictor = HistGradientBoostingRegressor()
432 | full_data_predictor.fit(X_full, y_full)
433 |
434 | model_selection.cross_val_score(full_data_predictor, X_full, y_full)
435 |
436 | # %%
437 | # The cross validation reveals that the predictor achieves an excellent
438 | # explained variance; it is a near-perfect predictor on fully observed
439 | # data
440 |
441 | # %%
442 | # Now we turn to data with missing values. Given that our data is MAR
443 | # (missing at random), we will use imputation to build a completed data
444 | # that looks like the full-observed data
445 |
446 | iterative_imputer = impute.IterativeImputer()
447 | X_imputed = iterative_imputer.fit_transform(X)
448 |
449 | # %%
450 | # The full data predictor can be used on the imputed data
451 | from sklearn import metrics
452 | metrics.r2_score(y, full_data_predictor.predict(X_imputed))
453 |
454 | # %%
455 | # This prediction is less good than on the full data, but this is
456 | # expected, as missing values lead to a loss of information. We can
457 | # compare it to a model trained to predict on data with missing values
458 |
459 | X_train, y_train = generate_mcar(n_samples, missing_rate=.5)
460 | na_predictor = HistGradientBoostingRegressor()
461 | na_predictor.fit(X_train, y_train)
462 |
463 | metrics.r2_score(y, na_predictor.predict(X))
464 |
465 | # %%
466 | # Applying a model valid on the full data to imputed data work almost
467 | # as well as a model trained for missing values. The small loss in
468 | # performance is because the imputation is imperfect.
469 |
470 | # %%
471 | # When the data-generation is non linear
472 | # ---------------------------------------
473 | #
474 | # We now modify a bit the example above to consider the situation where y
475 | # is a non-linear function of X
476 |
477 | X, y = generate_mcar(n_samples, missing_rate=.5)
478 | y = y ** 2
479 |
480 | # Train a predictive model that works on fully-observed data:
481 | X_full, y_full = generate_without_missing_values(n_samples)
482 | y_full = y_full ** 2
483 | full_data_predictor = HistGradientBoostingRegressor()
484 | full_data_predictor.fit(X_full, y_full)
485 |
486 | model_selection.cross_val_score(full_data_predictor, X_full, y_full)
487 |
488 | # %%
489 | # Once again, we have a near-perfect predictor on fully-observed data
490 | #
491 | # On data with missing values:
492 |
493 | iterative_imputer = impute.IterativeImputer()
494 | X_imputed = iterative_imputer.fit_transform(X)
495 |
496 | from sklearn import metrics
497 | metrics.r2_score(y, full_data_predictor.predict(X_imputed))
498 |
499 | # %%
500 | # The full-data predictor works much less well
501 | #
502 | # Now we use a model trained to predict on data with missing values
503 |
504 | X_train, y_train = generate_mcar(n_samples, missing_rate=.5)
505 | y_train = y_train ** 2
506 | na_predictor = HistGradientBoostingRegressor()
507 | na_predictor.fit(X_train, y_train)
508 |
509 | metrics.r2_score(y, na_predictor.predict(X))
510 |
511 | # %%
512 | # The model trained on data with missing values works significantly
513 | # better than that was optimal for the fully-observed data.
514 | #
515 | # **Only for linear mechanism is the model on full data also optimal for
516 | # perfectly imputed data**. When the function linking X to y has
517 | # curvature, this curvature turns uncertainty resulting from missingness
518 | # into bias [#]_.
519 | #
520 | # .. [#] The detailed mathematical analysis of prediction after
521 | # imputation can be found here: https://arxiv.org/abs/2106.00311
522 | #
523 | # |
524 | #
525 | # ________
526 |
527 |
--------------------------------------------------------------------------------