├── .gitignore
├── LICENSE
├── Makefile
├── README.rst
├── _static
    ├── custom.css
    ├── piggy.svg
    └── scrolltoc.js
├── _templates
    └── globallinks.html
├── conf.py
├── index.rst
├── jupyter-lite.json
├── notes
    ├── 01_missing_values.py
    ├── 02_dirty_categories.py
    └── README.rst
├── publish.sh
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # vim
132 | *~
133 | *.swp
134 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2021, dirty-data-science
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | PYTHON        = python3
 6 | SPHINXOPTS    =
 7 | SPHINXBUILD   = $(PYTHON) -m sphinx
 8 | 
 9 | ALLSPHINXOPTS   = -d build/doctrees $(SPHINXOPTS) .
10 | 
11 | 
12 | .PHONY: help clean html web pickle htmlhelp latex changes linkcheck zip
13 | 
14 | all: html-noplot
15 | 
16 | help:
17 | 	@echo "Please use \`make <target>' where <target> is one of"
18 | 	@echo "  html      to make standalone HTML files"
19 | 	@echo "  pickle    to make pickle files (usable by e.g. sphinx-web)"
20 | 	@echo "  htmlhelp  to make HTML files and a HTML help project"
21 | 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
22 | 	@echo "  pdf       to make PDF from LaTeX, you can set PAPER=a4 or PAPER=letter"
23 | 	@echo "  changes   to make an overview over all changed/added/deprecated items"
24 | 	@echo "  linkcheck to check all external links for integrity"
25 | 	@echo "  install   to upload to github the web pages"
26 | 	@echo "  zip       to create the zip file with examples and doc"
27 | 
28 | clean:
29 | 	-rm -rf build/*
30 | 	-rm -rf intro/scipy/auto_examples/ intro/matplotlib/auto_examples/ intro/summary-exercises/auto_examples advanced/mathematical_optimization/auto_examples/ advanced/advanced_numpy/auto_examples/ advanced/image_processing/auto_examples advanced/scipy_sparse/auto_examples packages/3d_plotting/auto_examples packages/statistics/auto_examples/ packages/scikit-image/auto_examples/ packages/scikit-learn/auto_examples intro/numpy/auto_examples guide/auto_examples
31 | 
32 | test:
33 | 	MATPLOTLIBRC=build_tools $(PYTHON) -m pytest --doctest-glob '*.rst' --ignore advanced/advanced_numpy/examples/myobject_test.py --ignore advanced/interfacing_with_c/numpy_c_api/test_cos_module_np.py --ignore advanced/interfacing_with_c/ctypes/cos_module.py --ignore advanced/interfacing_with_c/swig_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/cython_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/ctypes_numpy/cos_doubles.py --ignore advanced/interfacing_with_c/ctypes_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/numpy_shared/test_cos_doubles.py
34 | 
35 | test-stop-when-failing:
36 | 	MATPLOTLIBRC=build_tools $(PYTHON) -m pytest -x --doctest-glob '*.rst' --ignore advanced/advanced_numpy/examples/myobject_test.py --ignore advanced/interfacing_with_c/numpy_c_api/test_cos_module_np.py --ignore advanced/interfacing_with_c/ctypes/cos_module.py --ignore advanced/interfacing_with_c/swig_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/cython_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/ctypes_numpy/cos_doubles.py --ignore advanced/interfacing_with_c/ctypes_numpy/test_cos_doubles.py --ignore advanced/interfacing_with_c/numpy_shared/test_cos_doubles.py
37 | 
38 | html-noplot:
39 | 	$(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) build/html
40 | 	@echo
41 | 	@echo "Build finished. The HTML pages are in build/html."
42 | 
43 | html:
44 | 	mkdir -p build/html build/doctrees
45 | 	# This line makes the build a bit more lengthy, and the
46 | 	# the embedding of images more robust
47 | 	rm -rf build/html/_images
48 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html
49 | 	@echo
50 | 	@echo "Build finished. The HTML pages are in build/html."
51 | 
52 | cleandoctrees:
53 | 	rm -rf build/doctrees
54 | 
55 | pickle:
56 | 	mkdir -p build/pickle build/doctrees
57 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) build/pickle
58 | 	@echo
59 | 	@echo "Build finished; now you can process the pickle files or run"
60 | 	@echo "  sphinx-web build/pickle"
61 | 	@echo "to start the sphinx-web server."
62 | 
63 | web: pickle
64 | 
65 | linkcheck:
66 | 	mkdir -p build/linkcheck build/doctrees
67 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) build/linkcheck
68 | 	@echo
69 | 	@echo "Link check complete; look for any errors in the above output " \
70 | 	      "or in build/linkcheck/output.txt."
71 | 
72 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Machine-learning on dirty-data in Python: a tutorial
2 | -----------------------------------------------------
3 | 
4 | 


--------------------------------------------------------------------------------
/_static/custom.css:
--------------------------------------------------------------------------------
  1 | /** Styling **************************************************************/
  2 | 
  3 | /* Main page title */
  4 | div.body h1 {
  5 |     text-align: center;
  6 |     font-size: 270%;
  7 |     color: #055100;
  8 |     margin-bottom: 1em;
  9 | }
 10 | 
 11 | /* Secondary sections title */
 12 | div.body h2 {
 13 |     color: #055100;
 14 |     margin-top: 2.5em;
 15 | }
 16 | 
 17 | div.body h3 {
 18 |     margin-top: 2em;
 19 | }
 20 | 
 21 | div.body h4 {
 22 |     margin-top: 1.5em;
 23 | }
 24 | 
 25 | div.body h3, div.body h4 {
 26 |     color: #000;
 27 | }
 28 | 
 29 | /* More horizontal space in pre (to fit 80char lines) */
 30 | div.body pre {
 31 |     padding: 7px 30px 7px 10px;
 32 | }
 33 | 
 34 | @media only screen and (min-width: 1200px) {
 35 |     div.body pre, .sphx-glr-script-out .highlight pre {
 36 | 	margin-right: -30px;
 37 |     }
 38 | }
 39 | 
 40 | div.body dl.footnote {
 41 |     font-size: smaller;
 42 | }
 43 | 
 44 | /* Side hanging footnotes */
 45 | @media (min-width: 1500px) {
 46 |     div.body dl.footnote {
 47 | 	position: absolute;
 48 | 	right: 1ex;
 49 | 	margin-top: -10ex;
 50 | 	width: 17em;
 51 |     }
 52 | }
 53 | 
 54 | @media (min-width: 1650px) {
 55 |     div.body dl.footnote {
 56 | 	width: 20em;
 57 |     }
 58 | 
 59 | }
 60 | 
 61 | @media (min-width: 1750px) {
 62 |     div.body dl.footnote {
 63 | 	width: 25em;
 64 |     }
 65 | 
 66 | }
 67 | 
 68 | /** Sphinx gallery *******************************************************/
 69 | 
 70 | /* The download note on top of the examples */
 71 | div.sphx-glr-download-link-note {
 72 |     right: 2pt;
 73 |     top: 10pt;
 74 |     max-width: 15ex;
 75 |     float: right;
 76 |     padding: 0px 1px 10px 10px;
 77 |     background-color: #F6F6F6;
 78 |     font-size: small;
 79 | }
 80 | 
 81 | /* Hide the download python button (the notebook is enough) */
 82 | div.sphx-glr-download-python {
 83 |     display: none;
 84 | }
 85 | 
 86 | /* The download block at the bottom */
 87 | div.sphx-glr-footer-example {
 88 |     background-color: #F6F6F6;
 89 |     border: solid 1px #CCC;
 90 |     padding: 0px 5px;
 91 | }
 92 | 
 93 | div.sphx-glr-download code.download {
 94 |     word-break: break-word;
 95 |     font-size: small;
 96 | }
 97 | 
 98 | 
 99 | /* Large screens */
100 | @media only screen and (min-width: 1400px) {
101 |     /* The download note on top of the examples */
102 |     div.sphx-glr-download-link-note {
103 | 	display: none;
104 |     }
105 | 
106 |     /* The download block at the bottom */
107 |     div.sphx-glr-footer-example {
108 | 	position: absolute;
109 | 	right: 10px;
110 | 	max-width: 200px;
111 | 	top: 50px;
112 |     }
113 | }
114 | 
115 | @media only screen and (min-width: 1550px) {
116 |     div.sphx-glr-footer-example {
117 | 	max-width: 300px;
118 |     }
119 | }
120 | 
121 | @media only screen and (min-width: 1750px) {
122 |     div.sphx-glr-footer-example {
123 | 	max-width: 400px;
124 |     }
125 | }
126 | 
127 | div.sphx-glr-download-link-note p.admonition-title {
128 |     display: none;
129 | }
130 | 
131 | /* For screens larger than 1200px: alabaster default settings are that
132 |  * the body is 940px wide and the sidebar 220px */
133 | @media only screen and (min-width: 1200px) {
134 |     div.sphx-glr-download-link-note {
135 | 	position: absolute; /* Overrides the float behavior */
136 |     }
137 | }
138 | 
139 | div.sphx-glr-script-out, p.sphx-glr-script-out {
140 |     margin-left: -3.5ex;
141 | }
142 | 
143 | /** Sidebar **************************************************************/
144 | 
145 | /* Hide the title in the navigation bar in the index page */
146 | /*div.sphinxsidebarwrapper li a {
147 |     display: none;
148 | }*/
149 | 
150 | div.sphinxsidebarwrapper h3.toc-title {
151 |     font-size: unset;
152 |     font-style: italic;
153 |     text-decoration: underline;
154 |     margin-bottom: 0px;
155 | }
156 | 
157 | div.sphinxsidebarwrapper ul:first-of-type {
158 |     margin-top: 0px;
159 | }
160 | 
161 | /* Undo for children */
162 | div.sphinxsidebarwrapper li li a {
163 |     display: block;
164 | }
165 | 
166 | div.sphinxsidebarwrapper li li {
167 |     margin: 1ex 0ex;
168 | }
169 | 
170 | /* Online one level of titles */
171 | div.sphinxsidebarwrapper li li ul {
172 |     display: none;
173 | }
174 | 
175 | /* Undo for other pages */
176 | div.sphinxsidebarwrapper li.toctree-l1 a {
177 |     display: block;
178 | }
179 | 
180 | /* Less info in the navigation sidebar */
181 | /*div.sphinxsidebarwrapper h3:first-of-type {
182 |     display: none;
183 | }*/
184 | 
185 | /* Undo for childs of enclosing divs */
186 | div.sphinxsidebarwrapper div h3:first-of-type {
187 |     display: block;
188 | }
189 | 
190 | /* The section links */
191 | div.sphinxsidebarwrapper li a {
192 |     display: block;
193 |     margin-left: -2.5ex;
194 |     padding-left: 2.5ex;
195 | }
196 | 
197 | div.sphinxsidebarwrapper ul ul {
198 |     list-style: circle;
199 | }
200 | 
201 | div.sphinxsidebarwrapper li li a:hover {
202 |     border-right: #076B00 solid 5px;
203 |     background-color: #F6F6F6;
204 | }
205 | 
206 | /* The "active" sections */
207 | div.sphinxsidebarwrapper li li.preactive li ul {
208 |     display: none;
209 | }
210 | 
211 | div.sphinxsidebarwrapper li li.preactive li ul.active {
212 |     display: block;
213 | }
214 | 
215 | div.sphinxsidebarwrapper li li.preactive > a:first-child {
216 |     border-right: #C1CEC1 solid 5px;
217 | }
218 | 
219 | div.sphinxsidebarwrapper li ul.active ul {
220 |     display: block;
221 |     font-size: smaller;
222 | }
223 | 
224 | div.sphinxsidebarwrapper li li a.active {
225 |     border-right: #055100 solid 5px;
226 |     background-color: #F3F3F3;
227 |     text-decoration: none;
228 | }
229 | 
230 | div.sphinxsidebarwrapper > ul > li.preactive li ul {
231 |     display: none;
232 | }
233 | 
234 | div.sphinxsidebarwrapper > ul > li.preactive li ul.active {
235 |     display: block;
236 | }
237 | 
238 | div.sphinxsidebarwrapper li li.preactive ul {
239 |     display: block;
240 | }
241 | 
242 | div.sphinxsidebarwrapper li li a.active + ul {
243 |     display: block;
244 |     font-size: smaller;
245 | }
246 | 
247 | div.sphinxsidebarwrapper li li ul.active {
248 |     display: block;
249 | }
250 | 
251 | div.sphinxsidebarwrapper li li ul {
252 |     font-size: smaller;
253 | }
254 | 
255 | 
256 | /* Flush the sidebar more to the left */
257 | @media (min-width: 1300px) {
258 |     div.sphinxsidebar {
259 | 	left: 5%;
260 | 	width: 300px;
261 |     }
262 | 
263 |     div.sphinxsidebar p.logo {
264 | 	width: 220px;
265 |     }
266 | }
267 | 
268 | @media (min-width: 1500px) {
269 |     div.sphinxsidebar {
270 | 	left: 10%;
271 |     }
272 | }
273 | 
274 | /*************************************************************************/
275 | 
276 | /* My custom classes */
277 | p.right-align {
278 |     float: right;
279 | }
280 | 


--------------------------------------------------------------------------------
/_static/piggy.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?><svg xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 45 45" style="enable-background:new 0 0 45 45;" xml:space="preserve" version="1.1" id="svg2"><metadata id="metadata8"><rdf:RDF><cc:Work rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/></cc:Work></rdf:RDF></metadata><defs id="defs6"><clipPath id="clipPath16" clipPathUnits="userSpaceOnUse"><path id="path18" d="M 0,36 36,36 36,0 0,0 0,36 Z"/></clipPath></defs><g transform="matrix(1.25,0,0,-1.25,0,45)" id="g10"><g id="g12"><g clip-path="url(#clipPath16)" id="g14"><g transform="translate(34.1934,22.6709)" id="g20"><path id="path22" style="fill:#f4abba;fill-opacity:1;fill-rule:nonzero;stroke:none" d="m 0,0 c 0.387,0.371 0.733,0.795 1.02,1.28 1.685,2.853 0.269,10.292 -0.592,10.8 -0.696,0.411 -5.53,-1.05 -8.246,-3.132 -2.499,1.497 -5.344,2.381 -8.375,2.381 -3.022,0 -5.856,-0.879 -8.349,-2.367 -2.721,2.075 -7.533,3.528 -8.227,3.118 -0.861,-0.508 -2.276,-7.947 -0.592,-10.8 0.278,-0.471 0.615,-0.884 0.989,-1.249 -1.156,-2.551 -1.821,-5.342 -1.821,-8.18 0,-9.99 8.01,-12.522 18,-12.522 9.989,0 18,2.532 18,12.522 C 1.807,-5.322 1.146,-2.544 0,0"/></g><g transform="translate(7.3979,30.0352)" id="g24"><path id="path26" style="fill:#ea596e;fill-opacity:1;fill-rule:nonzero;stroke:none" d="M 0,0 C -2.166,1.268 -4.402,2.08 -4.8,1.845 -5.37,1.509 -5.883,-3.153 -5.151,-6.419 -3.879,-3.938 -2.112,-1.738 0,0"/></g><g transform="translate(33.7529,23.6157)" id="g28"><path id="path30" style="fill:#ea596e;fill-opacity:1;fill-rule:nonzero;stroke:none" d="M 0,0 C 0.732,3.267 0.219,7.928 -0.351,8.265 -0.749,8.499 -2.985,7.687 -5.151,6.419 -3.038,4.682 -1.271,2.482 0,0"/></g><g transform="translate(28,12.875)" id="g32"><path id="path34" style="fill:#ea596e;fill-opacity:1;fill-rule:nonzero;stroke:none" d="m 0,0 c 0,-4.487 -3.097,-9.375 -10,-9.375 -6.904,0 -10,4.888 -10,9.375 0,4.487 3.096,5.625 10,5.625 C -3.097,5.625 0,4.487 0,0"/></g><g transform="translate(15,11.4004)" id="g36"><path id="path38" style="fill:#662113;fill-opacity:1;fill-rule:nonzero;stroke:none" d="M 0,0 C 0,-1.857 -0.34,-2.4 -1.5,-2.4 -2.66,-2.4 -3,-1.857 -3,0 -3,1.856 -2.66,2.399 -1.5,2.399 -0.34,2.399 0,1.856 0,0"/></g><g transform="translate(24,11.4004)" id="g40"><path id="path42" style="fill:#662113;fill-opacity:1;fill-rule:nonzero;stroke:none" d="M 0,0 C 0,-1.857 -0.34,-2.4 -1.5,-2.4 -2.66,-2.4 -3,-1.857 -3,0 -3,1.856 -2.66,2.399 -1.5,2.399 -0.34,2.399 0,1.856 0,0"/></g><g transform="translate(9,19)" id="g44"><path id="path46" style="fill:#292f33;fill-opacity:1;fill-rule:nonzero;stroke:none" d="m 0,0 c 0,-1.104 -0.896,-2 -2,-2 -1.104,0 -2,0.896 -2,2 0,1.105 0.896,2 2,2 1.104,0 2,-0.895 2,-2"/></g><g transform="translate(31,19)" id="g48"><path id="path50" style="fill:#292f33;fill-opacity:1;fill-rule:nonzero;stroke:none" d="m 0,0 c 0,-1.104 -0.895,-2 -2,-2 -1.104,0 -2,0.896 -2,2 0,1.105 0.896,2 2,2 1.105,0 2,-0.895 2,-2"/></g></g></g></g></svg>
2 | 


--------------------------------------------------------------------------------
/_static/scrolltoc.js:
--------------------------------------------------------------------------------
 1 | function updateSideBarPosition(sections) {
 2 |     var pos = $(window).scrollTop();
 3 | 
 4 |     // Highlight the current section
 5 |     i = 0;
 6 |     current_section = 0;
 7 |     $('a.internal').removeClass('active');
 8 |     $('ul.active').removeClass('active');
 9 |     $('li.preactive').removeClass('preactive');
10 |     for(i in sections) {
11 |         if(sections[i] > pos) {
12 |             break
13 |         }
14 | 	console.log(i); 
15 | 	current_section = i
16 |         if($('a.internal[href$="' + i + '"]').is(':visible')){
17 |             current_section = i
18 |         }
19 |     }
20 |     $('a.internal[href$="' + current_section + '"]').addClass('active');
21 |     $('a.internal[href$="' + current_section + '"]').parent().parent().addClass('active')
22 |     $('a.internal[href$="' + current_section + '"]').parent().parent().parent().addClass('preactive')
23 |     $('a.internal[href$="' + current_section + '"]').parent().parent().parent().parent().parent().addClass('preactive')
24 | }
25 | 
26 | $(function() {
27 |     sections = {};
28 |     url = document.URL.replace(/#.*$/, "");
29 | 
30 |     // Grab positions of our sections
31 |     $('.headerlink').each(function(){
32 |         sections[this.href.replace(url, '')] = $(this).offset().top - 150
33 |     });
34 | 
35 |     updateSideBarPosition(sections);
36 |     $(window).scroll(function(event) {
37 |         updateSideBarPosition(sections)
38 |     });
39 | 
40 |     $(window).resize(function(event) {
41 |         updateSideBarPosition(sections)
42 |     });
43 | });
44 | 
45 | 


--------------------------------------------------------------------------------
/_templates/globallinks.html:
--------------------------------------------------------------------------------
1 | <h3>{{ _('Navigation') }}</h3>
2 | <hr />
3 | <ul>
4 |     <li class="toctree-l1"><a href="{{pathto('', 1)}}index.html#using-dirty-cat">Usage</a></li>
5 |     <li class="toctree-l1"><a href="{{pathto('', 1)}}index.html#api-documentation">API</a></li>
6 |     <li class="toctree-l1"><a href="{{pathto('', 1)}}index.html#about">About</a></li>
7 | </ul>
8 | 


--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is execfile()d with the current directory set to its
  4 | # containing dir.
  5 | #
  6 | # Note that not all possible configuration values are present in this
  7 | # autogenerated file.
  8 | #
  9 | # All configuration values have a default; values that are commented out
 10 | # serve to show the default.
 11 | 
 12 | # If extensions (or modules to document with autodoc) are in another directory,
 13 | # add these directories to sys.path here. If the directory is relative to the
 14 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 15 | #
 16 | import os
 17 | 
 18 | 
 19 | # -- General configuration ------------------------------------------------
 20 | 
 21 | # If your documentation needs a minimal Sphinx version, state it here.
 22 | #
 23 | # needs_sphinx = '1.0'
 24 | 
 25 | # Add any Sphinx extension module names here, as strings. They can be
 26 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 27 | # ones.
 28 | extensions = ['sphinx.ext.autodoc',
 29 |               'sphinx.ext.doctest',
 30 |               'sphinx.ext.intersphinx',
 31 |               'sphinx.ext.mathjax',
 32 |               'sphinx.ext.viewcode',
 33 |               'sphinx.ext.githubpages',
 34 |               'sphinx_gallery.gen_gallery',
 35 |               'jupyterlite_sphinx',
 36 |               ]
 37 | 
 38 | try:
 39 |     import sphinxext.opengraph
 40 |     extensions.append('sphinxext.opengraph')
 41 | except ImportError:
 42 |     print("ERROR: sphinxext.opengraph import failed")
 43 | 
 44 | # Add any paths that contain templates here, relative to this directory.
 45 | templates_path = ['_templates']
 46 | 
 47 | # The suffix(es) of source filenames.
 48 | # You can specify multiple suffix as a list of string:
 49 | #
 50 | # source_suffix = ['.rst', '.md']
 51 | source_suffix = '.rst'
 52 | 
 53 | # The master toctree document.
 54 | master_doc = 'index'
 55 | 
 56 | # General information about the project.
 57 | project = u'Dirty data science'
 58 | author = u'Gaël Varoquaux'
 59 | copyright = u'2021, ' + author
 60 | 
 61 | # The version info for the project you're documenting, acts as replacement for
 62 | # |version| and |release|, also used in various other places throughout the
 63 | # built documents.
 64 | #
 65 | version = '2021.1'
 66 | # The full version, including alpha/beta/rc tags.
 67 | release = version
 68 | 
 69 | # The language for content autogenerated by Sphinx. Refer to documentation
 70 | # for a list of supported languages.
 71 | #
 72 | # This is also used if you do content translation via gettext catalogs.
 73 | # Usually you set "language" from the command line for these cases.
 74 | language = None
 75 | 
 76 | # List of patterns, relative to source directory, that match files and
 77 | # directories to ignore when looking for source files.
 78 | # This patterns also effect to html_static_path and html_extra_path
 79 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 80 | 
 81 | # The name of the Pygments (syntax highlighting) style to use.
 82 | pygments_style = 'sphinx'
 83 | 
 84 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 85 | todo_include_todos = False
 86 | 
 87 | 
 88 | # -- Options for HTML output ----------------------------------------------
 89 | 
 90 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 91 | # a list of builtin themes.
 92 | #
 93 | html_theme = 'alabaster'
 94 | # Doc: https://alabaster.readthedocs.io/en/latest/customization.html
 95 | 
 96 | html_sidebars = {
 97 |     '**': [
 98 |         'about.html',
 99 |         #'globallinks.html',
100 |         'localtoc.html',
101 |         'relations.html',
102 |         #'searchbox.html',
103 |     ],
104 |     'index': [
105 |         'about.html',
106 |         'localtoc.html',
107 |         'relations.html',
108 |         #'searchbox.html',
109 |     ]
110 | }
111 | 
112 | # Theme options are theme-specific and customize the look and feel of a theme
113 | # further.  For a list of options available for each theme, see the
114 | # documentation.
115 | #
116 | html_theme_options = {
117 |     'logo': 'piggy.svg',
118 |     'github_user': 'dirty-data-science',
119 |     'github_repo': 'python',
120 |     'github_button': 'true',
121 |     'github_type': 'star',
122 |     'github_count': 'true',
123 |     'show_powered_by': 'false',
124 |     'logo_name': 'true',
125 |     'gray_1': "#030",
126 |     'gray_2': "#F1FFF1",
127 |     'link': "#076B00",
128 | #    'gray_3': "#090",
129 |     'fixed_sidebar': 'true',
130 |     'note_bg': "rgb(246, 248, 250);",
131 |     #'topic_bg': "rgb(246, 248, 250);",
132 | }
133 | 
134 | # Add any paths that contain custom static files (such as style sheets) here,
135 | # relative to this directory. They are copied after the builtin static files,
136 | # so a file named "default.css" will overwrite the builtin "default.css".
137 | html_static_path = ['_static']
138 | 
139 | 
140 | # Modify the title, so as to get good social-media links
141 | html_title = "&mdash; Dirty data science"
142 | 
143 | 
144 | # Configuration for intersphinx
145 | intersphinx_mapping = {
146 |     'python': ('https://docs.python.org/3/', None),
147 |     'numpy': ('https://docs.scipy.org/doc/numpy', None),
148 |     'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
149 |     'matplotlib': ('https://matplotlib.org/', None),
150 |     'sklearn': ('https://scikit-learn.org/stable', None),
151 |     'skimage': ('http://scikit-image.org/docs/stable/', None),
152 |     'mayavi': ('http://docs.enthought.com/mayavi/mayavi/', None),
153 |     'statsmodels': ('http://www.statsmodels.org/stable/', None),
154 |     'pandas': ('http://pandas.pydata.org/pandas-docs/stable/', None),
155 |     'seaborn': ('http://seaborn.pydata.org/', None),
156 |     'skrub': ('https://skrub-data.org/stable/', None),
157 | }
158 | 
159 | 
160 | # -- sphinx-gallery configuration -----------------------------------------
161 | from sphinx_gallery.sorting import FileNameSortKey
162 | sphinx_gallery_conf = {
163 |     'filename_pattern': '',
164 |     'backreferences_dir': os.path.join('generated'),
165 |     'reference_url': {
166 | #        'dirty_cat': 'https://dirty-cat.github.io/stable/',
167 |         'numpy': 'http://docs.scipy.org/doc/numpy',
168 | #        'scipy': 'http://docs.scipy.org/doc/scipy/reference',
169 | #        'pandas': 'http://pandas.pydata.org/pandas-docs/stable',
170 | #        'seaborn': 'http://seaborn.pydata.org/',
171 |         'matplotlib': 'http://matplotlib.org/stable',
172 |         'sklearn': 'http://scikit-learn.org/stable',
173 | #        #'scikit-image': 'http://scikit-image.org/docs/stable/',
174 | #        #'mayavi': 'http://docs.enthought.com/mayavi/mayavi/',
175 |         #'statsmodels': 'http://www.statsmodels.org/stable/',
176 |         },
177 |     'examples_dirs':'notes',
178 |     'gallery_dirs':'gen_notes',
179 |     'within_subsection_order': FileNameSortKey,
180 |     'download_all_examples': False,
181 |     'binder': {
182 |         'org': 'dirty-data-science',
183 |         'repo': 'python',
184 |         'binderhub_url': 'https://mybinder.org',
185 |         'branch': 'gh-pages',
186 |         'dependencies': ['requirements.txt',],
187 |         'notebooks_dir': 'notes'
188 |     },
189 |     'jupyterlite': {
190 |         'use_jupyter_lab': False,
191 |     },
192 |     "inspect_global_variables": False,
193 | }
194 | 
195 | # -- sphinxext.opengraph configuration -------------------------------------
196 | ogp_site_url = "https://dirtydata.science/python"
197 | ogp_image = "https://dirtydata.science/python/_static/piggy.svg"
198 | ogp_use_first_image = True
199 | ogp_site_name = "Dirty Data Science"
200 | 
201 | 
202 | # -- The javascript to highlight the toc as we scroll ----------------------
203 | html_js_files = ['scrolltoc.js']
204 | 


--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
 1 | =====================================================
 2 | Machine-learning on dirty data in Python: a tutorial
 3 | =====================================================
 4 | 
 5 | Often in data science, machine-learning applications spend a significant
 6 | energy preparing, tidying, and cleaning the data before the machine
 7 | learning.
 8 | 
 9 | Here we give a set of Python tutorials on how some of these operations
10 | can be simplified with adequate machine-learning tools.
11 | 
12 | .. include:: gen_notes/index.rst
13 |     :start-line: 2
14 |     :end-before: .. rst-class:: sphx-glr-signature
15 | 
16 | 


--------------------------------------------------------------------------------
/jupyter-lite.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "jupyter-lite-schema-version": 0,
 3 |     "jupyter-config-data": {
 4 |       "disabledExtensions": [
 5 |         "@jupyterlab/drawio-extension",
 6 |         "jupyterlab-kernel-spy",
 7 |         "jupyterlab-tour"
 8 |       ],
 9 |     "litePluginSettings": {
10 |       "@jupyterlite/pyodide-kernel-extension:kernel": {
11 |         "pyodideUrl": "https://cdn.jsdelivr.net/pyodide/v0.23.1/full/pyodide.js"
12 |       }
13 |     }
14 |   }
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/notes/01_missing_values.py:
--------------------------------------------------------------------------------
  1 | """
  2 | =========================================
  3 | Machine learning with missing values
  4 | =========================================
  5 | 
  6 | Here we use simulated data to understanding the fundamentals of statistical
  7 | learning with missing values.
  8 | 
  9 | This notebook reveals why a HistGradientBoostingRegressor (
 10 | :class:`sklearn.ensemble.HistGradientBoostingRegressor` ) is a good choice to
 11 | predict with missing values.
 12 | 
 13 | We use simulations to control the missing-value mechanism, and inspect
 14 | it's impact on predictive models. In particular, standard imputation
 15 | procedures can reconstruct missing values without distortion only if the
 16 | data is *missing at random*.
 17 | 
 18 | A good introduction to the mathematics behind this notebook can be found in
 19 | https://arxiv.org/abs/1902.06931
 20 | 
 21 | .. topic:: **Missing values in categorical data**
 22 | 
 23 |     If a categorical column has missing values, the simplest approach is
 24 |     to create a specific category "missing" and assign missing values to
 25 |     this new category, to represent missingness in the classifier.
 26 |     Indeed, as we will see, imputation is not crucial for prediction.
 27 |     In the following we focus on continuous columns, where the discrete
 28 |     nature of a missing value poses more problems.
 29 | 
 30 | """
 31 | 
 32 | 
 33 | # %%
 34 | # The fully-observed data: a toy regression problem
 35 | # ==================================================
 36 | #
 37 | # We consider a simple regression problem where X (the data) is bivariate
 38 | # gaussian, and y (the prediction target)  is a linear function of the first
 39 | # coordinate, with noise.
 40 | #
 41 | # The data-generating mechanism
 42 | # ------------------------------
 43 | 
 44 | import numpy as np
 45 | 
 46 | def generate_without_missing_values(n_samples, rng=42):
 47 |     mean = [0, 0]
 48 |     cov = [[1, 0.9], [0.9, 1]]
 49 |     if not isinstance(rng, np.random.RandomState):
 50 |         rng = np.random.RandomState(rng)
 51 |     X = rng.multivariate_normal(mean, cov, size=n_samples)
 52 | 
 53 |     epsilon = 0.1 * rng.randn(n_samples)
 54 |     y = X[:, 0] + epsilon
 55 | 
 56 |     return X, y
 57 | 
 58 | # %%
 59 | # A quick plot reveals what the data looks like
 60 | 
 61 | import matplotlib.pyplot as plt
 62 | plt.rcParams['figure.figsize'] = (5, 4) # Smaller default figure size
 63 | 
 64 | plt.figure()
 65 | X_full, y_full = generate_without_missing_values(1000)
 66 | plt.scatter(X_full[:, 0], X_full[:, 1], c=y_full)
 67 | plt.colorbar(label='y')
 68 | 
 69 | # %%
 70 | # Missing completely at random settings
 71 | # ======================================
 72 | #
 73 | # We now consider missing completely at random settings (a special case
 74 | # of missing at random): the missingness is completely independent from
 75 | # the values.
 76 | #
 77 | # The missing-values mechanism
 78 | # -----------------------------
 79 | 
 80 | def generate_mcar(n_samples, missing_rate=.5, rng=42):
 81 |     X, y = generate_without_missing_values(n_samples, rng=rng)
 82 |     if not isinstance(rng, np.random.RandomState):
 83 |         rng = np.random.RandomState(rng)
 84 | 
 85 |     M = rng.binomial(1, missing_rate, (n_samples, 2))
 86 |     np.putmask(X, M, np.nan)
 87 | 
 88 |     return X, y
 89 | 
 90 | # %%
 91 | # A quick plot to look at the data
 92 | X, y = generate_mcar(500)
 93 | 
 94 | plt.figure()
 95 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5', label='All data')
 96 | plt.colorbar(label='y')
 97 | plt.scatter(X[:, 0], X[:, 1], c=y, label='Fully observed')
 98 | plt.legend()
 99 | 
100 | # %%
101 | # We can see that the distribution of the fully-observed data is the same
102 | # than that of the original data
103 | #
104 | # Conditional Imputation with the IterativeImputer
105 | # ------------------------------------------------
106 | #
107 | # As the data is MAR (missing at random), an imputer can use the
108 | # conditional dependencies between the observed and the missing values to
109 | # impute the missing values.
110 | #
111 | # We'll use the IterativeImputer, a good imputer, but it needs to be enabled
112 | from sklearn.experimental import enable_iterative_imputer
113 | from sklearn import impute
114 | iterative_imputer = impute.IterativeImputer()
115 | 
116 | # %%
117 | # Let us try the imputer on the small data used to visualize
118 | #
119 | # **The imputation is learned by fitting the imputer on the data with
120 | # missing values**
121 | iterative_imputer.fit(X)
122 | 
123 | # %%
124 | # **The data are imputed with the transform method**
125 | X_imputed = iterative_imputer.transform(X)
126 | 
127 | # %%
128 | # We can display the imputed data as our previous visualization
129 | plt.figure()
130 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5',
131 |             label='All data', alpha=.5)
132 | plt.scatter(X_imputed[:, 0], X_imputed[:, 1], c=y, marker='X',
133 |             label='Imputed')
134 | plt.colorbar(label='y')
135 | plt.legend()
136 | 
137 | # %%
138 | # We can see that the imputer did a fairly good job of recovering the
139 | # data distribution
140 | #
141 | # Supervised learning: imputation and a linear model
142 | # -----------------------------------------------------------
143 | #
144 | # Given that the relationship between the fully-observed X and y is a
145 | # linear relationship, it seems natural to use a linear model for
146 | # prediction. It must be adapted to missing values using imputation.
147 | #
148 | # To use it in supervised setting, we will pipeline it with a linear
149 | # model, using a ridge, which is a good default linear model
150 | from sklearn.pipeline import make_pipeline
151 | from sklearn.linear_model import RidgeCV
152 | 
153 | iterative_and_ridge = make_pipeline(impute.IterativeImputer(), RidgeCV())
154 | 
155 | # %%
156 | # We can evaluate the model performance in a cross-validation loop
157 | # (for better evaluation accuracy, we increase slightly the number of
158 | # folds to 10)
159 | from sklearn import model_selection
160 | scores_iterative_and_ridge = model_selection.cross_val_score(
161 |     iterative_and_ridge, X, y, cv=10)
162 | 
163 | scores_iterative_and_ridge
164 | 
165 | # %%
166 | # **Computational cost**: One drawback of the IterativeImputer to keep in
167 | # mind is that its computational cost can become prohibitive of large
168 | # datasets (it has a bad computation scalability).
169 | 
170 | # %%
171 | # Mean imputation: SimpleImputer
172 | # -------------------------------
173 | #
174 | # We can try a simple imputer: imputation by the mean
175 | mean_imputer = impute.SimpleImputer()
176 | 
177 | # %%
178 | # A quick visualization reveals a larger disortion of the distribution
179 | X_imputed = mean_imputer.fit_transform(X)
180 | plt.figure()
181 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5',
182 |             label='All data', alpha=.5)
183 | plt.scatter(X_imputed[:, 0], X_imputed[:, 1], c=y, marker='X',
184 |             label='Imputed')
185 | plt.colorbar(label='y')
186 | 
187 | # %%
188 | # Evaluating in prediction pipeline
189 | mean_and_ridge = make_pipeline(impute.SimpleImputer(), RidgeCV())
190 | scores_mean_and_ridge = model_selection.cross_val_score(
191 |     mean_and_ridge, X, y, cv=10)
192 | 
193 | scores_mean_and_ridge
194 | 
195 | # %%
196 | # Supervised learning without imputation
197 | # ----------------------------------------
198 | #
199 | # The HistGradientBoosting models are based on trees, which can be
200 | # adapted to model directly missing values
201 | from sklearn.experimental import enable_hist_gradient_boosting
202 | from sklearn.ensemble import HistGradientBoostingRegressor
203 | score_hist_gradient_boosting = model_selection.cross_val_score(
204 |     HistGradientBoostingRegressor(), X, y, cv=10)
205 | 
206 | score_hist_gradient_boosting
207 | 
208 | # %%
209 | # Recap: which pipeline predicts well on our small data?
210 | # -------------------------------------------------------
211 | #
212 | # Let's plot the scores to see things better
213 | import pandas as pd
214 | import seaborn as sns
215 | 
216 | scores = pd.DataFrame({'Mean imputation + Ridge': scores_mean_and_ridge,
217 |              'IterativeImputer + Ridge': scores_iterative_and_ridge,
218 |              'HistGradientBoostingRegressor': score_hist_gradient_boosting,
219 |     })
220 | 
221 | sns.boxplot(data=scores, orient='h')
222 | plt.title('Prediction accuracy\n linear and small data\n'
223 |           'Missing Completely at Random')
224 | plt.tight_layout()
225 | 
226 | 
227 | # %%
228 | # Not much difference with the more sophisticated imputer. A more thorough
229 | # analysis would be necessary, with more cross-validation runs.
230 | #
231 | # Prediction performance with large datasets
232 | # -------------------------------------------
233 | #
234 | # Let us compare models in regimes where there is plenty of data
235 | 
236 | X, y = generate_mcar(n_samples=20000)
237 | 
238 | # %%
239 | # Iterative imputation and linear model
240 | scores_iterative_and_ridge= model_selection.cross_val_score(
241 |     iterative_and_ridge, X, y, cv=10)
242 | 
243 | # %%
244 | # Mean imputation and linear model
245 | scores_mean_and_ridge = model_selection.cross_val_score(
246 |     mean_and_ridge, X, y, cv=10)
247 | 
248 | # %%
249 | # And now the HistGradientBoostingRegressor, which does not need
250 | # imputation
251 | score_hist_gradient_boosting = model_selection.cross_val_score(
252 |     HistGradientBoostingRegressor(), X, y, cv=10)
253 | 
254 | # %%
255 | # We plot the results
256 | scores = pd.DataFrame({'Mean imputation + Ridge': scores_mean_and_ridge,
257 |              'IterativeImputer + Ridge': scores_iterative_and_ridge,
258 |              'HistGradientBoostingRegressor': score_hist_gradient_boosting,
259 |     })
260 | 
261 | sns.boxplot(data=scores, orient='h')
262 | plt.title('Prediction accuracy\n linear and large data\n'
263 |           'Missing Completely at Random')
264 | plt.tight_layout()
265 | 
266 | 
267 | # %%
268 | #
269 | # **When there is a reasonnable amout of data, the
270 | # HistGradientBoostingRegressor is the best strategy** even for a linear
271 | # data-generating mechanism, in MAR settings, which are settings
272 | # favorable to imputation + linear model [#]_.
273 | #
274 | # .. [#] Even in the case of a linear data-generating mechanism, the
275 | #        optimal prediction one data imputed by a constant
276 | #        is a piecewise affine function with 2^d regions (
277 | #        http://proceedings.mlr.press/v108/morvan20a.html ). The
278 | #        larger the dimensionality (number of features), the more a
279 | #        imperfect imputation is hard to approximate with a simple model.
280 | #
281 | # |
282 | 
283 | # %%
284 | # Missing not at random: censoring
285 | # ======================================
286 | #
287 | # We now consider missing not at random settings, in particular
288 | # self-masking or censoring, where large values are more likely to be
289 | # missing.
290 | #
291 | # The missing-values mechanism
292 | # -----------------------------
293 | 
294 | def generate_censored(n_samples, missing_rate=.4, rng=42):
295 |     X, y = generate_without_missing_values(n_samples, rng=rng)
296 |     if not isinstance(rng, np.random.RandomState):
297 |         rng = np.random.RandomState(rng)
298 | 
299 |     B = rng.binomial(1, 2 * missing_rate, (n_samples, 2))
300 |     M = (X > 0.5) * B
301 | 
302 |     np.putmask(X, M, np.nan)
303 | 
304 |     return X, y
305 | 
306 | # %%
307 | # A quick plot to look at the data
308 | X, y = generate_censored(500, missing_rate=.4)
309 | 
310 | plt.figure()
311 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5',
312 |             label='All data')
313 | plt.colorbar(label='y')
314 | plt.scatter(X[:, 0], X[:, 1], c=y, label='Fully observed')
315 | plt.legend()
316 | 
317 | # %%
318 | # Here the full-observed data does not reflect well at all the
319 | # distribution of all the data
320 | 
321 | # %%
322 | # Imputation fails to recover the distribution
323 | # --------------------------------------------------------
324 | #
325 | # With MNAR data, off-the-shelf imputation methods do not recover the
326 | # initial distribution:
327 | 
328 | iterative_imputer = impute.IterativeImputer()
329 | X_imputed = iterative_imputer.fit_transform(X)
330 | 
331 | plt.figure()
332 | plt.scatter(X_full[:, 0], X_full[:, 1], color='.8', ec='.5',
333 |             label='All data', alpha=.5)
334 | plt.scatter(X_imputed[:, 0], X_imputed[:, 1], c=y, marker='X',
335 |             label='Imputed')
336 | plt.colorbar(label='y')
337 | plt.legend()
338 | 
339 | # %%
340 | # Recovering the initial data distribution would need much more mass on
341 | # the right and the top of the figure. The imputed data is shifted to
342 | # lower values than the original data.
343 | #
344 | # Note also that as imputed values typically have lower X values than
345 | # their full-observed counterparts, the association between X and y is
346 | # also distorted. This is visible as the imputed values appear as lighter
347 | # diagonal lines.
348 | #
349 | # An important consequence is that **the link between imputed X and y is no
350 | # longer linear**, although the original data-generating mechanism is
351 | # linear [#]_. For this reason, **it is often a good idea to use non-linear
352 | # learners in the presence of missing values**.
353 | #
354 | # .. [#] As mentionned above, even in the case of a linear
355 | #    data-generating mechanism, imperfect imputation leads to complex
356 | #    functions to link to y (
357 | #    http://proceedings.mlr.press/v108/morvan20a.html )
358 | 
359 | # %%
360 | # Predictive pipelines
361 | # -----------------------------
362 | #
363 | # Let us now evaluate predictive pipelines
364 | scores = dict()
365 | 
366 | # Iterative imputation and linear model
367 | scores['IterativeImputer + Ridge'] = model_selection.cross_val_score(
368 |     iterative_and_ridge, X, y, cv=10)
369 | 
370 | # Mean imputation and linear model
371 | scores['Mean imputation + Ridge'] = model_selection.cross_val_score(
372 |     mean_and_ridge, X, y, cv=10)
373 | 
374 | # IterativeImputer and non-linear model
375 | iterative_and_gb = make_pipeline(impute.IterativeImputer(),
376 |                             HistGradientBoostingRegressor())
377 | scores['Mean imputation\n+ HistGradientBoostingRegressor'] = model_selection.cross_val_score(
378 |     iterative_and_gb, X, y, cv=10)
379 | 
380 | # Mean imputation and non-linear model
381 | mean_and_gb = make_pipeline(impute.SimpleImputer(),
382 |                             HistGradientBoostingRegressor())
383 | scores['IterativeImputer\n+ HistGradientBoostingRegressor'] = model_selection.cross_val_score(
384 |     mean_and_gb, X, y, cv=10)
385 | 
386 | # And now the HistGradientBoostingRegressor, whithout imputation
387 | scores['HistGradientBoostingRegressor'] = model_selection.cross_val_score(
388 |     HistGradientBoostingRegressor(), X, y, cv=10)
389 | 
390 | # We plot the results
391 | sns.boxplot(data=pd.DataFrame(scores), orient='h')
392 | plt.title('Prediction accuracy\n linear and small data\n'
393 |           'Missing not at Random')
394 | plt.tight_layout()
395 | 
396 | 
397 | # %%
398 | # We can see that the imputation is not the most important step of the
399 | # pipeline [#]_, rather **what is important is to use a powerful model**.
400 | # Here there is information in missingness (if a value is missing, it is
401 | # large), information that a model can use to predict better.
402 | #
403 | # .. [#] Note that there are less missing values in the example here
404 | #    compared to the section above on MCAR, hence the absolute prediction
405 | #    accuracies are not comparable.
406 | 
407 | # %%
408 | # .. topic:: Prediction with missing values
409 | #
410 | #   The data above are very simple: linear data-generating mechanism,
411 | #   Gaussian, and low dimensional. Yet, they show the importance of using
412 | #   non-linear models, in particular the HistGradientBoostingRegressor
413 | #   which natively deals with missing values.
414 | 
415 | 
416 | # %%
417 | # Using a predictor for the fully-observed case
418 | # ==============================================
419 | #
420 | # Let us go back to the "easy" case of the missing completely at random
421 | # settings with plenty of data
422 | n_samples = 20000
423 | 
424 | X, y = generate_mcar(n_samples, missing_rate=.5)
425 | 
426 | # %%
427 | # Suppose we have been able to train a predictive model that works on
428 | # fully-observed data:
429 | 
430 | X_full, y_full = generate_without_missing_values(n_samples)
431 | full_data_predictor = HistGradientBoostingRegressor()
432 | full_data_predictor.fit(X_full, y_full)
433 | 
434 | model_selection.cross_val_score(full_data_predictor, X_full, y_full)
435 | 
436 | # %%
437 | # The cross validation reveals that the predictor achieves an excellent
438 | # explained variance; it is a near-perfect predictor on fully observed
439 | # data
440 | 
441 | # %%
442 | # Now we turn to data with missing values. Given that our data is MAR
443 | # (missing at random), we will use imputation to build a completed data
444 | # that looks like the full-observed data
445 | 
446 | iterative_imputer = impute.IterativeImputer()
447 | X_imputed = iterative_imputer.fit_transform(X)
448 | 
449 | # %%
450 | # The full data predictor can be used on the imputed data
451 | from sklearn import metrics
452 | metrics.r2_score(y, full_data_predictor.predict(X_imputed))
453 | 
454 | # %%
455 | # This prediction is less good than on the full data, but this is
456 | # expected, as missing values lead to a loss of information. We can
457 | # compare it to a model trained to predict on data with missing values
458 | 
459 | X_train, y_train = generate_mcar(n_samples, missing_rate=.5)
460 | na_predictor = HistGradientBoostingRegressor()
461 | na_predictor.fit(X_train, y_train)
462 | 
463 | metrics.r2_score(y, na_predictor.predict(X))
464 | 
465 | # %%
466 | # Applying a model valid on the full data to imputed data work almost
467 | # as well as a model trained for missing values. The small loss in
468 | # performance is because the imputation is imperfect.
469 | 
470 | # %%
471 | # When the data-generation is non linear
472 | # ---------------------------------------
473 | #
474 | # We now modify a bit the example above to consider the situation where y
475 | # is a non-linear function of X
476 | 
477 | X, y = generate_mcar(n_samples, missing_rate=.5)
478 | y = y ** 2
479 | 
480 | # Train a predictive model that works on fully-observed data:
481 | X_full, y_full = generate_without_missing_values(n_samples)
482 | y_full = y_full ** 2
483 | full_data_predictor = HistGradientBoostingRegressor()
484 | full_data_predictor.fit(X_full, y_full)
485 | 
486 | model_selection.cross_val_score(full_data_predictor, X_full, y_full)
487 | 
488 | # %%
489 | # Once again, we have a near-perfect predictor on fully-observed data
490 | #
491 | # On data with missing values:
492 | 
493 | iterative_imputer = impute.IterativeImputer()
494 | X_imputed = iterative_imputer.fit_transform(X)
495 | 
496 | from sklearn import metrics
497 | metrics.r2_score(y, full_data_predictor.predict(X_imputed))
498 | 
499 | # %%
500 | # The full-data predictor works much less well
501 | #
502 | # Now we use a model trained to predict on data with missing values
503 | 
504 | X_train, y_train = generate_mcar(n_samples, missing_rate=.5)
505 | y_train = y_train ** 2
506 | na_predictor = HistGradientBoostingRegressor()
507 | na_predictor.fit(X_train, y_train)
508 | 
509 | metrics.r2_score(y, na_predictor.predict(X))
510 | 
511 | # %%
512 | # The model trained on data with missing values works significantly
513 | # better than that was optimal for the fully-observed data.
514 | #
515 | # **Only for linear mechanism is the model on full data also optimal for
516 | # perfectly imputed data**. When the function linking X to y has
517 | # curvature, this curvature turns uncertainty resulting from missingness
518 | # into bias [#]_.
519 | #
520 | # .. [#] The detailed mathematical analysis of prediction after
521 | #    imputation can be found here: https://arxiv.org/abs/2106.00311
522 | #
523 | # |
524 | #
525 | # ________
526 | 
527 | 


--------------------------------------------------------------------------------
/notes/02_dirty_categories.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ========================================================
  3 | Dirty categories: learning with non normalized strings
  4 | ========================================================
  5 | 
  6 | Including strings that represent categories often calls for much data
  7 | preparation. In particular categories may appear with many morphological
  8 | variants, when they have been manually input, or assembled from diverse
  9 | sources.
 10 | 
 11 | Including such a column in a learning pipeline as a standard categorical
 12 | colum leads to categories with very high cardinalities and can lose
 13 | information on which categories are similar.
 14 | 
 15 | Here we look at a dataset on wages [#]_ where the column *Employee
 16 | Position Title* contains dirty categories.
 17 | 
 18 | .. [#] https://catalog.data.gov/dataset/employee-salaries-2016
 19 | 
 20 | We investigate encodings to include such compare different categorical
 21 | encodings for the dirty column to predict the *Current Annual Salary*,
 22 | using gradient boosted trees. For this purpose, we use the skrub
 23 | library ( https://skrub-data.org ).
 24 | 
 25 | """
 26 | 
 27 | # %%
 28 | #
 29 | # .. |SV| replace::
 30 | #     :class:`~skrub.TableVectorizer`
 31 | #
 32 | # .. |tabular_learner| replace::
 33 | #     :func:`~skrub.tabular_learner`
 34 | #
 35 | # .. |OneHotEncoder| replace::
 36 | #     :class:`~sklearn.preprocessing.OneHotEncoder`
 37 | #
 38 | # .. |RandomForestRegressor| replace::
 39 | #     :class:`~sklearn.ensemble.RandomForestRegressor`
 40 | #
 41 | # .. |SE| replace:: :class:`~skrub.SimilarityEncoder`
 42 | #
 43 | # .. |GapEncoder| replace:: :class:`~skrub.GapEncoder`
 44 | #
 45 | # .. |permutation importances| replace::
 46 | #     :func:`~sklearn.inspection.permutation_importance`
 47 | #
 48 | #
 49 | # The data
 50 | # ========
 51 | #
 52 | # Data Importing and preprocessing
 53 | # --------------------------------
 54 | #
 55 | # We first download the dataset:
 56 | from skrub.datasets import fetch_employee_salaries
 57 | employee_salaries = fetch_employee_salaries()
 58 | print(employee_salaries.description)
 59 | 
 60 | # %%
 61 | # Then we load it:
 62 | import pandas as pd
 63 | df = employee_salaries.X.copy()
 64 | df
 65 | 
 66 | # %%
 67 | # Recover the target
 68 | 
 69 | y = employee_salaries.y
 70 | 
 71 | # %%
 72 | #
 73 | # A simple default as a learner
 74 | # ===============================
 75 | #
 76 | # The function |tabular_learner| is a simple way of creating a default
 77 | # learner for tabular_learner data:
 78 | from skrub import tabular_learner
 79 | model = tabular_learner("regressor")
 80 | 
 81 | # %%
 82 | # We can quickly compute its cross-validation score using the
 83 | # corresponding scikit-learn utility
 84 | from sklearn.model_selection import cross_validate
 85 | import numpy as np
 86 | 
 87 | results = cross_validate(model, df, y)
 88 | print(f"Prediction score: {np.mean(results['test_score'])}")
 89 | print(f"Training time: {np.mean(results['fit_time'])}")
 90 | 
 91 | # %%
 92 | # Below the hood, `model` is a pipeline:
 93 | model
 94 | 
 95 | # %%
 96 | # We can see that it is made of first a |SV|, and an
 97 | # HistGradientBoostingRegressor
 98 | 
 99 | # %%
100 | # Understanding the vectorizer + learner pipeline
101 | # =======================================
102 | #
103 | # The number one difficulty is that our input is a complex and
104 | # heterogeneous dataframe:
105 | df
106 | 
107 | # %%
108 | # The |SV| is a transformer that turns this dataframe into a
109 | # form suited for machine learning.
110 | #
111 | # Feeding it output to a powerful learner,
112 | # such as gradient boosted trees, gives **a machine-learning method that
113 | # can be readily applied to the dataframe**.
114 | from skrub import TableVectorizer
115 | 
116 | # %%
117 | # Assembling the pipeline
118 | # ---------------------------
119 | #
120 | 
121 | # %%
122 | # We use the |SV| with a HistGradientBoostingRegressor, which is a good
123 | # predictor for data with heterogeneous columns
124 | from sklearn.ensemble import HistGradientBoostingRegressor
125 | 
126 | # %%
127 | # We then create a pipeline chaining our encoders to a learner
128 | from sklearn.pipeline import make_pipeline
129 | 
130 | pipeline = make_pipeline(
131 |     TableVectorizer(),
132 |     HistGradientBoostingRegressor()
133 | )
134 | pipeline
135 | 
136 | # %%
137 | # Note that it is almost the same model as above (can you spot the
138 | # differences)
139 | #
140 | # Let's perform a cross-validation to see how well this model predicts
141 | 
142 | results = cross_validate(pipeline, df, y)
143 | print(f"Prediction score: {np.mean(results['test_score'])}")
144 | print(f"Training time: {np.mean(results['fit_time'])}")
145 | 
146 | 
147 | # %%
148 | # The prediction perform here is pretty much as good as above
149 | # but the code here is much simpler as it does not involve specifying
150 | # columns manually.
151 | 
152 | # %%
153 | # Analyzing the features created
154 | # -------------------------------
155 | #
156 | # Let us perform the same workflow, but without the `Pipeline`, so we can
157 | # analyze its mechanisms along the way.
158 | tab_vec = TableVectorizer()
159 | 
160 | # %%
161 | # We split the data between train and test, and transform them:
162 | from sklearn.model_selection import train_test_split
163 | df_train, df_test, y_train, y_test = train_test_split(
164 |     df, y, test_size=0.15, random_state=42
165 | )
166 | 
167 | X_train_enc = tab_vec.fit_transform(df_train, y_train)
168 | X_test_enc = tab_vec.transform(df_test)
169 | 
170 | # %%
171 | # The encoded data, X_train_enc and X_test_enc are numerical arrays:
172 | X_train_enc
173 | 
174 | # %%
175 | # They have more columns than the original dataframe, but not much more:
176 | X_train_enc.shape
177 | 
178 | # %%
179 | # Inspecting the features created
180 | # .................................
181 | #
182 | # The |SV| assigns a transformer for each column. We can inspect this
183 | # choice:
184 | tab_vec.transformers_
185 | 
186 | # %%
187 | # This is what is being passed to transform the different columns under the hood.
188 | # We can notice it classified the columns "gender" and "assignment_category"
189 | # as low cardinality string variables.
190 | # A |OneHotEncoder| will be applied to these columns.
191 | #
192 | # The vectorizer actually makes the difference between string variables
193 | # (data type ``object`` and ``string``) and categorical variables
194 | # (data type ``category``).
195 | #
196 | # Next, we can have a look at the encoded feature names.
197 | #
198 | # Before encoding:
199 | df.columns.to_list()
200 | 
201 | # %%
202 | # After encoding (we only plot the first 8 feature names):
203 | feature_names = tab_vec.get_feature_names_out()
204 | feature_names[:8]
205 | 
206 | # %%
207 | # As we can see, it created a new column for each unique value.
208 | # This is because we used |SE| on the column "division",
209 | # which was classified as a high cardinality string variable.
210 | # (default values, see |SV|'s docstring).
211 | #
212 | # In total, we have reasonnable number of encoded columns.
213 | len(feature_names)
214 | 
215 | 
216 | # %%
217 | # Feature importance in the statistical model
218 | # ---------------------------------------------
219 | #
220 | # Here we consider interpretability, plot the feature importances of a
221 | # classifier. We can do this because the |GapEncoder| leads to
222 | # interpretable features even with messy categories
223 | #
224 | # .. topic:: Note:
225 | #
226 | #    To minimize compute time, use the feature importances computed by the
227 | #    |RandomForestRegressor|, but you should prefer |permutation importances|
228 | #    instead (which are less subject to biases)
229 | #
230 | # First, let's train the |RandomForestRegressor|,
231 | 
232 | from sklearn.ensemble import RandomForestRegressor
233 | regressor = RandomForestRegressor()
234 | regressor.fit(X_train_enc, y_train)
235 | 
236 | 
237 | # %%
238 | # Retrieving the feature importances
239 | importances = regressor.feature_importances_
240 | std = np.std(
241 |     [
242 |         tree.feature_importances_
243 |         for tree in regressor.estimators_
244 |     ],
245 |     axis=0
246 | )
247 | indices = np.argsort(importances)[::-1]
248 | 
249 | # %%
250 | # Plotting the results:
251 | 
252 | import matplotlib.pyplot as plt
253 | plt.figure(figsize=(12, 9))
254 | plt.title("Feature importances")
255 | n = 20
256 | n_indices = indices[:n]
257 | labels = np.array(feature_names)[n_indices]
258 | plt.barh(range(n), importances[n_indices], color="b", yerr=std[n_indices])
259 | plt.yticks(range(n), labels, size=15)
260 | plt.tight_layout(pad=1)
261 | plt.show()
262 | 
263 | # %%
264 | # We can deduce from this data that the three factors that define the
265 | # most the salary are: being hired for a long time, being a manager, and
266 | # having a permanent, full-time job :).
267 | 
268 | 
269 | # %%
270 | #
271 | # Exploring different machine-learning pipeline to encode the data
272 | # =================================================================
273 | #
274 | # The learning pipeline
275 | # ----------------------------
276 | #
277 | # To build a learning pipeline, we need to assemble encoders for each
278 | # column, and apply a supervised learning model on top.
279 | 
280 | # %%
281 | # Encoding the table
282 | # ........................
283 | #
284 | # The TableVectorizer applies different transformations to the different
285 | # columns to turn them into numerical values suitable for learning
286 | 
287 | from skrub import TableVectorizer
288 | encoder = TableVectorizer()
289 | 
290 | # %%
291 | # Pipelining an encoder with a learner
292 | # ....................................
293 | #
294 | # Here again we use a pipeline with HistGradientBoostingRegressor
295 | from sklearn.ensemble import HistGradientBoostingRegressor
296 | pipeline = make_pipeline(encoder, HistGradientBoostingRegressor())
297 | 
298 | # %%
299 | # The pipeline can be readily applied to the dataframe for prediction
300 | pipeline.fit(df, y)
301 | 
302 | # The categorical encoders
303 | # ........................
304 | #
305 | # A encoder is needed to turn a categorical column into a numerical
306 | # representation
307 | from sklearn.preprocessing import OneHotEncoder
308 | 
309 | one_hot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
310 | 
311 | # %%
312 | # Dirty-category encoding
313 | # -------------------------
314 | #
315 | # The one-hot encoder is actually not well suited to the 'Employee
316 | # Position Title' column, as this columns contains 400 different entries.
317 | #
318 | # We will now experiments with different encoders for dirty columns
319 | from skrub import SimilarityEncoder, MinHashEncoder,\
320 |     GapEncoder
321 | from sklearn.preprocessing import TargetEncoder
322 | 
323 | similarity = SimilarityEncoder()
324 | target = TargetEncoder()
325 | minhash = MinHashEncoder(n_components=100)
326 | gap = GapEncoder(n_components=100)
327 | 
328 | encoders = {
329 |     'one-hot': one_hot,
330 |     'similarity': similarity,
331 |     'target': target,
332 |     'minhash': minhash,
333 |     'gap': gap}
334 | 
335 | # %%
336 | # We now loop over the different encoding methods,
337 | # instantiate each time a new pipeline, fit it
338 | # and store the returned cross-validation score:
339 | 
340 | all_scores = dict()
341 | 
342 | for name, method in encoders.items():
343 |     encoder = TableVectorizer(high_cardinality=method)
344 | 
345 |     pipeline = make_pipeline(encoder, HistGradientBoostingRegressor())
346 |     scores = cross_validate(pipeline, df, y)
347 |     print('{} encoding'.format(name))
348 |     print('r2 score:  mean: {:.3f}; std: {:.3f}'.format(
349 |         np.mean(scores['test_score']), np.std(scores['test_score'])))
350 |     print('time:  {:.3f}\n'.format(
351 |         np.mean(scores['fit_time'])))
352 |     all_scores[name] = scores['test_score']
353 | 
354 | # %%
355 | # Note that the time it takes to fit varies also a lot, and not only the
356 | # prediction score
357 | 
358 | # %%
359 | # Plotting the results
360 | # .....................
361 | # Finally, we plot the scores on a boxplot:
362 | 
363 | import seaborn
364 | import matplotlib.pyplot as plt
365 | plt.figure(figsize=(4, 3))
366 | ax = seaborn.boxplot(data=pd.DataFrame(all_scores), orient='h')
367 | plt.ylabel('Encoding', size=20)
368 | plt.xlabel('Prediction accuracy     ', size=20)
369 | plt.yticks(size=20)
370 | plt.tight_layout()
371 | 
372 | # %%
373 | # The clear trend is that encoders that use the string form
374 | # of the category (similarity, minhash, and gap) perform better than
375 | # those that discard it.
376 | # 
377 | # SimilarityEncoder is the best performer, but it is less scalable on big
378 | # data than MinHashEncoder and GapEncoder. The most scalable encoder is
379 | # the MinHashEncoder. GapEncoder, on the other hand, has the benefit that
380 | # it provides interpretable features, as shown above
381 | #
382 | # |
383 | #
384 | #
385 | # .. topic:: The TableVectorizer automates preprocessing
386 | #
387 | #   As this notebook demonstrates, many preprocessing steps can be
388 | #   automated by the |SV|, and the resulting pipeline can still be
389 | #   inspected, even with non-normalized entries.
390 | #
391 | 


--------------------------------------------------------------------------------
/notes/README.rst:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Script to publish to private GH-pages
3 | make cleandoctrees html
4 | ghp-import --no-jekyll -r origin --push --force build/html
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | scipy
 3 | pytest
 4 | pytest-cov
 5 | coverage
 6 | matplotlib
 7 | seaborn
 8 | joblib
 9 | pandas
10 | scikit-learn
11 | skrub
12 | 
13 | jupyterlite-pyodide-kernel
14 | 


--------------------------------------------------------------------------------