├── .github
└── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── .gitignore
├── .idea
├── gamma-sklearndf.iml
└── sklearndf.iml
├── .pre-commit-config.yaml
├── LICENSE
├── README.rst
├── RELEASE_NOTES.rst
├── azure-pipelines.yml
├── condabuild
└── meta.yaml
├── config
├── spelling.dic
└── test_config.yml
├── dev-setup.sh
├── environment.yml
├── make.py
├── mypy.ini
├── pypi_description.rst
├── pyproject.toml
├── sphinx
├── .gitignore
├── auxiliary
│ └── Titanic_getting_started_example.ipynb
├── make.py
└── source
│ ├── _images
│ ├── gamma_sklearndf_logo.png
│ ├── sklearndf-class-hierarchy.graffle
│ │ └── data.plist
│ ├── sklearndf-class-hierarchy.svg
│ └── sklearndf_logo.png
│ ├── api_landing.rst
│ ├── conf.py
│ ├── contribution_guide.rst
│ ├── faqs.rst
│ ├── index.rst
│ ├── tutorial
│ └── sklearndf_tutorial.ipynb
│ └── tutorials.rst
├── src
└── sklearndf
│ ├── __init__.py
│ ├── _sklearn_version.py
│ ├── _sklearndf.py
│ ├── _util.py
│ ├── classification
│ ├── __init__.py
│ ├── _classification.py
│ ├── _classification_v0_22.py
│ ├── _classification_v0_23.py
│ ├── _classification_v1_0.py
│ ├── extra
│ │ ├── __init__.py
│ │ └── _extra.py
│ └── wrapper
│ │ ├── __init__.py
│ │ └── _wrapper.py
│ ├── clustering
│ ├── __init__.py
│ ├── _clustering.py
│ ├── _clustering_v1_1.py
│ ├── _clustering_v1_3.py
│ └── wrapper
│ │ ├── __init__.py
│ │ └── _wrapper.py
│ ├── pipeline
│ ├── __init__.py
│ ├── _learner_pipeline.py
│ ├── _pipeline.py
│ └── wrapper
│ │ ├── __init__.py
│ │ └── _wrapper.py
│ ├── py.typed
│ ├── regression
│ ├── __init__.py
│ ├── _regression.py
│ ├── _regression_v0_22.py
│ ├── _regression_v0_23.py
│ ├── _regression_v1_0.py
│ ├── extra
│ │ ├── __init__.py
│ │ └── _extra.py
│ └── wrapper
│ │ ├── __init__.py
│ │ └── _wrapper.py
│ ├── transformation
│ ├── __init__.py
│ ├── _transformation.py
│ ├── _transformation_v0_22.py
│ ├── _transformation_v0_24.py
│ ├── _transformation_v1_0.py
│ ├── _transformation_v1_1.py
│ ├── _transformation_v1_3.py
│ ├── extra
│ │ ├── __init__.py
│ │ ├── _extra.py
│ │ └── wrapper
│ │ │ ├── __init__.py
│ │ │ └── _wrapper.py
│ └── wrapper
│ │ ├── __init__.py
│ │ └── _wrapper.py
│ └── wrapper
│ ├── __init__.py
│ ├── _missing.py
│ ├── _wrapper.py
│ ├── numpy
│ ├── __init__.py
│ └── _numpy.py
│ └── stacking
│ ├── __init__.py
│ └── _stacking.py
├── test
└── test
│ ├── __init__.py
│ ├── conftest.py
│ ├── paths.py
│ ├── sklearndf
│ ├── __init__.py
│ ├── pipeline
│ │ ├── __init__.py
│ │ ├── test_classification_pipeline_df.py
│ │ ├── test_clustering_pipeline.py
│ │ ├── test_pipeline_df.py
│ │ └── test_regression_pipeline_df.py
│ ├── test_base.py
│ ├── test_classification.py
│ ├── test_clustering.py
│ ├── test_meta_estimators.py
│ ├── test_missing.py
│ ├── test_regression.py
│ ├── test_sklearn_coverage.py
│ └── transformation
│ │ ├── __init__.py
│ │ ├── test_extra.py
│ │ ├── test_imputers.py
│ │ ├── test_sparse.py
│ │ └── test_transformation.py
│ └── test_docs.py
├── tmp
└── README.md
└── tox.ini
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 | .pytest_cache/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 | db.sqlite3
59 |
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 |
64 | # Scrapy stuff:
65 | .scrapy
66 |
67 | # Sphinx documentation
68 | docs/_build/
69 |
70 | # PyBuilder
71 | target/
72 |
73 | # Jupyter Notebook
74 | .ipynb_checkpoints
75 |
76 | # pyenv
77 | .python-version
78 |
79 | # celery beat schedule file
80 | celerybeat-schedule
81 |
82 | # SageMath parsed files
83 | *.sage.py
84 |
85 | # Environments
86 | .env
87 | .venv
88 | env/
89 | venv/
90 | ENV/
91 | env.bak/
92 | venv.bak/
93 |
94 | # Spyder project settings
95 | .spyderproject
96 | .spyproject
97 |
98 | # Rope project settings
99 | .ropeproject
100 |
101 | # mkdocs documentation
102 | /site
103 |
104 | # mypy
105 | .mypy_cache/
106 |
107 | ### JetBrains template
108 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
109 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
110 |
111 | # User-specific stuff
112 | .idea/**/workspace.xml
113 | .idea/**/tasks.xml
114 | .idea/**/dictionaries
115 | .idea/**/shelf
116 |
117 | # Sensitive or high-churn files
118 | .idea/**/dataSources/
119 | .idea/**/dataSources.ids
120 | .idea/**/dataSources.local.xml
121 | .idea/**/sqlDataSources.xml
122 | .idea/**/dynamic.xml
123 | .idea/**/uiDesigner.xml
124 | .idea/**/dbnavigator.xml
125 |
126 | # Gradle
127 | .idea/**/gradle.xml
128 | .idea/**/libraries
129 |
130 | # CMake
131 | cmake-build-debug/
132 | cmake-build-release/
133 |
134 | # Mongo Explorer plugin
135 | .idea/**/mongoSettings.xml
136 |
137 | # File-based project format
138 | *.iws
139 |
140 | # IntelliJ
141 | out/
142 |
143 | # mpeltonen/sbt-idea plugin
144 | .idea_modules/
145 |
146 | # JIRA plugin
147 | atlassian-ide-plugin.xml
148 |
149 | # Cursive Clojure plugin
150 | .idea/replstate.xml
151 |
152 | # Crashlytics plugin (for Android Studio and IntelliJ)
153 | com_crashlytics_export_strings.xml
154 | crashlytics.properties
155 | crashlytics-build.properties
156 | fabric.properties
157 |
158 | # Editor-based Rest Client
159 | .idea/httpRequests
160 | ### TeX template
161 | ## Core latex/pdflatex auxiliary files:
162 | *.aux
163 | *.lof
164 | *.lot
165 | *.fls
166 | *.out
167 | *.toc
168 | *.fmt
169 | *.fot
170 | *.cb
171 | *.cb2
172 | .*.lb
173 |
174 | ## Intermediate documents:
175 | *.dvi
176 | *.xdv
177 | *-converted-to.*
178 | # these rules might exclude image files for figures etc.
179 | # *.ps
180 | # *.eps
181 | # *.pdf
182 |
183 | ## Generated if empty string is given at "Please type another file name for output:"
184 | .pdf
185 |
186 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
187 | *.bbl
188 | *.bcf
189 | *.blg
190 | *-blx.aux
191 | *-blx.bib
192 | *.run.xml
193 |
194 | ## Build tool auxiliary files:
195 | *.fdb_latexmk
196 | *.synctex
197 | *.synctex(busy)
198 | *.synctex.gz
199 | *.synctex.gz(busy)
200 | *.pdfsync
201 |
202 | ## Build tool directories for auxiliary files
203 | # latexrun
204 | latex.out/
205 |
206 | ## Auxiliary and intermediate files from other packages:
207 | # algorithms
208 | *.alg
209 | *.loa
210 |
211 | # achemso
212 | acs-*.bib
213 |
214 | # amsthm
215 | *.thm
216 |
217 | # beamer
218 | *.nav
219 | *.pre
220 | *.snm
221 | *.vrb
222 |
223 | # changes
224 | *.soc
225 |
226 | # cprotect
227 | *.cpt
228 |
229 | # elsarticle (documentclass of Elsevier journals)
230 | *.spl
231 |
232 | # endnotes
233 | *.ent
234 |
235 | # fixme
236 | *.lox
237 |
238 | # feynmf/feynmp
239 | *.mf
240 | *.mp
241 | *.t[1-9]
242 | *.t[1-9][0-9]
243 | *.tfm
244 |
245 | #(r)(e)ledmac/(r)(e)ledpar
246 | *.end
247 | *.?end
248 | *.[1-9]
249 | *.[1-9][0-9]
250 | *.[1-9][0-9][0-9]
251 | *.[1-9]R
252 | *.[1-9][0-9]R
253 | *.[1-9][0-9][0-9]R
254 | *.eledsec[1-9]
255 | *.eledsec[1-9]R
256 | *.eledsec[1-9][0-9]
257 | *.eledsec[1-9][0-9]R
258 | *.eledsec[1-9][0-9][0-9]
259 | *.eledsec[1-9][0-9][0-9]R
260 |
261 | # glossaries
262 | *.acn
263 | *.acr
264 | *.glg
265 | *.glo
266 | *.gls
267 | *.glsdefs
268 |
269 | # gnuplottex
270 | *-gnuplottex-*
271 |
272 | # gregoriotex
273 | *.gaux
274 | *.gtex
275 |
276 | # htlatex
277 | *.4ct
278 | *.4tc
279 | *.idv
280 | *.lg
281 | *.trc
282 | *.xref
283 |
284 | # hyperref
285 | *.brf
286 |
287 | # knitr
288 | *-concordance.tex
289 | # TODO Comment the next line if you want to keep your tikz graphics files
290 | *.tikz
291 | *-tikzDictionary
292 |
293 | # listings
294 | *.lol
295 |
296 | # makeidx
297 | *.idx
298 | *.ilg
299 | *.ind
300 | *.ist
301 |
302 | # minitoc
303 | *.maf
304 | *.mlf
305 | *.mlt
306 | *.mtc[0-9]*
307 | *.slf[0-9]*
308 | *.slt[0-9]*
309 | *.stc[0-9]*
310 |
311 | # minted
312 | _minted*
313 | *.pyg
314 |
315 | # morewrites
316 | *.mw
317 |
318 | # nomencl
319 | *.nlg
320 | *.nlo
321 | *.nls
322 |
323 | # pax
324 | *.pax
325 |
326 | # pdfpcnotes
327 | *.pdfpc
328 |
329 | # sagetex
330 | *.sagetex.sage
331 | *.sagetex.py
332 | *.sagetex.scmd
333 |
334 | # scrwfile
335 | *.wrt
336 |
337 | # sympy
338 | *.sout
339 | *.sympy
340 | sympy-plots-for-*.tex/
341 |
342 | # pdfcomment
343 | *.upa
344 | *.upb
345 |
346 | # pythontex
347 | *.pytxcode
348 | pythontex-files-*/
349 |
350 | # thmtools
351 | *.loe
352 |
353 | # TikZ & PGF
354 | *.dpth
355 | *.md5
356 | *.auxlock
357 |
358 | # todonotes
359 | *.tdo
360 |
361 | # easy-todo
362 | *.lod
363 |
364 | # xmpincl
365 | *.xmpi
366 |
367 | # xindy
368 | *.xdy
369 |
370 | # xypic precompiled matrices
371 | *.xyc
372 |
373 | # endfloat
374 | *.ttt
375 | *.fff
376 |
377 | # Latexian
378 | TSWLatexianTemp*
379 |
380 | ## Editors:
381 | # WinEdt
382 | *.bak
383 | *.sav
384 |
385 | # Texpad
386 | .texpadtmp
387 |
388 | # Kile
389 | *.backup
390 |
391 | # KBibTeX
392 | *~[0-9]*
393 |
394 | *.el
395 |
396 | # expex forward references with \gathertags
397 | *-tags.tex
398 |
399 | # standalone packages
400 | *.sta
401 |
402 | .DS_Store
403 |
404 | #
405 | # project specific
406 | #
407 | /tmp
408 | !/tmp/README.md
409 |
410 | # exclude docs while they are not yet stable
411 | /docs/**
412 | !/docs/README.md
413 |
414 | # exclude notebooks directory: this is generated during build
415 | /notebooks/
416 |
417 | # OmniGraffle previews
418 | **/*.graffle/preview.jpeg
419 |
--------------------------------------------------------------------------------
/.idea/gamma-sklearndf.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/.idea/sklearndf.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/PyCQA/isort
3 | rev: 5.12.0
4 | hooks:
5 | - id: isort
6 |
7 | - repo: https://github.com/psf/black
8 | rev: 23.10.1
9 | hooks:
10 | - id: black
11 | language: python_venv
12 | language_version: python39
13 |
14 | - repo: https://github.com/pycqa/flake8
15 | rev: 5.0.4
16 | hooks:
17 | - id: flake8
18 | name: flake8
19 | entry: flake8 --config tox.ini
20 | language: python_venv
21 | language_version: python39
22 | additional_dependencies:
23 | - flake8-comprehensions ~= 3.10
24 | types: [ python ]
25 |
26 | - repo: https://github.com/pre-commit/pre-commit-hooks
27 | rev: v4.3.0
28 | hooks:
29 | - id: check-added-large-files
30 | - id: check-json
31 | - id: check-xml
32 | - id: check-yaml
33 | language: python_venv
34 | exclude: condabuild/meta.yaml
35 |
36 | - repo: https://github.com/pre-commit/mirrors-mypy
37 | rev: v1.2.0
38 | hooks:
39 | - id: mypy
40 | files: src|sphinx|test
41 | language: python_venv
42 | language_version: python39
43 | additional_dependencies:
44 | - numpy~=1.24
45 | - gamma-pytools~=2.1
46 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2020-2021 Boston Consulting Group
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/RELEASE_NOTES.rst:
--------------------------------------------------------------------------------
1 | Release Notes
2 | =============
3 |
4 | .. |lightgbm| replace:: :external+lightgbm:doc:`lightgbm `
5 | .. |xgboost| replace:: :external+xgboost:doc:`xgboost `
6 | .. |mypy| replace:: :external+mypy:doc:`mypy `
7 | .. |nbsp| unicode:: 0xA0
8 | :trim:
9 |
10 | *sklearndf* 2.3
11 | ---------------
12 |
13 |
14 | 2.3.0
15 | ~~~~~
16 |
17 | *sklearndf* 2.3 adds support for
18 | `scikit-learn 1.3 `_
19 | and drops support for *scikit-learn* |nbsp| 0.24.
20 |
21 | - API: add DF wrapper classe :class:`.HDBSCANDF` for native estimator
22 | :class:`~sklearn.cluster.HDBSCAN`
23 | - API: add DF wrapper class :class:`.TargetEncoderDF` for native estimator
24 | :class:`~sklearn.preprocessing.TargetEncoder`
25 |
26 |
27 | *sklearndf* 2.2
28 | ---------------
29 |
30 | *sklearndf* 2.2 adds support for
31 | `scikit-learn 1.2 `_, and enhances the EstimatorDF
32 | API.
33 |
34 |
35 | 2.2.1
36 | ~~~~~
37 |
38 | - VIZ: use *scikit-learn*'s native HTML representation of estimators, if available
39 |
40 |
41 | 2.2.0
42 | ~~~~~
43 |
44 | *sklearndf* 2.2 adds support for
45 | `scikit-learn 1.2 `_.
46 | It drops support for *scikit-learn* |nbsp| 0.23 and earlier due to incomplete
47 | support of sparse output (see below).
48 |
49 | - API: DF estimators now support native estimators using sparse matrices as input or
50 | output, and automatically convert them to or from sparse :class:`~pandas.DataFrame`
51 | objects
52 | - API: new property :attr:`.EstimatorDF.output_names_` to get the names of the output
53 | columns the estimator was fitted with
54 | - API: new method :attr:`.LearnerPipelineDF.preprocess` to apply the preprocessing step
55 | to a data frame
56 | - API: remove properties ``feature_names_out_`` and ``feature_names_original_`` from
57 | class :class:`.LearnerPipelineDF`
58 | - API: :class:`~pandas.Index` instances obtained from
59 | :attr:`.EstimatorDF.feature_names_in_` and :attr:`.TransformerDF.feature_names_out_`
60 | are now named ``"feature"`` instead of ``"feature_in"`` and ``"feature_out"``,
61 | respectively, and :class:`~pandas.Series` instances obtained from
62 | :attr:`.TransformerDF.feature_names_original_` are now named ``"feature_original"``
63 | instead of ``"feature_in"``, and their indices are now named ``"feature"`` instead
64 | of ``"feature_out"``; this is to separate the semantics of the originating property
65 | from the column index, which may be used in other contexts
66 |
67 |
68 |
69 | *sklearndf* 2.1
70 | ---------------
71 |
72 | *sklearndf* 2.1 adds support for
73 | `scikit-learn 1.1 `_.
74 |
75 |
76 | 2.1.1
77 | ~~~~~
78 |
79 | This is a maintenance release to catch up with *sklearndf* |nbsp| 2.0.2.
80 |
81 |
82 | 2.1.0
83 | ~~~~~
84 |
85 | - API: new clusterer :class:`.BisectingKMeansDF`
86 | - API: new transformer :class:`.MiniBatchNMFDF`
87 | - API: new transformer :class:`.RandomTreesEmbeddingDF`; note that class
88 | :class:`~sklearn.ensemble.RandomTreesEmbedding` existed previously in *scikit-learn*,
89 | but is based on :class:`~sklearn.base.TransformerMixin` only as of
90 | *scikit-learn* |nbsp| 1.1
91 | - API: support parameters ``max_categories`` and ``min_frequency`` of
92 | :class:`.OneHotEncoderDF`, introduced in *scikit-learn* |nbsp| 1.1
93 | - API: support array-like values for parameter ``drop`` of :class:`.OneHotEncoderDF`
94 | - API: support ``"passthrough"`` as a transformer in :class:`.FeatureUnionDF`
95 | - API: remove ``GeneralizedLinearRegressorDF`` since the underlying native estimator is
96 | a base class and not intended to be used as a regressor of its own
97 |
98 |
99 | *sklearndf* 2.0
100 | ---------------
101 |
102 | *sklearndf* 2.0 adds support for
103 | `scikit-learn 1.0 `_,
104 | adds data frame support for clusterers along with additional API enhancements and
105 | improvements, and is now subject to static type checking with |mypy|.
106 |
107 |
108 | 2.0.2
109 | ~~~~~
110 |
111 | - BUILD: add support for :mod:`pandas` 2.0 and above
112 | - FIX: property :attr:`.PCADF.n_components_` now returns the value of
113 | :attr:`~sklearndf.decomposition.PCA.n_components_`, not
114 | :attr:`~sklearndf.decomposition.PCA.n_components`
115 | - FIX: detect missing and extra columns when validating data frames resulting from
116 | transforms, even when the total column count is correct
117 |
118 |
119 | 2.0.1
120 | ~~~~~
121 |
122 | - API: upon declaration of new wrapper classes, automatically validate that their
123 | associated native estimators are compatible with the wrapper class
124 | - API: new public constants ``DROP`` and ``PASSTHROUGH`` in
125 | :class:`.ColumnTransformerDF`
126 | - FIX: base :class:`.LGBMClassifierDF` and :class:`.XGBClassifierDF` on the
127 | the correct wrapper class :class:`.ClassifierWrapperDF`
128 | - FIX: support array-like values for parameter ``drop`` of :class:`.OneHotEncoderDF`
129 | - FIX: various minor tweaks and stability improvements
130 |
131 |
132 | 2.0.0
133 | ~~~~~
134 |
135 | - API: :class:`.ClassifierDF` and :class:`.RegressorDF` get a new base class
136 | :class:`.SupervisedLearnerDF`, which in turn is based on :class:`.LearnerDF`;
137 | :class:`.SupervisedLearnerDF` implements method :meth:`~.SupervisedLearnerDF.score`,
138 | which is no longer implemented by :class:`.LearnerDF`
139 | - API: new class :class:`.ClusterDF`, based on :class:`.LearnerDF`
140 | - API: class :class:`.EstimatorDF` now implements the
141 | :class:`~pytools.expression.HasExpressionRepr` mix-in, rendering estimator
142 | representations as :class:`~pytools.expression.Expression` objects to enable better
143 | formatting
144 | - API: added data frame support for method
145 | :meth:`~.PartialFitRegressorWrapperDF.partial_fit`
146 | - API: removed ``OutlierRemoverDF``
147 | - API: removed dependency on package |lightgbm|: :class:`.LGBMClassifierDF` and
148 | :class:`.LGBMRegressorDF` are still available if |lightgbm| is installed
149 | - API: added support for |xgboost|: :class:`.XGBClassifierDF` and
150 | :class:`.XGBClassifierDF` are available if |xgboost| is installed
151 | - API: DF wrapper classes are now created using proper class declarations to better
152 | conform with Python type conventions checked by |mypy|;
153 | see :mod:`sklearndf.wrapper` for details
154 | - API: remove functions ``make_df_estimator``, ``make_df_classifier``,
155 | ``make_df_regressor``, and ``make_df_transformer`` which are now obsolete
156 | - API: move some classes in :mod:`sklearndf.wrapper` to sub-packages
157 | :mod:`sklearndf.wrapper.stacking` and :mod:`sklearndf.wrapper.numpy` to improve
158 | package navigability and to achieve better de-coupling of the underlying code;
159 | this change also moves :class:`~.StackingClassifierWrapperDF` and
160 | :class:`~.StackingRegressorWrapperDF` to package :mod:`sklearndf.wrapper.stacking`
161 |
162 |
163 | *sklearndf* 1.2
164 | ---------------
165 |
166 | This release adds support for `scikit-learn 0.24 `_.
167 |
168 |
169 | 1.2.3
170 | ~~~~~
171 |
172 | This is a maintenance release to catch up with *sklearndf* |nbsp| 1.1.3.
173 |
174 |
175 | 1.2.2
176 | ~~~~~
177 |
178 | This release makes small API tweaks, and catches up with *sklearndf* |nbsp| 1.1.2.
179 |
180 | - API: make type hints more specific in signatures for
181 | :func:`.make_df_transformer`, :func:`.make_df_classifier`, and
182 | :func:`.make_df_regressor`
183 |
184 |
185 | 1.2.1
186 | ~~~~~
187 |
188 | This is a maintenance release to catch up with *sklearndf* |nbsp| 1.1.1.
189 |
190 |
191 | 1.2.0
192 | ~~~~~
193 |
194 | - API: add `DF` adaptations for classes introduced by *scikit-learn* |nbsp| 0.24:
195 | :class:`.PolynomialCountSketchDF` and :class:`.SequentialFeatureSelectorDF`
196 |
197 |
198 | *sklearndf* 1.1
199 | ---------------
200 |
201 | 1.1.3
202 | ~~~~~
203 |
204 | This release relaxes package dependencies to support any `numpy` version `1.x` from
205 | 1.16.
206 |
207 |
208 | 1.1.2
209 | ~~~~~
210 |
211 | This release improves compatibility with `scikit-learn` and fixes bugs.
212 |
213 | - API: add full support for the
214 | `_estimator_type `__
215 | attribute
216 | - FIX: do not reset transformers when calling :meth:`.TransformerDF.inverse_transform`
217 | - FIX: accept `"passthrough"` as value for arg `remainder` of
218 | :class:`.ColumnTransformerDF`
219 |
220 |
221 | 1.1.1
222 | ~~~~~
223 |
224 | This release addresses compatibility issues with meta-estimators.
225 |
226 | - FIX: support complex DF estimators inside :class:`.StackingEstimatorDF`
227 | - FIX: raise an exception if a base estimator is not supported by one of `sklearndf`'s
228 | implementations for DF meta-estimators
229 |
230 |
231 | 1.1.0
232 | ~~~~~
233 |
234 | This release exposes the `wrapper` API used to generate augmented DF estimators from
235 | native `scikit-learn` estimators.
236 |
237 | - API: expose the :class:`.EstimatorWrapperDF` class hierarchy through the new
238 | :mod:`sklearndf.wrapper` package
239 | - API: create new `scikit-learn` wrapper classes with the new functions
240 | :func:`.make_df_estimator`, :func:`.make_df_classifier`, :func:`.make_df_regressor`,
241 | and :func:`.make_df_transformer`
242 |
243 |
244 | *sklearndf* 1.0
245 | ---------------
246 |
247 | 1.0.2
248 | ~~~~~
249 |
250 | This is a maintenance release focusing on enhancements to the CI/CD pipeline and bug
251 | fixes.
252 |
253 | - FIX: correctly mirror ``__init__`` signatures of native estimators to their
254 | corresponding DF estimators
255 | - FIX: do not mirror native estimator class attributes and protected members to
256 | DF estimators
257 | - FIX: support ``"passthrough"`` transformer in :class:`.ColumnTransformerDF`
258 | - FIX: support ``drop`` parameter in :class:`.OneHotEncoderDF`
259 | - BUILD: add support for `numpy` |nbsp| 1.20
260 | - BUILD: updates and changes to the CI/CD pipeline
261 |
262 |
263 | 1.0.1
264 | ~~~~~
265 |
266 | Initial release.
267 |
--------------------------------------------------------------------------------
/condabuild/meta.yaml:
--------------------------------------------------------------------------------
1 | package:
2 | name: sklearndf
3 | version: {{ environ.get('FACET_BUILD_SKLEARNDF_VERSION') }}
4 |
5 | source:
6 | git_url: ../
7 |
8 | build:
9 | noarch: python
10 | script: "flit install --deps none"
11 |
12 | requirements:
13 | host:
14 | - pip>=20.*
15 | - python {{ environ.get('FACET_V_PYTHON', '=3.8.*') }}
16 | - numpy {{ environ.get('FACET_V_NUMPY', '>=1.11.*') }}
17 | - flit>=3.0.*
18 | - packaging>=20
19 | run:
20 | - gamma-pytools {{ environ.get('FACET_V_GAMMA_PYTOOLS') }}
21 | - numpy {{ environ.get('FACET_V_NUMPY') }}
22 | - packaging {{ environ.get('FACET_V_PACKAGING') }}
23 | - pandas {{ environ.get('FACET_V_PANDAS') }}
24 | - python {{ environ.get('FACET_V_PYTHON') }}
25 | - scikit-learn {{ environ.get('FACET_V_SCIKIT_LEARN') }}
26 | - scipy {{ environ.get('FACET_V_SCIPY') }}
27 | test:
28 | imports:
29 | - sklearndf
30 | - sklearndf.classification
31 | - sklearndf.classification.extra
32 | - sklearndf.pipeline
33 | - sklearndf.regression
34 | - sklearndf.regression.extra
35 | - sklearndf.transformation
36 | - sklearndf.transformation.extra
37 | requires:
38 | - pytest ~= 7.1
39 | # we need pip to install arfs
40 | - pip # {{ '[False]' if not environ.get('FACET_V_ARFS') }}
41 | # optional libraries of sklearndf, needed for testing
42 | - boruta_py {{ environ.get('FACET_V_BORUTA', '[False]') }}
43 | - xgboost {{ environ.get('FACET_V_XGBOOST', '[False]') }}
44 | # we always need lightgbm for testing; version spec is optional
45 | - lightgbm {{ environ.get('FACET_V_LIGHTGBM', '') }}
46 | # additional requirements of gamma-pytools
47 | - joblib {{ environ.get('FACET_V_JOBLIB', '[False]') }}
48 | - matplotlib-base {{ environ.get('FACET_V_MATPLOTLIB', '[False]') }}
49 | - typing_inspect {{ environ.get('FACET_V_TYPING_INSPECT', '[False]') }}
50 | commands:
51 | - conda list
52 | - python -c 'import sklearndf;
53 | import os;
54 | assert sklearndf.__version__ == os.environ["PKG_VERSION"]'
55 | # optional PyPi package ARFS needed for testing
56 | {% if environ.get('FACET_V_ARFS') -%}
57 | - pip install 'arfs{{ environ.get("FACET_V_ARFS") }}'
58 | {%- endif %}
59 | # run the test suite
60 | - cd "${FACET_PATH}/sklearndf"
61 | - pytest -vs test
62 |
63 | about:
64 | home: https://github.com/BCG-X-Official/sklearndf
65 | license: Apache Software License v2.0
66 | license_file: LICENSE
67 | description: |
68 | sklearndf is an open source library designed to address a common need with
69 | scikit-learn: the outputs of transformers are numpy arrays, even when the input
70 | is a data frame. However, to inspect a model it is essential to keep track of
71 | the feature names.
72 | dev_url: https://github.com/BCG-X-Official/sklearndf
73 | doc_url: https://bcg-x-official.github.io/sklearndf/
74 | doc_source_url: https://github.com/BCG-X-Official/sklearndf/blob/develop/README.rst
--------------------------------------------------------------------------------
/config/spelling.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/config/spelling.dic
--------------------------------------------------------------------------------
/config/test_config.yml:
--------------------------------------------------------------------------------
1 | - inputfile:
2 | delimiter: "|"
3 | header: infer
4 | date_column_name : Date
5 | yield_column_name : Yield
6 | decimal: ","
--------------------------------------------------------------------------------
/dev-setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | conda env create -f environment.yml
3 | conda activate sklearndf-develop
4 | pre-commit install
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: sklearndf-develop
2 | channels:
3 | - conda-forge
4 | - bcg_gamma
5 | dependencies:
6 | # run
7 | - boruta_py ~= 0.3
8 | - gamma-pytools ~= 2.1
9 | - joblib ~= 1.2
10 | - lightgbm ~= 3.3
11 | - matplotlib ~= 3.7
12 | - numpy ~= 1.24
13 | - pandas ~= 2.0
14 | - pip ~= 23.3
15 | - python ~= 3.9
16 | - scikit-learn ~= 1.2.0
17 | - scipy ~= 1.11
18 | - xgboost ~= 1.7
19 | - pip:
20 | - arfs ~= 1.1
21 | # test
22 | - pytest ~= 7.2.1
23 | - pytest-cov ~= 2.12.1
24 | # sphinx
25 | - nbsphinx ~= 0.8.9
26 | - sphinx ~= 4.5.0
27 | - sphinx-autodoc-typehints ~= 1.19.2
28 | - pydata-sphinx-theme ~= 0.8.1
29 | # notebooks
30 | - ipywidgets ~= 8.1
31 | - jupyterlab ~= 3.6
32 | - openpyxl ~= 3.1
33 | - seaborn ~= 0.13
34 | - tableone ~= 0.7
35 |
--------------------------------------------------------------------------------
/make.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | call the Python make file for the common conda build process residing in 'pytools'
4 | """
5 |
6 | import os
7 | import sys
8 |
9 | SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
10 | PYTOOLS_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, "pytools"))
11 | sys.path.insert(0, PYTOOLS_DIR)
12 |
13 | # noinspection PyUnresolvedReferences
14 | from make import run_make
15 |
16 | run_make()
17 |
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | strict = True
3 | show_error_codes = True
4 |
5 | [mypy-arfs.*]
6 | ; TODO remove once PEP 561 is supported
7 | ignore_missing_imports = True
8 |
9 | [mypy-boruta.*]
10 | ; TODO remove once PEP 561 is supported
11 | ignore_missing_imports = True
12 |
13 | [mypy-lightgbm.*]
14 | ; TODO remove once PEP 561 is supported
15 | ignore_missing_imports = True
16 |
17 | [mypy-packaging.*]
18 | ; TODO remove once PEP 561 is supported
19 | ignore_missing_imports = True
20 |
21 | [mypy-pandas.*]
22 | ; TODO remove once PEP 561 is supported
23 | ignore_missing_imports = True
24 |
25 | [mypy-scipy.*]
26 | ; TODO remove once PEP 561 is supported
27 | ignore_missing_imports = True
28 |
29 | [mypy-sklearn.*]
30 | ; TODO remove once PEP 561 is supported
31 | ignore_missing_imports = True
32 |
33 | [mypy-xgboost.*]
34 | ; TODO remove once PEP 561 is supported
35 | ignore_missing_imports = True
36 |
--------------------------------------------------------------------------------
/pypi_description.rst:
--------------------------------------------------------------------------------
1 | *sklearndf* is an open source library designed to address a common need with
2 | `scikit-learn `__: the outputs of
3 | transformers are numpy arrays, even when the input is a
4 | data frame. However, to inspect a model it is essential to keep track of the
5 | feature names.
6 |
7 | To this end, *sklearndf* enhances scikit-learn's estimators as follows:
8 |
9 | - **Preserve data frame structure**:
10 | Return data frames as results of transformations, preserving feature names as the column index.
11 | - **Feature name tracing**:
12 | Add additional estimator properties to enable tracing a feature name back to its original input feature; this is especially useful for transformers that create new features (e.g., one-hot encode), and for pipelines that include such transformers.
13 | - **Easy use**:
14 | Simply append DF at the end of your usual scikit-learn class names to get enhanced data frame support!
15 |
16 | .. Begin-Badges
17 |
18 | |pypi| |conda| |python_versions| |code_style| |made_with_sphinx_doc| |License_badge|
19 |
20 | .. End-Badges
21 |
22 | License
23 | ---------------------------
24 |
25 | *sklearndf* is licensed under Apache 2.0 as described in the
26 | `LICENSE `_ file.
27 |
28 | .. Begin-Badges
29 |
30 | .. |conda| image:: https://anaconda.org/bcg_gamma/sklearndf/badges/version.svg
31 | :target: https://anaconda.org/BCG_Gamma/sklearndf
32 |
33 | .. |pypi| image:: https://badge.fury.io/py/sklearndf.svg
34 | :target: https://pypi.org/project/sklearndf/
35 |
36 | .. |python_versions| image:: https://img.shields.io/badge/python-3.7|3.8|3.9-blue.svg
37 | :target: https://www.python.org/downloads/release/python-380/
38 |
39 | .. |code_style| image:: https://img.shields.io/badge/code%20style-black-000000.svg
40 | :target: https://github.com/psf/black
41 |
42 | .. |made_with_sphinx_doc| image:: https://img.shields.io/badge/Made%20with-Sphinx-1f425f.svg
43 | :target: https://bcg-x-official.github.io/sklearndf/index.html
44 |
45 | .. |license_badge| image:: https://img.shields.io/badge/License-Apache%202.0-olivegreen.svg
46 | :target: https://opensource.org/licenses/Apache-2.0
47 |
48 | .. End-Badges
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["flit_core >=2,<4"]
3 | build-backend = "flit_core.buildapi"
4 |
5 | [tool.flit.sdist]
6 | exclude = [".idea", "tmp", "dist", ".tox", ".pytest_cache"]
7 |
8 | [tool.flit.metadata]
9 | module = "sklearndf"
10 | author = "Boston Consulting Group (BCG)"
11 | home-page = "https://github.com/BCG-X-Official/sklearndf"
12 | description-file = "pypi_description.rst"
13 | dist-name = "sklearndf"
14 | license = "Apache Software License v2.0"
15 |
16 | requires = [
17 | "gamma-pytools ~=2.1",
18 | "numpy >=1.21,<2a", # cannot use ~= due to conda bug
19 | "packaging >=20",
20 | "pandas >=1",
21 | "scikit-learn >=1,<1.4a",
22 | "scipy ~=1.6",
23 | ]
24 |
25 | requires-python = ">=3.7,<4a"
26 |
27 | classifiers = [
28 | "Development Status :: 5 - Production/Stable",
29 | "Intended Audience :: Science/Research",
30 | "License :: OSI Approved :: Apache Software License",
31 | "Operating System :: MacOS",
32 | "Operating System :: Microsoft :: Windows",
33 | "Operating System :: POSIX :: Linux",
34 | "Operating System :: Unix",
35 | "Programming Language :: Python",
36 | "Programming Language :: Python :: 3",
37 | "Programming Language :: Python :: 3.7",
38 | "Programming Language :: Python :: 3.8",
39 | "Programming Language :: Python :: 3.9",
40 | "Topic :: Scientific/Engineering",
41 | ]
42 |
43 | [tool.flit.metadata.requires-extra]
44 | testing = [
45 | "pytest ~= 7.1",
46 | "pytest-cov ~= 2.12",
47 | # optional requirements for testing sklearndf
48 | "lightgbm ~= 3.0",
49 | "xgboost ~= 1.0",
50 | ]
51 | docs = [
52 | "sphinx ~= 4.5",
53 | "sphinx-autodoc-typehints ~= 1.19",
54 | "pydata-sphinx-theme ~= 0.8.1",
55 | "jinja2 ~= 2.11",
56 | "nbsphinx ~= 0.8.9",
57 | "jupyter == 1",
58 | "docutils ~= 0.17",
59 | "xlrd ~= 1.2",
60 | "m2r ~= 0.2"
61 | ]
62 |
63 | [tool.flit.metadata.urls]
64 | Documentation = "https://bcg-x-official.github.io/sklearndf/"
65 | Repository = "https://github.com/BCG-X-Official/sklearndf"
66 |
67 | [build]
68 | # comma-separated list of packages to be built from source in pip min builds
69 | no-binary.min = ["matplotlib"]
70 |
71 | [build.matrix.min]
72 | # direct requirements of sklearndf
73 | boruta = "~=0.3.0"
74 | gamma-pytools = "~=2.1.0"
75 | lightgbm = "~=3.0.0"
76 | numpy = "==1.21.6" # cannot use ~= due to conda bug
77 | packaging = "~=20.9"
78 | pandas = "~=1.1.5"
79 | python = ">=3.7.12,<3.8a" # cannot use ~= due to conda bug
80 | scipy = "~=1.6.3"
81 | scikit-learn = "~=1.0.2"
82 | xgboost = "~=1.0.2"
83 | # additional minimum requirements of gamma-pytools
84 | joblib = "~=0.14.1"
85 | matplotlib = "~=3.0.3"
86 | typing_inspect = "~=0.4.0"
87 |
88 | [build.matrix.max]
89 | # direct requirements of sklearndf
90 | arfs = "~=1.1"
91 | gamma-pytools = "~=2.1"
92 | lightgbm = "~=3.3"
93 | numpy = ">=1.24,<2a" # cannot use ~= due to conda bug
94 | packaging = ">=20"
95 | pandas = "~=2.0"
96 | python = ">=3.11,<3.12a" # cannot use ~= due to conda bug
97 | scikit-learn = "~=1.3.2"
98 | scipy = "~=1.11"
99 | xgboost = "~=1.5"
100 | # additional maximum requirements of gamma-pytools
101 | joblib = "~=1.1"
102 | matplotlib = "~=3.5"
103 | typing_inspect = "~=0.7"
104 |
105 | [tool.black]
106 | # quiet = "True"
107 | line-length = 88
108 | target_version = ['py36']
109 | include = '\.pyi?$'
110 | exclude = '''
111 | (
112 | /(
113 | \.eggs # exclude a few common directories in the
114 | | \.git # root of the project
115 | | \.hg
116 | | \.mypy_cache
117 | | \.tox
118 | | \.venv
119 | | data
120 | | docs
121 | | notebooks
122 | | sphinx
123 | )/
124 | )
125 | '''
126 |
--------------------------------------------------------------------------------
/sphinx/.gitignore:
--------------------------------------------------------------------------------
1 | base
2 | source/_generated
3 | source/apidoc
4 |
--------------------------------------------------------------------------------
/sphinx/make.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Make sphinx documentation using the pytools make utility
4 | """
5 | import os
6 | from urllib import request
7 |
8 | BRANCH = "2.1.x"
9 |
10 |
11 | if __name__ == "__main__":
12 | # noinspection PyUnusedLocal
13 | def run_make(branch: str, working_directory: str) -> None:
14 | """Stub, overwritten by bootstrap.py"""
15 |
16 | # run the common make file available in the pytools repo
17 | with request.urlopen(
18 | f"https://raw.githubusercontent.com/BCG-X-Official/pytools/{BRANCH}"
19 | f"/sphinx/base/bootstrap.py"
20 | ) as response:
21 | exec(response.read().decode("utf-8"), globals())
22 |
23 | run_make(branch=BRANCH, working_directory=os.path.dirname(__file__))
24 |
--------------------------------------------------------------------------------
/sphinx/source/_images/gamma_sklearndf_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/sphinx/source/_images/gamma_sklearndf_logo.png
--------------------------------------------------------------------------------
/sphinx/source/_images/sklearndf-class-hierarchy.graffle/data.plist:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/sphinx/source/_images/sklearndf-class-hierarchy.graffle/data.plist
--------------------------------------------------------------------------------
/sphinx/source/_images/sklearndf_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/sphinx/source/_images/sklearndf_logo.png
--------------------------------------------------------------------------------
/sphinx/source/api_landing.rst:
--------------------------------------------------------------------------------
1 | Augmented scikit-learn classes are named after their native scikit-learn counterparts,
2 | with `DF` added as a suffix:
3 | :class:`.SimpleImputerDF` takes the place of :class:`~sklearn.impute.SimpleImputer`,
4 | :class:`.RandomForestRegressorDF` takes the place of
5 | :class:`~sklearn.ensemble.RandomForestRegressor`, and so on.
6 |
7 | For all methods expecting an `X` argument for a feature matrix and potentially a
8 | `y` argument for one or more targets, `sklearndf` estimators expect a pandas
9 | :class:`~pandas.DataFrame` for `X` and a pandas :class:`~pandas.Series` for a
10 | 1‑dimensional `y`, or a pandas :class:`~pandas.DataFrame` for `y` when fitting to
11 | multiple targets or outputs.
12 | This includes methods such as :meth:`~EstimatorDF.fit`,
13 | :meth:`~TransformerDF.transform`, and :meth:`~LearnerDF.predict`.
14 |
15 | All estimators enhanced by `sklearndf` also implement an additional attribute
16 | :attr:`~EstimatorDF.feature_names_in_`, keeping track of the column names of the data
17 | frame used to fit the estimator.
18 |
19 | `sklearndf` transformers also implement attributes
20 | :attr:`~TransformerDF.feature_names_out_` and
21 | :attr:`~TransformerDF.feature_names_original_`, keeping track of the feature names of
22 | the transformed outputs as well as mapping output features back to the input features.
23 | This enables tracing features back to the original inputs even across complex
24 | pipelines (see also :class:`.PipelineDF`).
25 |
26 | `sklearndf` classes implement a class hierarchy that follows the taxonomy of
27 | scikit-learn classes (but is only partially reflected via class inheritance in the
28 | original `scikit-learn` implementation):
29 |
30 | |
31 |
32 | .. image:: /_images/sklearndf-class-hierarchy.svg
33 | :alt: sklearndf class hierarchy
34 | :align: center
35 |
36 | |
37 |
38 | - all `sklearndf` transformers are subclasses of :class:`.TransformerDF`, which in turn
39 | provides the API for all common transformer methods, e.g.,
40 | :meth:`~TransformerDF.transform`
41 |
42 | - all `sklearndf` clusterers are subclasses of :class:`.ClusterDF`, which
43 | in turn provides the API for all common clustering methods, e.g.,
44 | :meth:`~ClusterDF.fit_predict`
45 |
46 | - all `sklearndf` regressors are subclasses of :class:`.RegressorDF`, which
47 | in turn provides the API for all common regressor methods, e.g.,
48 | :meth:`~LearnerDF.predict`
49 |
50 | - all `sklearndf` classifiers are subclasses of :class:`.ClassifierDF`, which
51 | in turn provides the API for all common classifier methods, e.g.,
52 | :meth:`~ClassifierDF.predict_proba`
53 |
54 | - all `sklearndf` regressors and classifiers are subclasses of
55 | :class:`.SupervisedLearnerDF`
56 |
57 | - all `sklearndf` regressors, classifiers and clusterers are subclasses of
58 | :class:`.LearnerDF`
59 |
60 | - all `sklearndf` estimators are subclasses of :class:`.EstimatorDF`
61 |
62 | `sklearndf` introduces additional pipeline classes :class:`.RegressorPipelineDF`,
63 | :class:`.ClassifierPipelineDF`, and :class:`.ClusterPipelineDF`, with an abstract base
64 | class :class:`.LearnerPipelineDF`, to allow for easier handling of common types of ML
65 | pipelines.
66 | These classes implement pipelines with two steps -- one preprocessing step, followed by
67 | a learner as the second and final step.
68 |
69 | `sklearndf` also provides data frame support for a selection of custom or 3rd-party
70 | estimators, most notably :class:`.BorutaDF`, :class:`.LGBMRegressorDF`,
71 | :class:`.LGBMClassifierDF`, :class:`.XGBRegressorDF`, and :class:`.XGBClassifierDF`.
72 |
73 | All `sklearndf` estimators are fully type hinted.
74 |
75 | Please see the :ref:`release notes` for recent API updates and bug fixes.
76 |
--------------------------------------------------------------------------------
/sphinx/source/conf.py:
--------------------------------------------------------------------------------
1 | """
2 | Configuration file for the Sphinx documentation builder.
3 |
4 | Receives the majority of the configuration from pytools conf_base.py
5 | """
6 |
7 | import os
8 | import sys
9 |
10 | _dir_base = os.path.join(os.path.dirname(os.path.dirname(__file__)), "base")
11 | sys.path.insert(0, _dir_base)
12 |
13 | from conf_base import set_config
14 |
15 | # ----- set custom configuration -----
16 |
17 | set_config(
18 | globals(),
19 | project="sklearndf",
20 | html_logo=os.path.join("_images", "gamma_sklearndf_logo.png"),
21 | intersphinx_mapping={
22 | "lightgbm": ("https://lightgbm.readthedocs.io/en/latest/", None),
23 | "pytools": ("https://bcg-x-official.github.io/pytools/", None),
24 | "sklearn": ("https://scikit-learn.org/stable", None),
25 | "xgboost": ("https://xgboost.readthedocs.io/en/latest/", None),
26 | },
27 | )
28 |
--------------------------------------------------------------------------------
/sphinx/source/faqs.rst:
--------------------------------------------------------------------------------
1 | .. _faqs:
2 |
3 | FAQ
4 | ===
5 |
6 | Below you can find answers to commonly asked questions as well as how to
7 | cite *sklearndf*.
8 |
9 | Commonly asked questions
10 | ------------------------
11 |
12 | If you don't see your answer there you could also try posting
13 | on `stackoverflow `_.
14 |
15 | 1. **What if I find a bug or have an idea for a new feature?**
16 |
17 | For bug reports or feature requests please use our
18 | `GitHub issue tracker `_.
19 | For any other enquiries please feel free to contact us at FacetTeam@bcg.com.
20 |
21 | 2. **How can I contribute?**
22 |
23 | We welcome contributors! If you have minor changes in mind that would like to
24 | contribute, please feel free to create a pull request and be sure to follow the
25 | developer guidelines. For large or extensive changes please feel free to open an
26 | issue, or reach out to us at FacetTeam@bcg.com to discuss.
27 |
28 |
29 | Citation
30 | --------
31 | If you use *sklearndf* in your work please cite us as follows:
32 |
33 | Bibtex entry::
34 |
35 | @manual{
36 | title={sklearndf},
37 | author={FACET Team at BCG Gamma},
38 | year={2021},
39 | note={Python package version 1.1.1}
40 | }
41 |
42 |
--------------------------------------------------------------------------------
/sphinx/source/index.rst:
--------------------------------------------------------------------------------
1 | .. image:: /_images/sklearndf_logo.png
2 |
3 | |
4 |
5 | Table of contents
6 | -----------------
7 |
8 | .. toctree::
9 | :maxdepth: 1
10 | :titlesonly:
11 |
12 | Getting started <_generated/getting_started>
13 | API reference
14 | tutorials
15 | contribution_guide
16 | faqs
17 | _generated/release_notes
18 |
--------------------------------------------------------------------------------
/sphinx/source/tutorials.rst:
--------------------------------------------------------------------------------
1 | .. _tutorials:
2 |
3 | Tutorials
4 | =========
5 |
6 |
7 |
8 |
9 | Detailed *sklearndf* tutorial
10 | ------------------------------
11 |
12 | Start exploring the tutorial right away by clicking on the section links below, and
13 | start running the code for yourself by downloading the notebook
14 | :download:`here `.
15 |
16 | .. toctree::
17 | :maxdepth: 1
18 |
19 | tutorial/sklearndf_tutorial
20 |
21 |
--------------------------------------------------------------------------------
/src/sklearndf/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Data frame support and feature traceability for `scikit-learn`.
3 |
4 | `sklearndf` augments more than 160 `scikit-learn` estimators for
5 | native support of data frames, while leaving the original API intact.
6 | """
7 |
8 | from ._sklearn_version import *
9 | from ._sklearndf import *
10 |
11 | __version__ = "2.3.1"
12 |
--------------------------------------------------------------------------------
/src/sklearndf/_sklearn_version.py:
--------------------------------------------------------------------------------
1 | """
2 | Special constants for version checks for scikit-learn.
3 | """
4 |
5 | from packaging.version import Version
6 | from sklearn import __version__ as sklearn_version
7 |
8 | __all__ = [
9 | "__sklearn_version__",
10 | "__sklearn_1_1__",
11 | "__sklearn_1_2__",
12 | "__sklearn_1_3__",
13 | "__sklearn_1_4__",
14 | ]
15 |
16 | __sklearn_version__ = Version(sklearn_version)
17 | __sklearn_1_1__ = Version("1.1")
18 | __sklearn_1_2__ = Version("1.2")
19 | __sklearn_1_3__ = Version("1.3")
20 | __sklearn_1_4__ = Version("1.4")
21 |
--------------------------------------------------------------------------------
/src/sklearndf/_util.py:
--------------------------------------------------------------------------------
1 | """
2 | Auxiliary functions for internal use.
3 | """
4 |
5 | from typing import Any, List, Optional, Union, cast
6 |
7 | import numpy.typing as npt
8 | import pandas as pd
9 | from scipy import sparse
10 |
11 |
12 | def hstack_frames(
13 | frames: List[Union[npt.NDArray[Any], sparse.spmatrix, pd.DataFrame]],
14 | *,
15 | prefixes: Optional[List[str]] = None,
16 | ) -> Optional[pd.DataFrame]:
17 | """
18 | If only data frames are passed, stack them horizontally.
19 |
20 | :param frames: a list of array-likes
21 | :param prefixes: an optional list of prefixes to use for the columns of each data
22 | frame in arg ``frames``; must have the same length as arg ``frames``
23 | :return: the stacked data frame if all elements of ``frames`` are data frames;
24 | ``None`` otherwise
25 | """
26 | if all(isinstance(frame, pd.DataFrame) for frame in frames):
27 | # all frames are data frames
28 | frames = cast(List[pd.DataFrame], frames)
29 | if prefixes is not None:
30 | assert len(prefixes) == len(
31 | frames
32 | ), "number of prefixes must match number of frames"
33 | frames = [
34 | frame.add_prefix(f"{prefix}__")
35 | for frame, prefix in zip(frames, prefixes)
36 | ]
37 | return pd.concat(frames, axis=1)
38 | else:
39 | return None
40 |
41 |
42 | def is_sparse_frame(frame: pd.DataFrame) -> bool:
43 | """
44 | Check if a data frame contains sparse columns.
45 |
46 | :param frame: the data frame to check
47 | :return: ``True`` if the data frame contains sparse columns; ``False`` otherwise
48 | """
49 |
50 | return any(isinstance(dtype, pd.SparseDtype) for dtype in frame.dtypes)
51 |
52 |
53 | def sparse_frame_density(frame: pd.DataFrame) -> float:
54 | """
55 | Compute the density of a data frame.
56 |
57 | The density of a data frame is the average density of its columns.
58 | The density of a sparse column is the ratio of non-sparse points to total (dense)
59 | data points.
60 | The density of a dense column is 1.
61 |
62 | :param frame: a data frame
63 | :return: the density of the data frame
64 | """
65 |
66 | def _density(sr: pd.Series) -> float:
67 | if isinstance(sr.dtype, pd.SparseDtype):
68 | return cast(float, sr.sparse.density)
69 | else:
70 | return 1.0
71 |
72 | return sum(_density(sr) for _, sr in frame.items()) / len(frame.columns)
73 |
--------------------------------------------------------------------------------
/src/sklearndf/classification/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Extended versions of all `scikit-learn` classifiers with enhanced support for data
3 | frames.
4 | """
5 | from ._classification import *
6 | from ._classification_v0_22 import *
7 | from ._classification_v0_23 import *
8 | from ._classification_v1_0 import *
9 |
--------------------------------------------------------------------------------
/src/sklearndf/classification/_classification.py:
--------------------------------------------------------------------------------
1 | """
2 | Core implementation of :mod:`sklearndf.classification`
3 | """
4 | import logging
5 |
6 | from sklearn.calibration import CalibratedClassifierCV
7 | from sklearn.discriminant_analysis import (
8 | LinearDiscriminantAnalysis,
9 | QuadraticDiscriminantAnalysis,
10 | )
11 | from sklearn.dummy import DummyClassifier
12 | from sklearn.ensemble import (
13 | AdaBoostClassifier,
14 | BaggingClassifier,
15 | ExtraTreesClassifier,
16 | GradientBoostingClassifier,
17 | RandomForestClassifier,
18 | VotingClassifier,
19 | )
20 | from sklearn.gaussian_process import GaussianProcessClassifier
21 | from sklearn.linear_model import (
22 | LogisticRegression,
23 | LogisticRegressionCV,
24 | PassiveAggressiveClassifier,
25 | Perceptron,
26 | RidgeClassifier,
27 | RidgeClassifierCV,
28 | SGDClassifier,
29 | )
30 | from sklearn.multiclass import (
31 | OneVsOneClassifier,
32 | OneVsRestClassifier,
33 | OutputCodeClassifier,
34 | )
35 | from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
36 | from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB
37 | from sklearn.neighbors import (
38 | KNeighborsClassifier,
39 | NearestCentroid,
40 | RadiusNeighborsClassifier,
41 | )
42 | from sklearn.neural_network import MLPClassifier
43 | from sklearn.semi_supervised import LabelPropagation, LabelSpreading
44 | from sklearn.svm import SVC, LinearSVC, NuSVC
45 | from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
46 |
47 | from pytools.api import AllTracker
48 |
49 | from ..wrapper import ClassifierWrapperDF, MetaEstimatorWrapperDF
50 | from .wrapper import (
51 | ClassifierChainWrapperDF,
52 | LinearDiscriminantAnalysisWrapperDF,
53 | MetaClassifierWrapperDF,
54 | MultiOutputClassifierWrapperDF,
55 | PartialFitClassifierWrapperDF,
56 | )
57 |
58 | log = logging.getLogger(__name__)
59 |
60 | __all__ = [
61 | "AdaBoostClassifierDF",
62 | "BaggingClassifierDF",
63 | "BernoulliNBDF",
64 | "CalibratedClassifierCVDF",
65 | "ClassifierChainDF",
66 | "ComplementNBDF",
67 | "DecisionTreeClassifierDF",
68 | "DummyClassifierDF",
69 | "ExtraTreeClassifierDF",
70 | "ExtraTreesClassifierDF",
71 | "GaussianNBDF",
72 | "GaussianProcessClassifierDF",
73 | "GradientBoostingClassifierDF",
74 | "KNeighborsClassifierDF",
75 | "LabelPropagationDF",
76 | "LabelSpreadingDF",
77 | "LinearDiscriminantAnalysisDF",
78 | "LinearSVCDF",
79 | "LogisticRegressionCVDF",
80 | "LogisticRegressionDF",
81 | "MLPClassifierDF",
82 | "MultinomialNBDF",
83 | "MultiOutputClassifierDF",
84 | "NearestCentroidDF",
85 | "NuSVCDF",
86 | "OneVsOneClassifierDF",
87 | "OneVsRestClassifierDF",
88 | "OutputCodeClassifierDF",
89 | "PassiveAggressiveClassifierDF",
90 | "PerceptronDF",
91 | "QuadraticDiscriminantAnalysisDF",
92 | "RadiusNeighborsClassifierDF",
93 | "RandomForestClassifierDF",
94 | "RidgeClassifierCVDF",
95 | "RidgeClassifierDF",
96 | "SGDClassifierDF",
97 | "SVCDF",
98 | "VotingClassifierDF",
99 | ]
100 |
101 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
102 |
103 |
104 | #
105 | # Ensure all symbols introduced below are included in __all__
106 | #
107 |
108 | __tracker = AllTracker(globals())
109 |
110 |
111 | #
112 | # Class definitions
113 | #
114 |
115 |
116 | #
117 | # Dummy
118 | #
119 |
120 |
121 | class DummyClassifierDF(ClassifierWrapperDF[DummyClassifier], native=DummyClassifier):
122 | """Stub for DF wrapper of class ``DummyClassifier``"""
123 |
124 |
125 | #
126 | # neighbors
127 | #
128 |
129 |
130 | class NearestCentroidDF(ClassifierWrapperDF[NearestCentroid], native=NearestCentroid):
131 | """Stub for DF wrapper of class ``NearestCentroid``"""
132 |
133 |
134 | class KNeighborsClassifierDF(
135 | ClassifierWrapperDF[KNeighborsClassifier], native=KNeighborsClassifier
136 | ):
137 | """Stub for DF wrapper of class ``KNeighborsClassifier``"""
138 |
139 |
140 | class RadiusNeighborsClassifierDF(
141 | ClassifierWrapperDF[RadiusNeighborsClassifier], native=RadiusNeighborsClassifier
142 | ):
143 | """Stub for DF wrapper of class ``RadiusNeighborsClassifier``"""
144 |
145 |
146 | #
147 | # voting
148 | #
149 |
150 |
151 | class VotingClassifierDF(
152 | MetaClassifierWrapperDF[VotingClassifier], native=VotingClassifier
153 | ):
154 | """Stub for DF wrapper of class ``VotingClassifier``"""
155 |
156 |
157 | #
158 | # ensemble
159 | #
160 |
161 |
162 | class RandomForestClassifierDF(
163 | ClassifierWrapperDF[RandomForestClassifier], native=RandomForestClassifier
164 | ):
165 | """Stub for DF wrapper of class ``RandomForestClassifier``"""
166 |
167 |
168 | class ExtraTreesClassifierDF(
169 | ClassifierWrapperDF[ExtraTreesClassifier], native=ExtraTreesClassifier
170 | ):
171 | """Stub for DF wrapper of class ``ExtraTreesClassifier``"""
172 |
173 |
174 | # noinspection PyAbstractClass
175 | class GradientBoostingClassifierDF(
176 | ClassifierWrapperDF[GradientBoostingClassifier], native=GradientBoostingClassifier
177 | ):
178 | """Stub for DF wrapper of class ``GradientBoostingClassifier``"""
179 |
180 |
181 | class AdaBoostClassifierDF(
182 | ClassifierWrapperDF[AdaBoostClassifier], native=AdaBoostClassifier
183 | ):
184 | """Stub for DF wrapper of class ``AdaBoostClassifier``"""
185 |
186 |
187 | class BaggingClassifierDF(
188 | ClassifierWrapperDF[BaggingClassifier], native=BaggingClassifier
189 | ):
190 | """Stub for DF wrapper of class ``BaggingClassifier``"""
191 |
192 |
193 | #
194 | # tree
195 | #
196 |
197 |
198 | class DecisionTreeClassifierDF(
199 | ClassifierWrapperDF[DecisionTreeClassifier], native=DecisionTreeClassifier
200 | ):
201 | """Stub for DF wrapper of class ``DecisionTreeClassifier``"""
202 |
203 |
204 | class ExtraTreeClassifierDF(
205 | ClassifierWrapperDF[ExtraTreeClassifier], native=ExtraTreeClassifier
206 | ):
207 | """Stub for DF wrapper of class ``ExtraTreeClassifier``"""
208 |
209 |
210 | #
211 | # discriminant analysis
212 | #
213 |
214 |
215 | class LinearDiscriminantAnalysisDF(
216 | LinearDiscriminantAnalysisWrapperDF, native=LinearDiscriminantAnalysis
217 | ):
218 | """Stub for DF wrapper of class ``LinearDiscriminantAnalysis``"""
219 |
220 |
221 | class QuadraticDiscriminantAnalysisDF(
222 | ClassifierWrapperDF[QuadraticDiscriminantAnalysis],
223 | native=QuadraticDiscriminantAnalysis,
224 | ):
225 | """Stub for DF wrapper of class ``QuadraticDiscriminantAnalysis``"""
226 |
227 |
228 | #
229 | # naive bayes
230 | #
231 |
232 |
233 | class GaussianNBDF(PartialFitClassifierWrapperDF[GaussianNB], native=GaussianNB):
234 | """Stub for DF wrapper of class ``GaussianNB``"""
235 |
236 |
237 | class MultinomialNBDF(
238 | PartialFitClassifierWrapperDF[MultinomialNB], native=MultinomialNB
239 | ):
240 | """Stub for DF wrapper of class ``MultinomialNB``"""
241 |
242 |
243 | class ComplementNBDF(PartialFitClassifierWrapperDF[ComplementNB], native=ComplementNB):
244 | """Stub for DF wrapper of class ``ComplementNB``"""
245 |
246 |
247 | class BernoulliNBDF(PartialFitClassifierWrapperDF[BernoulliNB], native=BernoulliNB):
248 | """Stub for DF wrapper of class ``BernoulliNB``"""
249 |
250 |
251 | #
252 | # calibration
253 | #
254 |
255 |
256 | class CalibratedClassifierCVDF(
257 | MetaClassifierWrapperDF[CalibratedClassifierCV], native=CalibratedClassifierCV
258 | ):
259 | """Stub for DF wrapper of class ``CalibratedClassifierCV``"""
260 |
261 |
262 | #
263 | # SVM
264 | #
265 |
266 |
267 | class SVCDF(ClassifierWrapperDF[SVC], native=SVC):
268 | """Stub for DF wrapper of class ``SVC``"""
269 |
270 |
271 | class NuSVCDF(ClassifierWrapperDF[NuSVC], native=NuSVC):
272 | """Stub for DF wrapper of class ``NuSVC``"""
273 |
274 |
275 | class LinearSVCDF(ClassifierWrapperDF[LinearSVC], native=LinearSVC):
276 | """Stub for DF wrapper of class ``LinearSVC``"""
277 |
278 |
279 | #
280 | # gaussian process
281 | #
282 |
283 |
284 | class GaussianProcessClassifierDF(
285 | ClassifierWrapperDF[GaussianProcessClassifier], native=GaussianProcessClassifier
286 | ):
287 | """Stub for DF wrapper of class ``GaussianProcessClassifier``"""
288 |
289 |
290 | #
291 | # linear model
292 | #
293 |
294 |
295 | class LogisticRegressionDF(
296 | ClassifierWrapperDF[LogisticRegression], native=LogisticRegression
297 | ):
298 | """Stub for DF wrapper of class ``LogisticRegression``"""
299 |
300 |
301 | class LogisticRegressionCVDF(
302 | ClassifierWrapperDF[LogisticRegressionCV], native=LogisticRegressionCV
303 | ):
304 | """Stub for DF wrapper of class ``LogisticRegressionCV``"""
305 |
306 |
307 | class PassiveAggressiveClassifierDF(
308 | PartialFitClassifierWrapperDF[PassiveAggressiveClassifier],
309 | native=PassiveAggressiveClassifier,
310 | ):
311 | """Stub for DF wrapper of class ``PassiveAggressiveClassifier``"""
312 |
313 |
314 | class PerceptronDF(PartialFitClassifierWrapperDF[Perceptron], native=Perceptron):
315 | """Stub for DF wrapper of class ``Perceptron``"""
316 |
317 |
318 | class SGDClassifierDF(
319 | PartialFitClassifierWrapperDF[SGDClassifier], native=SGDClassifier
320 | ):
321 | """Stub for DF wrapper of class ``SGDClassifier``"""
322 |
323 |
324 | class RidgeClassifierDF(ClassifierWrapperDF[RidgeClassifier], native=RidgeClassifier):
325 | """Stub for DF wrapper of class ``RidgeClassifier``"""
326 |
327 |
328 | class RidgeClassifierCVDF(
329 | ClassifierWrapperDF[RidgeClassifierCV], native=RidgeClassifierCV
330 | ):
331 | """Stub for DF wrapper of class ``RidgeClassifierCV``"""
332 |
333 |
334 | #
335 | # semi-supervised
336 | #
337 |
338 |
339 | class LabelPropagationDF(
340 | ClassifierWrapperDF[LabelPropagation], native=LabelPropagation
341 | ):
342 | """Stub for DF wrapper of class ``LabelPropagation``"""
343 |
344 |
345 | class LabelSpreadingDF(ClassifierWrapperDF[LabelSpreading], native=LabelSpreading):
346 | """Stub for DF wrapper of class ``LabelSpreading``"""
347 |
348 |
349 | #
350 | # multi-class
351 | #
352 |
353 |
354 | class OneVsRestClassifierDF(
355 | MetaClassifierWrapperDF[OneVsRestClassifier], native=OneVsRestClassifier
356 | ):
357 | """Stub for DF wrapper of class ``OneVsRestClassifier``"""
358 |
359 |
360 | class OneVsOneClassifierDF(
361 | ClassifierWrapperDF[OneVsOneClassifier],
362 | MetaEstimatorWrapperDF[OneVsOneClassifier],
363 | native=OneVsOneClassifier,
364 | ):
365 | """Stub for DF wrapper of class ``OneVsOneClassifier``"""
366 |
367 |
368 | class OutputCodeClassifierDF(
369 | ClassifierWrapperDF[OutputCodeClassifier],
370 | MetaEstimatorWrapperDF[OutputCodeClassifier],
371 | native=OutputCodeClassifier,
372 | ):
373 | """Stub for DF wrapper of class ``OutputCodeClassifier``"""
374 |
375 |
376 | #
377 | # multi-output
378 | #
379 |
380 |
381 | class MultiOutputClassifierDF(
382 | MultiOutputClassifierWrapperDF, native=MultiOutputClassifier
383 | ):
384 | """Stub for DF wrapper of class ``MultiOutputClassifier``"""
385 |
386 |
387 | #
388 | # chaining
389 | #
390 |
391 |
392 | class ClassifierChainDF(ClassifierChainWrapperDF, native=ClassifierChain):
393 | """Stub for DF wrapper of class ``ClassifierChain``"""
394 |
395 |
396 | #
397 | # neural network
398 | #
399 |
400 |
401 | class MLPClassifierDF(
402 | PartialFitClassifierWrapperDF[MLPClassifier], native=MLPClassifier
403 | ):
404 | """Stub for DF wrapper of class ``MLPClassifier``"""
405 |
406 |
407 | #
408 | # validate __all__
409 | #
410 |
411 | __tracker.validate()
412 |
413 |
414 | #
415 | # validate that __all__ comprises all symbols ending in "DF", and no others
416 | #
417 |
418 | __estimators = {
419 | sym
420 | for sym in dir()
421 | if sym.endswith("DF")
422 | and sym not in __imported_estimators
423 | and not sym.startswith("_")
424 | }
425 | if __estimators != set(__all__):
426 | raise RuntimeError(
427 | "__all__ does not contain exactly all DF estimators; expected value is:\n"
428 | f"{__estimators}"
429 | )
430 |
--------------------------------------------------------------------------------
/src/sklearndf/classification/_classification_v0_22.py:
--------------------------------------------------------------------------------
1 | """
2 | Additional implementation of :mod:`sklearndf.classification` loaded
3 | from sklearn 0.22 onwards
4 | """
5 | import logging
6 |
7 | from sklearn.ensemble import StackingClassifier
8 | from sklearn.naive_bayes import CategoricalNB
9 |
10 | from pytools.api import AllTracker
11 |
12 | from ..wrapper.stacking import StackingClassifierWrapperDF
13 | from .wrapper import PartialFitClassifierWrapperDF
14 |
15 | log = logging.getLogger(__name__)
16 |
17 | __all__ = ["CategoricalNBDF", "StackingClassifierDF"]
18 |
19 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
20 |
21 |
22 | #
23 | # Ensure all symbols introduced below are included in __all__
24 | #
25 |
26 | __tracker = AllTracker(globals())
27 |
28 |
29 | #
30 | # Class definitions
31 | #
32 |
33 |
34 | #
35 | # naive bayes
36 | #
37 |
38 |
39 | class CategoricalNBDF(
40 | PartialFitClassifierWrapperDF[CategoricalNB], native=CategoricalNB
41 | ):
42 | """Stub for DF wrapper of class ``CategoricalNB``"""
43 |
44 |
45 | class StackingClassifierDF(
46 | StackingClassifierWrapperDF[StackingClassifier], native=StackingClassifier
47 | ):
48 | """Stub for DF wrapper of class ``StackingClassifier``"""
49 |
50 |
51 | #
52 | # validate __all__
53 | #
54 |
55 | __tracker.validate()
56 |
57 |
58 | #
59 | # validate that __all__ comprises all symbols ending in "DF", and no others
60 | #
61 |
62 | __estimators = {
63 | sym
64 | for sym in dir()
65 | if sym.endswith("DF")
66 | and sym not in __imported_estimators
67 | and not sym.startswith("_")
68 | }
69 | if __estimators != set(__all__):
70 | raise RuntimeError(
71 | "__all__ does not contain exactly all DF estimators; expected value is:\n"
72 | f"{__estimators}"
73 | )
74 |
--------------------------------------------------------------------------------
/src/sklearndf/classification/_classification_v0_23.py:
--------------------------------------------------------------------------------
1 | """
2 | Additional implementation of :mod:`sklearndf.classification` loaded
3 | from sklearn 0.23 onwards
4 | """
5 |
6 | import logging
7 | from typing import List
8 |
9 | from pytools.api import AllTracker
10 |
11 | log = logging.getLogger(__name__)
12 |
13 | __all__: List[str] = []
14 |
15 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
16 |
17 |
18 | #
19 | # Ensure all symbols introduced below are included in __all__
20 | #
21 |
22 | __tracker = AllTracker(globals())
23 |
24 |
25 | #
26 | # Class definitions
27 | #
28 |
29 |
30 | # todo: add classification implementations for sklearn 0.23
31 |
32 |
33 | __tracker.validate()
34 |
35 | #
36 | # validate that __all__ comprises all symbols ending in "DF", and no others
37 | #
38 |
39 | __estimators = {
40 | sym
41 | for sym in dir()
42 | if sym.endswith("DF")
43 | and sym not in __imported_estimators
44 | and not sym.startswith("_")
45 | }
46 | if __estimators != set(__all__):
47 | raise RuntimeError(
48 | "__all__ does not contain exactly all DF estimators; expected value is:\n"
49 | f"{__estimators}"
50 | )
51 |
--------------------------------------------------------------------------------
/src/sklearndf/classification/_classification_v1_0.py:
--------------------------------------------------------------------------------
1 | """
2 | Additional implementation of :mod:`sklearndf.classification` loaded
3 | from sklearn 1.0 onwards
4 | """
5 | import logging
6 |
7 | from sklearn.ensemble import HistGradientBoostingClassifier
8 |
9 | from pytools.api import AllTracker
10 |
11 | from ..wrapper import ClassifierWrapperDF
12 |
13 | log = logging.getLogger(__name__)
14 |
15 | __all__ = ["HistGradientBoostingClassifierDF"]
16 |
17 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
18 |
19 |
20 | #
21 | # Ensure all symbols introduced below are included in __all__
22 | #
23 |
24 | __tracker = AllTracker(globals())
25 |
26 |
27 | #
28 | # ensemble
29 | #
30 |
31 |
32 | class HistGradientBoostingClassifierDF(
33 | ClassifierWrapperDF[HistGradientBoostingClassifier],
34 | native=HistGradientBoostingClassifier,
35 | ):
36 | """Stub for DF wrapper of class ``HistGradientBoostingClassifier``"""
37 |
38 |
39 | #
40 | # validate __all__
41 | #
42 |
43 | __tracker.validate()
44 |
45 |
46 | #
47 | # validate that __all__ comprises all symbols ending in "DF", and no others
48 | #
49 |
50 | __estimators = {
51 | sym
52 | for sym in dir()
53 | if sym.endswith("DF")
54 | and sym not in __imported_estimators
55 | and not sym.startswith("_")
56 | }
57 | if __estimators != set(__all__):
58 | raise RuntimeError(
59 | "__all__ does not contain exactly all DF estimators; expected value is:\n"
60 | f"{__estimators}"
61 | )
62 |
--------------------------------------------------------------------------------
/src/sklearndf/classification/extra/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Additional 3rd party classifiers that implement the `scikit-learn` interface.
3 |
4 | Note that 3rd party packages implementing the associated native estimators must be
5 | installed explicitly: they are not included in `sklearndf`'s package requirements to
6 | achieve a lean package footprint for default installs of `sklearndf`.
7 | """
8 | from ._extra import *
9 |
--------------------------------------------------------------------------------
/src/sklearndf/classification/extra/_extra.py:
--------------------------------------------------------------------------------
1 | """
2 | Core implementation of :mod:`sklearndf.classification.extra`
3 | """
4 | import logging
5 |
6 | from sklearn.base import ClassifierMixin
7 |
8 | from pytools.api import AllTracker
9 |
10 | from ...wrapper import ClassifierWrapperDF, MissingEstimator
11 |
12 | log = logging.getLogger(__name__)
13 |
14 | __all__ = ["LGBMClassifierDF", "XGBClassifierDF"]
15 |
16 | try:
17 | # import lightgbm classes only if installed
18 | from lightgbm.sklearn import LGBMClassifier
19 | except ImportError:
20 |
21 | class LGBMClassifier( # type: ignore
22 | MissingEstimator,
23 | ClassifierMixin, # type: ignore
24 | ):
25 | """Mock-up for missing estimator."""
26 |
27 |
28 | try:
29 | # import xgboost classes only if installed
30 | from xgboost import XGBClassifier
31 | except ImportError:
32 |
33 | class XGBClassifier( # type: ignore
34 | MissingEstimator,
35 | ClassifierMixin, # type: ignore
36 | ):
37 | """Mock-up for missing estimator."""
38 |
39 |
40 | #
41 | # Ensure all symbols introduced below are included in __all__
42 | #
43 |
44 | __tracker = AllTracker(globals())
45 |
46 |
47 | #
48 | # Class definitions
49 | #
50 |
51 |
52 | class LGBMClassifierDF(ClassifierWrapperDF[LGBMClassifier], native=LGBMClassifier):
53 | """Stub for DF wrapper of class ``LGBMClassifierDF``"""
54 |
55 |
56 | class XGBClassifierDF(ClassifierWrapperDF[XGBClassifier], native=XGBClassifier):
57 | """Stub for DF wrapper of class ``XGBClassifierDF``"""
58 |
59 |
60 | #
61 | # validate that __all__
62 | #
63 |
64 | __tracker.validate()
65 |
--------------------------------------------------------------------------------
/src/sklearndf/classification/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Wrapper classes for `scikit-learn` classifiers, providing enhanced support for data
3 | frames.
4 | """
5 |
6 | from ._wrapper import *
7 |
--------------------------------------------------------------------------------
/src/sklearndf/classification/wrapper/_wrapper.py:
--------------------------------------------------------------------------------
1 | """
2 | Core implementation of :mod:`sklearndf.classification.wrapper`
3 | """
4 |
5 | import logging
6 | from abc import ABCMeta
7 | from typing import Any, Generic, List, Optional, Sequence, TypeVar, Union, cast
8 |
9 | import numpy.typing as npt
10 | import pandas as pd
11 | from sklearn.base import ClassifierMixin
12 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
13 | from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
14 |
15 | from pytools.api import AllTracker
16 |
17 | from ...transformation.wrapper import NComponentsDimensionalityReductionWrapperDF
18 | from ...wrapper import ClassifierWrapperDF, MetaEstimatorWrapperDF
19 |
20 | log = logging.getLogger(__name__)
21 |
22 | __all__ = [
23 | "ClassifierChainWrapperDF",
24 | "LinearDiscriminantAnalysisWrapperDF",
25 | "MetaClassifierWrapperDF",
26 | "MultiOutputClassifierWrapperDF",
27 | "PartialFitClassifierWrapperDF",
28 | ]
29 |
30 | #
31 | # Type variables
32 | #
33 |
34 | T_PartialFitClassifierWrapperDF = TypeVar(
35 | "T_PartialFitClassifierWrapperDF",
36 | bound="PartialFitClassifierWrapperDF[ClassifierMixin]",
37 | )
38 | T_NativeClassifier = TypeVar("T_NativeClassifier", bound=ClassifierMixin)
39 |
40 |
41 | #
42 | # Ensure all symbols introduced below are included in __all__
43 | #
44 |
45 | __tracker = AllTracker(globals())
46 |
47 |
48 | #
49 | # Wrapper classes
50 | #
51 |
52 |
53 | class LinearDiscriminantAnalysisWrapperDF(
54 | ClassifierWrapperDF[LinearDiscriminantAnalysis],
55 | NComponentsDimensionalityReductionWrapperDF[LinearDiscriminantAnalysis],
56 | metaclass=ABCMeta,
57 | ):
58 | """
59 | DF wrapper for
60 | :class:`sklearn.discriminant_analysis.LinearDiscriminantAnalysis`.
61 | """
62 |
63 | pass
64 |
65 |
66 | class MetaClassifierWrapperDF(
67 | ClassifierWrapperDF[T_NativeClassifier],
68 | MetaEstimatorWrapperDF[T_NativeClassifier],
69 | Generic[T_NativeClassifier],
70 | metaclass=ABCMeta,
71 | ):
72 | """
73 | Abstract base class of DF wrappers for classifiers implementing
74 | :class:`sklearn.base.MetaEstimatorMixin`.
75 | """
76 |
77 | pass
78 |
79 |
80 | class PartialFitClassifierWrapperDF(
81 | ClassifierWrapperDF[T_NativeClassifier],
82 | Generic[T_NativeClassifier],
83 | metaclass=ABCMeta,
84 | ):
85 | """
86 | Abstract base class of DF wrappers for classifiers implementing
87 | method ``partial_fit()``.
88 | """
89 |
90 | # noinspection PyPep8Naming
91 | def partial_fit(
92 | self: T_PartialFitClassifierWrapperDF,
93 | X: Union[pd.Series, pd.DataFrame],
94 | y: Union[pd.Series, pd.DataFrame],
95 | classes: Optional[Sequence[Any]] = None,
96 | sample_weight: Optional[pd.Series] = None,
97 | ) -> T_PartialFitClassifierWrapperDF:
98 | """
99 | Perform incremental fit on a batch of samples.
100 |
101 | This method is meant to be called multiple times for subsets of training
102 | data which, e.g., couldn't fit in the required memory in full. It can be
103 | also used for online learning.
104 |
105 | :param X: data frame with observations as rows and features as columns
106 | :param y: a series or data frame with one or more outputs per observation
107 | :param classes: all classes present across all calls to ``partial_fit``;
108 | only required for the first call of this method
109 | :param sample_weight: optional weights applied to individual samples
110 | :return: ``self``
111 | """
112 | X, y = self._validate_parameter_types(X, y)
113 | self._partial_fit(X, y, classes=classes, sample_weight=sample_weight)
114 |
115 | return self
116 |
117 | # noinspection PyPep8Naming
118 | def _partial_fit(
119 | self: T_PartialFitClassifierWrapperDF,
120 | X: pd.DataFrame,
121 | y: Union[pd.Series, pd.DataFrame],
122 | **partial_fit_params: Optional[Any],
123 | ) -> T_PartialFitClassifierWrapperDF:
124 | return cast(
125 | T_PartialFitClassifierWrapperDF,
126 | self._native_estimator.partial_fit(
127 | self._prepare_X_for_delegate(X),
128 | self._prepare_y_for_delegate(y),
129 | **{
130 | arg: value
131 | for arg, value in partial_fit_params.items()
132 | if value is not None
133 | },
134 | ),
135 | )
136 |
137 |
138 | class MultiOutputClassifierWrapperDF(
139 | MetaClassifierWrapperDF[MultiOutputClassifier],
140 | PartialFitClassifierWrapperDF[MultiOutputClassifier],
141 | metaclass=ABCMeta,
142 | ):
143 | """
144 | DF wrapper for :class:`sklearn.multioutput.MultiOutputClassifier`.
145 | """
146 |
147 | # noinspection PyPep8Naming
148 | def _prediction_with_class_labels(
149 | self,
150 | X: pd.DataFrame,
151 | prediction: Union[
152 | pd.Series, pd.DataFrame, List[npt.NDArray[Any]], npt.NDArray[Any]
153 | ],
154 | classes: Optional[Sequence[Any]] = None,
155 | ) -> Union[pd.Series, pd.DataFrame, List[pd.DataFrame]]:
156 | # if we have a multi-output classifier, prediction of probabilities
157 | # yields a list of NumPy arrays
158 | if not isinstance(prediction, list):
159 | raise ValueError(
160 | "prediction of multi-output classifier expected to be a list of NumPy "
161 | f"arrays, but got type {type(prediction)}"
162 | )
163 |
164 | delegate_estimator = self.native_estimator
165 |
166 | # store the super() object as this is not available within a generator
167 | _super = cast(ClassifierWrapperDF[MultiOutputClassifier], super())
168 |
169 | # estimators attribute of abstract class MultiOutputEstimator
170 | # usually the delegate estimator will provide a list of estimators used
171 | # to predict each output. If present, use these estimators to get
172 | # individual class labels for each output; otherwise we cannot assign class
173 | # labels
174 | estimators = getattr(delegate_estimator, "estimators_", None)
175 | if estimators is None:
176 | return [
177 | _super._prediction_with_class_labels(X=X, prediction=output)
178 | for output in prediction
179 | ]
180 | else:
181 | return [
182 | _super._prediction_with_class_labels(
183 | X=X, prediction=output, classes=getattr(estimator, "classes_", None)
184 | )
185 | for estimator, output in zip(estimators, prediction)
186 | ]
187 |
188 |
189 | class ClassifierChainWrapperDF(
190 | MetaEstimatorWrapperDF[ClassifierChain],
191 | ClassifierWrapperDF[ClassifierChain],
192 | metaclass=ABCMeta,
193 | ):
194 | """
195 | DF wrapper for :class:`sklearn.multioutput.ClassifierChain`.
196 | """
197 |
198 | # noinspection PyPep8Naming
199 | def _prediction_with_class_labels(
200 | self,
201 | X: pd.DataFrame,
202 | prediction: Union[
203 | pd.Series, pd.DataFrame, List[npt.NDArray[Any]], npt.NDArray[Any]
204 | ],
205 | classes: Optional[Sequence[Any]] = None,
206 | ) -> Union[pd.Series, pd.DataFrame, List[pd.DataFrame]]:
207 | # todo: infer actual class names
208 | return super()._prediction_with_class_labels(
209 | X, prediction, classes=range(self.n_outputs_)
210 | )
211 |
212 |
213 | #
214 | # Validate __all__
215 | #
216 |
217 | __tracker.validate()
218 |
--------------------------------------------------------------------------------
/src/sklearndf/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Extended versions of `scikit-learn` clusterers with enhanced support for data
3 | frames.
4 | """
5 |
6 | from .. import __sklearn_1_1__, __sklearn_1_3__, __sklearn_version__
7 | from ._clustering import *
8 |
9 | if __sklearn_version__ >= __sklearn_1_1__:
10 | from ._clustering_v1_1 import *
11 |
12 | if __sklearn_version__ >= __sklearn_1_3__:
13 | from ._clustering_v1_3 import *
14 |
--------------------------------------------------------------------------------
/src/sklearndf/clustering/_clustering.py:
--------------------------------------------------------------------------------
1 | """
2 | Core implementation of :mod:`sklearndf.clustering`
3 | """
4 | import logging
5 |
6 | from sklearn.cluster import (
7 | DBSCAN,
8 | OPTICS,
9 | AffinityPropagation,
10 | AgglomerativeClustering,
11 | Birch,
12 | FeatureAgglomeration,
13 | KMeans,
14 | MeanShift,
15 | MiniBatchKMeans,
16 | SpectralClustering,
17 | )
18 |
19 | from pytools.api import AllTracker
20 |
21 | from ..wrapper import ClusterWrapperDF
22 | from .wrapper import FeatureAgglomerationWrapperDF, KMeansBaseWrapperDF
23 |
24 | log = logging.getLogger(__name__)
25 |
26 | __all__ = [
27 | "AffinityPropagationDF",
28 | "AgglomerativeClusteringDF",
29 | "BirchDF",
30 | "DBSCANDF",
31 | "FeatureAgglomerationDF",
32 | "KMeansDF",
33 | "MeanShiftDF",
34 | "MiniBatchKMeansDF",
35 | "OPTICSDF",
36 | "SpectralClusteringDF",
37 | ]
38 |
39 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
40 |
41 |
42 | #
43 | # Ensure all symbols introduced below are included in __all__
44 | #
45 |
46 | __tracker = AllTracker(globals())
47 |
48 |
49 | #
50 | # Class definitions
51 | #
52 |
53 |
54 | class AffinityPropagationDF(
55 | ClusterWrapperDF[AffinityPropagation], native=AffinityPropagation
56 | ):
57 | """Stub for DF wrapper of class ``AffinityPropagation``"""
58 |
59 |
60 | class AgglomerativeClusteringDF(
61 | ClusterWrapperDF[AgglomerativeClustering], native=AgglomerativeClustering
62 | ):
63 | """Stub for DF wrapper of class ``AgglomerativeClustering``"""
64 |
65 |
66 | class BirchDF(ClusterWrapperDF[Birch], native=Birch):
67 | """Stub for DF wrapper of class ``Birch``"""
68 |
69 |
70 | class DBSCANDF(ClusterWrapperDF[DBSCAN], native=DBSCAN):
71 | """Stub for DF wrapper of class ``DBSCAN``"""
72 |
73 |
74 | class KMeansDF(KMeansBaseWrapperDF[KMeans], native=KMeans):
75 | """Stub for DF wrapper of class ``KMeans``"""
76 |
77 |
78 | class MiniBatchKMeansDF(KMeansBaseWrapperDF[MiniBatchKMeans], native=MiniBatchKMeans):
79 | """Stub for DF wrapper of class ``MiniBatchKMeans``"""
80 |
81 |
82 | class MeanShiftDF(ClusterWrapperDF[MeanShift], native=MeanShift):
83 | """Stub for DF wrapper of class ``MeanShift``"""
84 |
85 |
86 | class OPTICSDF(ClusterWrapperDF[OPTICS], native=OPTICS):
87 | """Stub for DF wrapper of class ``OPTICS``"""
88 |
89 |
90 | class SpectralClusteringDF(
91 | ClusterWrapperDF[SpectralClustering], native=SpectralClustering
92 | ):
93 | """Stub for DF wrapper of class ``SpectralClustering``"""
94 |
95 |
96 | class FeatureAgglomerationDF(
97 | FeatureAgglomerationWrapperDF, native=FeatureAgglomeration
98 | ):
99 | """Stub for DF wrapper of class ``FeatureAgglomeration``"""
100 |
101 |
102 | #
103 | # Validate __all__
104 | #
105 |
106 | __tracker.validate()
107 |
--------------------------------------------------------------------------------
/src/sklearndf/clustering/_clustering_v1_1.py:
--------------------------------------------------------------------------------
1 | """
2 | Core implementation of :mod:`sklearndf.clustering`
3 | """
4 | import logging
5 |
6 | from sklearn.cluster import BisectingKMeans
7 |
8 | from pytools.api import AllTracker
9 |
10 | from .wrapper import KMeansBaseWrapperDF
11 |
12 | log = logging.getLogger(__name__)
13 |
14 | __all__ = [
15 | "BisectingKMeansDF",
16 | ]
17 |
18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
19 |
20 |
21 | #
22 | # Ensure all symbols introduced below are included in __all__
23 | #
24 |
25 | __tracker = AllTracker(globals())
26 |
27 |
28 | #
29 | # Class definitions
30 | #
31 |
32 |
33 | class BisectingKMeansDF(KMeansBaseWrapperDF[BisectingKMeans], native=BisectingKMeans):
34 | """Stub for DF wrapper of class ``MiniBatchKMeans``"""
35 |
36 |
37 | #
38 | # Validate __all__
39 | #
40 |
41 | __tracker.validate()
42 |
--------------------------------------------------------------------------------
/src/sklearndf/clustering/_clustering_v1_3.py:
--------------------------------------------------------------------------------
1 | """
2 | Core implementation of :mod:`sklearndf.clustering`
3 | """
4 | import logging
5 |
6 | from sklearn.cluster import HDBSCAN
7 |
8 | from pytools.api import AllTracker
9 |
10 | from ..wrapper import ClusterWrapperDF
11 |
12 | log = logging.getLogger(__name__)
13 |
14 | __all__ = [
15 | "HDBSCANDF",
16 | ]
17 |
18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
19 |
20 |
21 | #
22 | # Ensure all symbols introduced below are included in __all__
23 | #
24 |
25 | __tracker = AllTracker(globals())
26 |
27 |
28 | #
29 | # Class definitions
30 | #
31 |
32 |
33 | class HDBSCANDF(ClusterWrapperDF[HDBSCAN], native=HDBSCAN):
34 | """Stub for DF wrapper of class ``DBSCAN``"""
35 |
36 |
37 | #
38 | # Validate __all__
39 | #
40 |
41 | __tracker.validate()
42 |
--------------------------------------------------------------------------------
/src/sklearndf/clustering/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Wrapper classes for `scikit-learn` clusterers, providing enhanced support for data
3 | frames.
4 | """
5 |
6 | from ._wrapper import *
7 |
--------------------------------------------------------------------------------
/src/sklearndf/clustering/wrapper/_wrapper.py:
--------------------------------------------------------------------------------
1 | """
2 | Core implementation of :mod:`sklearndf.clustering.wrapper`
3 | """
4 |
5 | import logging
6 | from abc import ABCMeta
7 | from typing import Generic, TypeVar
8 |
9 | import pandas as pd
10 | from sklearn.cluster import FeatureAgglomeration, KMeans, MiniBatchKMeans
11 |
12 | from pytools.api import AllTracker
13 | from pytools.fit import fitted_only
14 |
15 | from sklearndf.transformation.wrapper import ColumnPreservingTransformerWrapperDF
16 | from sklearndf.wrapper import ClusterWrapperDF
17 |
18 | log = logging.getLogger(__name__)
19 |
20 | __all__ = [
21 | "KMeansBaseWrapperDF",
22 | "FeatureAgglomerationWrapperDF",
23 | ]
24 |
25 | #
26 | # Type variables
27 | #
28 |
29 | T_NativeKMeans = TypeVar("T_NativeKMeans", KMeans, MiniBatchKMeans)
30 |
31 |
32 | #
33 | # Ensure all symbols introduced below are included in __all__
34 | #
35 |
36 | __tracker = AllTracker(globals())
37 |
38 |
39 | #
40 | # Wrapper classes
41 | #
42 |
43 |
44 | # noinspection PyPep8Naming
45 | class KMeansBaseWrapperDF(
46 | ClusterWrapperDF[T_NativeKMeans], Generic[T_NativeKMeans], metaclass=ABCMeta
47 | ):
48 | """
49 | DF wrapper for KMeans-like algorithms, e.g., :class:`sklearn.cluster.KMeans`.
50 | """
51 |
52 | #: the name of the index representing clusters
53 | IDX_CLUSTER = "cluster"
54 |
55 | @property
56 | @fitted_only(not_fitted_error=AttributeError)
57 | def cluster_centers_(self) -> pd.DataFrame:
58 | """
59 | The cluster centers as a data frame, with clusters as rows and feature values
60 | as columns.
61 |
62 | :raises AttributeError: the clusterer is not fitted
63 | """
64 |
65 | raw_cluster_centers = self._native_estimator.cluster_centers_
66 | return pd.DataFrame(
67 | raw_cluster_centers,
68 | columns=self.feature_names_in_,
69 | index=pd.RangeIndex(
70 | len(raw_cluster_centers), name=KMeansBaseWrapperDF.IDX_CLUSTER
71 | ),
72 | )
73 |
74 |
75 | class FeatureAgglomerationWrapperDF(
76 | ClusterWrapperDF[FeatureAgglomeration],
77 | ColumnPreservingTransformerWrapperDF[FeatureAgglomeration],
78 | metaclass=ABCMeta,
79 | ):
80 | """
81 | DF wrapper for FeatureAgglomeration that combines clusterer and transformer.
82 | """
83 |
84 | pass
85 |
86 |
87 | #
88 | # Validate __all__
89 | #
90 |
91 | __tracker.validate()
92 |
--------------------------------------------------------------------------------
/src/sklearndf/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Extended versions of all `scikit-learn` pipelines with enhanced support for data
3 | frames.
4 | """
5 | from ._learner_pipeline import *
6 | from ._pipeline import *
7 |
--------------------------------------------------------------------------------
/src/sklearndf/pipeline/_pipeline.py:
--------------------------------------------------------------------------------
1 | """
2 | Core implementation of :mod:`sklearndf.pipeline`
3 | """
4 |
5 | import logging
6 |
7 | from sklearn.pipeline import Pipeline
8 |
9 | from pytools.api import AllTracker
10 |
11 | from .wrapper import FeatureUnionSparseFrames, FeatureUnionWrapperDF, PipelineWrapperDF
12 |
13 | log = logging.getLogger(__name__)
14 |
15 | __all__ = ["PipelineDF", "FeatureUnionDF"]
16 |
17 |
18 | #
19 | # Ensure all symbols introduced below are included in __all__
20 | #
21 |
22 | __tracker = AllTracker(globals())
23 |
24 |
25 | #
26 | # Class definitions
27 | #
28 |
29 |
30 | class PipelineDF(PipelineWrapperDF, native=Pipeline):
31 | """Stub for DF wrapper of class ``Pipeline``"""
32 |
33 |
34 | class FeatureUnionDF(FeatureUnionWrapperDF, native=FeatureUnionSparseFrames):
35 | """Stub for DF wrapper of class ``FeatureUnion``"""
36 |
37 |
38 | #
39 | # Validate __all__
40 | #
41 |
42 | __tracker.validate()
43 |
--------------------------------------------------------------------------------
/src/sklearndf/pipeline/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Wrapper classes to enhance the functionality of native pipeline classes conforming with
3 | the `scikit-learn` API.
4 | """
5 |
6 | from ._wrapper import *
7 |
--------------------------------------------------------------------------------
/src/sklearndf/pipeline/wrapper/_wrapper.py:
--------------------------------------------------------------------------------
1 | """
2 | Core implementation of :mod:`sklearndf.pipeline.wrapper`
3 | """
4 |
5 | import logging
6 | from abc import ABCMeta
7 | from typing import Any, Dict, Iterator, List, Sequence, Tuple, Union, cast
8 |
9 | import numpy.typing as npt
10 | import pandas as pd
11 | from pandas.core.arrays import ExtensionArray
12 | from scipy import sparse
13 | from sklearn.pipeline import FeatureUnion, Pipeline
14 | from sklearn.preprocessing import FunctionTransformer
15 |
16 | from pytools.api import AllTracker
17 |
18 | from ..._util import hstack_frames
19 | from sklearndf import EstimatorDF, TransformerDF
20 | from sklearndf.wrapper import (
21 | ClassifierWrapperDF,
22 | RegressorWrapperDF,
23 | TransformerWrapperDF,
24 | )
25 |
26 | log = logging.getLogger(__name__)
27 |
28 | __all__ = [
29 | "FeatureUnionSparseFrames",
30 | "FeatureUnionWrapperDF",
31 | "PipelineWrapperDF",
32 | ]
33 |
34 |
35 | #
36 | # Ensure all symbols introduced below are included in __all__
37 | #
38 |
39 | __tracker = AllTracker(globals())
40 |
41 |
42 | #
43 | # Class definitions
44 | #
45 |
46 |
47 | class PipelineWrapperDF(
48 | ClassifierWrapperDF[Pipeline],
49 | RegressorWrapperDF[Pipeline],
50 | TransformerWrapperDF[Pipeline],
51 | metaclass=ABCMeta,
52 | ):
53 | """
54 | DF wrapper for `scikit-learn` class :class:`~sklearn.pipeline.Pipeline`.
55 | """
56 |
57 | __native_base_class__ = Pipeline
58 |
59 | #: Placeholder that can be used in place of an estimator to designate a pipeline
60 | #: step that preserves the original ingoing data.
61 | PASSTHROUGH = "passthrough"
62 |
63 | def _validate_delegate_estimator(self) -> None:
64 | # ensure that all steps support data frames, and that all except the last
65 | # step are data frame transformers
66 |
67 | steps = self.steps
68 |
69 | if len(steps) == 0:
70 | return
71 |
72 | for name, transformer in steps[:-1]:
73 | if not (
74 | self._is_passthrough(transformer)
75 | or isinstance(transformer, TransformerDF)
76 | ):
77 | raise ValueError(
78 | f"expected step {name!r} to be a {TransformerDF.__name__}, "
79 | f"or {PipelineWrapperDF.PASSTHROUGH}, but found an instance of "
80 | f"{type(transformer).__name__}"
81 | )
82 |
83 | final_step = steps[-1]
84 | final_estimator = final_step[1]
85 | if not (
86 | self._is_passthrough(final_estimator)
87 | or isinstance(final_estimator, EstimatorDF)
88 | ):
89 | raise ValueError(
90 | f"expected final step {final_step[0]!r} to be an "
91 | f"{EstimatorDF.__name__} or {PipelineWrapperDF.PASSTHROUGH}, "
92 | f"but found an instance of {type(final_estimator).__name__}"
93 | )
94 |
95 | @property
96 | def steps(self) -> List[Tuple[str, EstimatorDF]]:
97 | """
98 | The ``steps`` attribute of the underlying :class:`~sklearn.pipeline.Pipeline`.
99 |
100 | List of (name, transformer) tuples (transformers implement fit/transform).
101 | """
102 | return cast(List[Tuple[str, EstimatorDF]], self.native_estimator.steps)
103 |
104 | def __len__(self) -> int:
105 | """The number of steps of the pipeline."""
106 | return len(self.native_estimator.steps)
107 |
108 | def __getitem__(self, ind: Union[slice, int, str]) -> EstimatorDF:
109 | """
110 | Return a sub-pipeline or a single estimator in the pipeline
111 |
112 | Indexing with an integer will return an estimator; using a slice
113 | returns another Pipeline instance which copies a slice of this
114 | Pipeline. This copy is shallow: modifying (or fitting) estimators in
115 | the sub-pipeline will affect the larger pipeline and vice-versa.
116 | However, replacing a value in ``steps`` will not change a copy.
117 | """
118 |
119 | if isinstance(ind, slice):
120 | base_pipeline = self.native_estimator
121 | if ind.step not in (1, None):
122 | raise ValueError("Pipeline slicing only supports a step of 1")
123 |
124 | return cast(
125 | EstimatorDF,
126 | self.__class__(
127 | steps=base_pipeline.steps[ind],
128 | memory=base_pipeline.memory,
129 | verbose=base_pipeline.verbose,
130 | ),
131 | )
132 | else:
133 | return cast(EstimatorDF, self.native_estimator[ind])
134 |
135 | @staticmethod
136 | def _is_passthrough(estimator: Union[EstimatorDF, str, None]) -> bool:
137 | # return True if the estimator is a "passthrough" (i.e. identity) transformer
138 | # in the pipeline
139 | return estimator is None or estimator == PipelineWrapperDF.PASSTHROUGH
140 |
141 | def _transformer_steps(self) -> Iterator[Tuple[str, TransformerDF]]:
142 | # make an iterator of all transform steps, i.e., excluding the final step
143 | # in case it is not a transformer
144 | # excludes steps whose transformer is ``None`` or ``"passthrough"``
145 |
146 | def _iter_not_none(
147 | transformer_steps: Sequence[Tuple[str, EstimatorDF]]
148 | ) -> Iterator[Tuple[str, TransformerDF]]:
149 | return (
150 | (name, cast(TransformerDF, transformer))
151 | for name, transformer in transformer_steps
152 | if not self._is_passthrough(transformer)
153 | )
154 |
155 | steps = self.steps
156 |
157 | if len(steps) == 0:
158 | return iter([])
159 |
160 | final_estimator = steps[-1][1]
161 |
162 | if isinstance(final_estimator, TransformerDF):
163 | return _iter_not_none(steps)
164 | else:
165 | return _iter_not_none(steps[:-1])
166 |
167 | def _get_features_original(self) -> pd.Series:
168 | col_mappings = [
169 | df_transformer.feature_names_original_
170 | for _, df_transformer in self._transformer_steps()
171 | ]
172 |
173 | _features_out: pd.Index
174 | _features_original: Union[npt.NDArray[Any], ExtensionArray]
175 |
176 | if len(col_mappings) == 0:
177 | _features_out = self.feature_names_in_
178 | _features_original = _features_out.values
179 | else:
180 | _features_out = col_mappings[-1].index
181 | _features_original = col_mappings[-1].values
182 |
183 | # iterate backwards starting from the penultimate item
184 | for preceding_out_to_original_mapping in col_mappings[-2::-1]:
185 | # join the original columns of my current transformer on the out columns
186 | # in the preceding transformer, then repeat
187 | if not all(
188 | feature in preceding_out_to_original_mapping
189 | for feature in _features_original
190 | ):
191 | unknown_features = set(_features_original) - set(
192 | preceding_out_to_original_mapping
193 | )
194 | raise KeyError(
195 | f"unknown features encountered while tracing original "
196 | f"features along pipeline: {unknown_features}"
197 | )
198 | _features_original = preceding_out_to_original_mapping.loc[
199 | _features_original
200 | ].values
201 |
202 | return pd.Series(index=_features_out, data=_features_original)
203 |
204 | def _get_features_out(self) -> pd.Index:
205 | for _, transformer in reversed(self.steps):
206 | if isinstance(transformer, TransformerDF):
207 | return transformer.feature_names_out_
208 |
209 | return self.feature_names_in_
210 |
211 | @property
212 | def _estimator_type(self) -> str:
213 | # noinspection PyProtectedMember
214 | return cast(str, self.native_estimator._estimator_type)
215 |
216 | def _more_tags(self) -> Dict[str, Any]:
217 | return cast(
218 | Dict[str, Any], getattr(self.native_estimator, "_more_tags", lambda: {})()
219 | )
220 |
221 |
222 | class FeatureUnionSparseFrames(
223 | FeatureUnion, # type:ignore
224 | ):
225 | """
226 | FeatureUnion transformer that returns sparse data frames instead of arrays if one or
227 | more of its transformers return a sparse data frame.
228 | """
229 |
230 | # noinspection PyPep8Naming
231 | def _hstack(
232 | self, Xs: List[Union[npt.NDArray[Any], sparse.spmatrix, pd.DataFrame]]
233 | ) -> Union[npt.NDArray[Any], sparse.spmatrix, pd.DataFrame]:
234 | stacked_frames = hstack_frames(
235 | Xs, prefixes=[name for name, _ in self.transformer_list]
236 | )
237 | if stacked_frames is None:
238 | return super()._hstack(Xs)
239 | else:
240 | return stacked_frames
241 |
242 |
243 | class FeatureUnionWrapperDF(
244 | TransformerWrapperDF[FeatureUnionSparseFrames], metaclass=ABCMeta
245 | ):
246 | """
247 | DF wrapper for `scikit-learn` class :class:`~sklearn.pipeline.FeatureUnion`.
248 | """
249 |
250 | DROP = "drop"
251 | PASSTHROUGH = "passthrough"
252 |
253 | __native_base_class__ = FeatureUnionSparseFrames
254 |
255 | @staticmethod
256 | def _prepend_features_out(features_out: pd.Index, name_prefix: str) -> pd.Index:
257 | return pd.Index(data=f"{name_prefix}__" + features_out.astype(str))
258 |
259 | def _get_features_original(self) -> pd.Series:
260 | # concatenate output-to-input mappings from all included transformers other than
261 | # ones stated as ``None`` or ``"drop"`` or any other string
262 |
263 | # prepend the name of the transformer so the resulting feature name is
264 | # `__