├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── .idea
    ├── gamma-sklearndf.iml
    └── sklearndf.iml
├── .pre-commit-config.yaml
├── LICENSE
├── README.rst
├── RELEASE_NOTES.rst
├── azure-pipelines.yml
├── condabuild
    └── meta.yaml
├── config
    ├── spelling.dic
    └── test_config.yml
├── dev-setup.sh
├── environment.yml
├── make.py
├── mypy.ini
├── pypi_description.rst
├── pyproject.toml
├── sphinx
    ├── .gitignore
    ├── auxiliary
    │   └── Titanic_getting_started_example.ipynb
    ├── make.py
    └── source
    │   ├── _images
    │       ├── gamma_sklearndf_logo.png
    │       ├── sklearndf-class-hierarchy.graffle
    │       │   └── data.plist
    │       ├── sklearndf-class-hierarchy.svg
    │       └── sklearndf_logo.png
    │   ├── api_landing.rst
    │   ├── conf.py
    │   ├── contribution_guide.rst
    │   ├── faqs.rst
    │   ├── index.rst
    │   ├── tutorial
    │       └── sklearndf_tutorial.ipynb
    │   └── tutorials.rst
├── src
    └── sklearndf
    │   ├── __init__.py
    │   ├── _sklearn_version.py
    │   ├── _sklearndf.py
    │   ├── _util.py
    │   ├── classification
    │       ├── __init__.py
    │       ├── _classification.py
    │       ├── _classification_v0_22.py
    │       ├── _classification_v0_23.py
    │       ├── _classification_v1_0.py
    │       ├── extra
    │       │   ├── __init__.py
    │       │   └── _extra.py
    │       └── wrapper
    │       │   ├── __init__.py
    │       │   └── _wrapper.py
    │   ├── clustering
    │       ├── __init__.py
    │       ├── _clustering.py
    │       ├── _clustering_v1_1.py
    │       ├── _clustering_v1_3.py
    │       └── wrapper
    │       │   ├── __init__.py
    │       │   └── _wrapper.py
    │   ├── pipeline
    │       ├── __init__.py
    │       ├── _learner_pipeline.py
    │       ├── _pipeline.py
    │       └── wrapper
    │       │   ├── __init__.py
    │       │   └── _wrapper.py
    │   ├── py.typed
    │   ├── regression
    │       ├── __init__.py
    │       ├── _regression.py
    │       ├── _regression_v0_22.py
    │       ├── _regression_v0_23.py
    │       ├── _regression_v1_0.py
    │       ├── extra
    │       │   ├── __init__.py
    │       │   └── _extra.py
    │       └── wrapper
    │       │   ├── __init__.py
    │       │   └── _wrapper.py
    │   ├── transformation
    │       ├── __init__.py
    │       ├── _transformation.py
    │       ├── _transformation_v0_22.py
    │       ├── _transformation_v0_24.py
    │       ├── _transformation_v1_0.py
    │       ├── _transformation_v1_1.py
    │       ├── _transformation_v1_3.py
    │       ├── extra
    │       │   ├── __init__.py
    │       │   ├── _extra.py
    │       │   └── wrapper
    │       │   │   ├── __init__.py
    │       │   │   └── _wrapper.py
    │       └── wrapper
    │       │   ├── __init__.py
    │       │   └── _wrapper.py
    │   └── wrapper
    │       ├── __init__.py
    │       ├── _missing.py
    │       ├── _wrapper.py
    │       ├── numpy
    │           ├── __init__.py
    │           └── _numpy.py
    │       └── stacking
    │           ├── __init__.py
    │           └── _stacking.py
├── test
    └── test
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── paths.py
    │   ├── sklearndf
    │       ├── __init__.py
    │       ├── pipeline
    │       │   ├── __init__.py
    │       │   ├── test_classification_pipeline_df.py
    │       │   ├── test_clustering_pipeline.py
    │       │   ├── test_pipeline_df.py
    │       │   └── test_regression_pipeline_df.py
    │       ├── test_base.py
    │       ├── test_classification.py
    │       ├── test_clustering.py
    │       ├── test_meta_estimators.py
    │       ├── test_missing.py
    │       ├── test_regression.py
    │       ├── test_sklearn_coverage.py
    │       └── transformation
    │       │   ├── __init__.py
    │       │   ├── test_extra.py
    │       │   ├── test_imputers.py
    │       │   ├── test_sparse.py
    │       │   └── test_transformation.py
    │   └── test_docs.py
├── tmp
    └── README.md
└── tox.ini


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | ### JetBrains template
108 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
109 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
110 | 
111 | # User-specific stuff
112 | .idea/**/workspace.xml
113 | .idea/**/tasks.xml
114 | .idea/**/dictionaries
115 | .idea/**/shelf
116 | 
117 | # Sensitive or high-churn files
118 | .idea/**/dataSources/
119 | .idea/**/dataSources.ids
120 | .idea/**/dataSources.local.xml
121 | .idea/**/sqlDataSources.xml
122 | .idea/**/dynamic.xml
123 | .idea/**/uiDesigner.xml
124 | .idea/**/dbnavigator.xml
125 | 
126 | # Gradle
127 | .idea/**/gradle.xml
128 | .idea/**/libraries
129 | 
130 | # CMake
131 | cmake-build-debug/
132 | cmake-build-release/
133 | 
134 | # Mongo Explorer plugin
135 | .idea/**/mongoSettings.xml
136 | 
137 | # File-based project format
138 | *.iws
139 | 
140 | # IntelliJ
141 | out/
142 | 
143 | # mpeltonen/sbt-idea plugin
144 | .idea_modules/
145 | 
146 | # JIRA plugin
147 | atlassian-ide-plugin.xml
148 | 
149 | # Cursive Clojure plugin
150 | .idea/replstate.xml
151 | 
152 | # Crashlytics plugin (for Android Studio and IntelliJ)
153 | com_crashlytics_export_strings.xml
154 | crashlytics.properties
155 | crashlytics-build.properties
156 | fabric.properties
157 | 
158 | # Editor-based Rest Client
159 | .idea/httpRequests
160 | ### TeX template
161 | ## Core latex/pdflatex auxiliary files:
162 | *.aux
163 | *.lof
164 | *.lot
165 | *.fls
166 | *.out
167 | *.toc
168 | *.fmt
169 | *.fot
170 | *.cb
171 | *.cb2
172 | .*.lb
173 | 
174 | ## Intermediate documents:
175 | *.dvi
176 | *.xdv
177 | *-converted-to.*
178 | # these rules might exclude image files for figures etc.
179 | # *.ps
180 | # *.eps
181 | # *.pdf
182 | 
183 | ## Generated if empty string is given at "Please type another file name for output:"
184 | .pdf
185 | 
186 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
187 | *.bbl
188 | *.bcf
189 | *.blg
190 | *-blx.aux
191 | *-blx.bib
192 | *.run.xml
193 | 
194 | ## Build tool auxiliary files:
195 | *.fdb_latexmk
196 | *.synctex
197 | *.synctex(busy)
198 | *.synctex.gz
199 | *.synctex.gz(busy)
200 | *.pdfsync
201 | 
202 | ## Build tool directories for auxiliary files
203 | # latexrun
204 | latex.out/
205 | 
206 | ## Auxiliary and intermediate files from other packages:
207 | # algorithms
208 | *.alg
209 | *.loa
210 | 
211 | # achemso
212 | acs-*.bib
213 | 
214 | # amsthm
215 | *.thm
216 | 
217 | # beamer
218 | *.nav
219 | *.pre
220 | *.snm
221 | *.vrb
222 | 
223 | # changes
224 | *.soc
225 | 
226 | # cprotect
227 | *.cpt
228 | 
229 | # elsarticle (documentclass of Elsevier journals)
230 | *.spl
231 | 
232 | # endnotes
233 | *.ent
234 | 
235 | # fixme
236 | *.lox
237 | 
238 | # feynmf/feynmp
239 | *.mf
240 | *.mp
241 | *.t[1-9]
242 | *.t[1-9][0-9]
243 | *.tfm
244 | 
245 | #(r)(e)ledmac/(r)(e)ledpar
246 | *.end
247 | *.?end
248 | *.[1-9]
249 | *.[1-9][0-9]
250 | *.[1-9][0-9][0-9]
251 | *.[1-9]R
252 | *.[1-9][0-9]R
253 | *.[1-9][0-9][0-9]R
254 | *.eledsec[1-9]
255 | *.eledsec[1-9]R
256 | *.eledsec[1-9][0-9]
257 | *.eledsec[1-9][0-9]R
258 | *.eledsec[1-9][0-9][0-9]
259 | *.eledsec[1-9][0-9][0-9]R
260 | 
261 | # glossaries
262 | *.acn
263 | *.acr
264 | *.glg
265 | *.glo
266 | *.gls
267 | *.glsdefs
268 | 
269 | # gnuplottex
270 | *-gnuplottex-*
271 | 
272 | # gregoriotex
273 | *.gaux
274 | *.gtex
275 | 
276 | # htlatex
277 | *.4ct
278 | *.4tc
279 | *.idv
280 | *.lg
281 | *.trc
282 | *.xref
283 | 
284 | # hyperref
285 | *.brf
286 | 
287 | # knitr
288 | *-concordance.tex
289 | # TODO Comment the next line if you want to keep your tikz graphics files
290 | *.tikz
291 | *-tikzDictionary
292 | 
293 | # listings
294 | *.lol
295 | 
296 | # makeidx
297 | *.idx
298 | *.ilg
299 | *.ind
300 | *.ist
301 | 
302 | # minitoc
303 | *.maf
304 | *.mlf
305 | *.mlt
306 | *.mtc[0-9]*
307 | *.slf[0-9]*
308 | *.slt[0-9]*
309 | *.stc[0-9]*
310 | 
311 | # minted
312 | _minted*
313 | *.pyg
314 | 
315 | # morewrites
316 | *.mw
317 | 
318 | # nomencl
319 | *.nlg
320 | *.nlo
321 | *.nls
322 | 
323 | # pax
324 | *.pax
325 | 
326 | # pdfpcnotes
327 | *.pdfpc
328 | 
329 | # sagetex
330 | *.sagetex.sage
331 | *.sagetex.py
332 | *.sagetex.scmd
333 | 
334 | # scrwfile
335 | *.wrt
336 | 
337 | # sympy
338 | *.sout
339 | *.sympy
340 | sympy-plots-for-*.tex/
341 | 
342 | # pdfcomment
343 | *.upa
344 | *.upb
345 | 
346 | # pythontex
347 | *.pytxcode
348 | pythontex-files-*/
349 | 
350 | # thmtools
351 | *.loe
352 | 
353 | # TikZ & PGF
354 | *.dpth
355 | *.md5
356 | *.auxlock
357 | 
358 | # todonotes
359 | *.tdo
360 | 
361 | # easy-todo
362 | *.lod
363 | 
364 | # xmpincl
365 | *.xmpi
366 | 
367 | # xindy
368 | *.xdy
369 | 
370 | # xypic precompiled matrices
371 | *.xyc
372 | 
373 | # endfloat
374 | *.ttt
375 | *.fff
376 | 
377 | # Latexian
378 | TSWLatexianTemp*
379 | 
380 | ## Editors:
381 | # WinEdt
382 | *.bak
383 | *.sav
384 | 
385 | # Texpad
386 | .texpadtmp
387 | 
388 | # Kile
389 | *.backup
390 | 
391 | # KBibTeX
392 | *~[0-9]*
393 | 
394 | *.el
395 | 
396 | # expex forward references with \gathertags
397 | *-tags.tex
398 | 
399 | # standalone packages
400 | *.sta
401 | 
402 | .DS_Store
403 | 
404 | #
405 | # project specific
406 | #
407 | /tmp
408 | !/tmp/README.md
409 | 
410 | # exclude docs while they are not yet stable
411 | /docs/**
412 | !/docs/README.md
413 | 
414 | # exclude notebooks directory: this is generated during build
415 | /notebooks/
416 | 
417 | # OmniGraffle previews
418 | **/*.graffle/preview.jpeg
419 | 


--------------------------------------------------------------------------------
/.idea/gamma-sklearndf.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
 6 |       <sourceFolder url="file://$MODULE_DIR$/test" isTestSource="false" />
 7 |     </content>
 8 |     <orderEntry type="jdk" jdkName="alpha" jdkType="Python SDK" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |     <orderEntry type="module" module-name="gamma-common" />
11 |   </component>
12 |   <component name="TestRunnerService">
13 |     <option name="projectConfiguration" value="pytest" />
14 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
15 |   </component>
16 | </module>


--------------------------------------------------------------------------------
/.idea/sklearndf.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
 6 |       <sourceFolder url="file://$MODULE_DIR$/test" isTestSource="false" />
 7 |       <excludeFolder url="file://$MODULE_DIR$/.ipynb_checkpoints" />
 8 |       <excludeFolder url="file://$MODULE_DIR$/.mypy_cache" />
 9 |       <excludeFolder url="file://$MODULE_DIR$/.pytest_cache" />
10 |       <excludeFolder url="file://$MODULE_DIR$/.tox" />
11 |       <excludeFolder url="file://$MODULE_DIR$/dist" />
12 |       <excludeFolder url="file://$MODULE_DIR$/notebooks" />
13 |       <excludeFolder url="file://$MODULE_DIR$/sphinx/base" />
14 |       <excludeFolder url="file://$MODULE_DIR$/sphinx/build" />
15 |       <excludeFolder url="file://$MODULE_DIR$/sphinx/source/_generated" />
16 |       <excludeFolder url="file://$MODULE_DIR$/sphinx/source/apidoc" />
17 |       <excludeFolder url="file://$MODULE_DIR$/sphinx/source/tutorial/.ipynb_checkpoints" />
18 |       <excludeFolder url="file://$MODULE_DIR$/tmp" />
19 |     </content>
20 |     <orderEntry type="jdk" jdkName="facet-base" jdkType="Python SDK" />
21 |     <orderEntry type="sourceFolder" forTests="false" />
22 |     <orderEntry type="module" module-name="pytools" />
23 |   </component>
24 |   <component name="PackageRequirementsSettings">
25 |     <option name="requirementsPath" value="" />
26 |   </component>
27 |   <component name="PyDocumentationSettings">
28 |     <option name="renderExternalDocumentation" value="true" />
29 |   </component>
30 |   <component name="ReSTService">
31 |     <option name="DOC_DIR" value="$MODULE_DIR$/sphinx/source" />
32 |   </component>
33 |   <component name="TestRunnerService">
34 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
35 |   </component>
36 | </module>


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/PyCQA/isort
 3 |     rev: 5.12.0
 4 |     hooks:
 5 |       - id: isort
 6 | 
 7 |   - repo: https://github.com/psf/black
 8 |     rev: 23.10.1
 9 |     hooks:
10 |       - id: black
11 |         language: python_venv
12 |         language_version: python39
13 | 
14 |   - repo: https://github.com/pycqa/flake8
15 |     rev: 5.0.4
16 |     hooks:
17 |       - id: flake8
18 |         name: flake8
19 |         entry: flake8 --config tox.ini
20 |         language: python_venv
21 |         language_version: python39
22 |         additional_dependencies:
23 |           - flake8-comprehensions ~= 3.10
24 |         types: [ python ]
25 | 
26 |   - repo: https://github.com/pre-commit/pre-commit-hooks
27 |     rev: v4.3.0
28 |     hooks:
29 |       - id: check-added-large-files
30 |       - id: check-json
31 |       - id: check-xml
32 |       - id: check-yaml
33 |         language: python_venv
34 |         exclude: condabuild/meta.yaml
35 | 
36 |   - repo: https://github.com/pre-commit/mirrors-mypy
37 |     rev: v1.2.0
38 |     hooks:
39 |       - id: mypy
40 |         files: src|sphinx|test
41 |         language: python_venv
42 |         language_version: python39
43 |         additional_dependencies:
44 |           - numpy~=1.24
45 |           - gamma-pytools~=2.1
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020-2021 Boston Consulting Group
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/RELEASE_NOTES.rst:
--------------------------------------------------------------------------------
  1 | Release Notes
  2 | =============
  3 | 
  4 | .. |lightgbm| replace:: :external+lightgbm:doc:`lightgbm <index>`
  5 | .. |xgboost| replace:: :external+xgboost:doc:`xgboost <index>`
  6 | .. |mypy| replace:: :external+mypy:doc:`mypy <index>`
  7 | .. |nbsp| unicode:: 0xA0
  8 |    :trim:
  9 | 
 10 | *sklearndf* 2.3
 11 | ---------------
 12 | 
 13 | 
 14 | 2.3.0
 15 | ~~~~~
 16 | 
 17 | *sklearndf* 2.3 adds support for
 18 | `scikit-learn 1.3 <https://scikit-learn.org/1.3>`_
 19 | and drops support for *scikit-learn* |nbsp| 0.24.
 20 | 
 21 | - API: add DF wrapper classe :class:`.HDBSCANDF` for native estimator
 22 |   :class:`~sklearn.cluster.HDBSCAN`
 23 | - API: add DF wrapper class :class:`.TargetEncoderDF` for native estimator
 24 |   :class:`~sklearn.preprocessing.TargetEncoder`
 25 | 
 26 | 
 27 | *sklearndf* 2.2
 28 | ---------------
 29 | 
 30 | *sklearndf* 2.2 adds support for
 31 | `scikit-learn 1.2 <https://scikit-learn.org/1.2>`_, and enhances the EstimatorDF
 32 | API.
 33 | 
 34 | 
 35 | 2.2.1
 36 | ~~~~~
 37 | 
 38 | - VIZ: use *scikit-learn*'s native HTML representation of estimators, if available
 39 | 
 40 | 
 41 | 2.2.0
 42 | ~~~~~
 43 | 
 44 | *sklearndf* 2.2 adds support for
 45 | `scikit-learn 1.2 <https://scikit-learn.org/1.2>`_.
 46 | It drops support for *scikit-learn* |nbsp| 0.23 and earlier due to incomplete
 47 | support of sparse output (see below).
 48 | 
 49 | - API: DF estimators now support native estimators using sparse matrices as input or
 50 |   output, and automatically convert them to or from sparse :class:`~pandas.DataFrame`
 51 |   objects
 52 | - API: new property :attr:`.EstimatorDF.output_names_` to get the names of the output
 53 |   columns the estimator was fitted with
 54 | - API: new method :attr:`.LearnerPipelineDF.preprocess` to apply the preprocessing step
 55 |   to a data frame
 56 | - API: remove properties ``feature_names_out_`` and ``feature_names_original_`` from
 57 |   class :class:`.LearnerPipelineDF`
 58 | - API: :class:`~pandas.Index` instances obtained from
 59 |   :attr:`.EstimatorDF.feature_names_in_` and :attr:`.TransformerDF.feature_names_out_`
 60 |   are now named ``"feature"`` instead of ``"feature_in"`` and ``"feature_out"``,
 61 |   respectively, and :class:`~pandas.Series` instances obtained from
 62 |   :attr:`.TransformerDF.feature_names_original_` are now named ``"feature_original"``
 63 |   instead of ``"feature_in"``, and their indices are now named ``"feature"`` instead
 64 |   of ``"feature_out"``; this is to separate the semantics of the originating property
 65 |   from the column index, which may be used in other contexts
 66 | 
 67 | 
 68 | 
 69 | *sklearndf* 2.1
 70 | ---------------
 71 | 
 72 | *sklearndf* 2.1 adds support for
 73 | `scikit-learn 1.1 <https://scikit-learn.org/1.1>`_.
 74 | 
 75 | 
 76 | 2.1.1
 77 | ~~~~~
 78 | 
 79 | This is a maintenance release to catch up with *sklearndf* |nbsp| 2.0.2.
 80 | 
 81 | 
 82 | 2.1.0
 83 | ~~~~~
 84 | 
 85 | - API: new clusterer :class:`.BisectingKMeansDF`
 86 | - API: new transformer :class:`.MiniBatchNMFDF`
 87 | - API: new transformer :class:`.RandomTreesEmbeddingDF`; note that class
 88 |   :class:`~sklearn.ensemble.RandomTreesEmbedding` existed previously in *scikit-learn*,
 89 |   but is based on :class:`~sklearn.base.TransformerMixin` only as of
 90 |   *scikit-learn* |nbsp| 1.1
 91 | - API: support parameters ``max_categories`` and ``min_frequency`` of
 92 |   :class:`.OneHotEncoderDF`, introduced in *scikit-learn* |nbsp| 1.1
 93 | - API: support array-like values for parameter ``drop`` of :class:`.OneHotEncoderDF`
 94 | - API: support ``"passthrough"`` as a transformer in :class:`.FeatureUnionDF`
 95 | - API: remove ``GeneralizedLinearRegressorDF`` since the underlying native estimator is
 96 |   a base class and not intended to be used as a regressor of its own
 97 | 
 98 | 
 99 | *sklearndf* 2.0
100 | ---------------
101 | 
102 | *sklearndf* 2.0 adds support for
103 | `scikit-learn 1.0 <https://scikit-learn.org/1.0>`_,
104 | adds data frame support for clusterers along with additional API enhancements and
105 | improvements, and is now subject to static type checking with |mypy|.
106 | 
107 | 
108 | 2.0.2
109 | ~~~~~
110 | 
111 | - BUILD: add support for :mod:`pandas` 2.0 and above
112 | - FIX: property :attr:`.PCADF.n_components_` now returns the value of
113 |   :attr:`~sklearndf.decomposition.PCA.n_components_`, not
114 |   :attr:`~sklearndf.decomposition.PCA.n_components`
115 | - FIX: detect missing and extra columns when validating data frames resulting from
116 |   transforms, even when the total column count is correct
117 | 
118 | 
119 | 2.0.1
120 | ~~~~~
121 | 
122 | - API: upon declaration of new wrapper classes, automatically validate that their
123 |   associated native estimators are compatible with the wrapper class
124 | - API: new public constants ``DROP`` and ``PASSTHROUGH`` in
125 |   :class:`.ColumnTransformerDF`
126 | - FIX: base :class:`.LGBMClassifierDF` and :class:`.XGBClassifierDF` on the
127 |   the correct wrapper class :class:`.ClassifierWrapperDF`
128 | - FIX: support array-like values for parameter ``drop`` of :class:`.OneHotEncoderDF`
129 | - FIX: various minor tweaks and stability improvements
130 | 
131 | 
132 | 2.0.0
133 | ~~~~~
134 | 
135 | - API: :class:`.ClassifierDF` and :class:`.RegressorDF` get a new base class
136 |   :class:`.SupervisedLearnerDF`, which in turn is based on :class:`.LearnerDF`;
137 |   :class:`.SupervisedLearnerDF` implements method :meth:`~.SupervisedLearnerDF.score`,
138 |   which is no longer implemented by :class:`.LearnerDF`
139 | - API: new class :class:`.ClusterDF`, based on :class:`.LearnerDF`
140 | - API: class :class:`.EstimatorDF` now implements the
141 |   :class:`~pytools.expression.HasExpressionRepr` mix-in, rendering estimator
142 |   representations as :class:`~pytools.expression.Expression` objects to enable better
143 |   formatting
144 | - API: added data frame support for method
145 |   :meth:`~.PartialFitRegressorWrapperDF.partial_fit`
146 | - API: removed ``OutlierRemoverDF``
147 | - API: removed dependency on package |lightgbm|: :class:`.LGBMClassifierDF` and
148 |   :class:`.LGBMRegressorDF` are still available if |lightgbm| is installed
149 | - API: added support for |xgboost|: :class:`.XGBClassifierDF` and
150 |   :class:`.XGBClassifierDF` are available if |xgboost| is installed
151 | - API: DF wrapper classes are now created using proper class declarations to better
152 |   conform with Python type conventions checked by |mypy|;
153 |   see :mod:`sklearndf.wrapper` for details
154 | - API: remove functions ``make_df_estimator``, ``make_df_classifier``,
155 |   ``make_df_regressor``, and ``make_df_transformer`` which are now obsolete
156 | - API: move some classes in :mod:`sklearndf.wrapper` to sub-packages
157 |   :mod:`sklearndf.wrapper.stacking` and :mod:`sklearndf.wrapper.numpy` to improve
158 |   package navigability and to achieve better de-coupling of the underlying code;
159 |   this change also moves :class:`~.StackingClassifierWrapperDF` and
160 |   :class:`~.StackingRegressorWrapperDF` to package :mod:`sklearndf.wrapper.stacking`
161 | 
162 | 
163 | *sklearndf* 1.2
164 | ---------------
165 | 
166 | This release adds support for `scikit-learn 0.24 <https://scikit-learn.org/0.24/>`_.
167 | 
168 | 
169 | 1.2.3
170 | ~~~~~
171 | 
172 | This is a maintenance release to catch up with *sklearndf* |nbsp| 1.1.3.
173 | 
174 | 
175 | 1.2.2
176 | ~~~~~
177 | 
178 | This release makes small API tweaks, and catches up with *sklearndf* |nbsp| 1.1.2.
179 | 
180 | - API: make type hints more specific in signatures for
181 |   :func:`.make_df_transformer`, :func:`.make_df_classifier`, and
182 |   :func:`.make_df_regressor`
183 | 
184 | 
185 | 1.2.1
186 | ~~~~~
187 | 
188 | This is a maintenance release to catch up with *sklearndf* |nbsp| 1.1.1.
189 | 
190 | 
191 | 1.2.0
192 | ~~~~~
193 | 
194 | - API: add `DF` adaptations for classes introduced by *scikit-learn* |nbsp| 0.24:
195 |   :class:`.PolynomialCountSketchDF` and :class:`.SequentialFeatureSelectorDF`
196 | 
197 | 
198 | *sklearndf* 1.1
199 | ---------------
200 | 
201 | 1.1.3
202 | ~~~~~
203 | 
204 | This release relaxes package dependencies to support any `numpy` version `1.x` from
205 | 1.16.
206 | 
207 | 
208 | 1.1.2
209 | ~~~~~
210 | 
211 | This release improves compatibility with `scikit-learn` and fixes bugs.
212 | 
213 | - API: add full support for the
214 |   `_estimator_type <https://scikit-learn.org/stable/glossary.html#term-_estimator_type>`__
215 |   attribute
216 | - FIX: do not reset transformers when calling :meth:`.TransformerDF.inverse_transform`
217 | - FIX: accept `"passthrough"` as value for arg `remainder` of
218 |   :class:`.ColumnTransformerDF`
219 | 
220 | 
221 | 1.1.1
222 | ~~~~~
223 | 
224 | This release addresses compatibility issues with meta-estimators.
225 | 
226 | - FIX: support complex DF estimators inside :class:`.StackingEstimatorDF`
227 | - FIX: raise an exception if a base estimator is not supported by one of `sklearndf`'s
228 |   implementations for DF meta-estimators
229 | 
230 | 
231 | 1.1.0
232 | ~~~~~
233 | 
234 | This release exposes the `wrapper` API used to generate augmented DF estimators from
235 | native `scikit-learn` estimators.
236 | 
237 | - API: expose the :class:`.EstimatorWrapperDF` class hierarchy through the new
238 |   :mod:`sklearndf.wrapper` package
239 | - API: create new `scikit-learn` wrapper classes with the new functions
240 |   :func:`.make_df_estimator`, :func:`.make_df_classifier`, :func:`.make_df_regressor`,
241 |   and :func:`.make_df_transformer`
242 | 
243 | 
244 | *sklearndf* 1.0
245 | ---------------
246 | 
247 | 1.0.2
248 | ~~~~~
249 | 
250 | This is a maintenance release focusing on enhancements to the CI/CD pipeline and bug
251 | fixes.
252 | 
253 | - FIX: correctly mirror ``__init__`` signatures of native estimators to their
254 |   corresponding DF estimators
255 | - FIX: do not mirror native estimator class attributes and protected members to
256 |   DF estimators
257 | - FIX: support ``"passthrough"`` transformer in :class:`.ColumnTransformerDF`
258 | - FIX: support ``drop`` parameter in :class:`.OneHotEncoderDF`
259 | - BUILD: add support for `numpy` |nbsp| 1.20
260 | - BUILD: updates and changes to the CI/CD pipeline
261 | 
262 | 
263 | 1.0.1
264 | ~~~~~
265 | 
266 | Initial release.
267 | 


--------------------------------------------------------------------------------
/condabuild/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: sklearndf
 3 |   version: {{ environ.get('FACET_BUILD_SKLEARNDF_VERSION') }}
 4 | 
 5 | source:
 6 |   git_url: ../
 7 | 
 8 | build:
 9 |   noarch: python
10 |   script: "flit install --deps none"
11 | 
12 | requirements:
13 |   host:
14 |     - pip>=20.*
15 |     - python        {{ environ.get('FACET_V_PYTHON', '=3.8.*') }}
16 |     - numpy         {{ environ.get('FACET_V_NUMPY', '>=1.11.*') }}
17 |     - flit>=3.0.*
18 |     - packaging>=20
19 |   run:
20 |     - gamma-pytools {{ environ.get('FACET_V_GAMMA_PYTOOLS') }}
21 |     - numpy         {{ environ.get('FACET_V_NUMPY') }}
22 |     - packaging     {{ environ.get('FACET_V_PACKAGING') }}
23 |     - pandas        {{ environ.get('FACET_V_PANDAS') }}
24 |     - python        {{ environ.get('FACET_V_PYTHON') }}
25 |     - scikit-learn  {{ environ.get('FACET_V_SCIKIT_LEARN') }}
26 |     - scipy         {{ environ.get('FACET_V_SCIPY') }}
27 | test:
28 |   imports:
29 |     - sklearndf
30 |     - sklearndf.classification
31 |     - sklearndf.classification.extra
32 |     - sklearndf.pipeline
33 |     - sklearndf.regression
34 |     - sklearndf.regression.extra
35 |     - sklearndf.transformation
36 |     - sklearndf.transformation.extra
37 |   requires:
38 |     - pytest ~= 7.1
39 |     # we need pip to install arfs
40 |     - pip # {{ '[False]' if not environ.get('FACET_V_ARFS') }}
41 |     # optional libraries of sklearndf, needed for testing
42 |     - boruta_py  {{ environ.get('FACET_V_BORUTA', '[False]') }}
43 |     - xgboost    {{ environ.get('FACET_V_XGBOOST', '[False]') }}
44 |     # we always need lightgbm for testing; version spec is optional
45 |     - lightgbm   {{ environ.get('FACET_V_LIGHTGBM', '') }}
46 |     # additional requirements of gamma-pytools
47 |     - joblib          {{ environ.get('FACET_V_JOBLIB', '[False]') }}
48 |     - matplotlib-base {{ environ.get('FACET_V_MATPLOTLIB', '[False]') }}
49 |     - typing_inspect  {{ environ.get('FACET_V_TYPING_INSPECT', '[False]') }}
50 |   commands:
51 |     - conda list
52 |     - python -c 'import sklearndf;
53 |                  import os;
54 |                  assert sklearndf.__version__ == os.environ["PKG_VERSION"]'
55 |     # optional PyPi package ARFS needed for testing
56 |     {% if environ.get('FACET_V_ARFS') -%}
57 |     - pip install 'arfs{{ environ.get("FACET_V_ARFS") }}'
58 |     {%- endif %}
59 |     # run the test suite
60 |     - cd "${FACET_PATH}/sklearndf"
61 |     - pytest -vs test
62 | 
63 | about:
64 |   home: https://github.com/BCG-X-Official/sklearndf
65 |   license: Apache Software License v2.0
66 |   license_file: LICENSE
67 |   description: |
68 |     sklearndf is an open source library designed to address a common need with
69 |     scikit-learn: the outputs of transformers are numpy arrays, even when the input
70 |     is a data frame. However, to inspect a model it is essential to keep track of
71 |     the feature names.
72 |   dev_url: https://github.com/BCG-X-Official/sklearndf
73 |   doc_url: https://bcg-x-official.github.io/sklearndf/
74 |   doc_source_url: https://github.com/BCG-X-Official/sklearndf/blob/develop/README.rst


--------------------------------------------------------------------------------
/config/spelling.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/config/spelling.dic


--------------------------------------------------------------------------------
/config/test_config.yml:
--------------------------------------------------------------------------------
1 | - inputfile:
2 |     delimiter: "|"
3 |     header: infer
4 |     date_column_name : Date
5 |     yield_column_name : Yield
6 |     decimal: ","


--------------------------------------------------------------------------------
/dev-setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | conda env create -f environment.yml
3 | conda activate sklearndf-develop
4 | pre-commit install


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: sklearndf-develop
 2 | channels:
 3 |   - conda-forge
 4 |   - bcg_gamma
 5 | dependencies:
 6 |   # run
 7 |   - boruta_py         ~= 0.3
 8 |   - gamma-pytools     ~= 2.1
 9 |   - joblib            ~= 1.2
10 |   - lightgbm          ~= 3.3
11 |   - matplotlib        ~= 3.7
12 |   - numpy             ~= 1.24
13 |   - pandas            ~= 2.0
14 |   - pip               ~= 23.3
15 |   - python            ~= 3.9
16 |   - scikit-learn      ~= 1.2.0
17 |   - scipy             ~= 1.11
18 |   - xgboost           ~= 1.7
19 |   - pip:
20 |      - arfs           ~= 1.1
21 |   # test
22 |   - pytest     ~= 7.2.1
23 |   - pytest-cov ~= 2.12.1
24 |   # sphinx
25 |   - nbsphinx                 ~= 0.8.9
26 |   - sphinx                   ~= 4.5.0
27 |   - sphinx-autodoc-typehints ~= 1.19.2
28 |   - pydata-sphinx-theme      ~= 0.8.1
29 |   # notebooks
30 |   - ipywidgets ~= 8.1
31 |   - jupyterlab ~= 3.6
32 |   - openpyxl   ~= 3.1
33 |   - seaborn    ~= 0.13
34 |   - tableone   ~= 0.7
35 | 


--------------------------------------------------------------------------------
/make.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | call the Python make file for the common conda build process residing in 'pytools'
 4 | """
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
10 | PYTOOLS_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, "pytools"))
11 | sys.path.insert(0, PYTOOLS_DIR)
12 | 
13 | # noinspection PyUnresolvedReferences
14 | from make import run_make
15 | 
16 | run_make()
17 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | strict = True
 3 | show_error_codes = True
 4 | 
 5 | [mypy-arfs.*]
 6 | ; TODO remove once PEP 561 is supported
 7 | ignore_missing_imports = True
 8 | 
 9 | [mypy-boruta.*]
10 | ; TODO remove once PEP 561 is supported
11 | ignore_missing_imports = True
12 | 
13 | [mypy-lightgbm.*]
14 | ; TODO remove once PEP 561 is supported
15 | ignore_missing_imports = True
16 | 
17 | [mypy-packaging.*]
18 | ; TODO remove once PEP 561 is supported
19 | ignore_missing_imports = True
20 | 
21 | [mypy-pandas.*]
22 | ; TODO remove once PEP 561 is supported
23 | ignore_missing_imports = True
24 | 
25 | [mypy-scipy.*]
26 | ; TODO remove once PEP 561 is supported
27 | ignore_missing_imports = True
28 | 
29 | [mypy-sklearn.*]
30 | ; TODO remove once PEP 561 is supported
31 | ignore_missing_imports = True
32 | 
33 | [mypy-xgboost.*]
34 | ; TODO remove once PEP 561 is supported
35 | ignore_missing_imports = True
36 | 


--------------------------------------------------------------------------------
/pypi_description.rst:
--------------------------------------------------------------------------------
 1 | *sklearndf* is an open source library designed to address a common need with
 2 | `scikit-learn <https://github.com/scikit-learn/scikit-learn>`__: the outputs of
 3 | transformers are numpy arrays, even when the input is a
 4 | data frame. However, to inspect a model it is essential to keep track of the
 5 | feature names.
 6 | 
 7 | To this end, *sklearndf* enhances scikit-learn's estimators as follows:
 8 | 
 9 | - **Preserve data frame structure**:
10 |     Return data frames as results of transformations, preserving feature names as the column index.
11 | - **Feature name tracing**:
12 |     Add additional estimator properties to enable tracing a feature name back to its original input feature; this is especially useful for transformers that create new features (e.g., one-hot encode), and for pipelines that include such transformers.
13 | - **Easy use**:
14 |     Simply append DF at the end of your usual scikit-learn class names to get enhanced data frame support!
15 | 
16 | .. Begin-Badges
17 | 
18 | |pypi| |conda| |python_versions| |code_style| |made_with_sphinx_doc| |License_badge|
19 | 
20 | .. End-Badges
21 | 
22 | License
23 | ---------------------------
24 | 
25 | *sklearndf* is licensed under Apache 2.0 as described in the
26 | `LICENSE <https://github.com/BCG-X-Official/sklearndf/blob/develop/LICENSE>`_ file.
27 | 
28 | .. Begin-Badges
29 | 
30 | .. |conda| image:: https://anaconda.org/bcg_gamma/sklearndf/badges/version.svg
31 |     :target: https://anaconda.org/BCG_Gamma/sklearndf
32 | 
33 | .. |pypi| image:: https://badge.fury.io/py/sklearndf.svg
34 |     :target: https://pypi.org/project/sklearndf/
35 | 
36 | .. |python_versions| image:: https://img.shields.io/badge/python-3.7|3.8|3.9-blue.svg
37 |     :target: https://www.python.org/downloads/release/python-380/
38 | 
39 | .. |code_style| image:: https://img.shields.io/badge/code%20style-black-000000.svg
40 |     :target: https://github.com/psf/black
41 | 
42 | .. |made_with_sphinx_doc| image:: https://img.shields.io/badge/Made%20with-Sphinx-1f425f.svg
43 |     :target: https://bcg-x-official.github.io/sklearndf/index.html
44 | 
45 | .. |license_badge| image:: https://img.shields.io/badge/License-Apache%202.0-olivegreen.svg
46 |     :target: https://opensource.org/licenses/Apache-2.0
47 | 
48 | .. End-Badges


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["flit_core >=2,<4"]
  3 | build-backend = "flit_core.buildapi"
  4 | 
  5 | [tool.flit.sdist]
  6 | exclude = [".idea", "tmp", "dist", ".tox", ".pytest_cache"]
  7 | 
  8 | [tool.flit.metadata]
  9 | module = "sklearndf"
 10 | author = "Boston Consulting Group (BCG)"
 11 | home-page = "https://github.com/BCG-X-Official/sklearndf"
 12 | description-file = "pypi_description.rst"
 13 | dist-name = "sklearndf"
 14 | license = "Apache Software License v2.0"
 15 | 
 16 | requires = [
 17 |     "gamma-pytools  ~=2.1",
 18 |     "numpy          >=1.21,<2a",  # cannot use ~= due to conda bug
 19 |     "packaging      >=20",
 20 |     "pandas         >=1",
 21 |     "scikit-learn   >=1,<1.4a",
 22 |     "scipy          ~=1.6",
 23 | ]
 24 | 
 25 | requires-python = ">=3.7,<4a"
 26 | 
 27 | classifiers = [
 28 |     "Development Status :: 5 - Production/Stable",
 29 |     "Intended Audience :: Science/Research",
 30 |     "License :: OSI Approved :: Apache Software License",
 31 |     "Operating System :: MacOS",
 32 |     "Operating System :: Microsoft :: Windows",
 33 |     "Operating System :: POSIX :: Linux",
 34 |     "Operating System :: Unix",
 35 |     "Programming Language :: Python",
 36 |     "Programming Language :: Python :: 3",
 37 |     "Programming Language :: Python :: 3.7",
 38 |     "Programming Language :: Python :: 3.8",
 39 |     "Programming Language :: Python :: 3.9",
 40 |     "Topic :: Scientific/Engineering",
 41 | ]
 42 | 
 43 | [tool.flit.metadata.requires-extra]
 44 | testing = [
 45 |     "pytest ~= 7.1",
 46 |     "pytest-cov ~= 2.12",
 47 |     # optional requirements for testing sklearndf
 48 |     "lightgbm ~= 3.0",
 49 |     "xgboost ~= 1.0",
 50 | ]
 51 | docs = [
 52 |     "sphinx ~= 4.5",
 53 |     "sphinx-autodoc-typehints ~= 1.19",
 54 |     "pydata-sphinx-theme ~= 0.8.1",
 55 |     "jinja2 ~= 2.11",
 56 |     "nbsphinx ~= 0.8.9",
 57 |     "jupyter == 1",
 58 |     "docutils ~= 0.17",
 59 |     "xlrd ~= 1.2",
 60 |     "m2r ~= 0.2"
 61 | ]
 62 | 
 63 | [tool.flit.metadata.urls]
 64 | Documentation = "https://bcg-x-official.github.io/sklearndf/"
 65 | Repository = "https://github.com/BCG-X-Official/sklearndf"
 66 | 
 67 | [build]
 68 | # comma-separated list of packages to be built from source in pip min builds
 69 | no-binary.min = ["matplotlib"]
 70 | 
 71 | [build.matrix.min]
 72 | # direct requirements of sklearndf
 73 | boruta         = "~=0.3.0"
 74 | gamma-pytools  = "~=2.1.0"
 75 | lightgbm       = "~=3.0.0"
 76 | numpy          = "==1.21.6"        # cannot use ~= due to conda bug
 77 | packaging      = "~=20.9"
 78 | pandas         = "~=1.1.5"
 79 | python         = ">=3.7.12,<3.8a"  # cannot use ~= due to conda bug
 80 | scipy          = "~=1.6.3"
 81 | scikit-learn   = "~=1.0.2"
 82 | xgboost        = "~=1.0.2"
 83 | # additional minimum requirements of gamma-pytools
 84 | joblib         = "~=0.14.1"
 85 | matplotlib     = "~=3.0.3"
 86 | typing_inspect = "~=0.4.0"
 87 | 
 88 | [build.matrix.max]
 89 | # direct requirements of sklearndf
 90 | arfs           = "~=1.1"
 91 | gamma-pytools  = "~=2.1"
 92 | lightgbm       = "~=3.3"
 93 | numpy          = ">=1.24,<2a"     # cannot use ~= due to conda bug
 94 | packaging      = ">=20"
 95 | pandas         = "~=2.0"
 96 | python         = ">=3.11,<3.12a"  # cannot use ~= due to conda bug
 97 | scikit-learn   = "~=1.3.2"
 98 | scipy          = "~=1.11"
 99 | xgboost        = "~=1.5"
100 | # additional maximum requirements of gamma-pytools
101 | joblib         = "~=1.1"
102 | matplotlib     = "~=3.5"
103 | typing_inspect = "~=0.7"
104 | 
105 | [tool.black]
106 | # quiet = "True"
107 | line-length = 88
108 | target_version = ['py36']
109 | include = '\.pyi?$'
110 | exclude = '''
111 | (
112 |   /(
113 |       \.eggs         # exclude a few common directories in the
114 |     | \.git          # root of the project
115 |     | \.hg
116 |     | \.mypy_cache
117 |     | \.tox
118 |     | \.venv
119 |     | data
120 |     | docs
121 |     | notebooks
122 |     | sphinx
123 |   )/
124 | )
125 | '''
126 | 


--------------------------------------------------------------------------------
/sphinx/.gitignore:
--------------------------------------------------------------------------------
1 | base
2 | source/_generated
3 | source/apidoc
4 | 


--------------------------------------------------------------------------------
/sphinx/make.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Make sphinx documentation using the pytools make utility
 4 | """
 5 | import os
 6 | from urllib import request
 7 | 
 8 | BRANCH = "2.1.x"
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     # noinspection PyUnusedLocal
13 |     def run_make(branch: str, working_directory: str) -> None:
14 |         """Stub, overwritten by bootstrap.py"""
15 | 
16 |     # run the common make file available in the pytools repo
17 |     with request.urlopen(
18 |         f"https://raw.githubusercontent.com/BCG-X-Official/pytools/{BRANCH}"
19 |         f"/sphinx/base/bootstrap.py"
20 |     ) as response:
21 |         exec(response.read().decode("utf-8"), globals())
22 | 
23 |     run_make(branch=BRANCH, working_directory=os.path.dirname(__file__))
24 | 


--------------------------------------------------------------------------------
/sphinx/source/_images/gamma_sklearndf_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/sphinx/source/_images/gamma_sklearndf_logo.png


--------------------------------------------------------------------------------
/sphinx/source/_images/sklearndf-class-hierarchy.graffle/data.plist:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/sphinx/source/_images/sklearndf-class-hierarchy.graffle/data.plist


--------------------------------------------------------------------------------
/sphinx/source/_images/sklearndf_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/sphinx/source/_images/sklearndf_logo.png


--------------------------------------------------------------------------------
/sphinx/source/api_landing.rst:
--------------------------------------------------------------------------------
 1 | Augmented scikit-learn classes are named after their native scikit-learn counterparts,
 2 | with `DF` added as a suffix:
 3 | :class:`.SimpleImputerDF` takes the place of :class:`~sklearn.impute.SimpleImputer`,
 4 | :class:`.RandomForestRegressorDF` takes the place of
 5 | :class:`~sklearn.ensemble.RandomForestRegressor`, and so on.
 6 | 
 7 | For all methods expecting an `X` argument for a feature matrix and potentially a
 8 | `y` argument for one or more targets, `sklearndf` estimators expect a pandas
 9 | :class:`~pandas.DataFrame` for `X` and a pandas :class:`~pandas.Series` for a
10 | 1‑dimensional `y`, or a pandas :class:`~pandas.DataFrame` for `y` when fitting to
11 | multiple targets or outputs.
12 | This includes methods such as :meth:`~EstimatorDF.fit`,
13 | :meth:`~TransformerDF.transform`, and :meth:`~LearnerDF.predict`.
14 | 
15 | All estimators enhanced by `sklearndf` also implement an additional attribute
16 | :attr:`~EstimatorDF.feature_names_in_`, keeping track of the column names of the data
17 | frame used to fit the estimator.
18 | 
19 | `sklearndf` transformers also implement attributes
20 | :attr:`~TransformerDF.feature_names_out_` and
21 | :attr:`~TransformerDF.feature_names_original_`, keeping track of the feature names of
22 | the transformed outputs as well as mapping output features back to the input features.
23 | This enables tracing features back to the original inputs even across complex
24 | pipelines (see also :class:`.PipelineDF`).
25 | 
26 | `sklearndf` classes implement a class hierarchy that follows the taxonomy of
27 | scikit-learn classes (but is only partially reflected via class inheritance in the
28 | original `scikit-learn` implementation):
29 | 
30 | |
31 | 
32 | .. image:: /_images/sklearndf-class-hierarchy.svg
33 |   :alt: sklearndf class hierarchy
34 |   :align: center
35 | 
36 | |
37 | 
38 | - all `sklearndf` transformers are subclasses of :class:`.TransformerDF`, which in turn
39 |   provides the API for all common transformer methods, e.g.,
40 |   :meth:`~TransformerDF.transform`
41 | 
42 | - all `sklearndf` clusterers are subclasses of :class:`.ClusterDF`, which
43 |   in turn provides the API for all common clustering methods, e.g.,
44 |   :meth:`~ClusterDF.fit_predict`
45 | 
46 | - all `sklearndf` regressors are subclasses of :class:`.RegressorDF`, which
47 |   in turn provides the API for all common regressor methods, e.g.,
48 |   :meth:`~LearnerDF.predict`
49 | 
50 | - all `sklearndf` classifiers are subclasses of :class:`.ClassifierDF`, which
51 |   in turn provides the API for all common classifier methods, e.g.,
52 |   :meth:`~ClassifierDF.predict_proba`
53 | 
54 | - all `sklearndf` regressors and classifiers are subclasses of
55 |   :class:`.SupervisedLearnerDF`
56 | 
57 | - all `sklearndf` regressors, classifiers and clusterers are subclasses of
58 |   :class:`.LearnerDF`
59 | 
60 | - all `sklearndf` estimators are subclasses of :class:`.EstimatorDF`
61 | 
62 | `sklearndf` introduces additional pipeline classes :class:`.RegressorPipelineDF`,
63 | :class:`.ClassifierPipelineDF`, and :class:`.ClusterPipelineDF`, with an abstract base
64 | class :class:`.LearnerPipelineDF`, to allow for easier handling of common types of ML
65 | pipelines.
66 | These classes implement pipelines with two steps -- one preprocessing step, followed by
67 | a learner as the second and final step.
68 | 
69 | `sklearndf` also provides data frame support for a selection of custom or 3rd-party
70 | estimators, most notably :class:`.BorutaDF`, :class:`.LGBMRegressorDF`,
71 | :class:`.LGBMClassifierDF`, :class:`.XGBRegressorDF`, and :class:`.XGBClassifierDF`.
72 | 
73 | All `sklearndf` estimators are fully type hinted.
74 | 
75 | Please see the :ref:`release notes<release-notes>` for recent API updates and bug fixes.
76 | 


--------------------------------------------------------------------------------
/sphinx/source/conf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration file for the Sphinx documentation builder.
 3 | 
 4 | Receives the majority of the configuration from pytools conf_base.py
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | 
10 | _dir_base = os.path.join(os.path.dirname(os.path.dirname(__file__)), "base")
11 | sys.path.insert(0, _dir_base)
12 | 
13 | from conf_base import set_config
14 | 
15 | # ----- set custom configuration -----
16 | 
17 | set_config(
18 |     globals(),
19 |     project="sklearndf",
20 |     html_logo=os.path.join("_images", "gamma_sklearndf_logo.png"),
21 |     intersphinx_mapping={
22 |         "lightgbm": ("https://lightgbm.readthedocs.io/en/latest/", None),
23 |         "pytools": ("https://bcg-x-official.github.io/pytools/", None),
24 |         "sklearn": ("https://scikit-learn.org/stable", None),
25 |         "xgboost": ("https://xgboost.readthedocs.io/en/latest/", None),
26 |     },
27 | )
28 | 


--------------------------------------------------------------------------------
/sphinx/source/faqs.rst:
--------------------------------------------------------------------------------
 1 | .. _faqs:
 2 | 
 3 | FAQ
 4 | ===
 5 | 
 6 | Below you can find answers to commonly asked questions as well as how to
 7 | cite *sklearndf*.
 8 | 
 9 | Commonly asked questions
10 | ------------------------
11 | 
12 | If you don't see your answer there you could also try posting
13 | on `stackoverflow <https://stackoverflow.com/>`_.
14 | 
15 | 1. **What if I find a bug or have an idea for a new feature?**
16 | 
17 |     For bug reports or feature requests please use our
18 |     `GitHub issue tracker <https://github.com/BCG-X-Official/sklearndf/issues>`_.
19 |     For any other enquiries please feel free to contact us at FacetTeam@bcg.com.
20 | 
21 | 2. **How can I contribute?**
22 | 
23 |     We welcome contributors! If you have minor changes in mind that would like to
24 |     contribute, please feel free to create a pull request and be sure to follow the
25 |     developer guidelines. For large or extensive changes please feel free to open an
26 |     issue, or reach out to us at FacetTeam@bcg.com to discuss.
27 | 
28 | 
29 | Citation
30 | --------
31 | If you use *sklearndf* in your work please cite us as follows:
32 | 
33 | Bibtex entry::
34 | 
35 |      @manual{
36 |      title={sklearndf},
37 |      author={FACET Team at BCG Gamma},
38 |      year={2021},
39 |      note={Python package version 1.1.1}
40 |      }
41 | 
42 | 


--------------------------------------------------------------------------------
/sphinx/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. image:: /_images/sklearndf_logo.png
 2 | 
 3 | |
 4 | 
 5 | Table of contents
 6 | -----------------
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 1
10 |    :titlesonly:
11 | 
12 |    Getting started <_generated/getting_started>
13 |    API reference <apidoc/sklearndf>
14 |    tutorials
15 |    contribution_guide
16 |    faqs
17 |    _generated/release_notes
18 | 


--------------------------------------------------------------------------------
/sphinx/source/tutorials.rst:
--------------------------------------------------------------------------------
 1 | .. _tutorials:
 2 | 
 3 | Tutorials
 4 | =========
 5 | 
 6 | 
 7 | 
 8 | 
 9 | Detailed *sklearndf* tutorial
10 | ------------------------------
11 | 
12 | Start exploring the tutorial right away by clicking on the section links below, and
13 | start running the code for yourself by downloading the notebook
14 | :download:`here <tutorial/sklearndf_tutorial.ipynb>`.
15 | 
16 | .. toctree::
17 |     :maxdepth: 1
18 | 
19 |     tutorial/sklearndf_tutorial
20 | 
21 | 


--------------------------------------------------------------------------------
/src/sklearndf/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data frame support and feature traceability for `scikit-learn`.
 3 | 
 4 | `sklearndf` augments more than 160 `scikit-learn` estimators for
 5 | native support of data frames, while leaving the original API intact.
 6 | """
 7 | 
 8 | from ._sklearn_version import *
 9 | from ._sklearndf import *
10 | 
11 | __version__ = "2.3.1"
12 | 


--------------------------------------------------------------------------------
/src/sklearndf/_sklearn_version.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Special constants for version checks for scikit-learn.
 3 | """
 4 | 
 5 | from packaging.version import Version
 6 | from sklearn import __version__ as sklearn_version
 7 | 
 8 | __all__ = [
 9 |     "__sklearn_version__",
10 |     "__sklearn_1_1__",
11 |     "__sklearn_1_2__",
12 |     "__sklearn_1_3__",
13 |     "__sklearn_1_4__",
14 | ]
15 | 
16 | __sklearn_version__ = Version(sklearn_version)
17 | __sklearn_1_1__ = Version("1.1")
18 | __sklearn_1_2__ = Version("1.2")
19 | __sklearn_1_3__ = Version("1.3")
20 | __sklearn_1_4__ = Version("1.4")
21 | 


--------------------------------------------------------------------------------
/src/sklearndf/_util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Auxiliary functions for internal use.
 3 | """
 4 | 
 5 | from typing import Any, List, Optional, Union, cast
 6 | 
 7 | import numpy.typing as npt
 8 | import pandas as pd
 9 | from scipy import sparse
10 | 
11 | 
12 | def hstack_frames(
13 |     frames: List[Union[npt.NDArray[Any], sparse.spmatrix, pd.DataFrame]],
14 |     *,
15 |     prefixes: Optional[List[str]] = None,
16 | ) -> Optional[pd.DataFrame]:
17 |     """
18 |     If only data frames are passed, stack them horizontally.
19 | 
20 |     :param frames: a list of array-likes
21 |     :param prefixes: an optional list of prefixes to use for the columns of each data
22 |         frame in arg ``frames``; must have the same length as arg ``frames``
23 |     :return: the stacked data frame if all elements of ``frames`` are data frames;
24 |         ``None`` otherwise
25 |     """
26 |     if all(isinstance(frame, pd.DataFrame) for frame in frames):
27 |         # all frames are data frames
28 |         frames = cast(List[pd.DataFrame], frames)
29 |         if prefixes is not None:
30 |             assert len(prefixes) == len(
31 |                 frames
32 |             ), "number of prefixes must match number of frames"
33 |             frames = [
34 |                 frame.add_prefix(f"{prefix}__")
35 |                 for frame, prefix in zip(frames, prefixes)
36 |             ]
37 |         return pd.concat(frames, axis=1)
38 |     else:
39 |         return None
40 | 
41 | 
42 | def is_sparse_frame(frame: pd.DataFrame) -> bool:
43 |     """
44 |     Check if a data frame contains sparse columns.
45 | 
46 |     :param frame: the data frame to check
47 |     :return: ``True`` if the data frame contains sparse columns; ``False`` otherwise
48 |     """
49 | 
50 |     return any(isinstance(dtype, pd.SparseDtype) for dtype in frame.dtypes)
51 | 
52 | 
53 | def sparse_frame_density(frame: pd.DataFrame) -> float:
54 |     """
55 |     Compute the density of a data frame.
56 | 
57 |     The density of a data frame is the average density of its columns.
58 |     The density of a sparse column is the ratio of non-sparse points to total (dense)
59 |     data points.
60 |     The density of a dense column is 1.
61 | 
62 |     :param frame: a data frame
63 |     :return: the density of the data frame
64 |     """
65 | 
66 |     def _density(sr: pd.Series) -> float:
67 |         if isinstance(sr.dtype, pd.SparseDtype):
68 |             return cast(float, sr.sparse.density)
69 |         else:
70 |             return 1.0
71 | 
72 |     return sum(_density(sr) for _, sr in frame.items()) / len(frame.columns)
73 | 


--------------------------------------------------------------------------------
/src/sklearndf/classification/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Extended versions of all `scikit-learn` classifiers with enhanced support for data
3 | frames.
4 | """
5 | from ._classification import *
6 | from ._classification_v0_22 import *
7 | from ._classification_v0_23 import *
8 | from ._classification_v1_0 import *
9 | 


--------------------------------------------------------------------------------
/src/sklearndf/classification/_classification.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Core implementation of :mod:`sklearndf.classification`
  3 | """
  4 | import logging
  5 | 
  6 | from sklearn.calibration import CalibratedClassifierCV
  7 | from sklearn.discriminant_analysis import (
  8 |     LinearDiscriminantAnalysis,
  9 |     QuadraticDiscriminantAnalysis,
 10 | )
 11 | from sklearn.dummy import DummyClassifier
 12 | from sklearn.ensemble import (
 13 |     AdaBoostClassifier,
 14 |     BaggingClassifier,
 15 |     ExtraTreesClassifier,
 16 |     GradientBoostingClassifier,
 17 |     RandomForestClassifier,
 18 |     VotingClassifier,
 19 | )
 20 | from sklearn.gaussian_process import GaussianProcessClassifier
 21 | from sklearn.linear_model import (
 22 |     LogisticRegression,
 23 |     LogisticRegressionCV,
 24 |     PassiveAggressiveClassifier,
 25 |     Perceptron,
 26 |     RidgeClassifier,
 27 |     RidgeClassifierCV,
 28 |     SGDClassifier,
 29 | )
 30 | from sklearn.multiclass import (
 31 |     OneVsOneClassifier,
 32 |     OneVsRestClassifier,
 33 |     OutputCodeClassifier,
 34 | )
 35 | from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
 36 | from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB
 37 | from sklearn.neighbors import (
 38 |     KNeighborsClassifier,
 39 |     NearestCentroid,
 40 |     RadiusNeighborsClassifier,
 41 | )
 42 | from sklearn.neural_network import MLPClassifier
 43 | from sklearn.semi_supervised import LabelPropagation, LabelSpreading
 44 | from sklearn.svm import SVC, LinearSVC, NuSVC
 45 | from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
 46 | 
 47 | from pytools.api import AllTracker
 48 | 
 49 | from ..wrapper import ClassifierWrapperDF, MetaEstimatorWrapperDF
 50 | from .wrapper import (
 51 |     ClassifierChainWrapperDF,
 52 |     LinearDiscriminantAnalysisWrapperDF,
 53 |     MetaClassifierWrapperDF,
 54 |     MultiOutputClassifierWrapperDF,
 55 |     PartialFitClassifierWrapperDF,
 56 | )
 57 | 
 58 | log = logging.getLogger(__name__)
 59 | 
 60 | __all__ = [
 61 |     "AdaBoostClassifierDF",
 62 |     "BaggingClassifierDF",
 63 |     "BernoulliNBDF",
 64 |     "CalibratedClassifierCVDF",
 65 |     "ClassifierChainDF",
 66 |     "ComplementNBDF",
 67 |     "DecisionTreeClassifierDF",
 68 |     "DummyClassifierDF",
 69 |     "ExtraTreeClassifierDF",
 70 |     "ExtraTreesClassifierDF",
 71 |     "GaussianNBDF",
 72 |     "GaussianProcessClassifierDF",
 73 |     "GradientBoostingClassifierDF",
 74 |     "KNeighborsClassifierDF",
 75 |     "LabelPropagationDF",
 76 |     "LabelSpreadingDF",
 77 |     "LinearDiscriminantAnalysisDF",
 78 |     "LinearSVCDF",
 79 |     "LogisticRegressionCVDF",
 80 |     "LogisticRegressionDF",
 81 |     "MLPClassifierDF",
 82 |     "MultinomialNBDF",
 83 |     "MultiOutputClassifierDF",
 84 |     "NearestCentroidDF",
 85 |     "NuSVCDF",
 86 |     "OneVsOneClassifierDF",
 87 |     "OneVsRestClassifierDF",
 88 |     "OutputCodeClassifierDF",
 89 |     "PassiveAggressiveClassifierDF",
 90 |     "PerceptronDF",
 91 |     "QuadraticDiscriminantAnalysisDF",
 92 |     "RadiusNeighborsClassifierDF",
 93 |     "RandomForestClassifierDF",
 94 |     "RidgeClassifierCVDF",
 95 |     "RidgeClassifierDF",
 96 |     "SGDClassifierDF",
 97 |     "SVCDF",
 98 |     "VotingClassifierDF",
 99 | ]
100 | 
101 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
102 | 
103 | 
104 | #
105 | # Ensure all symbols introduced below are included in __all__
106 | #
107 | 
108 | __tracker = AllTracker(globals())
109 | 
110 | 
111 | #
112 | # Class definitions
113 | #
114 | 
115 | 
116 | #
117 | # Dummy
118 | #
119 | 
120 | 
121 | class DummyClassifierDF(ClassifierWrapperDF[DummyClassifier], native=DummyClassifier):
122 |     """Stub for DF wrapper of class ``DummyClassifier``"""
123 | 
124 | 
125 | #
126 | # neighbors
127 | #
128 | 
129 | 
130 | class NearestCentroidDF(ClassifierWrapperDF[NearestCentroid], native=NearestCentroid):
131 |     """Stub for DF wrapper of class ``NearestCentroid``"""
132 | 
133 | 
134 | class KNeighborsClassifierDF(
135 |     ClassifierWrapperDF[KNeighborsClassifier], native=KNeighborsClassifier
136 | ):
137 |     """Stub for DF wrapper of class ``KNeighborsClassifier``"""
138 | 
139 | 
140 | class RadiusNeighborsClassifierDF(
141 |     ClassifierWrapperDF[RadiusNeighborsClassifier], native=RadiusNeighborsClassifier
142 | ):
143 |     """Stub for DF wrapper of class ``RadiusNeighborsClassifier``"""
144 | 
145 | 
146 | #
147 | # voting
148 | #
149 | 
150 | 
151 | class VotingClassifierDF(
152 |     MetaClassifierWrapperDF[VotingClassifier], native=VotingClassifier
153 | ):
154 |     """Stub for DF wrapper of class ``VotingClassifier``"""
155 | 
156 | 
157 | #
158 | # ensemble
159 | #
160 | 
161 | 
162 | class RandomForestClassifierDF(
163 |     ClassifierWrapperDF[RandomForestClassifier], native=RandomForestClassifier
164 | ):
165 |     """Stub for DF wrapper of class ``RandomForestClassifier``"""
166 | 
167 | 
168 | class ExtraTreesClassifierDF(
169 |     ClassifierWrapperDF[ExtraTreesClassifier], native=ExtraTreesClassifier
170 | ):
171 |     """Stub for DF wrapper of class ``ExtraTreesClassifier``"""
172 | 
173 | 
174 | # noinspection PyAbstractClass
175 | class GradientBoostingClassifierDF(
176 |     ClassifierWrapperDF[GradientBoostingClassifier], native=GradientBoostingClassifier
177 | ):
178 |     """Stub for DF wrapper of class ``GradientBoostingClassifier``"""
179 | 
180 | 
181 | class AdaBoostClassifierDF(
182 |     ClassifierWrapperDF[AdaBoostClassifier], native=AdaBoostClassifier
183 | ):
184 |     """Stub for DF wrapper of class ``AdaBoostClassifier``"""
185 | 
186 | 
187 | class BaggingClassifierDF(
188 |     ClassifierWrapperDF[BaggingClassifier], native=BaggingClassifier
189 | ):
190 |     """Stub for DF wrapper of class ``BaggingClassifier``"""
191 | 
192 | 
193 | #
194 | # tree
195 | #
196 | 
197 | 
198 | class DecisionTreeClassifierDF(
199 |     ClassifierWrapperDF[DecisionTreeClassifier], native=DecisionTreeClassifier
200 | ):
201 |     """Stub for DF wrapper of class ``DecisionTreeClassifier``"""
202 | 
203 | 
204 | class ExtraTreeClassifierDF(
205 |     ClassifierWrapperDF[ExtraTreeClassifier], native=ExtraTreeClassifier
206 | ):
207 |     """Stub for DF wrapper of class ``ExtraTreeClassifier``"""
208 | 
209 | 
210 | #
211 | # discriminant analysis
212 | #
213 | 
214 | 
215 | class LinearDiscriminantAnalysisDF(
216 |     LinearDiscriminantAnalysisWrapperDF, native=LinearDiscriminantAnalysis
217 | ):
218 |     """Stub for DF wrapper of class ``LinearDiscriminantAnalysis``"""
219 | 
220 | 
221 | class QuadraticDiscriminantAnalysisDF(
222 |     ClassifierWrapperDF[QuadraticDiscriminantAnalysis],
223 |     native=QuadraticDiscriminantAnalysis,
224 | ):
225 |     """Stub for DF wrapper of class ``QuadraticDiscriminantAnalysis``"""
226 | 
227 | 
228 | #
229 | # naive bayes
230 | #
231 | 
232 | 
233 | class GaussianNBDF(PartialFitClassifierWrapperDF[GaussianNB], native=GaussianNB):
234 |     """Stub for DF wrapper of class ``GaussianNB``"""
235 | 
236 | 
237 | class MultinomialNBDF(
238 |     PartialFitClassifierWrapperDF[MultinomialNB], native=MultinomialNB
239 | ):
240 |     """Stub for DF wrapper of class ``MultinomialNB``"""
241 | 
242 | 
243 | class ComplementNBDF(PartialFitClassifierWrapperDF[ComplementNB], native=ComplementNB):
244 |     """Stub for DF wrapper of class ``ComplementNB``"""
245 | 
246 | 
247 | class BernoulliNBDF(PartialFitClassifierWrapperDF[BernoulliNB], native=BernoulliNB):
248 |     """Stub for DF wrapper of class ``BernoulliNB``"""
249 | 
250 | 
251 | #
252 | # calibration
253 | #
254 | 
255 | 
256 | class CalibratedClassifierCVDF(
257 |     MetaClassifierWrapperDF[CalibratedClassifierCV], native=CalibratedClassifierCV
258 | ):
259 |     """Stub for DF wrapper of class ``CalibratedClassifierCV``"""
260 | 
261 | 
262 | #
263 | # SVM
264 | #
265 | 
266 | 
267 | class SVCDF(ClassifierWrapperDF[SVC], native=SVC):
268 |     """Stub for DF wrapper of class ``SVC``"""
269 | 
270 | 
271 | class NuSVCDF(ClassifierWrapperDF[NuSVC], native=NuSVC):
272 |     """Stub for DF wrapper of class ``NuSVC``"""
273 | 
274 | 
275 | class LinearSVCDF(ClassifierWrapperDF[LinearSVC], native=LinearSVC):
276 |     """Stub for DF wrapper of class ``LinearSVC``"""
277 | 
278 | 
279 | #
280 | # gaussian process
281 | #
282 | 
283 | 
284 | class GaussianProcessClassifierDF(
285 |     ClassifierWrapperDF[GaussianProcessClassifier], native=GaussianProcessClassifier
286 | ):
287 |     """Stub for DF wrapper of class ``GaussianProcessClassifier``"""
288 | 
289 | 
290 | #
291 | # linear model
292 | #
293 | 
294 | 
295 | class LogisticRegressionDF(
296 |     ClassifierWrapperDF[LogisticRegression], native=LogisticRegression
297 | ):
298 |     """Stub for DF wrapper of class ``LogisticRegression``"""
299 | 
300 | 
301 | class LogisticRegressionCVDF(
302 |     ClassifierWrapperDF[LogisticRegressionCV], native=LogisticRegressionCV
303 | ):
304 |     """Stub for DF wrapper of class ``LogisticRegressionCV``"""
305 | 
306 | 
307 | class PassiveAggressiveClassifierDF(
308 |     PartialFitClassifierWrapperDF[PassiveAggressiveClassifier],
309 |     native=PassiveAggressiveClassifier,
310 | ):
311 |     """Stub for DF wrapper of class ``PassiveAggressiveClassifier``"""
312 | 
313 | 
314 | class PerceptronDF(PartialFitClassifierWrapperDF[Perceptron], native=Perceptron):
315 |     """Stub for DF wrapper of class ``Perceptron``"""
316 | 
317 | 
318 | class SGDClassifierDF(
319 |     PartialFitClassifierWrapperDF[SGDClassifier], native=SGDClassifier
320 | ):
321 |     """Stub for DF wrapper of class ``SGDClassifier``"""
322 | 
323 | 
324 | class RidgeClassifierDF(ClassifierWrapperDF[RidgeClassifier], native=RidgeClassifier):
325 |     """Stub for DF wrapper of class ``RidgeClassifier``"""
326 | 
327 | 
328 | class RidgeClassifierCVDF(
329 |     ClassifierWrapperDF[RidgeClassifierCV], native=RidgeClassifierCV
330 | ):
331 |     """Stub for DF wrapper of class ``RidgeClassifierCV``"""
332 | 
333 | 
334 | #
335 | # semi-supervised
336 | #
337 | 
338 | 
339 | class LabelPropagationDF(
340 |     ClassifierWrapperDF[LabelPropagation], native=LabelPropagation
341 | ):
342 |     """Stub for DF wrapper of class ``LabelPropagation``"""
343 | 
344 | 
345 | class LabelSpreadingDF(ClassifierWrapperDF[LabelSpreading], native=LabelSpreading):
346 |     """Stub for DF wrapper of class ``LabelSpreading``"""
347 | 
348 | 
349 | #
350 | # multi-class
351 | #
352 | 
353 | 
354 | class OneVsRestClassifierDF(
355 |     MetaClassifierWrapperDF[OneVsRestClassifier], native=OneVsRestClassifier
356 | ):
357 |     """Stub for DF wrapper of class ``OneVsRestClassifier``"""
358 | 
359 | 
360 | class OneVsOneClassifierDF(
361 |     ClassifierWrapperDF[OneVsOneClassifier],
362 |     MetaEstimatorWrapperDF[OneVsOneClassifier],
363 |     native=OneVsOneClassifier,
364 | ):
365 |     """Stub for DF wrapper of class ``OneVsOneClassifier``"""
366 | 
367 | 
368 | class OutputCodeClassifierDF(
369 |     ClassifierWrapperDF[OutputCodeClassifier],
370 |     MetaEstimatorWrapperDF[OutputCodeClassifier],
371 |     native=OutputCodeClassifier,
372 | ):
373 |     """Stub for DF wrapper of class ``OutputCodeClassifier``"""
374 | 
375 | 
376 | #
377 | # multi-output
378 | #
379 | 
380 | 
381 | class MultiOutputClassifierDF(
382 |     MultiOutputClassifierWrapperDF, native=MultiOutputClassifier
383 | ):
384 |     """Stub for DF wrapper of class ``MultiOutputClassifier``"""
385 | 
386 | 
387 | #
388 | # chaining
389 | #
390 | 
391 | 
392 | class ClassifierChainDF(ClassifierChainWrapperDF, native=ClassifierChain):
393 |     """Stub for DF wrapper of class ``ClassifierChain``"""
394 | 
395 | 
396 | #
397 | # neural network
398 | #
399 | 
400 | 
401 | class MLPClassifierDF(
402 |     PartialFitClassifierWrapperDF[MLPClassifier], native=MLPClassifier
403 | ):
404 |     """Stub for DF wrapper of class ``MLPClassifier``"""
405 | 
406 | 
407 | #
408 | # validate __all__
409 | #
410 | 
411 | __tracker.validate()
412 | 
413 | 
414 | #
415 | # validate that __all__ comprises all symbols ending in "DF", and no others
416 | #
417 | 
418 | __estimators = {
419 |     sym
420 |     for sym in dir()
421 |     if sym.endswith("DF")
422 |     and sym not in __imported_estimators
423 |     and not sym.startswith("_")
424 | }
425 | if __estimators != set(__all__):
426 |     raise RuntimeError(
427 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
428 |         f"{__estimators}"
429 |     )
430 | 


--------------------------------------------------------------------------------
/src/sklearndf/classification/_classification_v0_22.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Additional implementation of :mod:`sklearndf.classification` loaded
 3 | from sklearn 0.22 onwards
 4 | """
 5 | import logging
 6 | 
 7 | from sklearn.ensemble import StackingClassifier
 8 | from sklearn.naive_bayes import CategoricalNB
 9 | 
10 | from pytools.api import AllTracker
11 | 
12 | from ..wrapper.stacking import StackingClassifierWrapperDF
13 | from .wrapper import PartialFitClassifierWrapperDF
14 | 
15 | log = logging.getLogger(__name__)
16 | 
17 | __all__ = ["CategoricalNBDF", "StackingClassifierDF"]
18 | 
19 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
20 | 
21 | 
22 | #
23 | # Ensure all symbols introduced below are included in __all__
24 | #
25 | 
26 | __tracker = AllTracker(globals())
27 | 
28 | 
29 | #
30 | # Class definitions
31 | #
32 | 
33 | 
34 | #
35 | # naive bayes
36 | #
37 | 
38 | 
39 | class CategoricalNBDF(
40 |     PartialFitClassifierWrapperDF[CategoricalNB], native=CategoricalNB
41 | ):
42 |     """Stub for DF wrapper of class ``CategoricalNB``"""
43 | 
44 | 
45 | class StackingClassifierDF(
46 |     StackingClassifierWrapperDF[StackingClassifier], native=StackingClassifier
47 | ):
48 |     """Stub for DF wrapper of class ``StackingClassifier``"""
49 | 
50 | 
51 | #
52 | # validate __all__
53 | #
54 | 
55 | __tracker.validate()
56 | 
57 | 
58 | #
59 | # validate that __all__ comprises all symbols ending in "DF", and no others
60 | #
61 | 
62 | __estimators = {
63 |     sym
64 |     for sym in dir()
65 |     if sym.endswith("DF")
66 |     and sym not in __imported_estimators
67 |     and not sym.startswith("_")
68 | }
69 | if __estimators != set(__all__):
70 |     raise RuntimeError(
71 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
72 |         f"{__estimators}"
73 |     )
74 | 


--------------------------------------------------------------------------------
/src/sklearndf/classification/_classification_v0_23.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Additional implementation of :mod:`sklearndf.classification` loaded
 3 | from sklearn 0.23 onwards
 4 | """
 5 | 
 6 | import logging
 7 | from typing import List
 8 | 
 9 | from pytools.api import AllTracker
10 | 
11 | log = logging.getLogger(__name__)
12 | 
13 | __all__: List[str] = []
14 | 
15 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
16 | 
17 | 
18 | #
19 | # Ensure all symbols introduced below are included in __all__
20 | #
21 | 
22 | __tracker = AllTracker(globals())
23 | 
24 | 
25 | #
26 | # Class definitions
27 | #
28 | 
29 | 
30 | # todo: add classification implementations for sklearn 0.23
31 | 
32 | 
33 | __tracker.validate()
34 | 
35 | #
36 | # validate that __all__ comprises all symbols ending in "DF", and no others
37 | #
38 | 
39 | __estimators = {
40 |     sym
41 |     for sym in dir()
42 |     if sym.endswith("DF")
43 |     and sym not in __imported_estimators
44 |     and not sym.startswith("_")
45 | }
46 | if __estimators != set(__all__):
47 |     raise RuntimeError(
48 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
49 |         f"{__estimators}"
50 |     )
51 | 


--------------------------------------------------------------------------------
/src/sklearndf/classification/_classification_v1_0.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Additional implementation of :mod:`sklearndf.classification` loaded
 3 | from sklearn 1.0 onwards
 4 | """
 5 | import logging
 6 | 
 7 | from sklearn.ensemble import HistGradientBoostingClassifier
 8 | 
 9 | from pytools.api import AllTracker
10 | 
11 | from ..wrapper import ClassifierWrapperDF
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | __all__ = ["HistGradientBoostingClassifierDF"]
16 | 
17 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
18 | 
19 | 
20 | #
21 | # Ensure all symbols introduced below are included in __all__
22 | #
23 | 
24 | __tracker = AllTracker(globals())
25 | 
26 | 
27 | #
28 | # ensemble
29 | #
30 | 
31 | 
32 | class HistGradientBoostingClassifierDF(
33 |     ClassifierWrapperDF[HistGradientBoostingClassifier],
34 |     native=HistGradientBoostingClassifier,
35 | ):
36 |     """Stub for DF wrapper of class ``HistGradientBoostingClassifier``"""
37 | 
38 | 
39 | #
40 | # validate __all__
41 | #
42 | 
43 | __tracker.validate()
44 | 
45 | 
46 | #
47 | # validate that __all__ comprises all symbols ending in "DF", and no others
48 | #
49 | 
50 | __estimators = {
51 |     sym
52 |     for sym in dir()
53 |     if sym.endswith("DF")
54 |     and sym not in __imported_estimators
55 |     and not sym.startswith("_")
56 | }
57 | if __estimators != set(__all__):
58 |     raise RuntimeError(
59 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
60 |         f"{__estimators}"
61 |     )
62 | 


--------------------------------------------------------------------------------
/src/sklearndf/classification/extra/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Additional 3rd party classifiers that implement the `scikit-learn` interface.
3 | 
4 | Note that 3rd party packages implementing the associated native estimators must be
5 | installed explicitly: they are not included in `sklearndf`'s package requirements to
6 | achieve a lean package footprint for default installs of `sklearndf`.
7 | """
8 | from ._extra import *
9 | 


--------------------------------------------------------------------------------
/src/sklearndf/classification/extra/_extra.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.classification.extra`
 3 | """
 4 | import logging
 5 | 
 6 | from sklearn.base import ClassifierMixin
 7 | 
 8 | from pytools.api import AllTracker
 9 | 
10 | from ...wrapper import ClassifierWrapperDF, MissingEstimator
11 | 
12 | log = logging.getLogger(__name__)
13 | 
14 | __all__ = ["LGBMClassifierDF", "XGBClassifierDF"]
15 | 
16 | try:
17 |     # import lightgbm classes only if installed
18 |     from lightgbm.sklearn import LGBMClassifier
19 | except ImportError:
20 | 
21 |     class LGBMClassifier(  # type: ignore
22 |         MissingEstimator,
23 |         ClassifierMixin,  # type: ignore
24 |     ):
25 |         """Mock-up for missing estimator."""
26 | 
27 | 
28 | try:
29 |     # import xgboost classes only if installed
30 |     from xgboost import XGBClassifier
31 | except ImportError:
32 | 
33 |     class XGBClassifier(  # type: ignore
34 |         MissingEstimator,
35 |         ClassifierMixin,  # type: ignore
36 |     ):
37 |         """Mock-up for missing estimator."""
38 | 
39 | 
40 | #
41 | # Ensure all symbols introduced below are included in __all__
42 | #
43 | 
44 | __tracker = AllTracker(globals())
45 | 
46 | 
47 | #
48 | # Class definitions
49 | #
50 | 
51 | 
52 | class LGBMClassifierDF(ClassifierWrapperDF[LGBMClassifier], native=LGBMClassifier):
53 |     """Stub for DF wrapper of class ``LGBMClassifierDF``"""
54 | 
55 | 
56 | class XGBClassifierDF(ClassifierWrapperDF[XGBClassifier], native=XGBClassifier):
57 |     """Stub for DF wrapper of class ``XGBClassifierDF``"""
58 | 
59 | 
60 | #
61 | # validate that __all__
62 | #
63 | 
64 | __tracker.validate()
65 | 


--------------------------------------------------------------------------------
/src/sklearndf/classification/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Wrapper classes for `scikit-learn` classifiers, providing enhanced support for data
3 | frames.
4 | """
5 | 
6 | from ._wrapper import *
7 | 


--------------------------------------------------------------------------------
/src/sklearndf/classification/wrapper/_wrapper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Core implementation of :mod:`sklearndf.classification.wrapper`
  3 | """
  4 | 
  5 | import logging
  6 | from abc import ABCMeta
  7 | from typing import Any, Generic, List, Optional, Sequence, TypeVar, Union, cast
  8 | 
  9 | import numpy.typing as npt
 10 | import pandas as pd
 11 | from sklearn.base import ClassifierMixin
 12 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 13 | from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
 14 | 
 15 | from pytools.api import AllTracker
 16 | 
 17 | from ...transformation.wrapper import NComponentsDimensionalityReductionWrapperDF
 18 | from ...wrapper import ClassifierWrapperDF, MetaEstimatorWrapperDF
 19 | 
 20 | log = logging.getLogger(__name__)
 21 | 
 22 | __all__ = [
 23 |     "ClassifierChainWrapperDF",
 24 |     "LinearDiscriminantAnalysisWrapperDF",
 25 |     "MetaClassifierWrapperDF",
 26 |     "MultiOutputClassifierWrapperDF",
 27 |     "PartialFitClassifierWrapperDF",
 28 | ]
 29 | 
 30 | #
 31 | # Type variables
 32 | #
 33 | 
 34 | T_PartialFitClassifierWrapperDF = TypeVar(
 35 |     "T_PartialFitClassifierWrapperDF",
 36 |     bound="PartialFitClassifierWrapperDF[ClassifierMixin]",
 37 | )
 38 | T_NativeClassifier = TypeVar("T_NativeClassifier", bound=ClassifierMixin)
 39 | 
 40 | 
 41 | #
 42 | # Ensure all symbols introduced below are included in __all__
 43 | #
 44 | 
 45 | __tracker = AllTracker(globals())
 46 | 
 47 | 
 48 | #
 49 | # Wrapper classes
 50 | #
 51 | 
 52 | 
 53 | class LinearDiscriminantAnalysisWrapperDF(
 54 |     ClassifierWrapperDF[LinearDiscriminantAnalysis],
 55 |     NComponentsDimensionalityReductionWrapperDF[LinearDiscriminantAnalysis],
 56 |     metaclass=ABCMeta,
 57 | ):
 58 |     """
 59 |     DF wrapper for
 60 |     :class:`sklearn.discriminant_analysis.LinearDiscriminantAnalysis`.
 61 |     """
 62 | 
 63 |     pass
 64 | 
 65 | 
 66 | class MetaClassifierWrapperDF(
 67 |     ClassifierWrapperDF[T_NativeClassifier],
 68 |     MetaEstimatorWrapperDF[T_NativeClassifier],
 69 |     Generic[T_NativeClassifier],
 70 |     metaclass=ABCMeta,
 71 | ):
 72 |     """
 73 |     Abstract base class of DF wrappers for classifiers implementing
 74 |     :class:`sklearn.base.MetaEstimatorMixin`.
 75 |     """
 76 | 
 77 |     pass
 78 | 
 79 | 
 80 | class PartialFitClassifierWrapperDF(
 81 |     ClassifierWrapperDF[T_NativeClassifier],
 82 |     Generic[T_NativeClassifier],
 83 |     metaclass=ABCMeta,
 84 | ):
 85 |     """
 86 |     Abstract base class of DF wrappers for classifiers implementing
 87 |     method ``partial_fit()``.
 88 |     """
 89 | 
 90 |     # noinspection PyPep8Naming
 91 |     def partial_fit(
 92 |         self: T_PartialFitClassifierWrapperDF,
 93 |         X: Union[pd.Series, pd.DataFrame],
 94 |         y: Union[pd.Series, pd.DataFrame],
 95 |         classes: Optional[Sequence[Any]] = None,
 96 |         sample_weight: Optional[pd.Series] = None,
 97 |     ) -> T_PartialFitClassifierWrapperDF:
 98 |         """
 99 |         Perform incremental fit on a batch of samples.
100 | 
101 |         This method is meant to be called multiple times for subsets of training
102 |         data which, e.g., couldn't fit in the required memory in full. It can be
103 |         also used for online learning.
104 | 
105 |         :param X: data frame with observations as rows and features as columns
106 |         :param y: a series or data frame with one or more outputs per observation
107 |         :param classes: all classes present across all calls to ``partial_fit``;
108 |             only required for the first call of this method
109 |         :param sample_weight: optional weights applied to individual samples
110 |         :return: ``self``
111 |         """
112 |         X, y = self._validate_parameter_types(X, y)
113 |         self._partial_fit(X, y, classes=classes, sample_weight=sample_weight)
114 | 
115 |         return self
116 | 
117 |     # noinspection PyPep8Naming
118 |     def _partial_fit(
119 |         self: T_PartialFitClassifierWrapperDF,
120 |         X: pd.DataFrame,
121 |         y: Union[pd.Series, pd.DataFrame],
122 |         **partial_fit_params: Optional[Any],
123 |     ) -> T_PartialFitClassifierWrapperDF:
124 |         return cast(
125 |             T_PartialFitClassifierWrapperDF,
126 |             self._native_estimator.partial_fit(
127 |                 self._prepare_X_for_delegate(X),
128 |                 self._prepare_y_for_delegate(y),
129 |                 **{
130 |                     arg: value
131 |                     for arg, value in partial_fit_params.items()
132 |                     if value is not None
133 |                 },
134 |             ),
135 |         )
136 | 
137 | 
138 | class MultiOutputClassifierWrapperDF(
139 |     MetaClassifierWrapperDF[MultiOutputClassifier],
140 |     PartialFitClassifierWrapperDF[MultiOutputClassifier],
141 |     metaclass=ABCMeta,
142 | ):
143 |     """
144 |     DF wrapper for :class:`sklearn.multioutput.MultiOutputClassifier`.
145 |     """
146 | 
147 |     # noinspection PyPep8Naming
148 |     def _prediction_with_class_labels(
149 |         self,
150 |         X: pd.DataFrame,
151 |         prediction: Union[
152 |             pd.Series, pd.DataFrame, List[npt.NDArray[Any]], npt.NDArray[Any]
153 |         ],
154 |         classes: Optional[Sequence[Any]] = None,
155 |     ) -> Union[pd.Series, pd.DataFrame, List[pd.DataFrame]]:
156 |         # if we have a multi-output classifier, prediction of probabilities
157 |         # yields a list of NumPy arrays
158 |         if not isinstance(prediction, list):
159 |             raise ValueError(
160 |                 "prediction of multi-output classifier expected to be a list of NumPy "
161 |                 f"arrays, but got type {type(prediction)}"
162 |             )
163 | 
164 |         delegate_estimator = self.native_estimator
165 | 
166 |         # store the super() object as this is not available within a generator
167 |         _super = cast(ClassifierWrapperDF[MultiOutputClassifier], super())
168 | 
169 |         # estimators attribute of abstract class MultiOutputEstimator
170 |         # usually the delegate estimator will provide a list of estimators used
171 |         # to predict each output. If present, use these estimators to get
172 |         # individual class labels for each output; otherwise we cannot assign class
173 |         # labels
174 |         estimators = getattr(delegate_estimator, "estimators_", None)
175 |         if estimators is None:
176 |             return [
177 |                 _super._prediction_with_class_labels(X=X, prediction=output)
178 |                 for output in prediction
179 |             ]
180 |         else:
181 |             return [
182 |                 _super._prediction_with_class_labels(
183 |                     X=X, prediction=output, classes=getattr(estimator, "classes_", None)
184 |                 )
185 |                 for estimator, output in zip(estimators, prediction)
186 |             ]
187 | 
188 | 
189 | class ClassifierChainWrapperDF(
190 |     MetaEstimatorWrapperDF[ClassifierChain],
191 |     ClassifierWrapperDF[ClassifierChain],
192 |     metaclass=ABCMeta,
193 | ):
194 |     """
195 |     DF wrapper for :class:`sklearn.multioutput.ClassifierChain`.
196 |     """
197 | 
198 |     # noinspection PyPep8Naming
199 |     def _prediction_with_class_labels(
200 |         self,
201 |         X: pd.DataFrame,
202 |         prediction: Union[
203 |             pd.Series, pd.DataFrame, List[npt.NDArray[Any]], npt.NDArray[Any]
204 |         ],
205 |         classes: Optional[Sequence[Any]] = None,
206 |     ) -> Union[pd.Series, pd.DataFrame, List[pd.DataFrame]]:
207 |         # todo: infer actual class names
208 |         return super()._prediction_with_class_labels(
209 |             X, prediction, classes=range(self.n_outputs_)
210 |         )
211 | 
212 | 
213 | #
214 | # Validate __all__
215 | #
216 | 
217 | __tracker.validate()
218 | 


--------------------------------------------------------------------------------
/src/sklearndf/clustering/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Extended versions of `scikit-learn` clusterers with enhanced support for data
 3 | frames.
 4 | """
 5 | 
 6 | from .. import __sklearn_1_1__, __sklearn_1_3__, __sklearn_version__
 7 | from ._clustering import *
 8 | 
 9 | if __sklearn_version__ >= __sklearn_1_1__:
10 |     from ._clustering_v1_1 import *
11 | 
12 | if __sklearn_version__ >= __sklearn_1_3__:
13 |     from ._clustering_v1_3 import *
14 | 


--------------------------------------------------------------------------------
/src/sklearndf/clustering/_clustering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Core implementation of :mod:`sklearndf.clustering`
  3 | """
  4 | import logging
  5 | 
  6 | from sklearn.cluster import (
  7 |     DBSCAN,
  8 |     OPTICS,
  9 |     AffinityPropagation,
 10 |     AgglomerativeClustering,
 11 |     Birch,
 12 |     FeatureAgglomeration,
 13 |     KMeans,
 14 |     MeanShift,
 15 |     MiniBatchKMeans,
 16 |     SpectralClustering,
 17 | )
 18 | 
 19 | from pytools.api import AllTracker
 20 | 
 21 | from ..wrapper import ClusterWrapperDF
 22 | from .wrapper import FeatureAgglomerationWrapperDF, KMeansBaseWrapperDF
 23 | 
 24 | log = logging.getLogger(__name__)
 25 | 
 26 | __all__ = [
 27 |     "AffinityPropagationDF",
 28 |     "AgglomerativeClusteringDF",
 29 |     "BirchDF",
 30 |     "DBSCANDF",
 31 |     "FeatureAgglomerationDF",
 32 |     "KMeansDF",
 33 |     "MeanShiftDF",
 34 |     "MiniBatchKMeansDF",
 35 |     "OPTICSDF",
 36 |     "SpectralClusteringDF",
 37 | ]
 38 | 
 39 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
 40 | 
 41 | 
 42 | #
 43 | # Ensure all symbols introduced below are included in __all__
 44 | #
 45 | 
 46 | __tracker = AllTracker(globals())
 47 | 
 48 | 
 49 | #
 50 | # Class definitions
 51 | #
 52 | 
 53 | 
 54 | class AffinityPropagationDF(
 55 |     ClusterWrapperDF[AffinityPropagation], native=AffinityPropagation
 56 | ):
 57 |     """Stub for DF wrapper of class ``AffinityPropagation``"""
 58 | 
 59 | 
 60 | class AgglomerativeClusteringDF(
 61 |     ClusterWrapperDF[AgglomerativeClustering], native=AgglomerativeClustering
 62 | ):
 63 |     """Stub for DF wrapper of class ``AgglomerativeClustering``"""
 64 | 
 65 | 
 66 | class BirchDF(ClusterWrapperDF[Birch], native=Birch):
 67 |     """Stub for DF wrapper of class ``Birch``"""
 68 | 
 69 | 
 70 | class DBSCANDF(ClusterWrapperDF[DBSCAN], native=DBSCAN):
 71 |     """Stub for DF wrapper of class ``DBSCAN``"""
 72 | 
 73 | 
 74 | class KMeansDF(KMeansBaseWrapperDF[KMeans], native=KMeans):
 75 |     """Stub for DF wrapper of class ``KMeans``"""
 76 | 
 77 | 
 78 | class MiniBatchKMeansDF(KMeansBaseWrapperDF[MiniBatchKMeans], native=MiniBatchKMeans):
 79 |     """Stub for DF wrapper of class ``MiniBatchKMeans``"""
 80 | 
 81 | 
 82 | class MeanShiftDF(ClusterWrapperDF[MeanShift], native=MeanShift):
 83 |     """Stub for DF wrapper of class ``MeanShift``"""
 84 | 
 85 | 
 86 | class OPTICSDF(ClusterWrapperDF[OPTICS], native=OPTICS):
 87 |     """Stub for DF wrapper of class ``OPTICS``"""
 88 | 
 89 | 
 90 | class SpectralClusteringDF(
 91 |     ClusterWrapperDF[SpectralClustering], native=SpectralClustering
 92 | ):
 93 |     """Stub for DF wrapper of class ``SpectralClustering``"""
 94 | 
 95 | 
 96 | class FeatureAgglomerationDF(
 97 |     FeatureAgglomerationWrapperDF, native=FeatureAgglomeration
 98 | ):
 99 |     """Stub for DF wrapper of class ``FeatureAgglomeration``"""
100 | 
101 | 
102 | #
103 | # Validate __all__
104 | #
105 | 
106 | __tracker.validate()
107 | 


--------------------------------------------------------------------------------
/src/sklearndf/clustering/_clustering_v1_1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.clustering`
 3 | """
 4 | import logging
 5 | 
 6 | from sklearn.cluster import BisectingKMeans
 7 | 
 8 | from pytools.api import AllTracker
 9 | 
10 | from .wrapper import KMeansBaseWrapperDF
11 | 
12 | log = logging.getLogger(__name__)
13 | 
14 | __all__ = [
15 |     "BisectingKMeansDF",
16 | ]
17 | 
18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
19 | 
20 | 
21 | #
22 | # Ensure all symbols introduced below are included in __all__
23 | #
24 | 
25 | __tracker = AllTracker(globals())
26 | 
27 | 
28 | #
29 | # Class definitions
30 | #
31 | 
32 | 
33 | class BisectingKMeansDF(KMeansBaseWrapperDF[BisectingKMeans], native=BisectingKMeans):
34 |     """Stub for DF wrapper of class ``MiniBatchKMeans``"""
35 | 
36 | 
37 | #
38 | # Validate __all__
39 | #
40 | 
41 | __tracker.validate()
42 | 


--------------------------------------------------------------------------------
/src/sklearndf/clustering/_clustering_v1_3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.clustering`
 3 | """
 4 | import logging
 5 | 
 6 | from sklearn.cluster import HDBSCAN
 7 | 
 8 | from pytools.api import AllTracker
 9 | 
10 | from ..wrapper import ClusterWrapperDF
11 | 
12 | log = logging.getLogger(__name__)
13 | 
14 | __all__ = [
15 |     "HDBSCANDF",
16 | ]
17 | 
18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
19 | 
20 | 
21 | #
22 | # Ensure all symbols introduced below are included in __all__
23 | #
24 | 
25 | __tracker = AllTracker(globals())
26 | 
27 | 
28 | #
29 | # Class definitions
30 | #
31 | 
32 | 
33 | class HDBSCANDF(ClusterWrapperDF[HDBSCAN], native=HDBSCAN):
34 |     """Stub for DF wrapper of class ``DBSCAN``"""
35 | 
36 | 
37 | #
38 | # Validate __all__
39 | #
40 | 
41 | __tracker.validate()
42 | 


--------------------------------------------------------------------------------
/src/sklearndf/clustering/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Wrapper classes for `scikit-learn` clusterers, providing enhanced support for data
3 | frames.
4 | """
5 | 
6 | from ._wrapper import *
7 | 


--------------------------------------------------------------------------------
/src/sklearndf/clustering/wrapper/_wrapper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.clustering.wrapper`
 3 | """
 4 | 
 5 | import logging
 6 | from abc import ABCMeta
 7 | from typing import Generic, TypeVar
 8 | 
 9 | import pandas as pd
10 | from sklearn.cluster import FeatureAgglomeration, KMeans, MiniBatchKMeans
11 | 
12 | from pytools.api import AllTracker
13 | from pytools.fit import fitted_only
14 | 
15 | from sklearndf.transformation.wrapper import ColumnPreservingTransformerWrapperDF
16 | from sklearndf.wrapper import ClusterWrapperDF
17 | 
18 | log = logging.getLogger(__name__)
19 | 
20 | __all__ = [
21 |     "KMeansBaseWrapperDF",
22 |     "FeatureAgglomerationWrapperDF",
23 | ]
24 | 
25 | #
26 | # Type variables
27 | #
28 | 
29 | T_NativeKMeans = TypeVar("T_NativeKMeans", KMeans, MiniBatchKMeans)
30 | 
31 | 
32 | #
33 | # Ensure all symbols introduced below are included in __all__
34 | #
35 | 
36 | __tracker = AllTracker(globals())
37 | 
38 | 
39 | #
40 | # Wrapper classes
41 | #
42 | 
43 | 
44 | # noinspection PyPep8Naming
45 | class KMeansBaseWrapperDF(
46 |     ClusterWrapperDF[T_NativeKMeans], Generic[T_NativeKMeans], metaclass=ABCMeta
47 | ):
48 |     """
49 |     DF wrapper for KMeans-like algorithms, e.g., :class:`sklearn.cluster.KMeans`.
50 |     """
51 | 
52 |     #: the name of the index representing clusters
53 |     IDX_CLUSTER = "cluster"
54 | 
55 |     @property
56 |     @fitted_only(not_fitted_error=AttributeError)
57 |     def cluster_centers_(self) -> pd.DataFrame:
58 |         """
59 |         The cluster centers as a data frame, with clusters as rows and feature values
60 |         as columns.
61 | 
62 |         :raises AttributeError: the clusterer is not fitted
63 |         """
64 | 
65 |         raw_cluster_centers = self._native_estimator.cluster_centers_
66 |         return pd.DataFrame(
67 |             raw_cluster_centers,
68 |             columns=self.feature_names_in_,
69 |             index=pd.RangeIndex(
70 |                 len(raw_cluster_centers), name=KMeansBaseWrapperDF.IDX_CLUSTER
71 |             ),
72 |         )
73 | 
74 | 
75 | class FeatureAgglomerationWrapperDF(
76 |     ClusterWrapperDF[FeatureAgglomeration],
77 |     ColumnPreservingTransformerWrapperDF[FeatureAgglomeration],
78 |     metaclass=ABCMeta,
79 | ):
80 |     """
81 |     DF wrapper for FeatureAgglomeration that combines clusterer and transformer.
82 |     """
83 | 
84 |     pass
85 | 
86 | 
87 | #
88 | # Validate __all__
89 | #
90 | 
91 | __tracker.validate()
92 | 


--------------------------------------------------------------------------------
/src/sklearndf/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Extended versions of all `scikit-learn` pipelines with enhanced support for data
3 | frames.
4 | """
5 | from ._learner_pipeline import *
6 | from ._pipeline import *
7 | 


--------------------------------------------------------------------------------
/src/sklearndf/pipeline/_pipeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.pipeline`
 3 | """
 4 | 
 5 | import logging
 6 | 
 7 | from sklearn.pipeline import Pipeline
 8 | 
 9 | from pytools.api import AllTracker
10 | 
11 | from .wrapper import FeatureUnionSparseFrames, FeatureUnionWrapperDF, PipelineWrapperDF
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | __all__ = ["PipelineDF", "FeatureUnionDF"]
16 | 
17 | 
18 | #
19 | # Ensure all symbols introduced below are included in __all__
20 | #
21 | 
22 | __tracker = AllTracker(globals())
23 | 
24 | 
25 | #
26 | # Class definitions
27 | #
28 | 
29 | 
30 | class PipelineDF(PipelineWrapperDF, native=Pipeline):
31 |     """Stub for DF wrapper of class ``Pipeline``"""
32 | 
33 | 
34 | class FeatureUnionDF(FeatureUnionWrapperDF, native=FeatureUnionSparseFrames):
35 |     """Stub for DF wrapper of class ``FeatureUnion``"""
36 | 
37 | 
38 | #
39 | # Validate __all__
40 | #
41 | 
42 | __tracker.validate()
43 | 


--------------------------------------------------------------------------------
/src/sklearndf/pipeline/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Wrapper classes to enhance the functionality of native pipeline classes conforming with
3 | the `scikit-learn` API.
4 | """
5 | 
6 | from ._wrapper import *
7 | 


--------------------------------------------------------------------------------
/src/sklearndf/pipeline/wrapper/_wrapper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Core implementation of :mod:`sklearndf.pipeline.wrapper`
  3 | """
  4 | 
  5 | import logging
  6 | from abc import ABCMeta
  7 | from typing import Any, Dict, Iterator, List, Sequence, Tuple, Union, cast
  8 | 
  9 | import numpy.typing as npt
 10 | import pandas as pd
 11 | from pandas.core.arrays import ExtensionArray
 12 | from scipy import sparse
 13 | from sklearn.pipeline import FeatureUnion, Pipeline
 14 | from sklearn.preprocessing import FunctionTransformer
 15 | 
 16 | from pytools.api import AllTracker
 17 | 
 18 | from ..._util import hstack_frames
 19 | from sklearndf import EstimatorDF, TransformerDF
 20 | from sklearndf.wrapper import (
 21 |     ClassifierWrapperDF,
 22 |     RegressorWrapperDF,
 23 |     TransformerWrapperDF,
 24 | )
 25 | 
 26 | log = logging.getLogger(__name__)
 27 | 
 28 | __all__ = [
 29 |     "FeatureUnionSparseFrames",
 30 |     "FeatureUnionWrapperDF",
 31 |     "PipelineWrapperDF",
 32 | ]
 33 | 
 34 | 
 35 | #
 36 | # Ensure all symbols introduced below are included in __all__
 37 | #
 38 | 
 39 | __tracker = AllTracker(globals())
 40 | 
 41 | 
 42 | #
 43 | # Class definitions
 44 | #
 45 | 
 46 | 
 47 | class PipelineWrapperDF(
 48 |     ClassifierWrapperDF[Pipeline],
 49 |     RegressorWrapperDF[Pipeline],
 50 |     TransformerWrapperDF[Pipeline],
 51 |     metaclass=ABCMeta,
 52 | ):
 53 |     """
 54 |     DF wrapper for `scikit-learn` class :class:`~sklearn.pipeline.Pipeline`.
 55 |     """
 56 | 
 57 |     __native_base_class__ = Pipeline
 58 | 
 59 |     #: Placeholder that can be used in place of an estimator to designate a pipeline
 60 |     #: step that preserves the original ingoing data.
 61 |     PASSTHROUGH = "passthrough"
 62 | 
 63 |     def _validate_delegate_estimator(self) -> None:
 64 |         # ensure that all steps support data frames, and that all except the last
 65 |         # step are data frame transformers
 66 | 
 67 |         steps = self.steps
 68 | 
 69 |         if len(steps) == 0:
 70 |             return
 71 | 
 72 |         for name, transformer in steps[:-1]:
 73 |             if not (
 74 |                 self._is_passthrough(transformer)
 75 |                 or isinstance(transformer, TransformerDF)
 76 |             ):
 77 |                 raise ValueError(
 78 |                     f"expected step {name!r} to be a {TransformerDF.__name__}, "
 79 |                     f"or {PipelineWrapperDF.PASSTHROUGH}, but found an instance of "
 80 |                     f"{type(transformer).__name__}"
 81 |                 )
 82 | 
 83 |         final_step = steps[-1]
 84 |         final_estimator = final_step[1]
 85 |         if not (
 86 |             self._is_passthrough(final_estimator)
 87 |             or isinstance(final_estimator, EstimatorDF)
 88 |         ):
 89 |             raise ValueError(
 90 |                 f"expected final step {final_step[0]!r} to be an "
 91 |                 f"{EstimatorDF.__name__} or {PipelineWrapperDF.PASSTHROUGH}, "
 92 |                 f"but found an instance of {type(final_estimator).__name__}"
 93 |             )
 94 | 
 95 |     @property
 96 |     def steps(self) -> List[Tuple[str, EstimatorDF]]:
 97 |         """
 98 |         The ``steps`` attribute of the underlying :class:`~sklearn.pipeline.Pipeline`.
 99 | 
100 |         List of (name, transformer) tuples (transformers implement fit/transform).
101 |         """
102 |         return cast(List[Tuple[str, EstimatorDF]], self.native_estimator.steps)
103 | 
104 |     def __len__(self) -> int:
105 |         """The number of steps of the pipeline."""
106 |         return len(self.native_estimator.steps)
107 | 
108 |     def __getitem__(self, ind: Union[slice, int, str]) -> EstimatorDF:
109 |         """
110 |         Return a sub-pipeline or a single estimator in the pipeline
111 | 
112 |         Indexing with an integer will return an estimator; using a slice
113 |         returns another Pipeline instance which copies a slice of this
114 |         Pipeline. This copy is shallow: modifying (or fitting) estimators in
115 |         the sub-pipeline will affect the larger pipeline and vice-versa.
116 |         However, replacing a value in ``steps`` will not change a copy.
117 |         """
118 | 
119 |         if isinstance(ind, slice):
120 |             base_pipeline = self.native_estimator
121 |             if ind.step not in (1, None):
122 |                 raise ValueError("Pipeline slicing only supports a step of 1")
123 | 
124 |             return cast(
125 |                 EstimatorDF,
126 |                 self.__class__(
127 |                     steps=base_pipeline.steps[ind],
128 |                     memory=base_pipeline.memory,
129 |                     verbose=base_pipeline.verbose,
130 |                 ),
131 |             )
132 |         else:
133 |             return cast(EstimatorDF, self.native_estimator[ind])
134 | 
135 |     @staticmethod
136 |     def _is_passthrough(estimator: Union[EstimatorDF, str, None]) -> bool:
137 |         # return True if the estimator is a "passthrough" (i.e. identity) transformer
138 |         # in the pipeline
139 |         return estimator is None or estimator == PipelineWrapperDF.PASSTHROUGH
140 | 
141 |     def _transformer_steps(self) -> Iterator[Tuple[str, TransformerDF]]:
142 |         # make an iterator of all transform steps, i.e., excluding the final step
143 |         # in case it is not a transformer
144 |         # excludes steps whose transformer is ``None`` or ``"passthrough"``
145 | 
146 |         def _iter_not_none(
147 |             transformer_steps: Sequence[Tuple[str, EstimatorDF]]
148 |         ) -> Iterator[Tuple[str, TransformerDF]]:
149 |             return (
150 |                 (name, cast(TransformerDF, transformer))
151 |                 for name, transformer in transformer_steps
152 |                 if not self._is_passthrough(transformer)
153 |             )
154 | 
155 |         steps = self.steps
156 | 
157 |         if len(steps) == 0:
158 |             return iter([])
159 | 
160 |         final_estimator = steps[-1][1]
161 | 
162 |         if isinstance(final_estimator, TransformerDF):
163 |             return _iter_not_none(steps)
164 |         else:
165 |             return _iter_not_none(steps[:-1])
166 | 
167 |     def _get_features_original(self) -> pd.Series:
168 |         col_mappings = [
169 |             df_transformer.feature_names_original_
170 |             for _, df_transformer in self._transformer_steps()
171 |         ]
172 | 
173 |         _features_out: pd.Index
174 |         _features_original: Union[npt.NDArray[Any], ExtensionArray]
175 | 
176 |         if len(col_mappings) == 0:
177 |             _features_out = self.feature_names_in_
178 |             _features_original = _features_out.values
179 |         else:
180 |             _features_out = col_mappings[-1].index
181 |             _features_original = col_mappings[-1].values
182 | 
183 |             # iterate backwards starting from the penultimate item
184 |             for preceding_out_to_original_mapping in col_mappings[-2::-1]:
185 |                 # join the original columns of my current transformer on the out columns
186 |                 # in the preceding transformer, then repeat
187 |                 if not all(
188 |                     feature in preceding_out_to_original_mapping
189 |                     for feature in _features_original
190 |                 ):
191 |                     unknown_features = set(_features_original) - set(
192 |                         preceding_out_to_original_mapping
193 |                     )
194 |                     raise KeyError(
195 |                         f"unknown features encountered while tracing original "
196 |                         f"features along pipeline: {unknown_features}"
197 |                     )
198 |                 _features_original = preceding_out_to_original_mapping.loc[
199 |                     _features_original
200 |                 ].values
201 | 
202 |         return pd.Series(index=_features_out, data=_features_original)
203 | 
204 |     def _get_features_out(self) -> pd.Index:
205 |         for _, transformer in reversed(self.steps):
206 |             if isinstance(transformer, TransformerDF):
207 |                 return transformer.feature_names_out_
208 | 
209 |         return self.feature_names_in_
210 | 
211 |     @property
212 |     def _estimator_type(self) -> str:
213 |         # noinspection PyProtectedMember
214 |         return cast(str, self.native_estimator._estimator_type)
215 | 
216 |     def _more_tags(self) -> Dict[str, Any]:
217 |         return cast(
218 |             Dict[str, Any], getattr(self.native_estimator, "_more_tags", lambda: {})()
219 |         )
220 | 
221 | 
222 | class FeatureUnionSparseFrames(
223 |     FeatureUnion,  # type:ignore
224 | ):
225 |     """
226 |     FeatureUnion transformer that returns sparse data frames instead of arrays if one or
227 |     more of its transformers return a sparse data frame.
228 |     """
229 | 
230 |     # noinspection PyPep8Naming
231 |     def _hstack(
232 |         self, Xs: List[Union[npt.NDArray[Any], sparse.spmatrix, pd.DataFrame]]
233 |     ) -> Union[npt.NDArray[Any], sparse.spmatrix, pd.DataFrame]:
234 |         stacked_frames = hstack_frames(
235 |             Xs, prefixes=[name for name, _ in self.transformer_list]
236 |         )
237 |         if stacked_frames is None:
238 |             return super()._hstack(Xs)
239 |         else:
240 |             return stacked_frames
241 | 
242 | 
243 | class FeatureUnionWrapperDF(
244 |     TransformerWrapperDF[FeatureUnionSparseFrames], metaclass=ABCMeta
245 | ):
246 |     """
247 |     DF wrapper for `scikit-learn` class :class:`~sklearn.pipeline.FeatureUnion`.
248 |     """
249 | 
250 |     DROP = "drop"
251 |     PASSTHROUGH = "passthrough"
252 | 
253 |     __native_base_class__ = FeatureUnionSparseFrames
254 | 
255 |     @staticmethod
256 |     def _prepend_features_out(features_out: pd.Index, name_prefix: str) -> pd.Index:
257 |         return pd.Index(data=f"{name_prefix}__" + features_out.astype(str))
258 | 
259 |     def _get_features_original(self) -> pd.Series:
260 |         # concatenate output-to-input mappings from all included transformers other than
261 |         # ones stated as ``None`` or ``"drop"`` or any other string
262 | 
263 |         # prepend the name of the transformer so the resulting feature name is
264 |         # `<name>__<output column of sub-transformer>
265 | 
266 |         def _prepend_features_original(
267 |             features_original: pd.Series, name_prefix: str
268 |         ) -> pd.Series:
269 |             return pd.Series(
270 |                 data=features_original.values,
271 |                 index=self._prepend_features_out(
272 |                     features_out=features_original.index, name_prefix=name_prefix
273 |                 ),
274 |             )
275 | 
276 |         # noinspection PyProtectedMember
277 |         return pd.concat(
278 |             objs=(
279 |                 _prepend_features_original(
280 |                     features_original=transformer.feature_names_original_,
281 |                     name_prefix=name,
282 |                 )
283 |                 for name, transformer, _ in self.native_estimator._iter()
284 |             )
285 |         )
286 | 
287 |     def _get_features_out(self) -> pd.Index:
288 |         # concatenate output columns from all included transformers other than
289 |         # ones stated as ``None`` or ``"drop"`` or any other string
290 | 
291 |         # prepend the name of the transformer so the resulting feature name is
292 |         # `<name>__<output column of sub-transformer>
293 | 
294 |         name: str
295 |         transformer: Union[TransformerDF, str, FunctionTransformer]
296 | 
297 |         indices = [
298 |             self._prepend_features_out(
299 |                 features_out=(
300 |                     self._get_features_in()
301 |                     if (
302 |                         isinstance(transformer, FunctionTransformer)
303 |                         and transformer.func is None
304 |                     )
305 |                     else cast(TransformerDF, transformer).feature_names_out_
306 |                 ),
307 |                 name_prefix=name,
308 |             )
309 |             for name, transformer in self.native_estimator.transformer_list
310 |             if transformer != FeatureUnionWrapperDF.DROP
311 |         ]
312 | 
313 |         if len(indices) == 0:
314 |             return pd.Index()
315 |         else:
316 |             return indices[0].append(indices[1:])
317 | 
318 | 
319 | #
320 | # Validate __all__
321 | #
322 | 
323 | __tracker.validate()
324 | 


--------------------------------------------------------------------------------
/src/sklearndf/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/src/sklearndf/py.typed


--------------------------------------------------------------------------------
/src/sklearndf/regression/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Extended versions of all `scikit-learn` regressors with enhanced support for data
3 | frames.
4 | """
5 | from ._regression import *
6 | from ._regression_v0_22 import *
7 | from ._regression_v0_23 import *
8 | from ._regression_v1_0 import *
9 | 


--------------------------------------------------------------------------------
/src/sklearndf/regression/_regression_v0_22.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.regression` loaded
 3 | from sklearn 0.22 onwards
 4 | """
 5 | import logging
 6 | 
 7 | from sklearn.ensemble import StackingRegressor
 8 | 
 9 | from pytools.api import AllTracker
10 | 
11 | from ..wrapper.stacking import StackingRegressorWrapperDF
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | __all__ = ["StackingRegressorDF"]
16 | 
17 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
18 | 
19 | 
20 | #
21 | # Ensure all symbols introduced below are included in __all__
22 | #
23 | 
24 | __tracker = AllTracker(globals())
25 | 
26 | 
27 | #
28 | # Class definitions
29 | #
30 | 
31 | 
32 | class StackingRegressorDF(
33 |     StackingRegressorWrapperDF[StackingRegressor], native=StackingRegressor
34 | ):
35 |     """Stub for DF wrapper of class ``StackingRegressor``"""
36 | 
37 | 
38 | #
39 | # validate __all__
40 | #
41 | 
42 | __tracker.validate()
43 | 
44 | 
45 | #
46 | # validate that __all__ comprises all symbols ending in "DF", and no others
47 | #
48 | 
49 | 
50 | __estimators = {
51 |     sym
52 |     for sym in dir()
53 |     if sym.endswith("DF")
54 |     and sym not in __imported_estimators
55 |     and not sym.startswith("_")
56 | }
57 | if __estimators != set(__all__):
58 |     raise RuntimeError(
59 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
60 |         f"{__estimators}"
61 |     )
62 | 


--------------------------------------------------------------------------------
/src/sklearndf/regression/_regression_v0_23.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.regression` loaded
 3 | from sklearn 0.23 onwards
 4 | """
 5 | import logging
 6 | 
 7 | from sklearn.linear_model import GammaRegressor, PoissonRegressor, TweedieRegressor
 8 | 
 9 | from pytools.api import AllTracker
10 | 
11 | from ..wrapper import RegressorWrapperDF
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | __all__ = [
16 |     "GammaRegressorDF",
17 |     "PoissonRegressorDF",
18 |     "TweedieRegressorDF",
19 | ]
20 | 
21 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
22 | 
23 | #
24 | # type variables
25 | #
26 | 
27 | 
28 | #
29 | # Ensure all symbols introduced below are included in __all__
30 | #
31 | 
32 | __tracker = AllTracker(globals())
33 | 
34 | 
35 | #
36 | # Class definitions
37 | #
38 | 
39 | 
40 | class PoissonRegressorDF(RegressorWrapperDF[PoissonRegressor], native=PoissonRegressor):
41 |     """Stub for DF wrapper of class ``PoissonRegressor``"""
42 | 
43 | 
44 | class GammaRegressorDF(RegressorWrapperDF[GammaRegressor], native=GammaRegressor):
45 |     """Stub for DF wrapper of class ``GammaRegressor``"""
46 | 
47 | 
48 | class TweedieRegressorDF(RegressorWrapperDF[TweedieRegressor], native=TweedieRegressor):
49 |     """Stub for DF wrapper of class ``TweedieRegressor``"""
50 | 
51 | 
52 | #
53 | # validate __all__
54 | #
55 | 
56 | __tracker.validate()
57 | 
58 | 
59 | #
60 | # validate that __all__ comprises all symbols ending in "DF", and no others
61 | #
62 | 
63 | __estimators = {
64 |     sym
65 |     for sym in dir()
66 |     if sym.endswith("DF")
67 |     and sym not in __imported_estimators
68 |     and not sym.startswith("_")
69 | }
70 | if __estimators != set(__all__):
71 |     raise RuntimeError(
72 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
73 |         f"{__estimators}"
74 |     )
75 | 


--------------------------------------------------------------------------------
/src/sklearndf/regression/_regression_v1_0.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Additional implementation of :mod:`sklearndf.regression` loaded
 3 | from sklearn 1.0 onwards
 4 | """
 5 | import logging
 6 | 
 7 | from sklearn.ensemble import HistGradientBoostingRegressor
 8 | from sklearn.linear_model import QuantileRegressor
 9 | 
10 | from pytools.api import AllTracker
11 | 
12 | from ..wrapper import RegressorWrapperDF
13 | 
14 | log = logging.getLogger(__name__)
15 | 
16 | __all__ = ["HistGradientBoostingRegressorDF", "QuantileRegressorDF"]
17 | 
18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
19 | 
20 | 
21 | #
22 | # Ensure all symbols introduced below are included in __all__
23 | #
24 | 
25 | __tracker = AllTracker(globals())
26 | 
27 | 
28 | #
29 | # ensemble
30 | #
31 | 
32 | 
33 | class HistGradientBoostingRegressorDF(
34 |     RegressorWrapperDF[HistGradientBoostingRegressor],
35 |     native=HistGradientBoostingRegressor,
36 | ):
37 |     """Stub for DF wrapper of class ``HistGradientBoostingRegressor``"""
38 | 
39 | 
40 | #
41 | # linear model
42 | #
43 | 
44 | 
45 | class QuantileRegressorDF(
46 |     RegressorWrapperDF[QuantileRegressor], native=QuantileRegressor
47 | ):
48 |     """Stub for DF wrapper of class ``QuantileRegressor``"""
49 | 
50 | 
51 | #
52 | # validate __all__
53 | #
54 | 
55 | __tracker.validate()
56 | 
57 | 
58 | #
59 | # validate that __all__ comprises all symbols ending in "DF", and no others
60 | #
61 | 
62 | __estimators = {
63 |     sym
64 |     for sym in dir()
65 |     if sym.endswith("DF")
66 |     and sym not in __imported_estimators
67 |     and not sym.startswith("_")
68 | }
69 | if __estimators != set(__all__):
70 |     raise RuntimeError(
71 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
72 |         f"{__estimators}"
73 |     )
74 | 


--------------------------------------------------------------------------------
/src/sklearndf/regression/extra/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Additional 3rd party regressors that implement the `scikit-learn` interface.
3 | 
4 | Note that 3rd party packages implementing the associated native estimators must be
5 | installed explicitly: they are not included in `sklearndf`'s package requirements to
6 | achieve a lean package footprint for default installs of `sklearndf`.
7 | """
8 | from ._extra import *
9 | 


--------------------------------------------------------------------------------
/src/sklearndf/regression/extra/_extra.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.regression.extra`
 3 | """
 4 | import logging
 5 | 
 6 | from sklearn.base import RegressorMixin
 7 | 
 8 | from pytools.api import AllTracker
 9 | 
10 | from ...wrapper import MissingEstimator, RegressorWrapperDF
11 | 
12 | log = logging.getLogger(__name__)
13 | 
14 | __all__ = ["LGBMRegressorDF", "XGBRegressorDF"]
15 | 
16 | try:
17 |     # import lightgbm classes only if installed
18 |     from lightgbm.sklearn import LGBMRegressor
19 | 
20 | except ImportError:
21 | 
22 |     class LGBMRegressor(  # type: ignore
23 |         MissingEstimator,
24 |         RegressorMixin,  # type: ignore
25 |     ):
26 |         """Mock-up for missing estimator."""
27 | 
28 | 
29 | try:
30 |     # import xgboost classes only if installed
31 |     from xgboost import XGBRegressor
32 | 
33 | except ImportError:
34 | 
35 |     class XGBRegressor(  # type: ignore
36 |         MissingEstimator,
37 |         RegressorMixin,  # type: ignore
38 |     ):
39 |         """Mock-up for missing estimator."""
40 | 
41 | 
42 | #
43 | # Ensure all symbols introduced below are included in __all__
44 | #
45 | 
46 | __tracker = AllTracker(globals())
47 | 
48 | 
49 | #
50 | # Class definitions
51 | #
52 | 
53 | 
54 | class LGBMRegressorDF(RegressorWrapperDF[LGBMRegressor], native=LGBMRegressor):
55 |     """Stub for DF wrapper of class ``LGBMRegressorDF``"""
56 | 
57 | 
58 | class XGBRegressorDF(RegressorWrapperDF[XGBRegressor], native=XGBRegressor):
59 |     """Stub for DF wrapper of class ``XGBRegressorDF``"""
60 | 
61 | 
62 | #
63 | # validate __all__
64 | #
65 | 
66 | __tracker.validate()
67 | 


--------------------------------------------------------------------------------
/src/sklearndf/regression/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Wrapper classes for `scikit-learn` regressors, providing enhanced support for data
3 | frames.
4 | """
5 | 
6 | from ._wrapper import *
7 | 


--------------------------------------------------------------------------------
/src/sklearndf/regression/wrapper/_wrapper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Core implementation of :mod:`sklearndf.regression.wrapper`
  3 | """
  4 | 
  5 | import logging
  6 | from abc import ABCMeta
  7 | from typing import Any, Generic, Optional, TypeVar, Union, cast
  8 | 
  9 | import numpy.typing as npt
 10 | import pandas as pd
 11 | from sklearn.base import RegressorMixin
 12 | from sklearn.isotonic import IsotonicRegression
 13 | from sklearn.multioutput import MultiOutputRegressor
 14 | 
 15 | from pytools.api import AllTracker
 16 | 
 17 | from ...transformation.wrapper import (
 18 |     ColumnPreservingTransformerWrapperDF,
 19 |     NumpyTransformerWrapperDF,
 20 |     SingleColumnTransformerWrapperDF,
 21 | )
 22 | from ...wrapper import MetaEstimatorWrapperDF, RegressorWrapperDF
 23 | 
 24 | log = logging.getLogger(__name__)
 25 | 
 26 | __all__ = [
 27 |     "IsotonicRegressionWrapperDF",
 28 |     "MetaRegressorWrapperDF",
 29 |     "RegressorTransformerWrapperDF",
 30 |     "PartialFitRegressorWrapperDF",
 31 |     "MultiOutputRegressorWrapperDF",
 32 | ]
 33 | 
 34 | 
 35 | #
 36 | # type variables
 37 | #
 38 | 
 39 | T_PartialFitRegressorWrapperDF = TypeVar(
 40 |     "T_PartialFitRegressorWrapperDF",
 41 |     bound="PartialFitRegressorWrapperDF[RegressorMixin]",
 42 | )
 43 | T_NativeRegressor = TypeVar("T_NativeRegressor", bound=RegressorMixin)
 44 | 
 45 | 
 46 | #
 47 | # Ensure all symbols introduced below are included in __all__
 48 | #
 49 | 
 50 | __tracker = AllTracker(globals())
 51 | 
 52 | 
 53 | #
 54 | # Class definitions
 55 | #
 56 | 
 57 | 
 58 | class MetaRegressorWrapperDF(
 59 |     MetaEstimatorWrapperDF[T_NativeRegressor],
 60 |     RegressorWrapperDF[T_NativeRegressor],
 61 |     Generic[T_NativeRegressor],
 62 |     metaclass=ABCMeta,
 63 | ):
 64 |     """
 65 |     Abstract base class of DF wrappers for regressors implementing
 66 |     :class:`sklearn.base.MetaEstimatorMixin`.
 67 |     """
 68 | 
 69 |     pass
 70 | 
 71 | 
 72 | class PartialFitRegressorWrapperDF(
 73 |     RegressorWrapperDF[T_NativeRegressor],
 74 |     Generic[T_NativeRegressor],
 75 |     metaclass=ABCMeta,
 76 | ):
 77 |     """
 78 |     Abstract base class of DF wrappers for regressors implementing
 79 |     method ``partial_fit()``.
 80 |     """
 81 | 
 82 |     # noinspection PyPep8Naming
 83 |     def partial_fit(
 84 |         self: T_PartialFitRegressorWrapperDF,
 85 |         X: Union[pd.Series, pd.DataFrame],
 86 |         y: Union[pd.Series, pd.DataFrame],
 87 |         sample_weight: Optional[pd.Series] = None,
 88 |     ) -> T_PartialFitRegressorWrapperDF:
 89 |         """
 90 |         Perform incremental fit on a batch of samples.
 91 | 
 92 |         This method is meant to be called multiple times for subsets of training
 93 |         data which, e.g., couldn't fit in the required memory in full. It can be
 94 |         also used for online learning.
 95 | 
 96 |         :param X: data frame with observations as rows and features as columns
 97 |         :param y: a series or data frame with one or more outputs per observation
 98 |         :param sample_weight: optional weights applied to individual samples
 99 |         :return: ``self``
100 |         """
101 |         X, y = self._validate_parameter_types(X, y)
102 |         self._partial_fit(X, y, sample_weight=sample_weight)
103 | 
104 |         return self
105 | 
106 |     # noinspection PyPep8Naming
107 |     def _partial_fit(
108 |         self: T_PartialFitRegressorWrapperDF,
109 |         X: pd.DataFrame,
110 |         y: Union[pd.Series, pd.DataFrame],
111 |         **partial_fit_params: Optional[Any],
112 |     ) -> T_PartialFitRegressorWrapperDF:
113 |         return cast(
114 |             T_PartialFitRegressorWrapperDF,
115 |             self._native_estimator.partial_fit(
116 |                 self._prepare_X_for_delegate(X),
117 |                 self._prepare_y_for_delegate(y),
118 |                 **{
119 |                     arg: value
120 |                     for arg, value in partial_fit_params.items()
121 |                     if value is not None
122 |                 },
123 |             ),
124 |         )
125 | 
126 | 
127 | class MultiOutputRegressorWrapperDF(
128 |     MetaRegressorWrapperDF[MultiOutputRegressor],
129 |     PartialFitRegressorWrapperDF[MultiOutputRegressor],
130 | ):
131 |     """
132 |     Abstract base class of DF wrappers for multi-output regressors.
133 |     """
134 | 
135 |     pass
136 | 
137 | 
138 | class RegressorTransformerWrapperDF(
139 |     RegressorWrapperDF[T_NativeRegressor],
140 |     ColumnPreservingTransformerWrapperDF[T_NativeRegressor],
141 |     Generic[T_NativeRegressor],
142 |     metaclass=ABCMeta,
143 | ):
144 |     """
145 |     DF wrapper for combined regressors and column preserving transformers.
146 |     """
147 | 
148 |     pass
149 | 
150 | 
151 | class IsotonicRegressionWrapperDF(
152 |     RegressorTransformerWrapperDF[IsotonicRegression],
153 |     SingleColumnTransformerWrapperDF[IsotonicRegression],
154 |     NumpyTransformerWrapperDF[IsotonicRegression],
155 |     metaclass=ABCMeta,
156 | ):
157 |     """
158 |     DF wrapper for :class:`sklearn.isotonic.IsotonicRegression`.
159 |     """
160 | 
161 |     # noinspection PyPep8Naming
162 |     def _adjust_X_type_for_delegate(self, X: pd.DataFrame) -> npt.NDArray[Any]:
163 |         arr = super()._adjust_X_type_for_delegate(X)
164 |         return arr.ravel()
165 | 
166 | 
167 | #
168 | # Validate __all__
169 | #
170 | 
171 | __tracker.validate()
172 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Extended versions of all `scikit-learn` transformers with enhanced support for data
 3 | frames.
 4 | """
 5 | 
 6 | from .. import __sklearn_1_1__, __sklearn_version__
 7 | from ._transformation import *
 8 | from ._transformation_v0_22 import *
 9 | from ._transformation_v0_24 import *
10 | from ._transformation_v1_0 import *
11 | 
12 | if __sklearn_version__ >= __sklearn_1_1__:
13 |     from ._transformation_v1_1 import *
14 | 
15 | from .. import __sklearn_1_3__
16 | 
17 | if __sklearn_version__ >= __sklearn_1_3__:
18 |     from ._transformation_v1_3 import *
19 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/_transformation_v0_22.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.transformation` loaded
 3 | from sklearn 0.22 onwards
 4 | """
 5 | 
 6 | 
 7 | import logging
 8 | 
 9 | from sklearn.impute import KNNImputer
10 | 
11 | from pytools.api import AllTracker
12 | 
13 | from .wrapper import ImputerWrapperDF
14 | 
15 | log = logging.getLogger(__name__)
16 | 
17 | __all__ = ["KNNImputerDF"]
18 | 
19 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
20 | 
21 | 
22 | #
23 | # Ensure all symbols introduced below are included in __all__
24 | #
25 | 
26 | __tracker = AllTracker(globals())
27 | 
28 | 
29 | #
30 | # impute
31 | #
32 | 
33 | 
34 | class KNNImputerDF(ImputerWrapperDF[KNNImputer], native=KNNImputer):
35 |     """Stub for DF wrapper of class ``KNNImputer``"""
36 | 
37 | 
38 | #
39 | # validate __all__
40 | #
41 | 
42 | __tracker.validate()
43 | 
44 | 
45 | #
46 | # validate that __all__ comprises all symbols ending in "DF", and no others
47 | #
48 | 
49 | __estimators = [
50 |     sym
51 |     for sym in dir()
52 |     if sym.endswith("DF")
53 |     and sym not in __imported_estimators
54 |     and not sym.startswith("_")
55 | ]
56 | if set(__estimators) != set(__all__):
57 |     raise RuntimeError(
58 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
59 |         f"{__estimators}"
60 |     )
61 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/_transformation_v0_24.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.transformation` loaded
 3 | from sklearn 0.24 onwards
 4 | """
 5 | 
 6 | 
 7 | import logging
 8 | 
 9 | from sklearn.feature_selection import SequentialFeatureSelector
10 | from sklearn.kernel_approximation import PolynomialCountSketch
11 | 
12 | from sklearndf.transformation.wrapper import (
13 |     FeatureSelectionWrapperDF,
14 |     NComponentsDimensionalityReductionWrapperDF,
15 | )
16 | 
17 | log = logging.getLogger(__name__)
18 | 
19 | __all__ = ["PolynomialCountSketchDF", "SequentialFeatureSelectorDF"]
20 | 
21 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
22 | 
23 | #
24 | # Transformers which have an n_components attribute
25 | # Implemented through NComponentsDimensionalityReductionWrapperDF
26 | #
27 | 
28 | 
29 | class PolynomialCountSketchDF(
30 |     NComponentsDimensionalityReductionWrapperDF[PolynomialCountSketch],
31 |     native=PolynomialCountSketch,
32 | ):
33 |     """Stub for DF wrapper of class ``PolynomialCountSketch``"""
34 | 
35 | 
36 | #
37 | # feature_selection
38 | #
39 | # Transformers with a get_support method, implemented via FeatureSelectionWrapperDF
40 | #
41 | 
42 | 
43 | class SequentialFeatureSelectorDF(
44 |     FeatureSelectionWrapperDF[SequentialFeatureSelector],
45 |     native=SequentialFeatureSelector,
46 | ):
47 |     """Stub for DF wrapper of class ``SequentialFeatureSelector``"""
48 | 
49 | 
50 | #
51 | # validate that __all__ comprises all symbols ending in "DF", and no others
52 | #
53 | 
54 | __estimators = [
55 |     sym
56 |     for sym in dir()
57 |     if sym.endswith("DF")
58 |     and sym not in __imported_estimators
59 |     and not sym.startswith("_")
60 | ]
61 | if set(__estimators) != set(__all__):
62 |     raise RuntimeError(
63 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
64 |         f"{__estimators}"
65 |     )
66 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/_transformation_v1_0.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.transformation` loaded
 3 | from sklearn 1.0 onwards
 4 | """
 5 | 
 6 | 
 7 | import logging
 8 | 
 9 | from sklearn.preprocessing import SplineTransformer
10 | 
11 | from .wrapper import PolynomialTransformerWrapperDF
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | __all__ = ["SplineTransformerDF"]
16 | 
17 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
18 | 
19 | #
20 | # preprocessing
21 | #
22 | 
23 | 
24 | class SplineTransformerDF(
25 |     PolynomialTransformerWrapperDF[SplineTransformer], native=SplineTransformer
26 | ):
27 |     """Stub for DF wrapper of class ``SplineTransformer``"""
28 | 
29 | 
30 | #
31 | # validate that __all__ comprises all symbols ending in "DF", and no others
32 | #
33 | 
34 | __estimators = [
35 |     sym
36 |     for sym in dir()
37 |     if sym.endswith("DF")
38 |     and sym not in __imported_estimators
39 |     and not sym.startswith("_")
40 | ]
41 | if set(__estimators) != set(__all__):
42 |     raise RuntimeError(
43 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
44 |         f"{__estimators}"
45 |     )
46 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/_transformation_v1_1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.transformation` loaded
 3 | from sklearn 1.1 onwards
 4 | """
 5 | 
 6 | 
 7 | import logging
 8 | 
 9 | from sklearn.decomposition import MiniBatchNMF
10 | from sklearn.ensemble import RandomTreesEmbedding
11 | 
12 | from .wrapper import ComponentsDimensionalityReductionWrapperDF, EmbeddingWrapperDF
13 | 
14 | log = logging.getLogger(__name__)
15 | 
16 | __all__ = ["MiniBatchNMFDF", "RandomTreesEmbeddingDF"]
17 | 
18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
19 | 
20 | #
21 | # preprocessing
22 | #
23 | 
24 | 
25 | class MiniBatchNMFDF(
26 |     ComponentsDimensionalityReductionWrapperDF[MiniBatchNMF],
27 |     native=MiniBatchNMF,
28 | ):
29 |     """Stub for DF wrapper of class ``MiniBatchNMF``"""
30 | 
31 | 
32 | class RandomTreesEmbeddingDF(
33 |     EmbeddingWrapperDF[RandomTreesEmbedding],
34 |     native=RandomTreesEmbedding,
35 | ):
36 |     """Stub for DF wrapper of class ``RandomTreesEmbedding``"""
37 | 
38 | 
39 | #
40 | # validate that __all__ comprises all symbols ending in "DF", and no others
41 | #
42 | 
43 | __estimators = [
44 |     sym
45 |     for sym in dir()
46 |     if sym.endswith("DF")
47 |     and sym not in __imported_estimators
48 |     and not sym.startswith("_")
49 | ]
50 | if set(__estimators) != set(__all__):
51 |     raise RuntimeError(
52 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
53 |         f"{__estimators}"
54 |     )
55 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/_transformation_v1_3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.transformation` loaded
 3 | from sklearn 1.3 onwards
 4 | """
 5 | 
 6 | 
 7 | import logging
 8 | 
 9 | from sklearn.preprocessing import TargetEncoder
10 | 
11 | from .wrapper import ColumnPreservingTransformerWrapperDF
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | __all__ = [
16 |     "TargetEncoderDF",
17 | ]
18 | 
19 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")}
20 | 
21 | #
22 | # preprocessing
23 | #
24 | 
25 | 
26 | class TargetEncoderDF(
27 |     ColumnPreservingTransformerWrapperDF[TargetEncoder], native=TargetEncoder
28 | ):
29 |     """Stub for DF wrapper of class ``TargetEncoder``"""
30 | 
31 | 
32 | #
33 | # validate that __all__ comprises all symbols ending in "DF", and no others
34 | #
35 | 
36 | __estimators = [
37 |     sym
38 |     for sym in dir()
39 |     if sym.endswith("DF")
40 |     and sym not in __imported_estimators
41 |     and not sym.startswith("_")
42 | ]
43 | if set(__estimators) != set(__all__):
44 |     raise RuntimeError(
45 |         "__all__ does not contain exactly all DF estimators; expected value is:\n"
46 |         f"{__estimators}"
47 |     )
48 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/extra/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Additional 3rd party transformers that implement the `scikit-learn` interface.
3 | 
4 | Note that 3rd party packages implementing the associated native estimators must be
5 | installed explicitly: they are not included in `sklearndf`'s package requirements to
6 | achieve a lean package footprint for default installs of `sklearndf`.
7 | """
8 | from ._extra import *
9 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/extra/_extra.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Core implementation of :mod:`sklearndf.transformation.extra`
  3 | """
  4 | from __future__ import annotations
  5 | 
  6 | import logging
  7 | 
  8 | import numpy as np
  9 | from sklearn.base import BaseEstimator, TransformerMixin
 10 | 
 11 | from pytools.api import AllTracker
 12 | 
 13 | from ...wrapper import MissingEstimator
 14 | 
 15 | log = logging.getLogger(__name__)
 16 | 
 17 | __all__ = ["BoostAGrootaDF", "BorutaDF", "GrootCVDF", "LeshyDF"]
 18 | 
 19 | try:
 20 |     # import boruta classes only if installed
 21 |     from boruta import BorutaPy
 22 | 
 23 |     # Apply a hack to address boruta's incompatibility with numpy >= 1.24:
 24 |     # boruta uses np.float_ which is deprecated in numpy >= 1.20 and removed in 1.24.
 25 |     #
 26 |     # We check these types are already defined in numpy, and if not, we define them
 27 |     # as aliases to the corresponding new types with a trailing underscore.
 28 | 
 29 |     for __attr in ["bool", "int", "float"]:
 30 |         if not hasattr(np, __attr):
 31 |             setattr(np, __attr, getattr(np, f"{__attr}_"))
 32 |     del __attr
 33 | 
 34 | except ImportError:
 35 | 
 36 |     class BorutaPy(  # type: ignore
 37 |         MissingEstimator,
 38 |         TransformerMixin,  # type: ignore
 39 |     ):
 40 |         """Mock-up for missing estimator."""
 41 | 
 42 | 
 43 | try:
 44 |     # import boruta classes only if installed
 45 |     from arfs.feature_selection.allrelevant import BoostAGroota, GrootCV, Leshy
 46 | 
 47 | except ImportError:
 48 | 
 49 |     class BoostAGroota(  # type: ignore
 50 |         MissingEstimator,
 51 |         TransformerMixin,  # type: ignore
 52 |     ):
 53 |         """Mock-up for missing estimator."""
 54 | 
 55 |     class GrootCV(  # type: ignore
 56 |         MissingEstimator,
 57 |         TransformerMixin,  # type: ignore
 58 |     ):
 59 |         """Mock-up for missing estimator."""
 60 | 
 61 |     class Leshy(  # type: ignore
 62 |         MissingEstimator,
 63 |         TransformerMixin,  # type: ignore
 64 |     ):
 65 |         """Mock-up for missing estimator."""
 66 | 
 67 | 
 68 | #
 69 | # Ensure all symbols introduced below are included in __all__
 70 | #
 71 | 
 72 | __tracker = AllTracker(globals())
 73 | 
 74 | 
 75 | #
 76 | # Class definitions
 77 | #
 78 | 
 79 | 
 80 | from .wrapper import ARFSWrapperDF as _ARFSWrapperDF
 81 | from .wrapper import BorutaPyWrapperDF as _BorutaPyWrapperDF
 82 | 
 83 | 
 84 | class BorutaDF(_BorutaPyWrapperDF, native=BorutaPy):
 85 |     """
 86 |     DF version of :class:`~boruta.BorutaPy`.
 87 |     """
 88 | 
 89 | 
 90 | class LeshyDF(_ARFSWrapperDF[Leshy], native=Leshy):
 91 |     """
 92 |     DF version of :class:`~arfs.feature_selection.allrelevant.Leshy`.
 93 |     """
 94 | 
 95 | 
 96 | class BoostAGrootaDF(_ARFSWrapperDF[BoostAGroota], native=BoostAGroota):
 97 |     """
 98 |     DF version of :class:`~arfs.feature_selection.allrelevant.BoostAGroota`.
 99 |     """
100 | 
101 |     @property
102 |     def estimator(self) -> BaseEstimator:
103 |         """
104 |         Alias for the native estimator's :attr:`.est` attribute, to conform with
105 |         the :class:`~sklearn.base.MetaEstimatorMixin` interface.
106 | 
107 |         :return: the value of the native estimator's :attr:`.est` attribute
108 |         """
109 |         return self.native_estimator.est
110 | 
111 |     @estimator.setter
112 |     def estimator(self, est: BaseEstimator) -> None:
113 |         """
114 |         Alias for the native estimator's :attr:`.est` attribute, to conform with
115 |         the :class:`~sklearn.base.MetaEstimatorMixin` interface.
116 | 
117 |         :param est: the new value for the native estimator's :attr:`.est` attribute
118 |         """
119 |         self.native_estimator.est = est
120 | 
121 |     @estimator.deleter
122 |     def estimator(self) -> None:
123 |         """
124 |         Alias for the native estimator's :attr:`.est` attribute, to conform with
125 |         the :class:`~sklearn.base.MetaEstimatorMixin` interface.
126 |         """
127 |         del self.native_estimator.est
128 | 
129 | 
130 | class GrootCVDF(_ARFSWrapperDF[GrootCV], native=GrootCV):
131 |     """
132 |     DF version of :class:`~arfs.feature_selection.allrelevant.GrootCV`.
133 |     """
134 | 
135 | 
136 | #
137 | # validate __all__
138 | #
139 | 
140 | __tracker.validate()
141 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/extra/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | DF wrapper classes for additional 3rd party transformers that implement the
3 | `scikit-learn` interface.
4 | """
5 | 
6 | from ._wrapper import *
7 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/extra/wrapper/_wrapper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Core implementation of :mod:`sklearndf.transformation.extra.wrapper`
 3 | """
 4 | from __future__ import annotations
 5 | 
 6 | import logging
 7 | from typing import Generic, TypeVar
 8 | 
 9 | import pandas as pd
10 | from sklearn.feature_selection import SelectorMixin
11 | 
12 | from pytools.api import AllTracker
13 | 
14 | from ....wrapper import MetaEstimatorWrapperDF
15 | from ...wrapper import ColumnSubsetTransformerWrapperDF, NumpyTransformerWrapperDF
16 | 
17 | log = logging.getLogger(__name__)
18 | 
19 | __all__ = ["BorutaPyWrapperDF", "ARFSWrapperDF"]
20 | 
21 | try:
22 |     # import boruta classes only if installed
23 |     from boruta import BorutaPy
24 | except ImportError:
25 |     BorutaPy = None
26 | 
27 | 
28 | #
29 | # Type variables
30 | #
31 | 
32 | T_FeatureSelector = TypeVar("T_FeatureSelector", bound=SelectorMixin)
33 | 
34 | 
35 | #
36 | # Ensure all symbols introduced below are included in __all__
37 | #
38 | 
39 | __tracker = AllTracker(globals())
40 | 
41 | 
42 | #
43 | # Class definitions
44 | #
45 | 
46 | 
47 | class BorutaPyWrapperDF(
48 |     MetaEstimatorWrapperDF[BorutaPy],
49 |     NumpyTransformerWrapperDF[BorutaPy],
50 |     ColumnSubsetTransformerWrapperDF[BorutaPy],
51 | ):
52 |     """
53 |     DF wrapper for :class:`~boruta.BorutaPy`.
54 |     """
55 | 
56 |     def _get_features_out(self) -> pd.Index:
57 |         return self.feature_names_in_[self.native_estimator.support_]
58 | 
59 |     def _get_sparse_threshold(self) -> float:
60 |         # don't allow sparse input
61 |         return 0.0
62 | 
63 | 
64 | class ARFSWrapperDF(
65 |     MetaEstimatorWrapperDF[T_FeatureSelector],
66 |     ColumnSubsetTransformerWrapperDF[T_FeatureSelector],
67 |     Generic[T_FeatureSelector],
68 | ):
69 |     """
70 |     DF wrapper for :class:`~boruta.BorutaPy`.
71 |     """
72 | 
73 |     def _get_features_out(self) -> pd.Index:
74 |         return self.feature_names_in_[self.native_estimator.support_]
75 | 
76 |     def _get_sparse_threshold(self) -> float:
77 |         # don't allow sparse input
78 |         return 0.0
79 | 
80 | 
81 | __tracker.validate()
82 | 


--------------------------------------------------------------------------------
/src/sklearndf/transformation/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Wrapper classes for `scikit-learn` transformers, providing enhanced support for data
3 | frames.
4 | """
5 | 
6 | from ._wrapper import *
7 | 


--------------------------------------------------------------------------------
/src/sklearndf/wrapper/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Wrapper class API for enhancing the functionality of native estimators conforming with
 3 | the `scikit-learn` API.
 4 | 
 5 | In most cases, creating a DF classifier or regressor class is as simple as
 6 | 
 7 | .. code-block:: python
 8 | 
 9 |     class RandomForestClassifierDF(
10 |         ClassifierWrapperDF[RandomForestClassifier], native=RandomForestClassifier
11 |     ):
12 |         \"""Stub for DF wrapper of class ``RandomForestClassifier``\"""
13 | 
14 |     class RandomForestRegressorDF(
15 |         RegressorWrapperDF[RandomForestRegressor], native=RandomForestRegressor
16 |     ):
17 |         \"""Stub for DF wrapper of class ``RandomForestRegressor``\"""
18 | 
19 | 
20 | Any class implementing the `scikit-learn` estimator protocol (and subclassing
21 | :class:`.BaseEstimator`) can be used to create a DF wrapper by declaring a wrapper
22 | class as follows:
23 | 
24 | .. code-block::
25 | 
26 |     class <name>DF(<DF wrapper class>, native=<native class>):
27 |         \"""Stub for DF wrapper of class ``<native class>``\"""
28 | 
29 | The resulting wrapper class implements a *delegation* pattern, forwarding method calls
30 | and attribute access to a native estimator instance while
31 | 
32 | - implementing enhanced functionality introduced by the :class:`.EstimatorDF` class
33 |   hierarchy, managing feature names and translating between data frames and *numpy*
34 |   arrays behind the scenes
35 | - adopting all additional methods and attributes from the wrapped native estimator
36 | - delegating relevant method calls and attribute access to the native estimator,
37 |   thus replicating the original estimator's behaviour except for the enhanced
38 |   functionality introduced by the :class:`.EstimatorDF` class hierarchy.
39 | 
40 | Most regressors, classifiers, and clusterers can be augmented using the
41 | :class:`.RegressorWrapperDF`, :class:`.ClassifierWrapperDF`, and
42 | :class:`.ClusterWrapperDF` wrappers, respectively.
43 | 
44 | More care must be taken to wrap transformer classes and some clusterer classes, which
45 | may require a more dedicated wrapper class to support the specific behaviour of the
46 | native transformer or clusterer.
47 | See packages :mod:`sklearndf.transformation.wrapper` and
48 | :mod:`sklearndf.clustering.wrapper` for more details on these.
49 | 
50 | For more advanced examples, including the use of custom wrapper classes, see the many
51 | examples in modules
52 | :mod:`sklearndf.transformation`, :mod:`sklearndf.classification`,
53 | :mod:`sklearndf.regression`, and :mod:`sklearndf.clustering`.
54 | """
55 | 
56 | from ._missing import *
57 | from ._wrapper import *
58 | 


--------------------------------------------------------------------------------
/src/sklearndf/wrapper/_missing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Handling of mocked up native estimators.
 3 | """
 4 | import logging
 5 | from typing import Any
 6 | 
 7 | from sklearn.base import BaseEstimator
 8 | 
 9 | from pytools.api import AllTracker
10 | 
11 | log = logging.getLogger(__name__)
12 | 
13 | __all__ = [
14 |     "MissingEstimator",
15 | ]
16 | 
17 | #
18 | # Ensure all symbols introduced below are included in __all__
19 | #
20 | 
21 | __tracker = AllTracker(globals())
22 | 
23 | 
24 | #
25 | # Class declarations
26 | #
27 | 
28 | 
29 | class MissingEstimator(
30 |     BaseEstimator,  # type: ignore
31 | ):
32 |     """
33 |     Base class of mocked up native estimators, for use in case an optional 3rd party
34 |     estimator is not installed but is required to create the associated DF estimator.
35 | 
36 |     Raises a :class:`.RuntimeError` upon instantiation.
37 |     """
38 | 
39 |     def __init__(self, *args: Any, **kwargs: Any) -> None:
40 |         """
41 |         :param args: arbitrary positional arguments
42 |         :param kwargs: arbitrary keyword arguments
43 |         :raises RuntimeError: always raised upon instantiation
44 |         """
45 |         raise RuntimeError(
46 |             f"Estimator {type(self).__name__} is not available. "
47 |             f"Please install the package that implements it."
48 |         )
49 | 
50 | 
51 | #
52 | # validate __all__
53 | #
54 | 
55 | __tracker.validate()
56 | 


--------------------------------------------------------------------------------
/src/sklearndf/wrapper/numpy/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Adapter classes that wrap DF estimators and accept numpy arrays for all DF estimator
3 | methods that would usually only accept pandas data frames or series.
4 | 
5 | For use in meta-estimators that internally pass on numpy arrays to sub-estimators.
6 | """
7 | 
8 | from ._numpy import *
9 | 


--------------------------------------------------------------------------------
/src/sklearndf/wrapper/stacking/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | DF wrapper classes for stacking estimators.
3 | """
4 | 
5 | from ._stacking import *
6 | 


--------------------------------------------------------------------------------
/test/test/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Root of sklearndf unit tests.
3 | """
4 | 


--------------------------------------------------------------------------------
/test/test/conftest.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | # noinspection PyPackageRequirements
  7 | import pytest
  8 | import sklearn
  9 | from sklearn import datasets
 10 | from sklearn.utils import Bunch
 11 | 
 12 | from sklearndf import __sklearn_1_1__, __sklearn_version__
 13 | from sklearndf.transformation import OneHotEncoderDF
 14 | 
 15 | logging.basicConfig(level=logging.DEBUG)
 16 | log = logging.getLogger(__name__)
 17 | 
 18 | UNSUPPORTED_SKLEARN_PACKAGES = [
 19 |     sklearn.manifold,
 20 |     sklearn.neighbors,
 21 |     sklearn.feature_extraction.image,
 22 | ]
 23 | 
 24 | 
 25 | @pytest.fixture  # type: ignore
 26 | def diabetes_target() -> str:
 27 |     return "disease_progression_1yr"
 28 | 
 29 | 
 30 | @pytest.fixture  # type: ignore
 31 | def iris_target_name() -> str:
 32 |     return "species"
 33 | 
 34 | 
 35 | @pytest.fixture  # type: ignore
 36 | def n_jobs() -> int:
 37 |     return -3
 38 | 
 39 | 
 40 | @pytest.fixture  # type: ignore
 41 | def diabetes_df(diabetes_target: str) -> pd.DataFrame:
 42 |     #  load sklearn test-data and convert to pd
 43 |     diabetes: Bunch
 44 |     if __sklearn_version__ >= __sklearn_1_1__:
 45 |         diabetes = datasets.load_diabetes(scaled=False)
 46 |     else:
 47 |         # arg scaled does not exist in scikit-learn < 1.1
 48 |         diabetes = datasets.load_diabetes()
 49 | 
 50 |     return pd.DataFrame(
 51 |         data=np.c_[diabetes.data, diabetes.target],
 52 |         columns=[*map(str, diabetes.feature_names), diabetes_target],
 53 |     ).astype(dtype={"sex": "category"})
 54 | 
 55 | 
 56 | @pytest.fixture  # type: ignore
 57 | def diabetes_features(diabetes_df: pd.DataFrame, diabetes_target: str) -> pd.DataFrame:
 58 |     return diabetes_df.drop(labels=[diabetes_target], axis=1)
 59 | 
 60 | 
 61 | @pytest.fixture  # type: ignore
 62 | def diabetes_target_sr(diabetes_df: pd.DataFrame, diabetes_target: str) -> pd.Series:
 63 |     return diabetes_df.loc[:, diabetes_target]
 64 | 
 65 | 
 66 | @pytest.fixture  # type: ignore
 67 | def diabetes_target_df(diabetes_df: pd.DataFrame, diabetes_target: str) -> pd.DataFrame:
 68 |     target = diabetes_df.loc[:, [diabetes_target]]
 69 |     target.loc[:, f"{diabetes_target}_2"] = target.loc[:, diabetes_target] * 2
 70 |     return target
 71 | 
 72 | 
 73 | @pytest.fixture  # type: ignore
 74 | def iris_dataset() -> Bunch:
 75 |     return datasets.load_iris()
 76 | 
 77 | 
 78 | @pytest.fixture  # type: ignore
 79 | def iris_df(iris_dataset: Bunch, iris_target_name: str) -> pd.DataFrame:
 80 |     #  convert sklearn iris data set to data frame
 81 |     return pd.DataFrame(
 82 |         data=np.c_[iris_dataset.data, iris_dataset.target],
 83 |         columns=[*map(str, iris_dataset.feature_names), iris_target_name],
 84 |     )
 85 | 
 86 | 
 87 | @pytest.fixture  # type: ignore
 88 | def iris_features(iris_df: pd.DataFrame, iris_target_name: str) -> pd.DataFrame:
 89 |     return iris_df.drop(labels=[iris_target_name], axis=1)
 90 | 
 91 | 
 92 | @pytest.fixture  # type: ignore
 93 | def iris_target_sr(
 94 |     iris_dataset: Bunch, iris_df: pd.DataFrame, iris_target_name: str
 95 | ) -> pd.Series:
 96 |     # replace numerical targets with actual class labels
 97 |     return iris_df.loc[:, iris_target_name].apply(
 98 |         lambda x: iris_dataset.target_names[int(x)]
 99 |     )
100 | 
101 | 
102 | @pytest.fixture  # type: ignore
103 | def iris_targets_df(iris_df: pd.DataFrame, iris_target_name: str) -> pd.DataFrame:
104 |     return iris_df.loc[:, [iris_target_name, iris_target_name]]
105 | 
106 | 
107 | @pytest.fixture  # type: ignore
108 | def iris_targets_binary_df(iris_target_sr: pd.Series) -> pd.DataFrame:
109 |     return OneHotEncoderDF(sparse=False).fit_transform(X=iris_target_sr.to_frame())
110 | 
111 | 
112 | @pytest.fixture  # type:ignore
113 | def test_data_categorical() -> pd.DataFrame:
114 |     return pd.DataFrame(
115 |         data=[
116 |             ["yes", "red", "child"],
117 |             ["yes", "blue", "father"],
118 |             ["no", "green", "mother"],
119 |         ],
120 |         columns=["a", "b", "c"],
121 |     )
122 | 


--------------------------------------------------------------------------------
/test/test/paths.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | log = logging.getLogger(__name__)
 5 | 
 6 | # directory paths
 7 | DIR_DATA = "data"
 8 | DIR_CONFIG = "config"
 9 | 
10 | # file paths
11 | TEST_CONFIG_YML = os.path.join(DIR_CONFIG, "test_config.yml")
12 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | from types import ModuleType
  4 | from typing import Dict, Iterable, List, Optional, Set, Type, Union
  5 | 
  6 | import pandas as pd
  7 | import sklearn
  8 | from sklearn.base import BaseEstimator
  9 | from sklearn.compose import ColumnTransformer
 10 | from sklearn.pipeline import FeatureUnion
 11 | 
 12 | from sklearndf import EstimatorDF, LearnerDF, TransformerDF, __sklearn_version__
 13 | from sklearndf.pipeline.wrapper import FeatureUnionSparseFrames
 14 | from sklearndf.transformation.wrapper import ColumnTransformerSparseFrames
 15 | from sklearndf.wrapper import EstimatorWrapperDF
 16 | 
 17 | OVERRIDDEN_SKLEARN_CLASSES = {
 18 |     ColumnTransformerSparseFrames: ColumnTransformer,
 19 |     FeatureUnionSparseFrames: FeatureUnion,
 20 | }
 21 | 
 22 | 
 23 | def find_all_classes(
 24 |     *modules: ModuleType,
 25 | ) -> Set[Type[EstimatorWrapperDF[BaseEstimator]]]:
 26 |     """Finds all Class members in given module/modules."""
 27 |     types: Set[Type[EstimatorWrapperDF[BaseEstimator]]] = set()
 28 | 
 29 |     def _add_classes_from_module(_m: ModuleType) -> None:
 30 |         member: Type[EstimatorWrapperDF[BaseEstimator]]
 31 |         for member in vars(_m).values():
 32 |             if isinstance(member, type):
 33 |                 types.add(member)
 34 | 
 35 |     for module in modules:
 36 |         _add_classes_from_module(module)
 37 | 
 38 |     return types
 39 | 
 40 | 
 41 | def find_all_submodules(parent_module: ModuleType) -> Set[ModuleType]:
 42 |     """Finds all submodules for a parent module."""
 43 |     parent_name = f"{parent_module.__name__}."
 44 |     return {
 45 |         module
 46 |         for module_name, module in sys.modules.items()
 47 |         if module_name.startswith(parent_name)
 48 |     }
 49 | 
 50 | 
 51 | def sklearn_delegate_classes(
 52 |     module: ModuleType,
 53 | ) -> Dict[Type[BaseEstimator], Type[EstimatorWrapperDF[BaseEstimator]]]:
 54 |     """
 55 |     Create a dictionary mapping sklearn classes to their corresponding sklearndf
 56 |     classes.
 57 |     """
 58 |     return {
 59 |         OVERRIDDEN_SKLEARN_CLASSES.get(
 60 |             df_class.__wrapped__, df_class.__wrapped__
 61 |         ): df_class
 62 |         for df_class in find_all_classes(module)
 63 |         # we only consider non-abstract wrapper classes wrapping a specific native class
 64 |         if issubclass(df_class, EstimatorWrapperDF) and hasattr(df_class, "__wrapped__")
 65 |     }
 66 | 
 67 | 
 68 | def iterate_classes(
 69 |     from_modules: Union[ModuleType, Iterable[ModuleType]],
 70 |     matching: str,
 71 |     excluding: Optional[Union[str, Iterable[str]]] = None,
 72 | ) -> List[Type[EstimatorWrapperDF[BaseEstimator]]]:
 73 |     """Helper to return all classes with matching name from Python module(s)"""
 74 | 
 75 |     if not isinstance(from_modules, Iterable):
 76 |         from_modules = (from_modules,)
 77 | 
 78 |     if excluding is not None and not isinstance(excluding, str):
 79 |         excluding = "|".join(f"({exclude_pattern})" for exclude_pattern in excluding)
 80 | 
 81 |     return [
 82 |         m
 83 |         for m in find_all_classes(*from_modules)
 84 |         if re.match(matching, m.__name__)
 85 |         and ((excluding is None) or not re.match(excluding, m.__name__))
 86 |     ]
 87 | 
 88 | 
 89 | def get_sklearndf_wrapper_class(
 90 |     to_wrap: Type[BaseEstimator], from_module: ModuleType
 91 | ) -> Type[EstimatorWrapperDF[BaseEstimator]]:
 92 |     """Helper to return the wrapped counterpart for a sklearn class"""
 93 |     try:
 94 |         return sklearn_delegate_classes(from_module)[to_wrap]
 95 | 
 96 |     except KeyError as cause:
 97 |         raise ValueError(
 98 |             f"There is no class that wraps '{to_wrap}' in {from_module}"
 99 |         ) from cause
100 | 
101 | 
102 | def check_expected_not_fitted_error(estimator: EstimatorDF) -> None:
103 |     """Check if transformers & learners raise NotFittedError"""
104 | 
105 |     test_x = pd.DataFrame(data=list(range(10)))
106 | 
107 |     def check_sklearndf_call(
108 |         func_to_call: str, _estimator: Union[LearnerDF, TransformerDF]
109 |     ) -> None:
110 |         try:
111 |             getattr(_estimator, func_to_call)(X=test_x)
112 |         except sklearn.exceptions.NotFittedError:
113 |             # This is the expected error, that sklearn[df] should raise
114 |             return
115 |         except Exception as sklearndf_exception:
116 |             # Re-run the predict/transform ahead of fitting, and compare errors
117 |             # across sklearn and sklearndf:
118 |             try:
119 |                 if func_to_call == "transform":
120 |                     x = test_x.values
121 |                 else:
122 |                     x = test_x.values.reshape(-1)
123 | 
124 |                 getattr(_estimator.native_estimator, func_to_call)(x)
125 |             except sklearn.exceptions.NotFittedError:
126 |                 raise AssertionError(
127 |                     "sklearndf did not return an expected NotFittedError"
128 |                     f" for {_estimator.__class__.__name__}"
129 |                 )
130 |             except Exception as sklearn_exception:
131 |                 assert repr(sklearndf_exception) == repr(sklearn_exception), (
132 |                     "sklearndf raised a different error as sklearn"
133 |                     f" for {_estimator.__class__.__name__}:\n"
134 |                     f"sklearndf: {repr(sklearndf_exception)} \n"
135 |                     f"sklearn:   {repr(sklearn_exception)}"
136 |                 )
137 | 
138 |     if isinstance(estimator, LearnerDF):
139 |         check_sklearndf_call("predict", estimator)
140 |     elif isinstance(estimator, TransformerDF):
141 |         check_sklearndf_call("transform", estimator)
142 |     else:
143 |         raise TypeError(f"Estimator of unknown type:{estimator.__name__}")
144 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Sequence
 2 | 
 3 | from sklearndf import TransformerDF
 4 | from sklearndf.transformation import (
 5 |     ColumnTransformerDF,
 6 |     OneHotEncoderDF,
 7 |     SimpleImputerDF,
 8 | )
 9 | 
10 | STEP_IMPUTE = "impute"
11 | STEP_ONE_HOT_ENCODE = "one-hot-encode"
12 | 
13 | 
14 | def make_simple_transformer(
15 |     impute_median_columns: Optional[Sequence[str]] = None,
16 |     one_hot_encode_columns: Optional[Sequence[str]] = None,
17 | ) -> TransformerDF:
18 |     column_transforms = []
19 | 
20 |     if impute_median_columns is not None and len(impute_median_columns) > 0:
21 |         column_transforms.append(
22 |             (STEP_IMPUTE, SimpleImputerDF(strategy="median"), impute_median_columns)
23 |         )
24 | 
25 |     if one_hot_encode_columns is not None and len(one_hot_encode_columns) > 0:
26 |         column_transforms.append(
27 |             (
28 |                 STEP_ONE_HOT_ENCODE,
29 |                 OneHotEncoderDF(sparse=False, handle_unknown="ignore"),
30 |                 one_hot_encode_columns,
31 |             )
32 |         )
33 | 
34 |     return ColumnTransformerDF(transformers=column_transforms)
35 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/pipeline/test_classification_pipeline_df.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | from sklearn.base import is_classifier
 7 | from sklearn.ensemble import RandomForestClassifier
 8 | from sklearn.preprocessing import OneHotEncoder
 9 | 
10 | from sklearndf import ClassifierDF
11 | from sklearndf.classification import RandomForestClassifierDF
12 | from sklearndf.classification.extra import LGBMClassifierDF
13 | from sklearndf.pipeline import ClassifierPipelineDF
14 | from test.sklearndf.pipeline import make_simple_transformer
15 | 
16 | 
17 | @pytest.mark.parametrize(  # type: ignore
18 |     argnames="classifier_df_cls",
19 |     argvalues=[RandomForestClassifierDF, LGBMClassifierDF],
20 | )
21 | def test_classification_pipeline_df(
22 |     iris_features: pd.DataFrame,
23 |     iris_target_sr: pd.DataFrame,
24 |     classifier_df_cls: Type[ClassifierDF],
25 | ) -> None:
26 |     cls_p_df = ClassifierPipelineDF(
27 |         classifier=classifier_df_cls(),
28 |         preprocessing=make_simple_transformer(
29 |             impute_median_columns=iris_features.select_dtypes(
30 |                 include=np.number
31 |             ).columns,
32 |             one_hot_encode_columns=iris_features.select_dtypes(include=object).columns,
33 |         ),
34 |     )
35 | 
36 |     assert is_classifier(cls_p_df)
37 | 
38 |     cls_p_df.fit(X=iris_features, y=iris_target_sr)
39 |     cls_p_df.predict(X=iris_features)
40 | 
41 |     # test-type check within constructor:
42 |     with pytest.raises(TypeError):
43 |         # noinspection PyTypeChecker
44 |         ClassifierPipelineDF(
45 |             classifier=RandomForestClassifier(), preprocessing=OneHotEncoder()
46 |         )
47 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/pipeline/test_clustering_pipeline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from sklearn.cluster import KMeans
 5 | from sklearn.preprocessing import OneHotEncoder
 6 | 
 7 | from sklearndf.clustering import KMeansDF
 8 | from sklearndf.pipeline import ClusterPipelineDF
 9 | from test.sklearndf.pipeline import make_simple_transformer
10 | 
11 | 
12 | def test_clustering_pipeline_df(
13 |     iris_features: pd.DataFrame, iris_target_sr: pd.DataFrame
14 | ) -> None:
15 |     cls_p_df = ClusterPipelineDF(
16 |         clusterer=KMeansDF(n_clusters=4),
17 |         preprocessing=make_simple_transformer(
18 |             impute_median_columns=iris_features.select_dtypes(
19 |                 include=np.number
20 |             ).columns,
21 |             one_hot_encode_columns=iris_features.select_dtypes(include=object).columns,
22 |         ),
23 |     )
24 | 
25 |     cls_p_df.fit(X=iris_features, y=iris_target_sr)
26 |     cls_p_df.predict(X=iris_features)
27 | 
28 |     # test-type check within constructor:
29 |     with pytest.raises(TypeError):
30 |         # noinspection PyTypeChecker
31 |         ClusterPipelineDF(clusterer=KMeans(n_clusters=4), preprocessing=OneHotEncoder())
32 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/pipeline/test_regression_pipeline_df.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | from lightgbm import LGBMRegressor
 7 | from sklearn.base import is_regressor
 8 | from sklearn.preprocessing import OneHotEncoder
 9 | 
10 | from sklearndf import RegressorDF
11 | from sklearndf.pipeline import RegressorPipelineDF
12 | from sklearndf.regression import RandomForestRegressorDF
13 | from sklearndf.regression.extra import LGBMRegressorDF
14 | from test.sklearndf.pipeline import make_simple_transformer
15 | 
16 | 
17 | @pytest.mark.parametrize(  # type: ignore
18 |     argnames="regressor_df_cls", argvalues=[RandomForestRegressorDF, LGBMRegressorDF]
19 | )
20 | def test_regression_pipeline_df(
21 |     diabetes_features: pd.DataFrame,
22 |     diabetes_target_sr: pd.Series,
23 |     regressor_df_cls: Type[RegressorDF],
24 | ) -> None:
25 |     rpdf = RegressorPipelineDF(
26 |         regressor=regressor_df_cls(),
27 |         preprocessing=make_simple_transformer(
28 |             impute_median_columns=diabetes_features.select_dtypes(
29 |                 include=np.number
30 |             ).columns,
31 |             one_hot_encode_columns=diabetes_features.select_dtypes(
32 |                 include=object
33 |             ).columns,
34 |         ),
35 |     )
36 | 
37 |     assert is_regressor(rpdf)
38 | 
39 |     rpdf.fit(X=diabetes_features, y=diabetes_target_sr)
40 |     rpdf.predict(X=diabetes_features)
41 | 
42 |     # test type check within constructor
43 |     with pytest.raises(TypeError):
44 |         # noinspection PyTypeChecker
45 |         RegressorPipelineDF(regressor=LGBMRegressor(), preprocessing=OneHotEncoder())
46 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/test_base.py:
--------------------------------------------------------------------------------
  1 | # inspired by:
  2 | # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tests/test_base.py
  3 | 
  4 | import re
  5 | from typing import Any
  6 | 
  7 | import numpy as np
  8 | import pytest
  9 | import scipy.sparse as sp
 10 | import sklearn
 11 | from numpy.testing import assert_array_equal
 12 | from sklearn.base import BaseEstimator, is_classifier
 13 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 14 | from sklearn.model_selection import GridSearchCV
 15 | from sklearn.pipeline import Pipeline
 16 | from sklearn.utils import estimator_html_repr
 17 | 
 18 | from pytools.expression import freeze, make_expression
 19 | from pytools.expression.atomic import Id
 20 | 
 21 | from sklearndf.classification import SVCDF, DecisionTreeClassifierDF
 22 | from sklearndf.clustering.wrapper import KMeansBaseWrapperDF
 23 | from sklearndf.pipeline import PipelineDF
 24 | from sklearndf.regression import RandomForestRegressorDF
 25 | from sklearndf.transformation import OneHotEncoderDF, SimpleImputerDF
 26 | from sklearndf.transformation.wrapper import ImputerWrapperDF
 27 | from sklearndf.wrapper import (
 28 |     ClassifierWrapperDF,
 29 |     EstimatorWrapperDF,
 30 |     RegressorWrapperDF,
 31 | )
 32 | 
 33 | 
 34 | class DummyEstimator(
 35 |     BaseEstimator,  # type: ignore
 36 | ):
 37 |     def __init__(self, l1: int = 0, empty: Any = None) -> None:
 38 |         self.l1 = l1
 39 |         self.empty = empty
 40 | 
 41 | 
 42 | class DummyEstimator2(
 43 |     BaseEstimator,  # type: ignore
 44 | ):
 45 |     def __init__(self, a: Any = None, b: Any = None) -> None:
 46 |         self.a = a
 47 |         self.b = b
 48 | 
 49 | 
 50 | class DummyEstimator3(
 51 |     BaseEstimator,  # type: ignore
 52 | ):
 53 |     def __init__(self, c: int = 0, d: Any = None) -> None:
 54 |         self.c = c
 55 |         self.d = d
 56 | 
 57 | 
 58 | class DummyEstimatorDF(EstimatorWrapperDF[DummyEstimator], native=DummyEstimator):
 59 |     """A trivial estimator."""
 60 | 
 61 | 
 62 | class DummyEstimator2DF(EstimatorWrapperDF[DummyEstimator2], native=DummyEstimator2):
 63 |     """A trivial estimator."""
 64 | 
 65 | 
 66 | class DummyEstimator3DF(EstimatorWrapperDF[DummyEstimator3], native=DummyEstimator3):
 67 |     """A trivial estimator."""
 68 | 
 69 | 
 70 | def test_clone() -> None:
 71 |     # Tests that clone creates a correct deep copy.
 72 |     # We create an estimator, make a copy of its original state
 73 |     # (which, in this case, is the current state of the estimator),
 74 |     # and check that the obtained copy is a correct deep copy.
 75 | 
 76 |     encoder = OneHotEncoderDF(drop="first", sparse=False)
 77 |     new_encoder = encoder.clone()
 78 |     assert encoder is not new_encoder
 79 |     assert encoder.get_params() == new_encoder.get_params()
 80 | 
 81 |     encoder = OneHotEncoderDF(handle_unknown="ignore", sparse=False)
 82 |     new_encoder = sklearn.clone(encoder)
 83 | 
 84 |     assert encoder is not new_encoder
 85 | 
 86 | 
 87 | def test_clone_2() -> None:
 88 |     # Tests that clone doesn't copy everything.
 89 |     # We first create an estimator, give it an own attribute, and
 90 |     # make a copy of its original state. Then we check that the copy doesn't
 91 |     # have the specific attribute we manually added to the initial estimator.
 92 | 
 93 |     encoder = OneHotEncoderDF(drop="first", sparse=False)
 94 | 
 95 |     encoder.own_attribute = "test"
 96 |     new_encoder = encoder.clone()
 97 | 
 98 |     assert not hasattr(new_encoder, "own_attribute")
 99 | 
100 | 
101 | def test_clone_empty_array() -> None:
102 |     # Regression test for cloning estimators with empty arrays
103 |     clf = DummyEstimatorDF(empty=np.array([]))
104 |     clf2 = clf.clone()
105 |     assert_array_equal(clf.empty, clf2.empty)
106 | 
107 |     clf = DummyEstimatorDF(empty=sp.csr_matrix(np.array([[0]])))
108 |     clf2 = clf.clone()
109 |     assert_array_equal(clf.empty.data, clf2.empty.data)
110 | 
111 | 
112 | def test_clone_nan() -> None:
113 |     # Regression test for cloning estimators with default parameter as np.nan
114 |     clf = DummyEstimatorDF(empty=np.nan)
115 |     clf2 = clf.clone()
116 | 
117 |     assert clf.empty is clf2.empty
118 | 
119 | 
120 | def test_clone_sparse_matrices() -> None:
121 |     sparse_matrix_classes = [
122 |         getattr(sp, name)
123 |         for name in dir(sp)
124 |         if name.endswith("_matrix") and name != "_matrix"
125 |     ]
126 | 
127 |     for cls in sparse_matrix_classes:
128 |         sparse_matrix = cls(np.eye(5))
129 |         clf = DummyEstimatorDF(empty=sparse_matrix)
130 |         clf_cloned = clf.clone()
131 |         assert clf.empty.__class__ is clf_cloned.empty.__class__
132 |         assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray())
133 | 
134 | 
135 | def test_clone_estimator_types() -> None:
136 |     # Check that clone works for parameters that are types rather than
137 |     # instances
138 |     clf = DummyEstimatorDF(empty=DummyEstimator)
139 |     clf2 = clf.clone()
140 | 
141 |     assert clf.empty is clf2.empty
142 | 
143 | 
144 | def test_repr() -> None:
145 |     # Smoke test the repr of the base estimator.
146 |     repr(DummyEstimatorDF())
147 | 
148 |     estimator = DummyEstimator2DF(
149 |         a=DummyEstimator3DF(c=None), b=DummyEstimator3DF(c=1, d=2)
150 |     )
151 |     assert freeze(make_expression(estimator)) == freeze(
152 |         Id.DummyEstimator2DF(
153 |             a=Id.DummyEstimator3DF(c=None), b=Id.DummyEstimator3DF(c=1, d=2)
154 |         )
155 |     )
156 |     assert repr(estimator) == (
157 |         "DummyEstimator2DF(a=DummyEstimator3DF(c=None), "
158 |         "b=DummyEstimator3DF(c=1, d=2))"
159 |     )
160 | 
161 |     assert len(repr(DummyEstimator2DF(a=["long_params"] * 1000))) == 15021
162 | 
163 | 
164 | def test_str() -> None:
165 |     # Smoke test the str of the base estimator
166 |     my_estimator = DummyEstimatorDF()
167 |     str(my_estimator)
168 | 
169 | 
170 | def test_html_repr() -> None:
171 |     # store the original display config
172 |     display_original = sklearn.get_config()["display"]
173 | 
174 |     # set the display config to use diagrams
175 |     sklearn.set_config(display="diagram")
176 | 
177 |     try:
178 |         pipeline_df = PipelineDF(
179 |             [
180 |                 (
181 |                     "preprocess",
182 |                     PipelineDF(
183 |                         [
184 |                             ("impute", SimpleImputerDF()),
185 |                         ]
186 |                     ),
187 |                 ),
188 |                 ("rf", RandomForestRegressorDF(n_estimators=120)),
189 |             ]
190 |         )
191 | 
192 |         def _replace_ids(_html: str) -> str:
193 |             # scikit-learn generates new ids on subsequent calls to estimator_html_repr,
194 |             # so we replace them with a placeholder
195 |             return re.sub(
196 |                 r'(?<=id-)\d+|(?:(?<=sk-)|(?<=id=")|(?<=for="))\w+(?:-\w+)*', "#", _html
197 |             )
198 | 
199 |         assert _replace_ids(pipeline_df._repr_html_()) == _replace_ids(
200 |             estimator_html_repr(pipeline_df)
201 |         )
202 | 
203 |     finally:
204 |         # reset the display config to its original value
205 |         sklearn.set_config(display=display_original)
206 |         pass
207 | 
208 | 
209 | def test_get_params() -> None:
210 |     test = DummyEstimator2DF(DummyEstimator3DF(), DummyEstimator3DF())
211 | 
212 |     assert "a__d" in test.get_params(deep=True)
213 |     assert "a__d" not in test.get_params(deep=False)
214 | 
215 |     # noinspection PyTypeChecker
216 |     test.set_params(a__d=2)
217 |     assert test.a.d == 2
218 |     with pytest.raises(ValueError):
219 |         test.set_params(a__a=2)
220 | 
221 | 
222 | def test_is_classifier() -> None:
223 |     svc = SVCDF()
224 |     assert is_classifier(svc)
225 |     assert is_classifier(GridSearchCV(svc, {"C": [0.1, 1]}))
226 |     assert is_classifier(PipelineDF([("svc", svc)]))
227 |     assert is_classifier(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))]))
228 | 
229 | 
230 | def test_set_params() -> None:
231 |     # test nested estimator parameter setting
232 |     clf = Pipeline([("svc", SVCDF())])
233 |     # non-existing parameter in svc
234 |     with pytest.raises(ValueError):
235 |         # noinspection PyTypeChecker
236 |         clf.set_params(svc__stupid_param=True)
237 |     # non-existing parameter of pipeline
238 |     with pytest.raises(ValueError):
239 |         # noinspection PyTypeChecker
240 |         clf.set_params(svm__stupid_param=True)
241 | 
242 | 
243 | def test_set_params_updates_valid_params() -> None:
244 |     # Check that set_params tries to set SVC().C, not
245 |     # DecisionTreeClassifier().C
246 |     gs = GridSearchCV(DecisionTreeClassifierDF(), {})
247 |     # noinspection PyTypeChecker
248 |     gs.set_params(estimator=SVCDF(), estimator__C=42.0)
249 |     assert gs.estimator.C == 42.0
250 | 
251 | 
252 | # noinspection PyUnusedLocal
253 | def test_native_class_validation() -> None:
254 |     with pytest.raises(
255 |         TypeError,
256 |         match=(
257 |             "native class RandomForestClassifier cannot be used with wrapper class "
258 |             "MismatchedNativeClass1 because it does not implement RegressorMixin"
259 |         ),
260 |     ):
261 | 
262 |         class MismatchedNativeClass1(
263 |             RegressorWrapperDF[RandomForestClassifier], native=RandomForestClassifier
264 |         ):
265 |             pass
266 | 
267 |     with pytest.raises(
268 |         TypeError,
269 |         match=(
270 |             "native class RandomForestRegressor cannot be used with wrapper class "
271 |             "MismatchedNativeClass2 because it does not implement ClassifierMixin"
272 |         ),
273 |     ):
274 | 
275 |         class MismatchedNativeClass2(
276 |             ClassifierWrapperDF[RandomForestRegressor], native=RandomForestRegressor
277 |         ):
278 |             pass
279 | 
280 |     with pytest.raises(
281 |         TypeError,
282 |         match=(
283 |             "native class RandomForestRegressor cannot be used with wrapper class "
284 |             "MismatchedNativeClass3 because it does not implement ClusterMixin"
285 |         ),
286 |     ):
287 | 
288 |         class MismatchedNativeClass3(
289 |             KMeansBaseWrapperDF[RandomForestRegressor], native=RandomForestRegressor
290 |         ):
291 |             pass
292 | 
293 |     with pytest.raises(
294 |         TypeError,
295 |         match=(
296 |             "native class RandomForestRegressor cannot be used with wrapper class "
297 |             "MismatchedNativeClass4 because it does not implement TransformerMixin"
298 |         ),
299 |     ):
300 | 
301 |         class MismatchedNativeClass4(
302 |             ImputerWrapperDF[Any], native=RandomForestRegressor
303 |         ):
304 |             pass
305 | 
306 |     with pytest.raises(
307 |         TypeError,
308 |         match=(
309 |             "native class RandomForestRegressor cannot be used with wrapper class "
310 |             "MismatchedNativeClass5 because it does not implement Pipeline"
311 |         ),
312 |     ):
313 | 
314 |         class MismatchedNativeClass5(PipelineDF, native=RandomForestRegressor):
315 |             pass
316 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/test_classification.py:
--------------------------------------------------------------------------------
  1 | from itertools import chain
  2 | from typing import Any, Dict, Type
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import pytest
  7 | from sklearn.base import is_classifier
  8 | from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
  9 | 
 10 | import sklearndf.classification as classification
 11 | from sklearndf import ClassifierDF, __sklearn_1_2__, __sklearn_version__
 12 | from test.sklearndf import check_expected_not_fitted_error, iterate_classes
 13 | 
 14 | CLASSIFIERS_TO_TEST = iterate_classes(
 15 |     from_modules=classification,
 16 |     matching=r".*DF",
 17 |     excluding=[ClassifierDF.__name__, r".*WrapperDF", r"^_"],
 18 | )
 19 | 
 20 | 
 21 | def test_classifier_count() -> None:
 22 |     n = len(CLASSIFIERS_TO_TEST)
 23 | 
 24 |     print(f"Testing {n} classifiers.")
 25 |     assert n == 41
 26 | 
 27 | 
 28 | if __sklearn_version__ < __sklearn_1_2__:
 29 |     BASE_ESTIMATOR = "base_estimator"
 30 | else:
 31 |     BASE_ESTIMATOR = "estimator"
 32 | 
 33 | 
 34 | CLASSIFIER_INIT_PARAMETERS: Dict[str, Dict[str, Any]] = {
 35 |     "CalibratedClassifierCVDF": {
 36 |         BASE_ESTIMATOR: classification.RandomForestClassifierDF()
 37 |     },
 38 |     "ClassifierChainDF": {"base_estimator": classification.RandomForestClassifierDF()},
 39 |     "MultiOutputClassifierDF": {"estimator": classification.RandomForestClassifierDF()},
 40 |     "MultiOutputClassifierDF_partial_fit": {"estimator": classification.PerceptronDF()},
 41 |     "OneVsOneClassifierDF": {"estimator": classification.RandomForestClassifierDF()},
 42 |     "OneVsRestClassifierDF": {"estimator": classification.RandomForestClassifierDF()},
 43 |     "OutputCodeClassifierDF": {"estimator": classification.RandomForestClassifierDF()},
 44 |     "VotingClassifierDF": {
 45 |         "estimators": [
 46 |             ("rfc", classification.RandomForestClassifierDF()),
 47 |             ("svmc", classification.SVCDF(probability=True)),
 48 |         ],
 49 |         "voting": "soft",
 50 |     },
 51 |     "StackingClassifierDF": {
 52 |         "estimators": (
 53 |             ("Forest", classification.RandomForestClassifierDF(max_depth=5)),
 54 |             ("Logit", classification.LogisticRegressionCVDF()),
 55 |             ("AdaBoost", classification.AdaBoostClassifierDF()),
 56 |         )
 57 |     },
 58 | }
 59 | 
 60 | 
 61 | CLASSIFIERS_PARTIAL_FIT = [
 62 |     classification.BernoulliNBDF,
 63 |     classification.MultinomialNBDF,
 64 |     classification.PerceptronDF,
 65 |     classification.SGDClassifierDF,
 66 |     classification.PassiveAggressiveClassifierDF,
 67 |     classification.GaussianNBDF,
 68 |     classification.ComplementNBDF,
 69 |     classification.MultiOutputClassifierDF,
 70 |     classification.CategoricalNBDF,
 71 | ]
 72 | 
 73 | 
 74 | @pytest.mark.parametrize(  # type: ignore
 75 |     argnames="sklearndf_cls", argvalues=CLASSIFIERS_TO_TEST
 76 | )
 77 | def test_wrapped_fit_predict(
 78 |     sklearndf_cls: Type[ClassifierDF],
 79 |     iris_features: pd.DataFrame,
 80 |     iris_target_sr: pd.Series,
 81 |     iris_targets_df: pd.DataFrame,
 82 |     iris_targets_binary_df: pd.DataFrame,
 83 | ) -> None:
 84 |     """Test fit & predict & predict[_log]_proba of wrapped sklearn classifiers"""
 85 |     # noinspection PyArgumentList
 86 |     parameters: Dict[str, Any] = CLASSIFIER_INIT_PARAMETERS.get(
 87 |         sklearndf_cls.__name__, {}
 88 |     )
 89 |     # noinspection PyArgumentList
 90 |     classifier: ClassifierDF = sklearndf_cls(**parameters)
 91 | 
 92 |     assert is_classifier(classifier)
 93 | 
 94 |     is_chain = isinstance(classifier.native_estimator, ClassifierChain)
 95 | 
 96 |     is_multi_output = isinstance(classifier.native_estimator, MultiOutputClassifier)
 97 |     check_expected_not_fitted_error(estimator=classifier)
 98 | 
 99 |     if is_chain:
100 |         # for chain classifiers, classes must be numerical so the preceding
101 |         # classification can act as input to the next classification
102 |         classes = set(range(iris_targets_binary_df.shape[1]))
103 |         classifier.fit(X=iris_features, y=iris_targets_binary_df)
104 |     elif is_multi_output:
105 |         classes = set(
106 |             chain(
107 |                 *(
108 |                     list(iris_targets_df.iloc[:, col].unique())
109 |                     for col in range(iris_targets_df.shape[1])
110 |                 )
111 |             )
112 |         )
113 |         classifier.fit(X=iris_features, y=iris_targets_df)
114 |     else:
115 |         classes = set(iris_target_sr.unique())
116 |         classifier.fit(X=iris_features, y=iris_target_sr)
117 | 
118 |     predictions = classifier.predict(X=iris_features)
119 | 
120 |     # test predictions data-type, length and values
121 |     assert isinstance(
122 |         predictions, pd.DataFrame if is_multi_output or is_chain else pd.Series
123 |     )
124 |     assert len(predictions) == len(iris_target_sr)
125 |     assert np.all(predictions.isin(classes))
126 | 
127 |     # test predict_proba & predict_log_proba:
128 |     for method_name in ["predict_proba", "predict_log_proba"]:
129 |         method = getattr(classifier, method_name)
130 | 
131 |         if hasattr(classifier.native_estimator, method_name):
132 |             predictions = method(X=iris_features)
133 | 
134 |             if is_multi_output:
135 |                 assert isinstance(predictions, list)
136 |                 assert classifier.output_names_ == iris_targets_df.columns.tolist()
137 |                 assert classifier.n_outputs_ == len(predictions)
138 |             else:
139 |                 if is_chain:
140 |                     assert (
141 |                         classifier.output_names_
142 |                         == iris_targets_binary_df.columns.tolist()
143 |                     )
144 |                     assert classifier.n_outputs_ == predictions.shape[1]
145 |                 else:
146 |                     assert classifier.output_names_ == [iris_target_sr.name]
147 |                     assert classifier.n_outputs_ == 1
148 | 
149 |                 predictions = [predictions]
150 | 
151 |             for prediction in predictions:
152 |                 # test type and shape of predictions
153 |                 assert isinstance(prediction, pd.DataFrame)
154 |                 assert len(prediction) == len(iris_target_sr)
155 |                 assert prediction.shape == (len(iris_target_sr), len(classes))
156 |                 # check correct labels are set as columns
157 |                 assert set(prediction.columns) == classes
158 |         else:
159 |             with pytest.raises(NotImplementedError):
160 |                 method(X=iris_features)
161 | 
162 | 
163 | @pytest.mark.parametrize(  # type: ignore
164 |     argnames="sklearndf_cls", argvalues=CLASSIFIERS_PARTIAL_FIT
165 | )
166 | def test_wrapped_partial_fit(
167 |     sklearndf_cls: Type[ClassifierDF],
168 |     iris_features: pd.DataFrame,
169 |     iris_target_sr: pd.Series,
170 |     iris_targets_df: pd.DataFrame,
171 | ) -> None:
172 |     # noinspection PyArgumentList
173 |     classifier: ClassifierDF = sklearndf_cls(
174 |         **CLASSIFIER_INIT_PARAMETERS.get(f"{sklearndf_cls.__name__}_partial_fit", {})
175 |     )
176 | 
177 |     is_multi_output = isinstance(classifier.native_estimator, MultiOutputClassifier)
178 |     if is_multi_output:
179 |         classes = iris_targets_df.apply(lambda col: col.unique()).transpose().values
180 |         iris_target = iris_targets_df
181 |     else:
182 |         classes = iris_target_sr.unique()
183 |         iris_target = iris_target_sr
184 | 
185 |     with pytest.raises(
186 |         ValueError,
187 |         match="classes must be passed on the first call to partial_fit.",
188 |     ):
189 |         classifier.partial_fit(iris_features, iris_target)
190 | 
191 |     classifier.partial_fit(iris_features, iris_target, classes)
192 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/test_clustering.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | import sklearndf.clustering
 7 | from sklearndf import ClusterDF, __sklearn_1_1__, __sklearn_1_3__, __sklearn_version__
 8 | from sklearndf.clustering import FeatureAgglomerationDF
 9 | from test.sklearndf import iterate_classes
10 | 
11 | # noinspection PyTypeChecker
12 | CLUSTERERS_TO_TEST = iterate_classes(
13 |     from_modules=sklearndf.clustering,
14 |     matching=r".*DF",
15 |     excluding=[ClusterDF.__name__, r".*WrapperDF", FeatureAgglomerationDF.__name__],
16 | )
17 | # FeatureAgglomeration doesn't support `fit_predict` method
18 | CLUSTERERS_WITH_AGGLOMERATION = CLUSTERERS_TO_TEST + [FeatureAgglomerationDF]
19 | 
20 | 
21 | def test_clusterer_count() -> None:
22 |     n = len(CLUSTERERS_TO_TEST)
23 | 
24 |     print(f"Testing {n} clusterers.")
25 | 
26 |     if __sklearn_version__ < __sklearn_1_1__:
27 |         assert n == 9
28 |     elif __sklearn_version__ < __sklearn_1_3__:
29 |         assert n == 10
30 |     else:
31 |         assert n == 11
32 | 
33 | 
34 | @pytest.mark.parametrize(  # type: ignore
35 |     argnames="sklearn_clusterer_cls", argvalues=CLUSTERERS_TO_TEST
36 | )
37 | def test_clusterer_fit_predict_call(
38 |     iris_features: pd.DataFrame, sklearn_clusterer_cls: Type[ClusterDF]
39 | ) -> None:
40 |     """Check if each sklearndf clusterer supports fit_predict method"""
41 | 
42 |     clusterer_instance = sklearn_clusterer_cls()
43 | 
44 |     assert not clusterer_instance.is_fitted
45 |     result_prediction = clusterer_instance.fit_predict(iris_features)
46 |     assert type(result_prediction) == pd.Series
47 |     assert clusterer_instance.is_fitted
48 | 
49 | 
50 | @pytest.mark.parametrize(  # type: ignore
51 |     argnames="sklearn_clusterer_cls", argvalues=CLUSTERERS_WITH_AGGLOMERATION
52 | )
53 | def test_clusterer_fit_call(
54 |     iris_features: pd.DataFrame, sklearn_clusterer_cls: Type[ClusterDF]
55 | ) -> None:
56 |     """Check if each sklearndf clusterer supports fit method"""
57 | 
58 |     clusterer_instance = sklearn_clusterer_cls()
59 | 
60 |     assert not clusterer_instance.is_fitted
61 |     clusterer_instance.fit(iris_features)
62 |     assert clusterer_instance.is_fitted
63 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/test_meta_estimators.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import pandas as pd
  4 | import pytest
  5 | from sklearn.base import is_classifier, is_regressor
  6 | from sklearn.impute import SimpleImputer
  7 | 
  8 | from sklearndf.classification import (
  9 |     ClassifierChainDF,
 10 |     LogisticRegressionCVDF,
 11 |     LogisticRegressionDF,
 12 |     RandomForestClassifierDF,
 13 |     VotingClassifierDF,
 14 | )
 15 | from sklearndf.pipeline import ClassifierPipelineDF, PipelineDF, RegressorPipelineDF
 16 | from sklearndf.regression import (
 17 |     ElasticNetDF,
 18 |     LinearRegressionDF,
 19 |     MultiOutputRegressorDF,
 20 |     RandomForestRegressorDF,
 21 |     RidgeCVDF,
 22 | )
 23 | from sklearndf.transformation import ColumnTransformerDF, StandardScalerDF
 24 | 
 25 | log = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | def test_meta_estimators() -> None:
 29 |     with pytest.warns(
 30 |         expected_warning=UserWarning,
 31 |         match=(
 32 |             "^the following attributes of VotingClassifierDF have been replaced with "
 33 |             "their native scikit-learn counterparts: estimators$"
 34 |         ),
 35 |     ):
 36 |         VotingClassifierDF(estimators=[("rf", RandomForestClassifierDF())])
 37 | 
 38 |     with pytest.raises(
 39 |         TypeError,
 40 |         match=(
 41 |             "sklearndf meta-estimators only accept simple regressors and classifiers, "
 42 |             "but got: ClassifierPipelineDF"
 43 |         ),
 44 |     ):
 45 |         VotingClassifierDF(
 46 |             estimators=[
 47 |                 ("rf", ClassifierPipelineDF(classifier=RandomForestClassifierDF()))
 48 |             ]
 49 |         )
 50 | 
 51 |     with pytest.warns(
 52 |         expected_warning=UserWarning,
 53 |         match=(
 54 |             "^the following attributes of MultiOutputRegressorDF have been replaced "
 55 |             "with their native scikit-learn counterparts: estimator$"
 56 |         ),
 57 |     ):
 58 |         regressor = MultiOutputRegressorDF(estimator=RandomForestRegressorDF())
 59 |     assert is_regressor(regressor)
 60 | 
 61 |     with pytest.raises(
 62 |         TypeError,
 63 |         match=(
 64 |             "sklearndf meta-estimators only accept simple regressors and classifiers, "
 65 |             "but got: RegressorPipelineDF"
 66 |         ),
 67 |     ):
 68 |         MultiOutputRegressorDF(
 69 |             estimator=RegressorPipelineDF(regressor=RandomForestRegressorDF())
 70 |         )
 71 | 
 72 |     with pytest.warns(
 73 |         expected_warning=UserWarning,
 74 |         match=(
 75 |             "^the following attributes of ClassifierChainDF have been replaced "
 76 |             "with their native scikit-learn counterparts: base_estimator$"
 77 |         ),
 78 |     ):
 79 |         classifier = ClassifierChainDF(base_estimator=RandomForestClassifierDF())
 80 |     assert is_classifier(classifier)
 81 | 
 82 |     with pytest.raises(
 83 |         TypeError,
 84 |         match=(
 85 |             "sklearndf meta-estimators only accept simple regressors and classifiers, "
 86 |             "but got: SimpleImputer"
 87 |         ),
 88 |     ):
 89 |         ClassifierChainDF(base_estimator=SimpleImputer())
 90 | 
 91 | 
 92 | def test_stacking_regressor(
 93 |     diabetes_features: pd.DataFrame, diabetes_target_sr: pd.Series
 94 | ) -> None:
 95 |     from sklearndf.regression import StackingRegressorDF
 96 | 
 97 |     # basic building blocks
 98 |     model1 = LinearRegressionDF()
 99 |     model2 = ElasticNetDF()
100 |     feature_names = list(diabetes_features.columns)
101 |     preprocessing = ColumnTransformerDF(
102 |         [
103 |             ("scaled", StandardScalerDF(), feature_names[1:]),
104 |             ("keep", "passthrough", feature_names[:1]),
105 |         ]
106 |     )
107 |     print(preprocessing)
108 | 
109 |     # Pipeline with stack works
110 |     pipeline = PipelineDF(
111 |         [
112 |             ("preprocessing", preprocessing),
113 |             (
114 |                 "stack",
115 |                 StackingRegressorDF(
116 |                     [
117 |                         ("model1", model1),
118 |                         ("model2", model2),
119 |                     ]
120 |                 ),
121 |             ),
122 |         ]
123 |     )
124 | 
125 |     assert is_regressor(pipeline)
126 | 
127 |     pipeline.fit(diabetes_features, diabetes_target_sr)
128 |     print(pipeline.predict(diabetes_features))
129 | 
130 |     # Stack of Pipelines doesn't
131 |     stack_of_pipelines = StackingRegressorDF(
132 |         estimators=[
133 |             (
134 |                 "pipeline1",
135 |                 PipelineDF([("preprocessing", preprocessing), ("model1", model1)]),
136 |             ),
137 |             (
138 |                 "pipeline2",
139 |                 PipelineDF([("preprocessing", preprocessing), ("model2", model2)]),
140 |             ),
141 |             ("ignore", "drop"),
142 |         ],
143 |         final_estimator=RidgeCVDF(),
144 |     )
145 | 
146 |     assert is_regressor(stack_of_pipelines)
147 | 
148 |     stack_of_pipelines.fit(diabetes_features, diabetes_target_sr)
149 | 
150 |     pred = stack_of_pipelines.predict(diabetes_features)
151 |     assert isinstance(pred, pd.Series)
152 | 
153 |     assert not stack_of_pipelines.final_estimator.is_fitted
154 |     final_estimator_fitted = stack_of_pipelines.final_estimator_
155 |     assert final_estimator_fitted.feature_names_in_.to_list() == [
156 |         "pipeline1",
157 |         "pipeline2",
158 |     ]
159 | 
160 | 
161 | def test_stacking_classifier(
162 |     iris_features: pd.DataFrame, iris_target_sr: pd.Series
163 | ) -> None:
164 |     from sklearndf.classification import StackingClassifierDF
165 | 
166 |     # basic building blocks
167 |     model1 = LogisticRegressionCVDF()
168 |     model2 = RandomForestClassifierDF(max_depth=5)
169 |     feature_names = iris_features.columns.to_list()
170 |     preprocessing = ColumnTransformerDF(
171 |         [
172 |             ("scaled", StandardScalerDF(), feature_names[1:]),
173 |             ("keep", "passthrough", feature_names[:1]),
174 |         ]
175 |     )
176 | 
177 |     # Pipeline with stack works
178 |     pipeline = PipelineDF(
179 |         [
180 |             ("preprocessing", preprocessing),
181 |             (
182 |                 "stack",
183 |                 StackingClassifierDF(
184 |                     [
185 |                         ("model1", model1),
186 |                         ("model2", model2),
187 |                     ]
188 |                 ),
189 |             ),
190 |         ]
191 |     )
192 | 
193 |     assert is_classifier(pipeline)
194 | 
195 |     pipeline.fit(iris_features, iris_target_sr)
196 | 
197 |     # Stack of Pipelines doesn't
198 |     stack_of_pipelines = StackingClassifierDF(
199 |         estimators=[
200 |             (
201 |                 "pipeline1",
202 |                 PipelineDF([("preprocessing", preprocessing), ("model1", model1)]),
203 |             ),
204 |             (
205 |                 "pipeline2",
206 |                 PipelineDF([("preprocessing", preprocessing), ("model2", model2)]),
207 |             ),
208 |             ("ignore", "drop"),
209 |         ],
210 |         final_estimator=LogisticRegressionDF(),
211 |         passthrough=True,
212 |     )
213 | 
214 |     assert is_classifier(pipeline)
215 | 
216 |     stack_of_pipelines.fit(iris_features, iris_target_sr)
217 | 
218 |     pred = stack_of_pipelines.predict_proba(iris_features)
219 |     assert pred.columns.to_list() == ["setosa", "versicolor", "virginica"]
220 | 
221 |     assert not stack_of_pipelines.final_estimator.is_fitted
222 |     final_estimator_fitted = stack_of_pipelines.final_estimator_
223 |     assert final_estimator_fitted.feature_names_in_.to_list() == [
224 |         "pipeline1_setosa",
225 |         "pipeline1_versicolor",
226 |         "pipeline1_virginica",
227 |         "pipeline2_setosa",
228 |         "pipeline2_versicolor",
229 |         "pipeline2_virginica",
230 |         "sepal length (cm)",
231 |         "sepal width (cm)",
232 |         "petal length (cm)",
233 |         "petal width (cm)",
234 |     ]
235 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/test_missing.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pytest
 4 | 
 5 | from sklearndf.wrapper import MissingEstimator
 6 | 
 7 | log = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def test_missing() -> None:
11 |     class MyMissingEstimator(MissingEstimator):
12 |         pass
13 | 
14 |     with pytest.raises(
15 |         RuntimeError,
16 |         match=(
17 |             "Estimator MyMissingEstimator is not available. "
18 |             "Please install the package that implements it."
19 |         ),
20 |     ):
21 |         MyMissingEstimator(1, "2", a=2)
22 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/test_regression.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List, Type
  2 | 
  3 | import pandas as pd
  4 | import pytest
  5 | from sklearn.base import BaseEstimator, is_regressor
  6 | from sklearn.multioutput import MultiOutputRegressor, RegressorChain
  7 | 
  8 | import sklearndf.regression
  9 | from sklearndf import RegressorDF, TransformerDF
 10 | from sklearndf.regression import (
 11 |     SVRDF,
 12 |     IsotonicRegressionDF,
 13 |     LinearRegressionDF,
 14 |     MLPRegressorDF,
 15 |     MultiOutputRegressorDF,
 16 |     PassiveAggressiveRegressorDF,
 17 |     RandomForestRegressorDF,
 18 |     SGDRegressorDF,
 19 | )
 20 | from sklearndf.wrapper import EstimatorWrapperDF
 21 | from test.sklearndf import check_expected_not_fitted_error, iterate_classes
 22 | 
 23 | # noinspection PyTypeChecker
 24 | # ignore false alert about module type
 25 | REGRESSORS_TO_TEST: List[Type[EstimatorWrapperDF[BaseEstimator]]] = iterate_classes(
 26 |     from_modules=sklearndf.regression,
 27 |     matching=r".*DF",
 28 |     excluding=[RegressorDF.__name__, TransformerDF.__name__, r".*WrapperDF"],
 29 | )
 30 | 
 31 | 
 32 | def test_regressor_count() -> None:
 33 |     n = len(REGRESSORS_TO_TEST)
 34 | 
 35 |     print(f"Testing {n} regressors.")
 36 |     assert n == 55
 37 | 
 38 | 
 39 | DEFAULT_REGRESSOR_PARAMETERS: Dict[str, Dict[str, Any]] = {
 40 |     "MultiOutputRegressorDF": dict(estimator=RandomForestRegressorDF()),
 41 |     "MultiOutputRegressorDF_partial_fit": dict(estimator=SGDRegressorDF()),
 42 |     "RegressorChainDF": dict(base_estimator=RandomForestRegressorDF()),
 43 |     "VotingRegressorDF": dict(
 44 |         estimators=[("rfr", RandomForestRegressorDF()), ("svr", SVRDF())]
 45 |     ),
 46 |     "StackingRegressorDF": dict(
 47 |         estimators=(
 48 |             ("Forest", RandomForestRegressorDF()),
 49 |             ("SVR", SVRDF()),
 50 |             ("Linear", LinearRegressionDF()),
 51 |         )
 52 |     ),
 53 |     # the rank of Y is 1, so n_components needs to be 1
 54 |     "CCADF": dict(n_components=1),
 55 |     # the rank of Y is 1, so n_components needs to be 1
 56 |     "PLSCanonicalDF": dict(n_components=1),
 57 |     # use a solver that is still supported with scipy 1.11
 58 |     "QuantileRegressorDF": dict(solver="highs"),
 59 | }
 60 | 
 61 | REGRESSORS_PARTIAL_FIT = [
 62 |     SGDRegressorDF,
 63 |     PassiveAggressiveRegressorDF,
 64 |     MultiOutputRegressorDF,
 65 |     MLPRegressorDF,
 66 | ]
 67 | 
 68 | 
 69 | @pytest.mark.parametrize(  # type: ignore
 70 |     argnames="sklearndf_cls", argvalues=REGRESSORS_TO_TEST
 71 | )
 72 | def test_wrapped_fit_predict(
 73 |     sklearndf_cls: Type[RegressorDF],
 74 |     diabetes_features: pd.DataFrame,
 75 |     diabetes_target_sr: pd.Series,
 76 |     diabetes_target_df: pd.DataFrame,
 77 | ) -> None:
 78 |     """Test fit & predict of wrapped sklearn regressors"""
 79 |     parameters: Dict[str, Any] = DEFAULT_REGRESSOR_PARAMETERS.get(
 80 |         sklearndf_cls.__name__, {}
 81 |     )
 82 | 
 83 |     # noinspection PyArgumentList
 84 |     regressor: RegressorDF = sklearndf_cls(**parameters)
 85 | 
 86 |     assert is_regressor(regressor)
 87 | 
 88 |     check_expected_not_fitted_error(estimator=regressor)
 89 | 
 90 |     if (
 91 |         type(regressor).__name__.startswith("Multi")
 92 |         or isinstance(regressor.native_estimator, MultiOutputRegressor)
 93 |         or isinstance(regressor.native_estimator, RegressorChain)
 94 |     ):
 95 |         regressor.fit(X=diabetes_features, y=diabetes_target_df)
 96 | 
 97 |     else:
 98 |         if isinstance(regressor, IsotonicRegressionDF):
 99 |             # fit will fail when we have more than one feature
100 |             with pytest.raises(ValueError):
101 |                 regressor.fit(X=diabetes_features, y=diabetes_target_sr)
102 |             # eliminate all features except one then continue testing
103 |             diabetes_features = diabetes_features.loc[:, "bmi"]
104 | 
105 |         regressor.fit(X=diabetes_features, y=diabetes_target_sr)
106 | 
107 |     predictions = regressor.predict(X=diabetes_features)
108 | 
109 |     # test predictions data-type, length and values
110 |     assert isinstance(predictions, (pd.Series, pd.DataFrame))
111 |     assert len(predictions) == len(diabetes_target_sr)
112 | 
113 | 
114 | @pytest.mark.parametrize(  # type: ignore
115 |     argnames="sklearndf_cls", argvalues=REGRESSORS_PARTIAL_FIT
116 | )
117 | def test_wrapped_partial_fit(
118 |     sklearndf_cls: Type[RegressorDF],
119 |     diabetes_features: pd.DataFrame,
120 |     diabetes_target_sr: pd.Series,
121 |     diabetes_target_df: pd.DataFrame,
122 | ) -> None:
123 |     # noinspection PyArgumentList
124 |     regressor = sklearndf_cls(
125 |         **DEFAULT_REGRESSOR_PARAMETERS.get(f"{sklearndf_cls.__name__}_partial_fit", {})
126 |     )
127 | 
128 |     is_multi_output = isinstance(regressor.native_estimator, MultiOutputRegressor)
129 |     diabetes_target = diabetes_target_df if is_multi_output else diabetes_target_sr
130 | 
131 |     # noinspection PyUnresolvedReferences
132 |     regressor.partial_fit(diabetes_features, diabetes_target)
133 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/test_sklearn_coverage.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from types import ModuleType
  3 | from typing import Dict, Iterable, List, Optional, Type, TypeVar, Union
  4 | 
  5 | import pytest
  6 | import sklearn
  7 | from sklearn.base import (
  8 |     BaseEstimator,
  9 |     ClassifierMixin,
 10 |     ClusterMixin,
 11 |     RegressorMixin,
 12 |     TransformerMixin,
 13 | )
 14 | from sklearn.pipeline import Pipeline
 15 | from sklearn.utils.metaestimators import _BaseComposition
 16 | 
 17 | import sklearndf.classification
 18 | import sklearndf.clustering
 19 | import sklearndf.pipeline
 20 | import sklearndf.regression
 21 | import sklearndf.transformation
 22 | from ..conftest import UNSUPPORTED_SKLEARN_PACKAGES
 23 | from . import find_all_submodules, iterate_classes, sklearn_delegate_classes
 24 | from sklearndf import EstimatorDF
 25 | 
 26 | T = TypeVar("T")
 27 | 
 28 | 
 29 | GENERAL_COVERAGE_EXCLUSIONS = {
 30 |     # exclude all private classes:
 31 |     r"^_",
 32 |     # exclude all Base classes:
 33 |     r"^Base[A-Z]",
 34 |     # exclude all Mixin classes:
 35 |     r".*Mixin$",
 36 | }
 37 | 
 38 | CLASSIFIER_COVERAGE_EXCLUSIONS = {
 39 |     *GENERAL_COVERAGE_EXCLUSIONS,
 40 |     # Base classes and Mixins not following the convention
 41 |     "ForestClassifier",
 42 |     "_IdentityClassifier",
 43 | }
 44 | 
 45 | 
 46 | REGRESSOR_COVERAGE_EXCLUSIONS = {
 47 |     *GENERAL_COVERAGE_EXCLUSIONS,
 48 |     # Base classes and mix-ins
 49 |     "ForestRegressor",
 50 |     "GeneralizedLinearRegressor",
 51 |     # Private classes
 52 |     "_SigmoidCalibration",
 53 |     "_PLS",
 54 | }
 55 | 
 56 | 
 57 | TRANSFORMER_COVERAGE_EXCLUSIONS = {
 58 |     *GENERAL_COVERAGE_EXCLUSIONS,
 59 |     # class "Imputer" was deprecated in 0.20 and removed in 0.22
 60 |     "Imputer",
 61 |     # class "AgglomerationTransform" is just a mix-in class and
 62 |     # isn't meant to be used directly
 63 |     "AgglomerationTransform",
 64 | }
 65 | 
 66 | 
 67 | PIPELINE_COVERAGE_EXCLUSIONS = GENERAL_COVERAGE_EXCLUSIONS
 68 | 
 69 | 
 70 | CLUSTERER_COVERAGE_EXCLUSIONS = {
 71 |     *GENERAL_COVERAGE_EXCLUSIONS,
 72 | }
 73 | 
 74 | 
 75 | UNSUPPORTED_SKLEARN_CLASSES = {
 76 |     sklearn_class.__name__
 77 |     for sklearn_class in iterate_classes(
 78 |         from_modules=itertools.chain.from_iterable(
 79 |             (p, *find_all_submodules(p)) for p in UNSUPPORTED_SKLEARN_PACKAGES
 80 |         ),
 81 |         matching=".*",
 82 |     )
 83 | }
 84 | 
 85 | 
 86 | def _find_sklearn_classes_to_cover(
 87 |     from_modules: Union[ModuleType, Iterable[ModuleType]],
 88 |     subclass_of: Type[T],
 89 |     excluding: Optional[Union[str, Iterable[str]]] = None,
 90 | ) -> List[Type[T]]:
 91 |     return [
 92 |         cls
 93 |         for cls in iterate_classes(
 94 |             from_modules=from_modules, matching=".*", excluding=excluding
 95 |         )
 96 |         if issubclass(cls, subclass_of)
 97 |     ]
 98 | 
 99 | 
100 | def sklearn_classifier_classes() -> List[type]:
101 |     return _find_sklearn_classes_to_cover(
102 |         from_modules=find_all_submodules(sklearn),
103 |         subclass_of=ClassifierMixin,
104 |         excluding=CLASSIFIER_COVERAGE_EXCLUSIONS,
105 |     )
106 | 
107 | 
108 | def sklearn_regressor_classes() -> List[type]:
109 |     return _find_sklearn_classes_to_cover(
110 |         from_modules=find_all_submodules(sklearn),
111 |         subclass_of=RegressorMixin,
112 |         excluding=REGRESSOR_COVERAGE_EXCLUSIONS,
113 |     )
114 | 
115 | 
116 | def sklearn_pipeline_classes() -> List[type]:
117 |     pipeline_modules = find_all_submodules(sklearn.pipeline)
118 |     pipeline_modules.add(sklearn.pipeline)
119 | 
120 |     return _find_sklearn_classes_to_cover(
121 |         from_modules=pipeline_modules,
122 |         subclass_of=_BaseComposition,
123 |         excluding=PIPELINE_COVERAGE_EXCLUSIONS,
124 |     )
125 | 
126 | 
127 | def sklearn_transformer_classes() -> List[type]:
128 |     """Return all classes that are 'just' transformers, not learners or pipelines."""
129 |     transformer_mixin_classes = [
130 |         cls
131 |         for cls in iterate_classes(
132 |             from_modules=find_all_submodules(sklearn),
133 |             matching=".*",
134 |             excluding=TRANSFORMER_COVERAGE_EXCLUSIONS,
135 |         )
136 |         if issubclass(cls, TransformerMixin)
137 |     ]
138 | 
139 |     transformer_classes = list(
140 |         set(transformer_mixin_classes)
141 |         .difference(sklearn_classifier_classes())
142 |         .difference(sklearn_regressor_classes())
143 |         .difference(sklearn_pipeline_classes())
144 |         .difference(sklearn_clusterer_classes())
145 |     )
146 | 
147 |     return transformer_classes
148 | 
149 | 
150 | def sklearn_clusterer_classes() -> List[type]:
151 |     return _find_sklearn_classes_to_cover(
152 |         from_modules=find_all_submodules(sklearn),
153 |         subclass_of=ClusterMixin,
154 |         excluding=CLUSTERER_COVERAGE_EXCLUSIONS,
155 |     )
156 | 
157 | 
158 | def _check_unexpected_sklearn_class(cls: type) -> None:
159 |     f_cls_name = f"{cls.__module__}.{cls.__name__}"
160 |     if cls.__name__ in UNSUPPORTED_SKLEARN_CLASSES:
161 |         pytest.skip(f"Class {f_cls_name} is not wrapped but marked as unsupported")
162 |     else:
163 |         raise ValueError(f"Class {f_cls_name} is not wrapped")
164 | 
165 | 
166 | @pytest.mark.parametrize(  # type: ignore
167 |     argnames="sklearn_classifier_cls", argvalues=sklearn_classifier_classes()
168 | )
169 | def test_classifier_coverage(sklearn_classifier_cls: Type[ClassifierMixin]) -> None:
170 |     """Check if each sklearn classifier has a wrapped sklearndf counterpart."""
171 |     sklearn_classes: Dict[
172 |         Type[BaseEstimator], Type[EstimatorDF]
173 |     ] = sklearn_delegate_classes(sklearndf.classification)
174 | 
175 |     if sklearn_classifier_cls not in sklearn_classes:
176 |         _check_unexpected_sklearn_class(sklearn_classifier_cls)
177 | 
178 | 
179 | @pytest.mark.parametrize(  # type: ignore
180 |     argnames="sklearn_regressor_cls", argvalues=sklearn_regressor_classes()
181 | )
182 | def test_regressor_coverage(sklearn_regressor_cls: Type[RegressorMixin]) -> None:
183 |     """Check if each sklearn regressor has a wrapped sklearndf counterpart."""
184 |     sklearn_classes: Dict[
185 |         Type[BaseEstimator], Type[EstimatorDF]
186 |     ] = sklearn_delegate_classes(sklearndf.regression)
187 | 
188 |     if sklearn_regressor_cls not in sklearn_classes:
189 |         _check_unexpected_sklearn_class(sklearn_regressor_cls)
190 | 
191 | 
192 | @pytest.mark.parametrize(  # type: ignore
193 |     argnames="sklearn_transformer_cls", argvalues=sklearn_transformer_classes()
194 | )
195 | def test_transformer_coverage(sklearn_transformer_cls: Type[TransformerMixin]) -> None:
196 |     """Check if each sklearn transformer has a wrapped sklearndf counterpart."""
197 | 
198 |     sklearn_classes: Dict[
199 |         Type[BaseEstimator], Type[EstimatorDF]
200 |     ] = sklearn_delegate_classes(sklearndf.transformation)
201 | 
202 |     if sklearn_transformer_cls not in sklearn_classes:
203 |         _check_unexpected_sklearn_class(sklearn_transformer_cls)
204 | 
205 | 
206 | @pytest.mark.parametrize(  # type: ignore
207 |     argnames="sklearn_pipeline_cls", argvalues=sklearn_pipeline_classes()
208 | )
209 | def test_pipeline_coverage(sklearn_pipeline_cls: Type[Pipeline]) -> None:
210 |     """Check if each sklearn pipeline estimator has
211 |     a wrapped sklearndf counterpart."""
212 | 
213 |     # noinspection PyTypeChecker
214 |     sklearn_classes = sklearn_delegate_classes(sklearndf.pipeline)
215 | 
216 |     if sklearn_pipeline_cls not in sklearn_classes:
217 |         _check_unexpected_sklearn_class(sklearn_pipeline_cls)
218 | 
219 | 
220 | @pytest.mark.parametrize(  # type: ignore
221 |     argnames="sklearn_clusterer_cls", argvalues=sklearn_clusterer_classes()
222 | )
223 | def test_clusterer_coverage(sklearn_clusterer_cls: Type[ClusterMixin]) -> None:
224 |     """Check if each sklearn clusterer has a wrapped sklearndf counterpart."""
225 |     sklearn_classes: Dict[
226 |         Type[BaseEstimator], Type[EstimatorDF]
227 |     ] = sklearn_delegate_classes(sklearndf.clustering)
228 | 
229 |     if sklearn_clusterer_cls not in sklearn_classes:
230 |         _check_unexpected_sklearn_class(sklearn_clusterer_cls)
231 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/transformation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/test/test/sklearndf/transformation/__init__.py


--------------------------------------------------------------------------------
/test/test/sklearndf/transformation/test_extra.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Callable, Dict, Optional, Type
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pytest
  6 | from lightgbm import LGBMRegressor
  7 | from packaging.version import Version
  8 | 
  9 | from sklearndf import TransformerDF
 10 | from sklearndf.pipeline import PipelineDF
 11 | from sklearndf.regression import RandomForestRegressorDF
 12 | from sklearndf.regression.extra import LGBMRegressorDF
 13 | from sklearndf.transformation import SimpleImputerDF
 14 | from sklearndf.transformation.extra import BoostAGrootaDF, BorutaDF, GrootCVDF, LeshyDF
 15 | from sklearndf.wrapper import MissingEstimator
 16 | 
 17 | # get the version of the arfs package
 18 | __arfs_version__: Optional[Version]
 19 | try:
 20 |     import arfs
 21 | 
 22 |     # get the version of the arfs package
 23 |     __arfs_version__ = Version(arfs.__version__)
 24 | except ImportError:
 25 |     __arfs_version__ = None
 26 | 
 27 | __arfs_1_1__ = Version("1.1")
 28 | 
 29 | # set up a regressors for use in the feature selection tests
 30 | 
 31 | regressor_params = dict(max_depth=5, n_jobs=-3, random_state=42, n_estimators=100)
 32 | lgbm_regressor = LGBMRegressor(**regressor_params)
 33 | lgbm_regressor_df = LGBMRegressorDF(**regressor_params)
 34 | 
 35 | parametrize_feature_selector_cls: Callable[
 36 |     [Callable[..., None]], Callable[..., None]
 37 | ] = pytest.mark.parametrize(
 38 |     # the class/parameter combinations to test for feature selection
 39 |     argnames=["feature_selector_cls", "feature_selector_params"],
 40 |     argvalues=[
 41 |         (cls, params)
 42 |         for cls, params in [
 43 |             # Boruta selector
 44 |             (
 45 |                 BorutaDF,
 46 |                 dict(
 47 |                     estimator=RandomForestRegressorDF(
 48 |                         max_depth=5, n_jobs=-3, random_state=42, n_estimators=100
 49 |                     )
 50 |                 ),
 51 |             ),
 52 |             # Various ARFS selectors
 53 |             (LeshyDF, dict(estimator=lgbm_regressor, random_state=42, perc=90)),
 54 |             (LeshyDF, dict(estimator=lgbm_regressor_df, random_state=42, perc=90)),
 55 |             (
 56 |                 BoostAGrootaDF,
 57 |                 dict(est=lgbm_regressor, cutoff=1.1)
 58 |                 if __arfs_version__ is None or __arfs_version__ < __arfs_1_1__
 59 |                 else dict(estimator=lgbm_regressor, cutoff=1.1),
 60 |             ),
 61 |             (GrootCVDF, dict()),
 62 |         ]
 63 |         if not issubclass(cls.__wrapped__, MissingEstimator)
 64 |     ],
 65 | )
 66 | 
 67 | 
 68 | #
 69 | # Test the feature selection classes
 70 | #
 71 | 
 72 | 
 73 | @parametrize_feature_selector_cls
 74 | def test_feature_selection_df(
 75 |     feature_selector_cls: Type[TransformerDF], feature_selector_params: Dict[str, Any]
 76 | ) -> None:
 77 |     """
 78 |     Test feature selection using the Boruta or ARFS package using a simple synthetic
 79 |     dataset.
 80 | 
 81 |     :param feature_selector_cls: The feature selector class to test.
 82 |     :param feature_selector_params: The parameters to use for the feature selector.
 83 |     """
 84 | 
 85 |     df = pd.DataFrame(data=np.random.randn(100, 5), columns=list("abcde"))
 86 |     x = df.iloc[:, :-1]
 87 |     y = df.iloc[:, -1]
 88 | 
 89 |     feature_selector = feature_selector_cls(**feature_selector_params)
 90 |     feature_selector.fit(x, y)
 91 |     assert set(feature_selector.feature_names_out_) <= {"a", "b", "c", "d", "e"}
 92 | 
 93 | 
 94 | @parametrize_feature_selector_cls
 95 | def test_feature_selection_pipeline_df(
 96 |     feature_selector_cls: Type[TransformerDF],
 97 |     feature_selector_params: Dict[str, Any],
 98 |     diabetes_df: pd.DataFrame,
 99 |     diabetes_target: str,
100 | ) -> None:
101 |     """
102 |     Test feature selection using the Boruta or ARFS package using the diabetes
103 |     dataset.
104 | 
105 |     :param feature_selector_cls: The feature selector class to test.
106 |     :param feature_selector_params: The parameters to use for the feature selector.
107 |     :param diabetes_df: The diabetes dataset.
108 |     :param diabetes_target: The diabetes target column.
109 |     """
110 | 
111 |     feature_selector = feature_selector_cls(**feature_selector_params)
112 | 
113 |     diabetes_df = diabetes_df.sample(frac=0.5, random_state=42)
114 | 
115 |     feature_selection_pipeline = PipelineDF(
116 |         steps=[
117 |             (
118 |                 "preprocess",
119 |                 PipelineDF(
120 |                     steps=[
121 |                         ("imputer", SimpleImputerDF()),
122 |                     ]
123 |                 ),
124 |             ),
125 |             ("selector", feature_selector),
126 |         ]
127 |     )
128 | 
129 |     x = diabetes_df.drop(columns=diabetes_target)
130 |     y = diabetes_df.loc[:, diabetes_target]
131 | 
132 |     feature_selection_pipeline.fit(x, y)
133 | 
134 |     selected_features = set(feature_selection_pipeline.feature_names_out_)
135 |     try:
136 |         assert selected_features == set(feature_selector.selected_features_)
137 |     except AttributeError:
138 |         pass
139 | 
140 |     assert {"bmi", "bp", "s5"}.issubset(
141 |         selected_features
142 |     ), "key features have been selected"
143 | 
144 |     assert len(selected_features) <= 5, "no more than 5 features were selected"
145 | 
146 |     assert (selected_features - {"bmi", "bp", "s5"}).issubset(
147 |         {"sex", "s1", "s2", "s3", "s6"}
148 |     ), "additional selected features were not completely irrelevant"
149 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/transformation/test_imputers.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import logging
 3 | from typing import Type
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | import sklearndf.transformation
10 | from sklearndf import TransformerDF
11 | from test.sklearndf import iterate_classes
12 | 
13 | logger = logging.getLogger(__name__)
14 | logger.setLevel(logging.DEBUG)
15 | 
16 | IMPUTERS_TO_TEST = iterate_classes(
17 |     from_modules=sklearndf.transformation, matching=r".*Imputer.*DF", excluding=[]
18 | )
19 | 
20 | 
21 | @pytest.mark.parametrize(  # type: ignore
22 |     argnames=["imputer_cls", "add_indicator"],
23 |     argvalues=itertools.product(IMPUTERS_TO_TEST, [True, False]),
24 | )
25 | def test_imputer(
26 |     imputer_cls: Type[TransformerDF],
27 |     add_indicator: bool,
28 | ) -> None:
29 |     """
30 |     Test imputer classes using the combinations of arguments from
31 |     ``@pytest.mark.parametrize``
32 | 
33 |     :param imputer_cls: the imputer class to test
34 |     :param add_indicator: whether to add an indicator column
35 |     :return:
36 |     """
37 |     imputer_df = imputer_cls(add_indicator=add_indicator)
38 |     imputer_cls_orig = type(imputer_df.native_estimator)
39 | 
40 |     test_data_x = pd.DataFrame(
41 |         data=[[7, 2, 3], [4, np.nan, 6], [10, 5, 9]], columns=["a", "b", "c"]
42 |     )
43 |     test_data_x_with_all_nan = pd.DataFrame(
44 |         data=[[7, np.nan, 3], [4, np.nan, 6], [np.nan, np.nan, np.nan]],
45 |         columns=["a", "b", "c"],
46 |     )
47 |     test_data_y = pd.DataFrame(
48 |         data=[[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]], columns=["a", "b", "c"]
49 |     )
50 | 
51 |     # noinspection PyArgumentList
52 |     imputer_orig = imputer_cls_orig(add_indicator=add_indicator)
53 |     # noinspection PyUnresolvedReferences
54 |     imputer_orig.fit(test_data_x.values)
55 |     # noinspection PyUnresolvedReferences
56 |     y_transformed = imputer_orig.transform(test_data_y)
57 | 
58 |     imputer_df.fit(test_data_x)
59 |     y_transformed_df = imputer_df.transform(test_data_y)
60 | 
61 |     assert np.array_equal(
62 |         np.round(y_transformed, 4), np.round(y_transformed_df.values, 4)
63 |     ), (
64 |         f"Different imputation results! "
65 |         f"sklearn:{y_transformed} "
66 |         f"sklearndf: {y_transformed_df.values}"
67 |     )
68 | 
69 |     # test correct imputation (and returned column labels)
70 |     # for the case when a full input series is NaN
71 |     # noinspection PyUnresolvedReferences
72 |     imputer_orig.fit(test_data_x_with_all_nan.values)
73 |     # noinspection PyUnresolvedReferences
74 |     y_transformed = imputer_orig.transform(test_data_y)
75 | 
76 |     imputer_df.fit(test_data_x_with_all_nan)
77 |     y_transformed_df = imputer_df.transform(test_data_y)
78 | 
79 |     assert np.array_equal(
80 |         np.round(y_transformed, 4), np.round(y_transformed_df.values, 4)
81 |     ), (
82 |         f"Different imputation results! "
83 |         f"sklearn:{y_transformed} "
84 |         f"sklearndf: {y_transformed_df.values}"
85 |     )
86 | 


--------------------------------------------------------------------------------
/test/test/sklearndf/transformation/test_sparse.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pytest
  3 | from pandas.testing import assert_index_equal, assert_series_equal
  4 | 
  5 | from sklearndf._util import sparse_frame_density
  6 | from sklearndf.pipeline import FeatureUnionDF, PipelineDF
  7 | from sklearndf.transformation import CountVectorizerDF, TfidfTransformerDF
  8 | 
  9 | 
 10 | def test_tfidf() -> None:
 11 |     # expected results
 12 | 
 13 |     word_feature_names = (
 14 |         ["and", "document", "first", "here", "is", "it"]
 15 |         + ["last", "one", "or", "second", "the", "third", "this"]
 16 |         # single-word features
 17 |     )
 18 |     bigram_feature_names = (
 19 |         ["and the", "first document", "here is", "is it", "is the", "is this", "it the"]
 20 |         + ["last document", "or is", "second document", "the first", "the last"]
 21 |         + ["the second", "the third", "third one", "this the"]
 22 |     )
 23 | 
 24 |     # create a simple toy corpus, inspired by scikit-learn's documentation
 25 | 
 26 |     corpus = pd.Series(
 27 |         [
 28 |             "Here is the first document.",
 29 |             "Here is the second document.",
 30 |             "And the third one.",
 31 |             "Is this the first document?",
 32 |             "The last document?",
 33 |             "Or is it the second document?",
 34 |         ]
 35 |     )
 36 |     corpus_named = corpus.rename("document")
 37 | 
 38 |     # count the words for every document in the corpus
 39 | 
 40 |     word_counter = CountVectorizerDF()
 41 | 
 42 |     with pytest.raises(
 43 |         ValueError, match="the name of the series passed as arg X must not be None$"
 44 |     ):
 45 |         word_counter.fit_transform(corpus)
 46 | 
 47 |     word_counts_sparse_df = word_counter.fit_transform(corpus_named)
 48 | 
 49 |     assert word_counter.feature_names_out_.to_list() == word_feature_names
 50 |     assert all(
 51 |         isinstance(dtype, pd.SparseDtype) for dtype in word_counts_sparse_df.dtypes
 52 |     )
 53 | 
 54 |     # compute the tf-idf values for every word in every document
 55 | 
 56 |     tfidf = TfidfTransformerDF()
 57 |     x_tfidf = tfidf.fit_transform(word_counts_sparse_df)
 58 | 
 59 |     assert all(isinstance(dtype, pd.SparseDtype) for dtype in x_tfidf.dtypes)
 60 |     assert_index_equal(tfidf.feature_names_out_, word_counts_sparse_df.columns)
 61 |     assert_index_equal(tfidf.feature_names_out_, x_tfidf.columns)
 62 |     assert sparse_frame_density(x_tfidf) == pytest.approx(0.3589744)
 63 | 
 64 |     # count the bigrams for every document in the corpus
 65 | 
 66 |     bigram_counter = CountVectorizerDF(analyzer="word", ngram_range=(2, 2))
 67 |     x2 = bigram_counter.fit_transform(corpus_named)
 68 |     assert bigram_counter.feature_names_out_.to_list() == bigram_feature_names
 69 |     assert all(isinstance(dtype, pd.SparseDtype) for dtype in x2.dtypes)
 70 | 
 71 |     # create a pipeline that combines the word and bigram counter
 72 |     # and computes the tf-idf values for every word and bigram
 73 | 
 74 |     vectorize = FeatureUnionDF(
 75 |         [
 76 |             ("words", word_counter),
 77 |             ("bigrams", bigram_counter),
 78 |         ]
 79 |     )
 80 |     pipeline = PipelineDF(
 81 |         [
 82 |             ("vectorize", vectorize),
 83 |             ("tfidf", tfidf),
 84 |         ]
 85 |     )
 86 | 
 87 |     tfidf = pipeline.fit_transform(corpus_named)
 88 |     assert all(isinstance(dtype, pd.SparseDtype) for dtype in tfidf.dtypes)
 89 |     assert_series_equal(
 90 |         pipeline.feature_names_original_,
 91 |         pd.Series(
 92 |             index=pd.Index(
 93 |                 [f"words__{name}" for name in word_feature_names]
 94 |                 + [f"bigrams__{name}" for name in bigram_feature_names],
 95 |                 name="feature",
 96 |             ),
 97 |             data="document",  # all features share the same input column, "document"
 98 |             name="feature_original",
 99 |         ),
100 |     )
101 | 


--------------------------------------------------------------------------------
/test/test/test_docs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test docstrings.
 3 | """
 4 | 
 5 | from pytools.api import DocValidator
 6 | 
 7 | 
 8 | def test_doc() -> None:
 9 |     assert DocValidator(
10 |         root_dir="src",
11 |         exclude_from_parameter_validation=(
12 |             r"sklearndf\.(?:"
13 |             + "|".join(
14 |                 f"(?:{pattern})"
15 |                 for pattern in (
16 |                     # generated classes, except in the '.extra' subpackages
17 |                     r"(?:classification|clustering|regression|transformation)"
18 |                     r"\.(?!extra\.).*",
19 |                     # LGBM estimators in the '.extra' packages
20 |                     r"(?:classification|regression)\.extra\.LGBM.*",
21 |                     # XGBoost estimators in the '.extra' packages
22 |                     r"(?:classification|regression)\.extra\.XGB.*",
23 |                     # BorutaPy package
24 |                     r"transformation\.extra\.BorutaDF",
25 |                     # ARFS package
26 |                     r"transformation\.extra\.BoostAGrootaDF",
27 |                     r"transformation\.extra\.GrootCVDF",
28 |                     r"transformation\.extra\.LeshyDF",
29 |                     # scikit-learn pipeline classes
30 |                     r"pipeline\.(PipelineDF|FeatureUnionDF).*",
31 |                     # sparse frames version of FeatureUnion
32 |                     r"pipeline\.wrapper\.FeatureUnion\.",
33 |                 )
34 |             )
35 |             + ")"
36 |         ),
37 |     ).validate_doc(), "docstrings and type hints are valid"
38 | 


--------------------------------------------------------------------------------
/tmp/README.md:
--------------------------------------------------------------------------------
1 | This folder is for temporary files. It is not managed by git.


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
  1 | [tox]
  2 | envlist = py3,
  3 |           py37
  4 |           py38,
  5 |           py39
  6 | skip_missing_interpreters = true
  7 | isolated_build = true
  8 | minversion = 3.7
  9 | distshare= {toxinidir}/dist/tox
 10 | 
 11 | [testenv]
 12 | changedir = .
 13 | passenv = *
 14 | setenv =
 15 |     PYTHONPATH = {toxinidir}{:}{toxinidir}/test
 16 |     PIP_DISABLE_PIP_VERSION_CHECK = 1
 17 |     PIP_EXTRA_INDEX_URL={env:FACET_PATH_URI}/pytools/dist/tox/simple
 18 | 
 19 | # We change the install command to build packages from source that depend on numpy's
 20 | # binary API.
 21 | # This is necessary to prevent the notorious "RuntimeError: module compiled against API
 22 | # version 0x… but this version of numpy is 0x…" error.
 23 | install_command =
 24 |     python -m pip install {opts} {packages} --no-binary '{env:FACET_NO_BINARY}'
 25 | 
 26 | extras =
 27 |     testing
 28 | 
 29 | commands =
 30 |     # print all installed packages to stdout
 31 |     python -m pip freeze
 32 |     # run the tests
 33 |     pytest test/ -s
 34 | 
 35 | [testenv:{py3,py37,py38,py39}-custom-deps]
 36 | deps =
 37 |     # install custom dependencies
 38 |     gamma-pytools{env:FACET_V_GAMMA_PYTOOLS}
 39 |     joblib{env:FACET_V_JOBLIB}
 40 |     matplotlib{env:FACET_V_MATPLOTLIB}
 41 |     numpy{env:FACET_V_NUMPY}
 42 |     pandas{env:FACET_V_PANDAS}
 43 |     scikit-learn{env:FACET_V_SCIKIT_LEARN}
 44 |     scipy{env:FACET_V_SCIPY}
 45 |     typing_inspect{env:FACET_V_TYPING_INSPECT}
 46 |     # optional dependencies, for testing only
 47 |     arfs{env:FACET_V_ARFS}
 48 |     boruta{env:FACET_V_BORUTA}
 49 |     lightgbm{env:FACET_V_LIGHTGBM}
 50 |     xgboost{env:FACET_V_XGBOOST}
 51 | 
 52 | [flake8]
 53 | 
 54 | max-line-length = 88
 55 | 
 56 | show-source = true
 57 | 
 58 | ignore =
 59 |     W504,  # line break after binary operator
 60 |     E402,  # module level import not at top of file
 61 |     E731,  # do not assign a lambda expression, use a def
 62 |     E741,  # ignore not easy to read variables like i l I etc
 63 |     C408,  # Unnecessary (dict/list/tuple) call - rewrite as a literal
 64 |     S001,  # found modulo formatter (incorrect picks up mod operations)
 65 | 
 66 |     # Ignores below are added to prevent conflicts with Black formatter
 67 |     E231,  # Missing whitespace after ',', ';', or ':'
 68 |     E203,  # space before :
 69 |     W503,  # line break before binary operator
 70 | 
 71 | per-file-ignores =
 72 |     __init__.py: F401, F403, F405
 73 | 
 74 | exclude =
 75 |     .eggs/*.py,
 76 |     venv/*,
 77 |     .venv/*,
 78 |     .git/*
 79 | 
 80 | [coverage:report]
 81 | ignore_errors = False
 82 | show_missing = True
 83 | 
 84 | [isort]
 85 | profile=black
 86 | src_paths=src,test
 87 | known_local_folder=sklearndf,test
 88 | known_first_party=pytools
 89 | known_third_party=numpy,pandas,joblib,sklearn,matplot
 90 | 
 91 | [pytest]
 92 | adopts =
 93 |     --cov-report=html:coverage_html
 94 |     --cov-report=xml:coverage.xml
 95 |     --cov-config=setup.cfg
 96 |     --cov-report=term-missing:skip-covered
 97 |     --no-cov-on-fail
 98 | testpaths= test/test/
 99 | log_cli_level=ERROR
100 | cache_dir=.pytest_cache
101 | 


--------------------------------------------------------------------------------