├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .idea ├── gamma-sklearndf.iml └── sklearndf.iml ├── .pre-commit-config.yaml ├── LICENSE ├── README.rst ├── RELEASE_NOTES.rst ├── azure-pipelines.yml ├── condabuild └── meta.yaml ├── config ├── spelling.dic └── test_config.yml ├── dev-setup.sh ├── environment.yml ├── make.py ├── mypy.ini ├── pypi_description.rst ├── pyproject.toml ├── sphinx ├── .gitignore ├── auxiliary │ └── Titanic_getting_started_example.ipynb ├── make.py └── source │ ├── _images │ ├── gamma_sklearndf_logo.png │ ├── sklearndf-class-hierarchy.graffle │ │ └── data.plist │ ├── sklearndf-class-hierarchy.svg │ └── sklearndf_logo.png │ ├── api_landing.rst │ ├── conf.py │ ├── contribution_guide.rst │ ├── faqs.rst │ ├── index.rst │ ├── tutorial │ └── sklearndf_tutorial.ipynb │ └── tutorials.rst ├── src └── sklearndf │ ├── __init__.py │ ├── _sklearn_version.py │ ├── _sklearndf.py │ ├── _util.py │ ├── classification │ ├── __init__.py │ ├── _classification.py │ ├── _classification_v0_22.py │ ├── _classification_v0_23.py │ ├── _classification_v1_0.py │ ├── extra │ │ ├── __init__.py │ │ └── _extra.py │ └── wrapper │ │ ├── __init__.py │ │ └── _wrapper.py │ ├── clustering │ ├── __init__.py │ ├── _clustering.py │ ├── _clustering_v1_1.py │ ├── _clustering_v1_3.py │ └── wrapper │ │ ├── __init__.py │ │ └── _wrapper.py │ ├── pipeline │ ├── __init__.py │ ├── _learner_pipeline.py │ ├── _pipeline.py │ └── wrapper │ │ ├── __init__.py │ │ └── _wrapper.py │ ├── py.typed │ ├── regression │ ├── __init__.py │ ├── _regression.py │ ├── _regression_v0_22.py │ ├── _regression_v0_23.py │ ├── _regression_v1_0.py │ ├── extra │ │ ├── __init__.py │ │ └── _extra.py │ └── wrapper │ │ ├── __init__.py │ │ └── _wrapper.py │ ├── transformation │ ├── __init__.py │ ├── _transformation.py │ ├── _transformation_v0_22.py │ ├── _transformation_v0_24.py │ ├── _transformation_v1_0.py │ ├── _transformation_v1_1.py │ ├── _transformation_v1_3.py │ ├── extra │ │ ├── __init__.py │ │ ├── _extra.py │ │ └── wrapper │ │ │ ├── __init__.py │ │ │ └── _wrapper.py │ └── wrapper │ │ ├── __init__.py │ │ └── _wrapper.py │ └── wrapper │ ├── __init__.py │ ├── _missing.py │ ├── _wrapper.py │ ├── numpy │ ├── __init__.py │ └── _numpy.py │ └── stacking │ ├── __init__.py │ └── _stacking.py ├── test └── test │ ├── __init__.py │ ├── conftest.py │ ├── paths.py │ ├── sklearndf │ ├── __init__.py │ ├── pipeline │ │ ├── __init__.py │ │ ├── test_classification_pipeline_df.py │ │ ├── test_clustering_pipeline.py │ │ ├── test_pipeline_df.py │ │ └── test_regression_pipeline_df.py │ ├── test_base.py │ ├── test_classification.py │ ├── test_clustering.py │ ├── test_meta_estimators.py │ ├── test_missing.py │ ├── test_regression.py │ ├── test_sklearn_coverage.py │ └── transformation │ │ ├── __init__.py │ │ ├── test_extra.py │ │ ├── test_imputers.py │ │ ├── test_sparse.py │ │ └── test_transformation.py │ └── test_docs.py ├── tmp └── README.md └── tox.ini /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | ### JetBrains template 108 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 109 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 110 | 111 | # User-specific stuff 112 | .idea/**/workspace.xml 113 | .idea/**/tasks.xml 114 | .idea/**/dictionaries 115 | .idea/**/shelf 116 | 117 | # Sensitive or high-churn files 118 | .idea/**/dataSources/ 119 | .idea/**/dataSources.ids 120 | .idea/**/dataSources.local.xml 121 | .idea/**/sqlDataSources.xml 122 | .idea/**/dynamic.xml 123 | .idea/**/uiDesigner.xml 124 | .idea/**/dbnavigator.xml 125 | 126 | # Gradle 127 | .idea/**/gradle.xml 128 | .idea/**/libraries 129 | 130 | # CMake 131 | cmake-build-debug/ 132 | cmake-build-release/ 133 | 134 | # Mongo Explorer plugin 135 | .idea/**/mongoSettings.xml 136 | 137 | # File-based project format 138 | *.iws 139 | 140 | # IntelliJ 141 | out/ 142 | 143 | # mpeltonen/sbt-idea plugin 144 | .idea_modules/ 145 | 146 | # JIRA plugin 147 | atlassian-ide-plugin.xml 148 | 149 | # Cursive Clojure plugin 150 | .idea/replstate.xml 151 | 152 | # Crashlytics plugin (for Android Studio and IntelliJ) 153 | com_crashlytics_export_strings.xml 154 | crashlytics.properties 155 | crashlytics-build.properties 156 | fabric.properties 157 | 158 | # Editor-based Rest Client 159 | .idea/httpRequests 160 | ### TeX template 161 | ## Core latex/pdflatex auxiliary files: 162 | *.aux 163 | *.lof 164 | *.lot 165 | *.fls 166 | *.out 167 | *.toc 168 | *.fmt 169 | *.fot 170 | *.cb 171 | *.cb2 172 | .*.lb 173 | 174 | ## Intermediate documents: 175 | *.dvi 176 | *.xdv 177 | *-converted-to.* 178 | # these rules might exclude image files for figures etc. 179 | # *.ps 180 | # *.eps 181 | # *.pdf 182 | 183 | ## Generated if empty string is given at "Please type another file name for output:" 184 | .pdf 185 | 186 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 187 | *.bbl 188 | *.bcf 189 | *.blg 190 | *-blx.aux 191 | *-blx.bib 192 | *.run.xml 193 | 194 | ## Build tool auxiliary files: 195 | *.fdb_latexmk 196 | *.synctex 197 | *.synctex(busy) 198 | *.synctex.gz 199 | *.synctex.gz(busy) 200 | *.pdfsync 201 | 202 | ## Build tool directories for auxiliary files 203 | # latexrun 204 | latex.out/ 205 | 206 | ## Auxiliary and intermediate files from other packages: 207 | # algorithms 208 | *.alg 209 | *.loa 210 | 211 | # achemso 212 | acs-*.bib 213 | 214 | # amsthm 215 | *.thm 216 | 217 | # beamer 218 | *.nav 219 | *.pre 220 | *.snm 221 | *.vrb 222 | 223 | # changes 224 | *.soc 225 | 226 | # cprotect 227 | *.cpt 228 | 229 | # elsarticle (documentclass of Elsevier journals) 230 | *.spl 231 | 232 | # endnotes 233 | *.ent 234 | 235 | # fixme 236 | *.lox 237 | 238 | # feynmf/feynmp 239 | *.mf 240 | *.mp 241 | *.t[1-9] 242 | *.t[1-9][0-9] 243 | *.tfm 244 | 245 | #(r)(e)ledmac/(r)(e)ledpar 246 | *.end 247 | *.?end 248 | *.[1-9] 249 | *.[1-9][0-9] 250 | *.[1-9][0-9][0-9] 251 | *.[1-9]R 252 | *.[1-9][0-9]R 253 | *.[1-9][0-9][0-9]R 254 | *.eledsec[1-9] 255 | *.eledsec[1-9]R 256 | *.eledsec[1-9][0-9] 257 | *.eledsec[1-9][0-9]R 258 | *.eledsec[1-9][0-9][0-9] 259 | *.eledsec[1-9][0-9][0-9]R 260 | 261 | # glossaries 262 | *.acn 263 | *.acr 264 | *.glg 265 | *.glo 266 | *.gls 267 | *.glsdefs 268 | 269 | # gnuplottex 270 | *-gnuplottex-* 271 | 272 | # gregoriotex 273 | *.gaux 274 | *.gtex 275 | 276 | # htlatex 277 | *.4ct 278 | *.4tc 279 | *.idv 280 | *.lg 281 | *.trc 282 | *.xref 283 | 284 | # hyperref 285 | *.brf 286 | 287 | # knitr 288 | *-concordance.tex 289 | # TODO Comment the next line if you want to keep your tikz graphics files 290 | *.tikz 291 | *-tikzDictionary 292 | 293 | # listings 294 | *.lol 295 | 296 | # makeidx 297 | *.idx 298 | *.ilg 299 | *.ind 300 | *.ist 301 | 302 | # minitoc 303 | *.maf 304 | *.mlf 305 | *.mlt 306 | *.mtc[0-9]* 307 | *.slf[0-9]* 308 | *.slt[0-9]* 309 | *.stc[0-9]* 310 | 311 | # minted 312 | _minted* 313 | *.pyg 314 | 315 | # morewrites 316 | *.mw 317 | 318 | # nomencl 319 | *.nlg 320 | *.nlo 321 | *.nls 322 | 323 | # pax 324 | *.pax 325 | 326 | # pdfpcnotes 327 | *.pdfpc 328 | 329 | # sagetex 330 | *.sagetex.sage 331 | *.sagetex.py 332 | *.sagetex.scmd 333 | 334 | # scrwfile 335 | *.wrt 336 | 337 | # sympy 338 | *.sout 339 | *.sympy 340 | sympy-plots-for-*.tex/ 341 | 342 | # pdfcomment 343 | *.upa 344 | *.upb 345 | 346 | # pythontex 347 | *.pytxcode 348 | pythontex-files-*/ 349 | 350 | # thmtools 351 | *.loe 352 | 353 | # TikZ & PGF 354 | *.dpth 355 | *.md5 356 | *.auxlock 357 | 358 | # todonotes 359 | *.tdo 360 | 361 | # easy-todo 362 | *.lod 363 | 364 | # xmpincl 365 | *.xmpi 366 | 367 | # xindy 368 | *.xdy 369 | 370 | # xypic precompiled matrices 371 | *.xyc 372 | 373 | # endfloat 374 | *.ttt 375 | *.fff 376 | 377 | # Latexian 378 | TSWLatexianTemp* 379 | 380 | ## Editors: 381 | # WinEdt 382 | *.bak 383 | *.sav 384 | 385 | # Texpad 386 | .texpadtmp 387 | 388 | # Kile 389 | *.backup 390 | 391 | # KBibTeX 392 | *~[0-9]* 393 | 394 | *.el 395 | 396 | # expex forward references with \gathertags 397 | *-tags.tex 398 | 399 | # standalone packages 400 | *.sta 401 | 402 | .DS_Store 403 | 404 | # 405 | # project specific 406 | # 407 | /tmp 408 | !/tmp/README.md 409 | 410 | # exclude docs while they are not yet stable 411 | /docs/** 412 | !/docs/README.md 413 | 414 | # exclude notebooks directory: this is generated during build 415 | /notebooks/ 416 | 417 | # OmniGraffle previews 418 | **/*.graffle/preview.jpeg 419 | -------------------------------------------------------------------------------- /.idea/gamma-sklearndf.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 16 | -------------------------------------------------------------------------------- /.idea/sklearndf.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 27 | 28 | 30 | 31 | 33 | 34 | 36 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/PyCQA/isort 3 | rev: 5.12.0 4 | hooks: 5 | - id: isort 6 | 7 | - repo: https://github.com/psf/black 8 | rev: 23.10.1 9 | hooks: 10 | - id: black 11 | language: python_venv 12 | language_version: python39 13 | 14 | - repo: https://github.com/pycqa/flake8 15 | rev: 5.0.4 16 | hooks: 17 | - id: flake8 18 | name: flake8 19 | entry: flake8 --config tox.ini 20 | language: python_venv 21 | language_version: python39 22 | additional_dependencies: 23 | - flake8-comprehensions ~= 3.10 24 | types: [ python ] 25 | 26 | - repo: https://github.com/pre-commit/pre-commit-hooks 27 | rev: v4.3.0 28 | hooks: 29 | - id: check-added-large-files 30 | - id: check-json 31 | - id: check-xml 32 | - id: check-yaml 33 | language: python_venv 34 | exclude: condabuild/meta.yaml 35 | 36 | - repo: https://github.com/pre-commit/mirrors-mypy 37 | rev: v1.2.0 38 | hooks: 39 | - id: mypy 40 | files: src|sphinx|test 41 | language: python_venv 42 | language_version: python39 43 | additional_dependencies: 44 | - numpy~=1.24 45 | - gamma-pytools~=2.1 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020-2021 Boston Consulting Group 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /RELEASE_NOTES.rst: -------------------------------------------------------------------------------- 1 | Release Notes 2 | ============= 3 | 4 | .. |lightgbm| replace:: :external+lightgbm:doc:`lightgbm ` 5 | .. |xgboost| replace:: :external+xgboost:doc:`xgboost ` 6 | .. |mypy| replace:: :external+mypy:doc:`mypy ` 7 | .. |nbsp| unicode:: 0xA0 8 | :trim: 9 | 10 | *sklearndf* 2.3 11 | --------------- 12 | 13 | 14 | 2.3.0 15 | ~~~~~ 16 | 17 | *sklearndf* 2.3 adds support for 18 | `scikit-learn 1.3 `_ 19 | and drops support for *scikit-learn* |nbsp| 0.24. 20 | 21 | - API: add DF wrapper classe :class:`.HDBSCANDF` for native estimator 22 | :class:`~sklearn.cluster.HDBSCAN` 23 | - API: add DF wrapper class :class:`.TargetEncoderDF` for native estimator 24 | :class:`~sklearn.preprocessing.TargetEncoder` 25 | 26 | 27 | *sklearndf* 2.2 28 | --------------- 29 | 30 | *sklearndf* 2.2 adds support for 31 | `scikit-learn 1.2 `_, and enhances the EstimatorDF 32 | API. 33 | 34 | 35 | 2.2.1 36 | ~~~~~ 37 | 38 | - VIZ: use *scikit-learn*'s native HTML representation of estimators, if available 39 | 40 | 41 | 2.2.0 42 | ~~~~~ 43 | 44 | *sklearndf* 2.2 adds support for 45 | `scikit-learn 1.2 `_. 46 | It drops support for *scikit-learn* |nbsp| 0.23 and earlier due to incomplete 47 | support of sparse output (see below). 48 | 49 | - API: DF estimators now support native estimators using sparse matrices as input or 50 | output, and automatically convert them to or from sparse :class:`~pandas.DataFrame` 51 | objects 52 | - API: new property :attr:`.EstimatorDF.output_names_` to get the names of the output 53 | columns the estimator was fitted with 54 | - API: new method :attr:`.LearnerPipelineDF.preprocess` to apply the preprocessing step 55 | to a data frame 56 | - API: remove properties ``feature_names_out_`` and ``feature_names_original_`` from 57 | class :class:`.LearnerPipelineDF` 58 | - API: :class:`~pandas.Index` instances obtained from 59 | :attr:`.EstimatorDF.feature_names_in_` and :attr:`.TransformerDF.feature_names_out_` 60 | are now named ``"feature"`` instead of ``"feature_in"`` and ``"feature_out"``, 61 | respectively, and :class:`~pandas.Series` instances obtained from 62 | :attr:`.TransformerDF.feature_names_original_` are now named ``"feature_original"`` 63 | instead of ``"feature_in"``, and their indices are now named ``"feature"`` instead 64 | of ``"feature_out"``; this is to separate the semantics of the originating property 65 | from the column index, which may be used in other contexts 66 | 67 | 68 | 69 | *sklearndf* 2.1 70 | --------------- 71 | 72 | *sklearndf* 2.1 adds support for 73 | `scikit-learn 1.1 `_. 74 | 75 | 76 | 2.1.1 77 | ~~~~~ 78 | 79 | This is a maintenance release to catch up with *sklearndf* |nbsp| 2.0.2. 80 | 81 | 82 | 2.1.0 83 | ~~~~~ 84 | 85 | - API: new clusterer :class:`.BisectingKMeansDF` 86 | - API: new transformer :class:`.MiniBatchNMFDF` 87 | - API: new transformer :class:`.RandomTreesEmbeddingDF`; note that class 88 | :class:`~sklearn.ensemble.RandomTreesEmbedding` existed previously in *scikit-learn*, 89 | but is based on :class:`~sklearn.base.TransformerMixin` only as of 90 | *scikit-learn* |nbsp| 1.1 91 | - API: support parameters ``max_categories`` and ``min_frequency`` of 92 | :class:`.OneHotEncoderDF`, introduced in *scikit-learn* |nbsp| 1.1 93 | - API: support array-like values for parameter ``drop`` of :class:`.OneHotEncoderDF` 94 | - API: support ``"passthrough"`` as a transformer in :class:`.FeatureUnionDF` 95 | - API: remove ``GeneralizedLinearRegressorDF`` since the underlying native estimator is 96 | a base class and not intended to be used as a regressor of its own 97 | 98 | 99 | *sklearndf* 2.0 100 | --------------- 101 | 102 | *sklearndf* 2.0 adds support for 103 | `scikit-learn 1.0 `_, 104 | adds data frame support for clusterers along with additional API enhancements and 105 | improvements, and is now subject to static type checking with |mypy|. 106 | 107 | 108 | 2.0.2 109 | ~~~~~ 110 | 111 | - BUILD: add support for :mod:`pandas` 2.0 and above 112 | - FIX: property :attr:`.PCADF.n_components_` now returns the value of 113 | :attr:`~sklearndf.decomposition.PCA.n_components_`, not 114 | :attr:`~sklearndf.decomposition.PCA.n_components` 115 | - FIX: detect missing and extra columns when validating data frames resulting from 116 | transforms, even when the total column count is correct 117 | 118 | 119 | 2.0.1 120 | ~~~~~ 121 | 122 | - API: upon declaration of new wrapper classes, automatically validate that their 123 | associated native estimators are compatible with the wrapper class 124 | - API: new public constants ``DROP`` and ``PASSTHROUGH`` in 125 | :class:`.ColumnTransformerDF` 126 | - FIX: base :class:`.LGBMClassifierDF` and :class:`.XGBClassifierDF` on the 127 | the correct wrapper class :class:`.ClassifierWrapperDF` 128 | - FIX: support array-like values for parameter ``drop`` of :class:`.OneHotEncoderDF` 129 | - FIX: various minor tweaks and stability improvements 130 | 131 | 132 | 2.0.0 133 | ~~~~~ 134 | 135 | - API: :class:`.ClassifierDF` and :class:`.RegressorDF` get a new base class 136 | :class:`.SupervisedLearnerDF`, which in turn is based on :class:`.LearnerDF`; 137 | :class:`.SupervisedLearnerDF` implements method :meth:`~.SupervisedLearnerDF.score`, 138 | which is no longer implemented by :class:`.LearnerDF` 139 | - API: new class :class:`.ClusterDF`, based on :class:`.LearnerDF` 140 | - API: class :class:`.EstimatorDF` now implements the 141 | :class:`~pytools.expression.HasExpressionRepr` mix-in, rendering estimator 142 | representations as :class:`~pytools.expression.Expression` objects to enable better 143 | formatting 144 | - API: added data frame support for method 145 | :meth:`~.PartialFitRegressorWrapperDF.partial_fit` 146 | - API: removed ``OutlierRemoverDF`` 147 | - API: removed dependency on package |lightgbm|: :class:`.LGBMClassifierDF` and 148 | :class:`.LGBMRegressorDF` are still available if |lightgbm| is installed 149 | - API: added support for |xgboost|: :class:`.XGBClassifierDF` and 150 | :class:`.XGBClassifierDF` are available if |xgboost| is installed 151 | - API: DF wrapper classes are now created using proper class declarations to better 152 | conform with Python type conventions checked by |mypy|; 153 | see :mod:`sklearndf.wrapper` for details 154 | - API: remove functions ``make_df_estimator``, ``make_df_classifier``, 155 | ``make_df_regressor``, and ``make_df_transformer`` which are now obsolete 156 | - API: move some classes in :mod:`sklearndf.wrapper` to sub-packages 157 | :mod:`sklearndf.wrapper.stacking` and :mod:`sklearndf.wrapper.numpy` to improve 158 | package navigability and to achieve better de-coupling of the underlying code; 159 | this change also moves :class:`~.StackingClassifierWrapperDF` and 160 | :class:`~.StackingRegressorWrapperDF` to package :mod:`sklearndf.wrapper.stacking` 161 | 162 | 163 | *sklearndf* 1.2 164 | --------------- 165 | 166 | This release adds support for `scikit-learn 0.24 `_. 167 | 168 | 169 | 1.2.3 170 | ~~~~~ 171 | 172 | This is a maintenance release to catch up with *sklearndf* |nbsp| 1.1.3. 173 | 174 | 175 | 1.2.2 176 | ~~~~~ 177 | 178 | This release makes small API tweaks, and catches up with *sklearndf* |nbsp| 1.1.2. 179 | 180 | - API: make type hints more specific in signatures for 181 | :func:`.make_df_transformer`, :func:`.make_df_classifier`, and 182 | :func:`.make_df_regressor` 183 | 184 | 185 | 1.2.1 186 | ~~~~~ 187 | 188 | This is a maintenance release to catch up with *sklearndf* |nbsp| 1.1.1. 189 | 190 | 191 | 1.2.0 192 | ~~~~~ 193 | 194 | - API: add `DF` adaptations for classes introduced by *scikit-learn* |nbsp| 0.24: 195 | :class:`.PolynomialCountSketchDF` and :class:`.SequentialFeatureSelectorDF` 196 | 197 | 198 | *sklearndf* 1.1 199 | --------------- 200 | 201 | 1.1.3 202 | ~~~~~ 203 | 204 | This release relaxes package dependencies to support any `numpy` version `1.x` from 205 | 1.16. 206 | 207 | 208 | 1.1.2 209 | ~~~~~ 210 | 211 | This release improves compatibility with `scikit-learn` and fixes bugs. 212 | 213 | - API: add full support for the 214 | `_estimator_type `__ 215 | attribute 216 | - FIX: do not reset transformers when calling :meth:`.TransformerDF.inverse_transform` 217 | - FIX: accept `"passthrough"` as value for arg `remainder` of 218 | :class:`.ColumnTransformerDF` 219 | 220 | 221 | 1.1.1 222 | ~~~~~ 223 | 224 | This release addresses compatibility issues with meta-estimators. 225 | 226 | - FIX: support complex DF estimators inside :class:`.StackingEstimatorDF` 227 | - FIX: raise an exception if a base estimator is not supported by one of `sklearndf`'s 228 | implementations for DF meta-estimators 229 | 230 | 231 | 1.1.0 232 | ~~~~~ 233 | 234 | This release exposes the `wrapper` API used to generate augmented DF estimators from 235 | native `scikit-learn` estimators. 236 | 237 | - API: expose the :class:`.EstimatorWrapperDF` class hierarchy through the new 238 | :mod:`sklearndf.wrapper` package 239 | - API: create new `scikit-learn` wrapper classes with the new functions 240 | :func:`.make_df_estimator`, :func:`.make_df_classifier`, :func:`.make_df_regressor`, 241 | and :func:`.make_df_transformer` 242 | 243 | 244 | *sklearndf* 1.0 245 | --------------- 246 | 247 | 1.0.2 248 | ~~~~~ 249 | 250 | This is a maintenance release focusing on enhancements to the CI/CD pipeline and bug 251 | fixes. 252 | 253 | - FIX: correctly mirror ``__init__`` signatures of native estimators to their 254 | corresponding DF estimators 255 | - FIX: do not mirror native estimator class attributes and protected members to 256 | DF estimators 257 | - FIX: support ``"passthrough"`` transformer in :class:`.ColumnTransformerDF` 258 | - FIX: support ``drop`` parameter in :class:`.OneHotEncoderDF` 259 | - BUILD: add support for `numpy` |nbsp| 1.20 260 | - BUILD: updates and changes to the CI/CD pipeline 261 | 262 | 263 | 1.0.1 264 | ~~~~~ 265 | 266 | Initial release. 267 | -------------------------------------------------------------------------------- /condabuild/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: sklearndf 3 | version: {{ environ.get('FACET_BUILD_SKLEARNDF_VERSION') }} 4 | 5 | source: 6 | git_url: ../ 7 | 8 | build: 9 | noarch: python 10 | script: "flit install --deps none" 11 | 12 | requirements: 13 | host: 14 | - pip>=20.* 15 | - python {{ environ.get('FACET_V_PYTHON', '=3.8.*') }} 16 | - numpy {{ environ.get('FACET_V_NUMPY', '>=1.11.*') }} 17 | - flit>=3.0.* 18 | - packaging>=20 19 | run: 20 | - gamma-pytools {{ environ.get('FACET_V_GAMMA_PYTOOLS') }} 21 | - numpy {{ environ.get('FACET_V_NUMPY') }} 22 | - packaging {{ environ.get('FACET_V_PACKAGING') }} 23 | - pandas {{ environ.get('FACET_V_PANDAS') }} 24 | - python {{ environ.get('FACET_V_PYTHON') }} 25 | - scikit-learn {{ environ.get('FACET_V_SCIKIT_LEARN') }} 26 | - scipy {{ environ.get('FACET_V_SCIPY') }} 27 | test: 28 | imports: 29 | - sklearndf 30 | - sklearndf.classification 31 | - sklearndf.classification.extra 32 | - sklearndf.pipeline 33 | - sklearndf.regression 34 | - sklearndf.regression.extra 35 | - sklearndf.transformation 36 | - sklearndf.transformation.extra 37 | requires: 38 | - pytest ~= 7.1 39 | # we need pip to install arfs 40 | - pip # {{ '[False]' if not environ.get('FACET_V_ARFS') }} 41 | # optional libraries of sklearndf, needed for testing 42 | - boruta_py {{ environ.get('FACET_V_BORUTA', '[False]') }} 43 | - xgboost {{ environ.get('FACET_V_XGBOOST', '[False]') }} 44 | # we always need lightgbm for testing; version spec is optional 45 | - lightgbm {{ environ.get('FACET_V_LIGHTGBM', '') }} 46 | # additional requirements of gamma-pytools 47 | - joblib {{ environ.get('FACET_V_JOBLIB', '[False]') }} 48 | - matplotlib-base {{ environ.get('FACET_V_MATPLOTLIB', '[False]') }} 49 | - typing_inspect {{ environ.get('FACET_V_TYPING_INSPECT', '[False]') }} 50 | commands: 51 | - conda list 52 | - python -c 'import sklearndf; 53 | import os; 54 | assert sklearndf.__version__ == os.environ["PKG_VERSION"]' 55 | # optional PyPi package ARFS needed for testing 56 | {% if environ.get('FACET_V_ARFS') -%} 57 | - pip install 'arfs{{ environ.get("FACET_V_ARFS") }}' 58 | {%- endif %} 59 | # run the test suite 60 | - cd "${FACET_PATH}/sklearndf" 61 | - pytest -vs test 62 | 63 | about: 64 | home: https://github.com/BCG-X-Official/sklearndf 65 | license: Apache Software License v2.0 66 | license_file: LICENSE 67 | description: | 68 | sklearndf is an open source library designed to address a common need with 69 | scikit-learn: the outputs of transformers are numpy arrays, even when the input 70 | is a data frame. However, to inspect a model it is essential to keep track of 71 | the feature names. 72 | dev_url: https://github.com/BCG-X-Official/sklearndf 73 | doc_url: https://bcg-x-official.github.io/sklearndf/ 74 | doc_source_url: https://github.com/BCG-X-Official/sklearndf/blob/develop/README.rst -------------------------------------------------------------------------------- /config/spelling.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/config/spelling.dic -------------------------------------------------------------------------------- /config/test_config.yml: -------------------------------------------------------------------------------- 1 | - inputfile: 2 | delimiter: "|" 3 | header: infer 4 | date_column_name : Date 5 | yield_column_name : Yield 6 | decimal: "," -------------------------------------------------------------------------------- /dev-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | conda env create -f environment.yml 3 | conda activate sklearndf-develop 4 | pre-commit install -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: sklearndf-develop 2 | channels: 3 | - conda-forge 4 | - bcg_gamma 5 | dependencies: 6 | # run 7 | - boruta_py ~= 0.3 8 | - gamma-pytools ~= 2.1 9 | - joblib ~= 1.2 10 | - lightgbm ~= 3.3 11 | - matplotlib ~= 3.7 12 | - numpy ~= 1.24 13 | - pandas ~= 2.0 14 | - pip ~= 23.3 15 | - python ~= 3.9 16 | - scikit-learn ~= 1.2.0 17 | - scipy ~= 1.11 18 | - xgboost ~= 1.7 19 | - pip: 20 | - arfs ~= 1.1 21 | # test 22 | - pytest ~= 7.2.1 23 | - pytest-cov ~= 2.12.1 24 | # sphinx 25 | - nbsphinx ~= 0.8.9 26 | - sphinx ~= 4.5.0 27 | - sphinx-autodoc-typehints ~= 1.19.2 28 | - pydata-sphinx-theme ~= 0.8.1 29 | # notebooks 30 | - ipywidgets ~= 8.1 31 | - jupyterlab ~= 3.6 32 | - openpyxl ~= 3.1 33 | - seaborn ~= 0.13 34 | - tableone ~= 0.7 35 | -------------------------------------------------------------------------------- /make.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | call the Python make file for the common conda build process residing in 'pytools' 4 | """ 5 | 6 | import os 7 | import sys 8 | 9 | SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__)) 10 | PYTOOLS_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, "pytools")) 11 | sys.path.insert(0, PYTOOLS_DIR) 12 | 13 | # noinspection PyUnresolvedReferences 14 | from make import run_make 15 | 16 | run_make() 17 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | strict = True 3 | show_error_codes = True 4 | 5 | [mypy-arfs.*] 6 | ; TODO remove once PEP 561 is supported 7 | ignore_missing_imports = True 8 | 9 | [mypy-boruta.*] 10 | ; TODO remove once PEP 561 is supported 11 | ignore_missing_imports = True 12 | 13 | [mypy-lightgbm.*] 14 | ; TODO remove once PEP 561 is supported 15 | ignore_missing_imports = True 16 | 17 | [mypy-packaging.*] 18 | ; TODO remove once PEP 561 is supported 19 | ignore_missing_imports = True 20 | 21 | [mypy-pandas.*] 22 | ; TODO remove once PEP 561 is supported 23 | ignore_missing_imports = True 24 | 25 | [mypy-scipy.*] 26 | ; TODO remove once PEP 561 is supported 27 | ignore_missing_imports = True 28 | 29 | [mypy-sklearn.*] 30 | ; TODO remove once PEP 561 is supported 31 | ignore_missing_imports = True 32 | 33 | [mypy-xgboost.*] 34 | ; TODO remove once PEP 561 is supported 35 | ignore_missing_imports = True 36 | -------------------------------------------------------------------------------- /pypi_description.rst: -------------------------------------------------------------------------------- 1 | *sklearndf* is an open source library designed to address a common need with 2 | `scikit-learn `__: the outputs of 3 | transformers are numpy arrays, even when the input is a 4 | data frame. However, to inspect a model it is essential to keep track of the 5 | feature names. 6 | 7 | To this end, *sklearndf* enhances scikit-learn's estimators as follows: 8 | 9 | - **Preserve data frame structure**: 10 | Return data frames as results of transformations, preserving feature names as the column index. 11 | - **Feature name tracing**: 12 | Add additional estimator properties to enable tracing a feature name back to its original input feature; this is especially useful for transformers that create new features (e.g., one-hot encode), and for pipelines that include such transformers. 13 | - **Easy use**: 14 | Simply append DF at the end of your usual scikit-learn class names to get enhanced data frame support! 15 | 16 | .. Begin-Badges 17 | 18 | |pypi| |conda| |python_versions| |code_style| |made_with_sphinx_doc| |License_badge| 19 | 20 | .. End-Badges 21 | 22 | License 23 | --------------------------- 24 | 25 | *sklearndf* is licensed under Apache 2.0 as described in the 26 | `LICENSE `_ file. 27 | 28 | .. Begin-Badges 29 | 30 | .. |conda| image:: https://anaconda.org/bcg_gamma/sklearndf/badges/version.svg 31 | :target: https://anaconda.org/BCG_Gamma/sklearndf 32 | 33 | .. |pypi| image:: https://badge.fury.io/py/sklearndf.svg 34 | :target: https://pypi.org/project/sklearndf/ 35 | 36 | .. |python_versions| image:: https://img.shields.io/badge/python-3.7|3.8|3.9-blue.svg 37 | :target: https://www.python.org/downloads/release/python-380/ 38 | 39 | .. |code_style| image:: https://img.shields.io/badge/code%20style-black-000000.svg 40 | :target: https://github.com/psf/black 41 | 42 | .. |made_with_sphinx_doc| image:: https://img.shields.io/badge/Made%20with-Sphinx-1f425f.svg 43 | :target: https://bcg-x-official.github.io/sklearndf/index.html 44 | 45 | .. |license_badge| image:: https://img.shields.io/badge/License-Apache%202.0-olivegreen.svg 46 | :target: https://opensource.org/licenses/Apache-2.0 47 | 48 | .. End-Badges -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [tool.flit.sdist] 6 | exclude = [".idea", "tmp", "dist", ".tox", ".pytest_cache"] 7 | 8 | [tool.flit.metadata] 9 | module = "sklearndf" 10 | author = "Boston Consulting Group (BCG)" 11 | home-page = "https://github.com/BCG-X-Official/sklearndf" 12 | description-file = "pypi_description.rst" 13 | dist-name = "sklearndf" 14 | license = "Apache Software License v2.0" 15 | 16 | requires = [ 17 | "gamma-pytools ~=2.1", 18 | "numpy >=1.21,<2a", # cannot use ~= due to conda bug 19 | "packaging >=20", 20 | "pandas >=1", 21 | "scikit-learn >=1,<1.4a", 22 | "scipy ~=1.6", 23 | ] 24 | 25 | requires-python = ">=3.7,<4a" 26 | 27 | classifiers = [ 28 | "Development Status :: 5 - Production/Stable", 29 | "Intended Audience :: Science/Research", 30 | "License :: OSI Approved :: Apache Software License", 31 | "Operating System :: MacOS", 32 | "Operating System :: Microsoft :: Windows", 33 | "Operating System :: POSIX :: Linux", 34 | "Operating System :: Unix", 35 | "Programming Language :: Python", 36 | "Programming Language :: Python :: 3", 37 | "Programming Language :: Python :: 3.7", 38 | "Programming Language :: Python :: 3.8", 39 | "Programming Language :: Python :: 3.9", 40 | "Topic :: Scientific/Engineering", 41 | ] 42 | 43 | [tool.flit.metadata.requires-extra] 44 | testing = [ 45 | "pytest ~= 7.1", 46 | "pytest-cov ~= 2.12", 47 | # optional requirements for testing sklearndf 48 | "lightgbm ~= 3.0", 49 | "xgboost ~= 1.0", 50 | ] 51 | docs = [ 52 | "sphinx ~= 4.5", 53 | "sphinx-autodoc-typehints ~= 1.19", 54 | "pydata-sphinx-theme ~= 0.8.1", 55 | "jinja2 ~= 2.11", 56 | "nbsphinx ~= 0.8.9", 57 | "jupyter == 1", 58 | "docutils ~= 0.17", 59 | "xlrd ~= 1.2", 60 | "m2r ~= 0.2" 61 | ] 62 | 63 | [tool.flit.metadata.urls] 64 | Documentation = "https://bcg-x-official.github.io/sklearndf/" 65 | Repository = "https://github.com/BCG-X-Official/sklearndf" 66 | 67 | [build] 68 | # comma-separated list of packages to be built from source in pip min builds 69 | no-binary.min = ["matplotlib"] 70 | 71 | [build.matrix.min] 72 | # direct requirements of sklearndf 73 | boruta = "~=0.3.0" 74 | gamma-pytools = "~=2.1.0" 75 | lightgbm = "~=3.0.0" 76 | numpy = "==1.21.6" # cannot use ~= due to conda bug 77 | packaging = "~=20.9" 78 | pandas = "~=1.1.5" 79 | python = ">=3.7.12,<3.8a" # cannot use ~= due to conda bug 80 | scipy = "~=1.6.3" 81 | scikit-learn = "~=1.0.2" 82 | xgboost = "~=1.0.2" 83 | # additional minimum requirements of gamma-pytools 84 | joblib = "~=0.14.1" 85 | matplotlib = "~=3.0.3" 86 | typing_inspect = "~=0.4.0" 87 | 88 | [build.matrix.max] 89 | # direct requirements of sklearndf 90 | arfs = "~=1.1" 91 | gamma-pytools = "~=2.1" 92 | lightgbm = "~=3.3" 93 | numpy = ">=1.24,<2a" # cannot use ~= due to conda bug 94 | packaging = ">=20" 95 | pandas = "~=2.0" 96 | python = ">=3.11,<3.12a" # cannot use ~= due to conda bug 97 | scikit-learn = "~=1.3.2" 98 | scipy = "~=1.11" 99 | xgboost = "~=1.5" 100 | # additional maximum requirements of gamma-pytools 101 | joblib = "~=1.1" 102 | matplotlib = "~=3.5" 103 | typing_inspect = "~=0.7" 104 | 105 | [tool.black] 106 | # quiet = "True" 107 | line-length = 88 108 | target_version = ['py36'] 109 | include = '\.pyi?$' 110 | exclude = ''' 111 | ( 112 | /( 113 | \.eggs # exclude a few common directories in the 114 | | \.git # root of the project 115 | | \.hg 116 | | \.mypy_cache 117 | | \.tox 118 | | \.venv 119 | | data 120 | | docs 121 | | notebooks 122 | | sphinx 123 | )/ 124 | ) 125 | ''' 126 | -------------------------------------------------------------------------------- /sphinx/.gitignore: -------------------------------------------------------------------------------- 1 | base 2 | source/_generated 3 | source/apidoc 4 | -------------------------------------------------------------------------------- /sphinx/make.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Make sphinx documentation using the pytools make utility 4 | """ 5 | import os 6 | from urllib import request 7 | 8 | BRANCH = "2.1.x" 9 | 10 | 11 | if __name__ == "__main__": 12 | # noinspection PyUnusedLocal 13 | def run_make(branch: str, working_directory: str) -> None: 14 | """Stub, overwritten by bootstrap.py""" 15 | 16 | # run the common make file available in the pytools repo 17 | with request.urlopen( 18 | f"https://raw.githubusercontent.com/BCG-X-Official/pytools/{BRANCH}" 19 | f"/sphinx/base/bootstrap.py" 20 | ) as response: 21 | exec(response.read().decode("utf-8"), globals()) 22 | 23 | run_make(branch=BRANCH, working_directory=os.path.dirname(__file__)) 24 | -------------------------------------------------------------------------------- /sphinx/source/_images/gamma_sklearndf_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/sphinx/source/_images/gamma_sklearndf_logo.png -------------------------------------------------------------------------------- /sphinx/source/_images/sklearndf-class-hierarchy.graffle/data.plist: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/sphinx/source/_images/sklearndf-class-hierarchy.graffle/data.plist -------------------------------------------------------------------------------- /sphinx/source/_images/sklearndf_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/sphinx/source/_images/sklearndf_logo.png -------------------------------------------------------------------------------- /sphinx/source/api_landing.rst: -------------------------------------------------------------------------------- 1 | Augmented scikit-learn classes are named after their native scikit-learn counterparts, 2 | with `DF` added as a suffix: 3 | :class:`.SimpleImputerDF` takes the place of :class:`~sklearn.impute.SimpleImputer`, 4 | :class:`.RandomForestRegressorDF` takes the place of 5 | :class:`~sklearn.ensemble.RandomForestRegressor`, and so on. 6 | 7 | For all methods expecting an `X` argument for a feature matrix and potentially a 8 | `y` argument for one or more targets, `sklearndf` estimators expect a pandas 9 | :class:`~pandas.DataFrame` for `X` and a pandas :class:`~pandas.Series` for a 10 | 1‑dimensional `y`, or a pandas :class:`~pandas.DataFrame` for `y` when fitting to 11 | multiple targets or outputs. 12 | This includes methods such as :meth:`~EstimatorDF.fit`, 13 | :meth:`~TransformerDF.transform`, and :meth:`~LearnerDF.predict`. 14 | 15 | All estimators enhanced by `sklearndf` also implement an additional attribute 16 | :attr:`~EstimatorDF.feature_names_in_`, keeping track of the column names of the data 17 | frame used to fit the estimator. 18 | 19 | `sklearndf` transformers also implement attributes 20 | :attr:`~TransformerDF.feature_names_out_` and 21 | :attr:`~TransformerDF.feature_names_original_`, keeping track of the feature names of 22 | the transformed outputs as well as mapping output features back to the input features. 23 | This enables tracing features back to the original inputs even across complex 24 | pipelines (see also :class:`.PipelineDF`). 25 | 26 | `sklearndf` classes implement a class hierarchy that follows the taxonomy of 27 | scikit-learn classes (but is only partially reflected via class inheritance in the 28 | original `scikit-learn` implementation): 29 | 30 | | 31 | 32 | .. image:: /_images/sklearndf-class-hierarchy.svg 33 | :alt: sklearndf class hierarchy 34 | :align: center 35 | 36 | | 37 | 38 | - all `sklearndf` transformers are subclasses of :class:`.TransformerDF`, which in turn 39 | provides the API for all common transformer methods, e.g., 40 | :meth:`~TransformerDF.transform` 41 | 42 | - all `sklearndf` clusterers are subclasses of :class:`.ClusterDF`, which 43 | in turn provides the API for all common clustering methods, e.g., 44 | :meth:`~ClusterDF.fit_predict` 45 | 46 | - all `sklearndf` regressors are subclasses of :class:`.RegressorDF`, which 47 | in turn provides the API for all common regressor methods, e.g., 48 | :meth:`~LearnerDF.predict` 49 | 50 | - all `sklearndf` classifiers are subclasses of :class:`.ClassifierDF`, which 51 | in turn provides the API for all common classifier methods, e.g., 52 | :meth:`~ClassifierDF.predict_proba` 53 | 54 | - all `sklearndf` regressors and classifiers are subclasses of 55 | :class:`.SupervisedLearnerDF` 56 | 57 | - all `sklearndf` regressors, classifiers and clusterers are subclasses of 58 | :class:`.LearnerDF` 59 | 60 | - all `sklearndf` estimators are subclasses of :class:`.EstimatorDF` 61 | 62 | `sklearndf` introduces additional pipeline classes :class:`.RegressorPipelineDF`, 63 | :class:`.ClassifierPipelineDF`, and :class:`.ClusterPipelineDF`, with an abstract base 64 | class :class:`.LearnerPipelineDF`, to allow for easier handling of common types of ML 65 | pipelines. 66 | These classes implement pipelines with two steps -- one preprocessing step, followed by 67 | a learner as the second and final step. 68 | 69 | `sklearndf` also provides data frame support for a selection of custom or 3rd-party 70 | estimators, most notably :class:`.BorutaDF`, :class:`.LGBMRegressorDF`, 71 | :class:`.LGBMClassifierDF`, :class:`.XGBRegressorDF`, and :class:`.XGBClassifierDF`. 72 | 73 | All `sklearndf` estimators are fully type hinted. 74 | 75 | Please see the :ref:`release notes` for recent API updates and bug fixes. 76 | -------------------------------------------------------------------------------- /sphinx/source/conf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration file for the Sphinx documentation builder. 3 | 4 | Receives the majority of the configuration from pytools conf_base.py 5 | """ 6 | 7 | import os 8 | import sys 9 | 10 | _dir_base = os.path.join(os.path.dirname(os.path.dirname(__file__)), "base") 11 | sys.path.insert(0, _dir_base) 12 | 13 | from conf_base import set_config 14 | 15 | # ----- set custom configuration ----- 16 | 17 | set_config( 18 | globals(), 19 | project="sklearndf", 20 | html_logo=os.path.join("_images", "gamma_sklearndf_logo.png"), 21 | intersphinx_mapping={ 22 | "lightgbm": ("https://lightgbm.readthedocs.io/en/latest/", None), 23 | "pytools": ("https://bcg-x-official.github.io/pytools/", None), 24 | "sklearn": ("https://scikit-learn.org/stable", None), 25 | "xgboost": ("https://xgboost.readthedocs.io/en/latest/", None), 26 | }, 27 | ) 28 | -------------------------------------------------------------------------------- /sphinx/source/faqs.rst: -------------------------------------------------------------------------------- 1 | .. _faqs: 2 | 3 | FAQ 4 | === 5 | 6 | Below you can find answers to commonly asked questions as well as how to 7 | cite *sklearndf*. 8 | 9 | Commonly asked questions 10 | ------------------------ 11 | 12 | If you don't see your answer there you could also try posting 13 | on `stackoverflow `_. 14 | 15 | 1. **What if I find a bug or have an idea for a new feature?** 16 | 17 | For bug reports or feature requests please use our 18 | `GitHub issue tracker `_. 19 | For any other enquiries please feel free to contact us at FacetTeam@bcg.com. 20 | 21 | 2. **How can I contribute?** 22 | 23 | We welcome contributors! If you have minor changes in mind that would like to 24 | contribute, please feel free to create a pull request and be sure to follow the 25 | developer guidelines. For large or extensive changes please feel free to open an 26 | issue, or reach out to us at FacetTeam@bcg.com to discuss. 27 | 28 | 29 | Citation 30 | -------- 31 | If you use *sklearndf* in your work please cite us as follows: 32 | 33 | Bibtex entry:: 34 | 35 | @manual{ 36 | title={sklearndf}, 37 | author={FACET Team at BCG Gamma}, 38 | year={2021}, 39 | note={Python package version 1.1.1} 40 | } 41 | 42 | -------------------------------------------------------------------------------- /sphinx/source/index.rst: -------------------------------------------------------------------------------- 1 | .. image:: /_images/sklearndf_logo.png 2 | 3 | | 4 | 5 | Table of contents 6 | ----------------- 7 | 8 | .. toctree:: 9 | :maxdepth: 1 10 | :titlesonly: 11 | 12 | Getting started <_generated/getting_started> 13 | API reference 14 | tutorials 15 | contribution_guide 16 | faqs 17 | _generated/release_notes 18 | -------------------------------------------------------------------------------- /sphinx/source/tutorials.rst: -------------------------------------------------------------------------------- 1 | .. _tutorials: 2 | 3 | Tutorials 4 | ========= 5 | 6 | 7 | 8 | 9 | Detailed *sklearndf* tutorial 10 | ------------------------------ 11 | 12 | Start exploring the tutorial right away by clicking on the section links below, and 13 | start running the code for yourself by downloading the notebook 14 | :download:`here `. 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | 19 | tutorial/sklearndf_tutorial 20 | 21 | -------------------------------------------------------------------------------- /src/sklearndf/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data frame support and feature traceability for `scikit-learn`. 3 | 4 | `sklearndf` augments more than 160 `scikit-learn` estimators for 5 | native support of data frames, while leaving the original API intact. 6 | """ 7 | 8 | from ._sklearn_version import * 9 | from ._sklearndf import * 10 | 11 | __version__ = "2.3.1" 12 | -------------------------------------------------------------------------------- /src/sklearndf/_sklearn_version.py: -------------------------------------------------------------------------------- 1 | """ 2 | Special constants for version checks for scikit-learn. 3 | """ 4 | 5 | from packaging.version import Version 6 | from sklearn import __version__ as sklearn_version 7 | 8 | __all__ = [ 9 | "__sklearn_version__", 10 | "__sklearn_1_1__", 11 | "__sklearn_1_2__", 12 | "__sklearn_1_3__", 13 | "__sklearn_1_4__", 14 | ] 15 | 16 | __sklearn_version__ = Version(sklearn_version) 17 | __sklearn_1_1__ = Version("1.1") 18 | __sklearn_1_2__ = Version("1.2") 19 | __sklearn_1_3__ = Version("1.3") 20 | __sklearn_1_4__ = Version("1.4") 21 | -------------------------------------------------------------------------------- /src/sklearndf/_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Auxiliary functions for internal use. 3 | """ 4 | 5 | from typing import Any, List, Optional, Union, cast 6 | 7 | import numpy.typing as npt 8 | import pandas as pd 9 | from scipy import sparse 10 | 11 | 12 | def hstack_frames( 13 | frames: List[Union[npt.NDArray[Any], sparse.spmatrix, pd.DataFrame]], 14 | *, 15 | prefixes: Optional[List[str]] = None, 16 | ) -> Optional[pd.DataFrame]: 17 | """ 18 | If only data frames are passed, stack them horizontally. 19 | 20 | :param frames: a list of array-likes 21 | :param prefixes: an optional list of prefixes to use for the columns of each data 22 | frame in arg ``frames``; must have the same length as arg ``frames`` 23 | :return: the stacked data frame if all elements of ``frames`` are data frames; 24 | ``None`` otherwise 25 | """ 26 | if all(isinstance(frame, pd.DataFrame) for frame in frames): 27 | # all frames are data frames 28 | frames = cast(List[pd.DataFrame], frames) 29 | if prefixes is not None: 30 | assert len(prefixes) == len( 31 | frames 32 | ), "number of prefixes must match number of frames" 33 | frames = [ 34 | frame.add_prefix(f"{prefix}__") 35 | for frame, prefix in zip(frames, prefixes) 36 | ] 37 | return pd.concat(frames, axis=1) 38 | else: 39 | return None 40 | 41 | 42 | def is_sparse_frame(frame: pd.DataFrame) -> bool: 43 | """ 44 | Check if a data frame contains sparse columns. 45 | 46 | :param frame: the data frame to check 47 | :return: ``True`` if the data frame contains sparse columns; ``False`` otherwise 48 | """ 49 | 50 | return any(isinstance(dtype, pd.SparseDtype) for dtype in frame.dtypes) 51 | 52 | 53 | def sparse_frame_density(frame: pd.DataFrame) -> float: 54 | """ 55 | Compute the density of a data frame. 56 | 57 | The density of a data frame is the average density of its columns. 58 | The density of a sparse column is the ratio of non-sparse points to total (dense) 59 | data points. 60 | The density of a dense column is 1. 61 | 62 | :param frame: a data frame 63 | :return: the density of the data frame 64 | """ 65 | 66 | def _density(sr: pd.Series) -> float: 67 | if isinstance(sr.dtype, pd.SparseDtype): 68 | return cast(float, sr.sparse.density) 69 | else: 70 | return 1.0 71 | 72 | return sum(_density(sr) for _, sr in frame.items()) / len(frame.columns) 73 | -------------------------------------------------------------------------------- /src/sklearndf/classification/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extended versions of all `scikit-learn` classifiers with enhanced support for data 3 | frames. 4 | """ 5 | from ._classification import * 6 | from ._classification_v0_22 import * 7 | from ._classification_v0_23 import * 8 | from ._classification_v1_0 import * 9 | -------------------------------------------------------------------------------- /src/sklearndf/classification/_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.classification` 3 | """ 4 | import logging 5 | 6 | from sklearn.calibration import CalibratedClassifierCV 7 | from sklearn.discriminant_analysis import ( 8 | LinearDiscriminantAnalysis, 9 | QuadraticDiscriminantAnalysis, 10 | ) 11 | from sklearn.dummy import DummyClassifier 12 | from sklearn.ensemble import ( 13 | AdaBoostClassifier, 14 | BaggingClassifier, 15 | ExtraTreesClassifier, 16 | GradientBoostingClassifier, 17 | RandomForestClassifier, 18 | VotingClassifier, 19 | ) 20 | from sklearn.gaussian_process import GaussianProcessClassifier 21 | from sklearn.linear_model import ( 22 | LogisticRegression, 23 | LogisticRegressionCV, 24 | PassiveAggressiveClassifier, 25 | Perceptron, 26 | RidgeClassifier, 27 | RidgeClassifierCV, 28 | SGDClassifier, 29 | ) 30 | from sklearn.multiclass import ( 31 | OneVsOneClassifier, 32 | OneVsRestClassifier, 33 | OutputCodeClassifier, 34 | ) 35 | from sklearn.multioutput import ClassifierChain, MultiOutputClassifier 36 | from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB 37 | from sklearn.neighbors import ( 38 | KNeighborsClassifier, 39 | NearestCentroid, 40 | RadiusNeighborsClassifier, 41 | ) 42 | from sklearn.neural_network import MLPClassifier 43 | from sklearn.semi_supervised import LabelPropagation, LabelSpreading 44 | from sklearn.svm import SVC, LinearSVC, NuSVC 45 | from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier 46 | 47 | from pytools.api import AllTracker 48 | 49 | from ..wrapper import ClassifierWrapperDF, MetaEstimatorWrapperDF 50 | from .wrapper import ( 51 | ClassifierChainWrapperDF, 52 | LinearDiscriminantAnalysisWrapperDF, 53 | MetaClassifierWrapperDF, 54 | MultiOutputClassifierWrapperDF, 55 | PartialFitClassifierWrapperDF, 56 | ) 57 | 58 | log = logging.getLogger(__name__) 59 | 60 | __all__ = [ 61 | "AdaBoostClassifierDF", 62 | "BaggingClassifierDF", 63 | "BernoulliNBDF", 64 | "CalibratedClassifierCVDF", 65 | "ClassifierChainDF", 66 | "ComplementNBDF", 67 | "DecisionTreeClassifierDF", 68 | "DummyClassifierDF", 69 | "ExtraTreeClassifierDF", 70 | "ExtraTreesClassifierDF", 71 | "GaussianNBDF", 72 | "GaussianProcessClassifierDF", 73 | "GradientBoostingClassifierDF", 74 | "KNeighborsClassifierDF", 75 | "LabelPropagationDF", 76 | "LabelSpreadingDF", 77 | "LinearDiscriminantAnalysisDF", 78 | "LinearSVCDF", 79 | "LogisticRegressionCVDF", 80 | "LogisticRegressionDF", 81 | "MLPClassifierDF", 82 | "MultinomialNBDF", 83 | "MultiOutputClassifierDF", 84 | "NearestCentroidDF", 85 | "NuSVCDF", 86 | "OneVsOneClassifierDF", 87 | "OneVsRestClassifierDF", 88 | "OutputCodeClassifierDF", 89 | "PassiveAggressiveClassifierDF", 90 | "PerceptronDF", 91 | "QuadraticDiscriminantAnalysisDF", 92 | "RadiusNeighborsClassifierDF", 93 | "RandomForestClassifierDF", 94 | "RidgeClassifierCVDF", 95 | "RidgeClassifierDF", 96 | "SGDClassifierDF", 97 | "SVCDF", 98 | "VotingClassifierDF", 99 | ] 100 | 101 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 102 | 103 | 104 | # 105 | # Ensure all symbols introduced below are included in __all__ 106 | # 107 | 108 | __tracker = AllTracker(globals()) 109 | 110 | 111 | # 112 | # Class definitions 113 | # 114 | 115 | 116 | # 117 | # Dummy 118 | # 119 | 120 | 121 | class DummyClassifierDF(ClassifierWrapperDF[DummyClassifier], native=DummyClassifier): 122 | """Stub for DF wrapper of class ``DummyClassifier``""" 123 | 124 | 125 | # 126 | # neighbors 127 | # 128 | 129 | 130 | class NearestCentroidDF(ClassifierWrapperDF[NearestCentroid], native=NearestCentroid): 131 | """Stub for DF wrapper of class ``NearestCentroid``""" 132 | 133 | 134 | class KNeighborsClassifierDF( 135 | ClassifierWrapperDF[KNeighborsClassifier], native=KNeighborsClassifier 136 | ): 137 | """Stub for DF wrapper of class ``KNeighborsClassifier``""" 138 | 139 | 140 | class RadiusNeighborsClassifierDF( 141 | ClassifierWrapperDF[RadiusNeighborsClassifier], native=RadiusNeighborsClassifier 142 | ): 143 | """Stub for DF wrapper of class ``RadiusNeighborsClassifier``""" 144 | 145 | 146 | # 147 | # voting 148 | # 149 | 150 | 151 | class VotingClassifierDF( 152 | MetaClassifierWrapperDF[VotingClassifier], native=VotingClassifier 153 | ): 154 | """Stub for DF wrapper of class ``VotingClassifier``""" 155 | 156 | 157 | # 158 | # ensemble 159 | # 160 | 161 | 162 | class RandomForestClassifierDF( 163 | ClassifierWrapperDF[RandomForestClassifier], native=RandomForestClassifier 164 | ): 165 | """Stub for DF wrapper of class ``RandomForestClassifier``""" 166 | 167 | 168 | class ExtraTreesClassifierDF( 169 | ClassifierWrapperDF[ExtraTreesClassifier], native=ExtraTreesClassifier 170 | ): 171 | """Stub for DF wrapper of class ``ExtraTreesClassifier``""" 172 | 173 | 174 | # noinspection PyAbstractClass 175 | class GradientBoostingClassifierDF( 176 | ClassifierWrapperDF[GradientBoostingClassifier], native=GradientBoostingClassifier 177 | ): 178 | """Stub for DF wrapper of class ``GradientBoostingClassifier``""" 179 | 180 | 181 | class AdaBoostClassifierDF( 182 | ClassifierWrapperDF[AdaBoostClassifier], native=AdaBoostClassifier 183 | ): 184 | """Stub for DF wrapper of class ``AdaBoostClassifier``""" 185 | 186 | 187 | class BaggingClassifierDF( 188 | ClassifierWrapperDF[BaggingClassifier], native=BaggingClassifier 189 | ): 190 | """Stub for DF wrapper of class ``BaggingClassifier``""" 191 | 192 | 193 | # 194 | # tree 195 | # 196 | 197 | 198 | class DecisionTreeClassifierDF( 199 | ClassifierWrapperDF[DecisionTreeClassifier], native=DecisionTreeClassifier 200 | ): 201 | """Stub for DF wrapper of class ``DecisionTreeClassifier``""" 202 | 203 | 204 | class ExtraTreeClassifierDF( 205 | ClassifierWrapperDF[ExtraTreeClassifier], native=ExtraTreeClassifier 206 | ): 207 | """Stub for DF wrapper of class ``ExtraTreeClassifier``""" 208 | 209 | 210 | # 211 | # discriminant analysis 212 | # 213 | 214 | 215 | class LinearDiscriminantAnalysisDF( 216 | LinearDiscriminantAnalysisWrapperDF, native=LinearDiscriminantAnalysis 217 | ): 218 | """Stub for DF wrapper of class ``LinearDiscriminantAnalysis``""" 219 | 220 | 221 | class QuadraticDiscriminantAnalysisDF( 222 | ClassifierWrapperDF[QuadraticDiscriminantAnalysis], 223 | native=QuadraticDiscriminantAnalysis, 224 | ): 225 | """Stub for DF wrapper of class ``QuadraticDiscriminantAnalysis``""" 226 | 227 | 228 | # 229 | # naive bayes 230 | # 231 | 232 | 233 | class GaussianNBDF(PartialFitClassifierWrapperDF[GaussianNB], native=GaussianNB): 234 | """Stub for DF wrapper of class ``GaussianNB``""" 235 | 236 | 237 | class MultinomialNBDF( 238 | PartialFitClassifierWrapperDF[MultinomialNB], native=MultinomialNB 239 | ): 240 | """Stub for DF wrapper of class ``MultinomialNB``""" 241 | 242 | 243 | class ComplementNBDF(PartialFitClassifierWrapperDF[ComplementNB], native=ComplementNB): 244 | """Stub for DF wrapper of class ``ComplementNB``""" 245 | 246 | 247 | class BernoulliNBDF(PartialFitClassifierWrapperDF[BernoulliNB], native=BernoulliNB): 248 | """Stub for DF wrapper of class ``BernoulliNB``""" 249 | 250 | 251 | # 252 | # calibration 253 | # 254 | 255 | 256 | class CalibratedClassifierCVDF( 257 | MetaClassifierWrapperDF[CalibratedClassifierCV], native=CalibratedClassifierCV 258 | ): 259 | """Stub for DF wrapper of class ``CalibratedClassifierCV``""" 260 | 261 | 262 | # 263 | # SVM 264 | # 265 | 266 | 267 | class SVCDF(ClassifierWrapperDF[SVC], native=SVC): 268 | """Stub for DF wrapper of class ``SVC``""" 269 | 270 | 271 | class NuSVCDF(ClassifierWrapperDF[NuSVC], native=NuSVC): 272 | """Stub for DF wrapper of class ``NuSVC``""" 273 | 274 | 275 | class LinearSVCDF(ClassifierWrapperDF[LinearSVC], native=LinearSVC): 276 | """Stub for DF wrapper of class ``LinearSVC``""" 277 | 278 | 279 | # 280 | # gaussian process 281 | # 282 | 283 | 284 | class GaussianProcessClassifierDF( 285 | ClassifierWrapperDF[GaussianProcessClassifier], native=GaussianProcessClassifier 286 | ): 287 | """Stub for DF wrapper of class ``GaussianProcessClassifier``""" 288 | 289 | 290 | # 291 | # linear model 292 | # 293 | 294 | 295 | class LogisticRegressionDF( 296 | ClassifierWrapperDF[LogisticRegression], native=LogisticRegression 297 | ): 298 | """Stub for DF wrapper of class ``LogisticRegression``""" 299 | 300 | 301 | class LogisticRegressionCVDF( 302 | ClassifierWrapperDF[LogisticRegressionCV], native=LogisticRegressionCV 303 | ): 304 | """Stub for DF wrapper of class ``LogisticRegressionCV``""" 305 | 306 | 307 | class PassiveAggressiveClassifierDF( 308 | PartialFitClassifierWrapperDF[PassiveAggressiveClassifier], 309 | native=PassiveAggressiveClassifier, 310 | ): 311 | """Stub for DF wrapper of class ``PassiveAggressiveClassifier``""" 312 | 313 | 314 | class PerceptronDF(PartialFitClassifierWrapperDF[Perceptron], native=Perceptron): 315 | """Stub for DF wrapper of class ``Perceptron``""" 316 | 317 | 318 | class SGDClassifierDF( 319 | PartialFitClassifierWrapperDF[SGDClassifier], native=SGDClassifier 320 | ): 321 | """Stub for DF wrapper of class ``SGDClassifier``""" 322 | 323 | 324 | class RidgeClassifierDF(ClassifierWrapperDF[RidgeClassifier], native=RidgeClassifier): 325 | """Stub for DF wrapper of class ``RidgeClassifier``""" 326 | 327 | 328 | class RidgeClassifierCVDF( 329 | ClassifierWrapperDF[RidgeClassifierCV], native=RidgeClassifierCV 330 | ): 331 | """Stub for DF wrapper of class ``RidgeClassifierCV``""" 332 | 333 | 334 | # 335 | # semi-supervised 336 | # 337 | 338 | 339 | class LabelPropagationDF( 340 | ClassifierWrapperDF[LabelPropagation], native=LabelPropagation 341 | ): 342 | """Stub for DF wrapper of class ``LabelPropagation``""" 343 | 344 | 345 | class LabelSpreadingDF(ClassifierWrapperDF[LabelSpreading], native=LabelSpreading): 346 | """Stub for DF wrapper of class ``LabelSpreading``""" 347 | 348 | 349 | # 350 | # multi-class 351 | # 352 | 353 | 354 | class OneVsRestClassifierDF( 355 | MetaClassifierWrapperDF[OneVsRestClassifier], native=OneVsRestClassifier 356 | ): 357 | """Stub for DF wrapper of class ``OneVsRestClassifier``""" 358 | 359 | 360 | class OneVsOneClassifierDF( 361 | ClassifierWrapperDF[OneVsOneClassifier], 362 | MetaEstimatorWrapperDF[OneVsOneClassifier], 363 | native=OneVsOneClassifier, 364 | ): 365 | """Stub for DF wrapper of class ``OneVsOneClassifier``""" 366 | 367 | 368 | class OutputCodeClassifierDF( 369 | ClassifierWrapperDF[OutputCodeClassifier], 370 | MetaEstimatorWrapperDF[OutputCodeClassifier], 371 | native=OutputCodeClassifier, 372 | ): 373 | """Stub for DF wrapper of class ``OutputCodeClassifier``""" 374 | 375 | 376 | # 377 | # multi-output 378 | # 379 | 380 | 381 | class MultiOutputClassifierDF( 382 | MultiOutputClassifierWrapperDF, native=MultiOutputClassifier 383 | ): 384 | """Stub for DF wrapper of class ``MultiOutputClassifier``""" 385 | 386 | 387 | # 388 | # chaining 389 | # 390 | 391 | 392 | class ClassifierChainDF(ClassifierChainWrapperDF, native=ClassifierChain): 393 | """Stub for DF wrapper of class ``ClassifierChain``""" 394 | 395 | 396 | # 397 | # neural network 398 | # 399 | 400 | 401 | class MLPClassifierDF( 402 | PartialFitClassifierWrapperDF[MLPClassifier], native=MLPClassifier 403 | ): 404 | """Stub for DF wrapper of class ``MLPClassifier``""" 405 | 406 | 407 | # 408 | # validate __all__ 409 | # 410 | 411 | __tracker.validate() 412 | 413 | 414 | # 415 | # validate that __all__ comprises all symbols ending in "DF", and no others 416 | # 417 | 418 | __estimators = { 419 | sym 420 | for sym in dir() 421 | if sym.endswith("DF") 422 | and sym not in __imported_estimators 423 | and not sym.startswith("_") 424 | } 425 | if __estimators != set(__all__): 426 | raise RuntimeError( 427 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 428 | f"{__estimators}" 429 | ) 430 | -------------------------------------------------------------------------------- /src/sklearndf/classification/_classification_v0_22.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additional implementation of :mod:`sklearndf.classification` loaded 3 | from sklearn 0.22 onwards 4 | """ 5 | import logging 6 | 7 | from sklearn.ensemble import StackingClassifier 8 | from sklearn.naive_bayes import CategoricalNB 9 | 10 | from pytools.api import AllTracker 11 | 12 | from ..wrapper.stacking import StackingClassifierWrapperDF 13 | from .wrapper import PartialFitClassifierWrapperDF 14 | 15 | log = logging.getLogger(__name__) 16 | 17 | __all__ = ["CategoricalNBDF", "StackingClassifierDF"] 18 | 19 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 20 | 21 | 22 | # 23 | # Ensure all symbols introduced below are included in __all__ 24 | # 25 | 26 | __tracker = AllTracker(globals()) 27 | 28 | 29 | # 30 | # Class definitions 31 | # 32 | 33 | 34 | # 35 | # naive bayes 36 | # 37 | 38 | 39 | class CategoricalNBDF( 40 | PartialFitClassifierWrapperDF[CategoricalNB], native=CategoricalNB 41 | ): 42 | """Stub for DF wrapper of class ``CategoricalNB``""" 43 | 44 | 45 | class StackingClassifierDF( 46 | StackingClassifierWrapperDF[StackingClassifier], native=StackingClassifier 47 | ): 48 | """Stub for DF wrapper of class ``StackingClassifier``""" 49 | 50 | 51 | # 52 | # validate __all__ 53 | # 54 | 55 | __tracker.validate() 56 | 57 | 58 | # 59 | # validate that __all__ comprises all symbols ending in "DF", and no others 60 | # 61 | 62 | __estimators = { 63 | sym 64 | for sym in dir() 65 | if sym.endswith("DF") 66 | and sym not in __imported_estimators 67 | and not sym.startswith("_") 68 | } 69 | if __estimators != set(__all__): 70 | raise RuntimeError( 71 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 72 | f"{__estimators}" 73 | ) 74 | -------------------------------------------------------------------------------- /src/sklearndf/classification/_classification_v0_23.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additional implementation of :mod:`sklearndf.classification` loaded 3 | from sklearn 0.23 onwards 4 | """ 5 | 6 | import logging 7 | from typing import List 8 | 9 | from pytools.api import AllTracker 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | __all__: List[str] = [] 14 | 15 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 16 | 17 | 18 | # 19 | # Ensure all symbols introduced below are included in __all__ 20 | # 21 | 22 | __tracker = AllTracker(globals()) 23 | 24 | 25 | # 26 | # Class definitions 27 | # 28 | 29 | 30 | # todo: add classification implementations for sklearn 0.23 31 | 32 | 33 | __tracker.validate() 34 | 35 | # 36 | # validate that __all__ comprises all symbols ending in "DF", and no others 37 | # 38 | 39 | __estimators = { 40 | sym 41 | for sym in dir() 42 | if sym.endswith("DF") 43 | and sym not in __imported_estimators 44 | and not sym.startswith("_") 45 | } 46 | if __estimators != set(__all__): 47 | raise RuntimeError( 48 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 49 | f"{__estimators}" 50 | ) 51 | -------------------------------------------------------------------------------- /src/sklearndf/classification/_classification_v1_0.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additional implementation of :mod:`sklearndf.classification` loaded 3 | from sklearn 1.0 onwards 4 | """ 5 | import logging 6 | 7 | from sklearn.ensemble import HistGradientBoostingClassifier 8 | 9 | from pytools.api import AllTracker 10 | 11 | from ..wrapper import ClassifierWrapperDF 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | __all__ = ["HistGradientBoostingClassifierDF"] 16 | 17 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 18 | 19 | 20 | # 21 | # Ensure all symbols introduced below are included in __all__ 22 | # 23 | 24 | __tracker = AllTracker(globals()) 25 | 26 | 27 | # 28 | # ensemble 29 | # 30 | 31 | 32 | class HistGradientBoostingClassifierDF( 33 | ClassifierWrapperDF[HistGradientBoostingClassifier], 34 | native=HistGradientBoostingClassifier, 35 | ): 36 | """Stub for DF wrapper of class ``HistGradientBoostingClassifier``""" 37 | 38 | 39 | # 40 | # validate __all__ 41 | # 42 | 43 | __tracker.validate() 44 | 45 | 46 | # 47 | # validate that __all__ comprises all symbols ending in "DF", and no others 48 | # 49 | 50 | __estimators = { 51 | sym 52 | for sym in dir() 53 | if sym.endswith("DF") 54 | and sym not in __imported_estimators 55 | and not sym.startswith("_") 56 | } 57 | if __estimators != set(__all__): 58 | raise RuntimeError( 59 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 60 | f"{__estimators}" 61 | ) 62 | -------------------------------------------------------------------------------- /src/sklearndf/classification/extra/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additional 3rd party classifiers that implement the `scikit-learn` interface. 3 | 4 | Note that 3rd party packages implementing the associated native estimators must be 5 | installed explicitly: they are not included in `sklearndf`'s package requirements to 6 | achieve a lean package footprint for default installs of `sklearndf`. 7 | """ 8 | from ._extra import * 9 | -------------------------------------------------------------------------------- /src/sklearndf/classification/extra/_extra.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.classification.extra` 3 | """ 4 | import logging 5 | 6 | from sklearn.base import ClassifierMixin 7 | 8 | from pytools.api import AllTracker 9 | 10 | from ...wrapper import ClassifierWrapperDF, MissingEstimator 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | __all__ = ["LGBMClassifierDF", "XGBClassifierDF"] 15 | 16 | try: 17 | # import lightgbm classes only if installed 18 | from lightgbm.sklearn import LGBMClassifier 19 | except ImportError: 20 | 21 | class LGBMClassifier( # type: ignore 22 | MissingEstimator, 23 | ClassifierMixin, # type: ignore 24 | ): 25 | """Mock-up for missing estimator.""" 26 | 27 | 28 | try: 29 | # import xgboost classes only if installed 30 | from xgboost import XGBClassifier 31 | except ImportError: 32 | 33 | class XGBClassifier( # type: ignore 34 | MissingEstimator, 35 | ClassifierMixin, # type: ignore 36 | ): 37 | """Mock-up for missing estimator.""" 38 | 39 | 40 | # 41 | # Ensure all symbols introduced below are included in __all__ 42 | # 43 | 44 | __tracker = AllTracker(globals()) 45 | 46 | 47 | # 48 | # Class definitions 49 | # 50 | 51 | 52 | class LGBMClassifierDF(ClassifierWrapperDF[LGBMClassifier], native=LGBMClassifier): 53 | """Stub for DF wrapper of class ``LGBMClassifierDF``""" 54 | 55 | 56 | class XGBClassifierDF(ClassifierWrapperDF[XGBClassifier], native=XGBClassifier): 57 | """Stub for DF wrapper of class ``XGBClassifierDF``""" 58 | 59 | 60 | # 61 | # validate that __all__ 62 | # 63 | 64 | __tracker.validate() 65 | -------------------------------------------------------------------------------- /src/sklearndf/classification/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper classes for `scikit-learn` classifiers, providing enhanced support for data 3 | frames. 4 | """ 5 | 6 | from ._wrapper import * 7 | -------------------------------------------------------------------------------- /src/sklearndf/classification/wrapper/_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.classification.wrapper` 3 | """ 4 | 5 | import logging 6 | from abc import ABCMeta 7 | from typing import Any, Generic, List, Optional, Sequence, TypeVar, Union, cast 8 | 9 | import numpy.typing as npt 10 | import pandas as pd 11 | from sklearn.base import ClassifierMixin 12 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 13 | from sklearn.multioutput import ClassifierChain, MultiOutputClassifier 14 | 15 | from pytools.api import AllTracker 16 | 17 | from ...transformation.wrapper import NComponentsDimensionalityReductionWrapperDF 18 | from ...wrapper import ClassifierWrapperDF, MetaEstimatorWrapperDF 19 | 20 | log = logging.getLogger(__name__) 21 | 22 | __all__ = [ 23 | "ClassifierChainWrapperDF", 24 | "LinearDiscriminantAnalysisWrapperDF", 25 | "MetaClassifierWrapperDF", 26 | "MultiOutputClassifierWrapperDF", 27 | "PartialFitClassifierWrapperDF", 28 | ] 29 | 30 | # 31 | # Type variables 32 | # 33 | 34 | T_PartialFitClassifierWrapperDF = TypeVar( 35 | "T_PartialFitClassifierWrapperDF", 36 | bound="PartialFitClassifierWrapperDF[ClassifierMixin]", 37 | ) 38 | T_NativeClassifier = TypeVar("T_NativeClassifier", bound=ClassifierMixin) 39 | 40 | 41 | # 42 | # Ensure all symbols introduced below are included in __all__ 43 | # 44 | 45 | __tracker = AllTracker(globals()) 46 | 47 | 48 | # 49 | # Wrapper classes 50 | # 51 | 52 | 53 | class LinearDiscriminantAnalysisWrapperDF( 54 | ClassifierWrapperDF[LinearDiscriminantAnalysis], 55 | NComponentsDimensionalityReductionWrapperDF[LinearDiscriminantAnalysis], 56 | metaclass=ABCMeta, 57 | ): 58 | """ 59 | DF wrapper for 60 | :class:`sklearn.discriminant_analysis.LinearDiscriminantAnalysis`. 61 | """ 62 | 63 | pass 64 | 65 | 66 | class MetaClassifierWrapperDF( 67 | ClassifierWrapperDF[T_NativeClassifier], 68 | MetaEstimatorWrapperDF[T_NativeClassifier], 69 | Generic[T_NativeClassifier], 70 | metaclass=ABCMeta, 71 | ): 72 | """ 73 | Abstract base class of DF wrappers for classifiers implementing 74 | :class:`sklearn.base.MetaEstimatorMixin`. 75 | """ 76 | 77 | pass 78 | 79 | 80 | class PartialFitClassifierWrapperDF( 81 | ClassifierWrapperDF[T_NativeClassifier], 82 | Generic[T_NativeClassifier], 83 | metaclass=ABCMeta, 84 | ): 85 | """ 86 | Abstract base class of DF wrappers for classifiers implementing 87 | method ``partial_fit()``. 88 | """ 89 | 90 | # noinspection PyPep8Naming 91 | def partial_fit( 92 | self: T_PartialFitClassifierWrapperDF, 93 | X: Union[pd.Series, pd.DataFrame], 94 | y: Union[pd.Series, pd.DataFrame], 95 | classes: Optional[Sequence[Any]] = None, 96 | sample_weight: Optional[pd.Series] = None, 97 | ) -> T_PartialFitClassifierWrapperDF: 98 | """ 99 | Perform incremental fit on a batch of samples. 100 | 101 | This method is meant to be called multiple times for subsets of training 102 | data which, e.g., couldn't fit in the required memory in full. It can be 103 | also used for online learning. 104 | 105 | :param X: data frame with observations as rows and features as columns 106 | :param y: a series or data frame with one or more outputs per observation 107 | :param classes: all classes present across all calls to ``partial_fit``; 108 | only required for the first call of this method 109 | :param sample_weight: optional weights applied to individual samples 110 | :return: ``self`` 111 | """ 112 | X, y = self._validate_parameter_types(X, y) 113 | self._partial_fit(X, y, classes=classes, sample_weight=sample_weight) 114 | 115 | return self 116 | 117 | # noinspection PyPep8Naming 118 | def _partial_fit( 119 | self: T_PartialFitClassifierWrapperDF, 120 | X: pd.DataFrame, 121 | y: Union[pd.Series, pd.DataFrame], 122 | **partial_fit_params: Optional[Any], 123 | ) -> T_PartialFitClassifierWrapperDF: 124 | return cast( 125 | T_PartialFitClassifierWrapperDF, 126 | self._native_estimator.partial_fit( 127 | self._prepare_X_for_delegate(X), 128 | self._prepare_y_for_delegate(y), 129 | **{ 130 | arg: value 131 | for arg, value in partial_fit_params.items() 132 | if value is not None 133 | }, 134 | ), 135 | ) 136 | 137 | 138 | class MultiOutputClassifierWrapperDF( 139 | MetaClassifierWrapperDF[MultiOutputClassifier], 140 | PartialFitClassifierWrapperDF[MultiOutputClassifier], 141 | metaclass=ABCMeta, 142 | ): 143 | """ 144 | DF wrapper for :class:`sklearn.multioutput.MultiOutputClassifier`. 145 | """ 146 | 147 | # noinspection PyPep8Naming 148 | def _prediction_with_class_labels( 149 | self, 150 | X: pd.DataFrame, 151 | prediction: Union[ 152 | pd.Series, pd.DataFrame, List[npt.NDArray[Any]], npt.NDArray[Any] 153 | ], 154 | classes: Optional[Sequence[Any]] = None, 155 | ) -> Union[pd.Series, pd.DataFrame, List[pd.DataFrame]]: 156 | # if we have a multi-output classifier, prediction of probabilities 157 | # yields a list of NumPy arrays 158 | if not isinstance(prediction, list): 159 | raise ValueError( 160 | "prediction of multi-output classifier expected to be a list of NumPy " 161 | f"arrays, but got type {type(prediction)}" 162 | ) 163 | 164 | delegate_estimator = self.native_estimator 165 | 166 | # store the super() object as this is not available within a generator 167 | _super = cast(ClassifierWrapperDF[MultiOutputClassifier], super()) 168 | 169 | # estimators attribute of abstract class MultiOutputEstimator 170 | # usually the delegate estimator will provide a list of estimators used 171 | # to predict each output. If present, use these estimators to get 172 | # individual class labels for each output; otherwise we cannot assign class 173 | # labels 174 | estimators = getattr(delegate_estimator, "estimators_", None) 175 | if estimators is None: 176 | return [ 177 | _super._prediction_with_class_labels(X=X, prediction=output) 178 | for output in prediction 179 | ] 180 | else: 181 | return [ 182 | _super._prediction_with_class_labels( 183 | X=X, prediction=output, classes=getattr(estimator, "classes_", None) 184 | ) 185 | for estimator, output in zip(estimators, prediction) 186 | ] 187 | 188 | 189 | class ClassifierChainWrapperDF( 190 | MetaEstimatorWrapperDF[ClassifierChain], 191 | ClassifierWrapperDF[ClassifierChain], 192 | metaclass=ABCMeta, 193 | ): 194 | """ 195 | DF wrapper for :class:`sklearn.multioutput.ClassifierChain`. 196 | """ 197 | 198 | # noinspection PyPep8Naming 199 | def _prediction_with_class_labels( 200 | self, 201 | X: pd.DataFrame, 202 | prediction: Union[ 203 | pd.Series, pd.DataFrame, List[npt.NDArray[Any]], npt.NDArray[Any] 204 | ], 205 | classes: Optional[Sequence[Any]] = None, 206 | ) -> Union[pd.Series, pd.DataFrame, List[pd.DataFrame]]: 207 | # todo: infer actual class names 208 | return super()._prediction_with_class_labels( 209 | X, prediction, classes=range(self.n_outputs_) 210 | ) 211 | 212 | 213 | # 214 | # Validate __all__ 215 | # 216 | 217 | __tracker.validate() 218 | -------------------------------------------------------------------------------- /src/sklearndf/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extended versions of `scikit-learn` clusterers with enhanced support for data 3 | frames. 4 | """ 5 | 6 | from .. import __sklearn_1_1__, __sklearn_1_3__, __sklearn_version__ 7 | from ._clustering import * 8 | 9 | if __sklearn_version__ >= __sklearn_1_1__: 10 | from ._clustering_v1_1 import * 11 | 12 | if __sklearn_version__ >= __sklearn_1_3__: 13 | from ._clustering_v1_3 import * 14 | -------------------------------------------------------------------------------- /src/sklearndf/clustering/_clustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.clustering` 3 | """ 4 | import logging 5 | 6 | from sklearn.cluster import ( 7 | DBSCAN, 8 | OPTICS, 9 | AffinityPropagation, 10 | AgglomerativeClustering, 11 | Birch, 12 | FeatureAgglomeration, 13 | KMeans, 14 | MeanShift, 15 | MiniBatchKMeans, 16 | SpectralClustering, 17 | ) 18 | 19 | from pytools.api import AllTracker 20 | 21 | from ..wrapper import ClusterWrapperDF 22 | from .wrapper import FeatureAgglomerationWrapperDF, KMeansBaseWrapperDF 23 | 24 | log = logging.getLogger(__name__) 25 | 26 | __all__ = [ 27 | "AffinityPropagationDF", 28 | "AgglomerativeClusteringDF", 29 | "BirchDF", 30 | "DBSCANDF", 31 | "FeatureAgglomerationDF", 32 | "KMeansDF", 33 | "MeanShiftDF", 34 | "MiniBatchKMeansDF", 35 | "OPTICSDF", 36 | "SpectralClusteringDF", 37 | ] 38 | 39 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 40 | 41 | 42 | # 43 | # Ensure all symbols introduced below are included in __all__ 44 | # 45 | 46 | __tracker = AllTracker(globals()) 47 | 48 | 49 | # 50 | # Class definitions 51 | # 52 | 53 | 54 | class AffinityPropagationDF( 55 | ClusterWrapperDF[AffinityPropagation], native=AffinityPropagation 56 | ): 57 | """Stub for DF wrapper of class ``AffinityPropagation``""" 58 | 59 | 60 | class AgglomerativeClusteringDF( 61 | ClusterWrapperDF[AgglomerativeClustering], native=AgglomerativeClustering 62 | ): 63 | """Stub for DF wrapper of class ``AgglomerativeClustering``""" 64 | 65 | 66 | class BirchDF(ClusterWrapperDF[Birch], native=Birch): 67 | """Stub for DF wrapper of class ``Birch``""" 68 | 69 | 70 | class DBSCANDF(ClusterWrapperDF[DBSCAN], native=DBSCAN): 71 | """Stub for DF wrapper of class ``DBSCAN``""" 72 | 73 | 74 | class KMeansDF(KMeansBaseWrapperDF[KMeans], native=KMeans): 75 | """Stub for DF wrapper of class ``KMeans``""" 76 | 77 | 78 | class MiniBatchKMeansDF(KMeansBaseWrapperDF[MiniBatchKMeans], native=MiniBatchKMeans): 79 | """Stub for DF wrapper of class ``MiniBatchKMeans``""" 80 | 81 | 82 | class MeanShiftDF(ClusterWrapperDF[MeanShift], native=MeanShift): 83 | """Stub for DF wrapper of class ``MeanShift``""" 84 | 85 | 86 | class OPTICSDF(ClusterWrapperDF[OPTICS], native=OPTICS): 87 | """Stub for DF wrapper of class ``OPTICS``""" 88 | 89 | 90 | class SpectralClusteringDF( 91 | ClusterWrapperDF[SpectralClustering], native=SpectralClustering 92 | ): 93 | """Stub for DF wrapper of class ``SpectralClustering``""" 94 | 95 | 96 | class FeatureAgglomerationDF( 97 | FeatureAgglomerationWrapperDF, native=FeatureAgglomeration 98 | ): 99 | """Stub for DF wrapper of class ``FeatureAgglomeration``""" 100 | 101 | 102 | # 103 | # Validate __all__ 104 | # 105 | 106 | __tracker.validate() 107 | -------------------------------------------------------------------------------- /src/sklearndf/clustering/_clustering_v1_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.clustering` 3 | """ 4 | import logging 5 | 6 | from sklearn.cluster import BisectingKMeans 7 | 8 | from pytools.api import AllTracker 9 | 10 | from .wrapper import KMeansBaseWrapperDF 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | __all__ = [ 15 | "BisectingKMeansDF", 16 | ] 17 | 18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 19 | 20 | 21 | # 22 | # Ensure all symbols introduced below are included in __all__ 23 | # 24 | 25 | __tracker = AllTracker(globals()) 26 | 27 | 28 | # 29 | # Class definitions 30 | # 31 | 32 | 33 | class BisectingKMeansDF(KMeansBaseWrapperDF[BisectingKMeans], native=BisectingKMeans): 34 | """Stub for DF wrapper of class ``MiniBatchKMeans``""" 35 | 36 | 37 | # 38 | # Validate __all__ 39 | # 40 | 41 | __tracker.validate() 42 | -------------------------------------------------------------------------------- /src/sklearndf/clustering/_clustering_v1_3.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.clustering` 3 | """ 4 | import logging 5 | 6 | from sklearn.cluster import HDBSCAN 7 | 8 | from pytools.api import AllTracker 9 | 10 | from ..wrapper import ClusterWrapperDF 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | __all__ = [ 15 | "HDBSCANDF", 16 | ] 17 | 18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 19 | 20 | 21 | # 22 | # Ensure all symbols introduced below are included in __all__ 23 | # 24 | 25 | __tracker = AllTracker(globals()) 26 | 27 | 28 | # 29 | # Class definitions 30 | # 31 | 32 | 33 | class HDBSCANDF(ClusterWrapperDF[HDBSCAN], native=HDBSCAN): 34 | """Stub for DF wrapper of class ``DBSCAN``""" 35 | 36 | 37 | # 38 | # Validate __all__ 39 | # 40 | 41 | __tracker.validate() 42 | -------------------------------------------------------------------------------- /src/sklearndf/clustering/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper classes for `scikit-learn` clusterers, providing enhanced support for data 3 | frames. 4 | """ 5 | 6 | from ._wrapper import * 7 | -------------------------------------------------------------------------------- /src/sklearndf/clustering/wrapper/_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.clustering.wrapper` 3 | """ 4 | 5 | import logging 6 | from abc import ABCMeta 7 | from typing import Generic, TypeVar 8 | 9 | import pandas as pd 10 | from sklearn.cluster import FeatureAgglomeration, KMeans, MiniBatchKMeans 11 | 12 | from pytools.api import AllTracker 13 | from pytools.fit import fitted_only 14 | 15 | from sklearndf.transformation.wrapper import ColumnPreservingTransformerWrapperDF 16 | from sklearndf.wrapper import ClusterWrapperDF 17 | 18 | log = logging.getLogger(__name__) 19 | 20 | __all__ = [ 21 | "KMeansBaseWrapperDF", 22 | "FeatureAgglomerationWrapperDF", 23 | ] 24 | 25 | # 26 | # Type variables 27 | # 28 | 29 | T_NativeKMeans = TypeVar("T_NativeKMeans", KMeans, MiniBatchKMeans) 30 | 31 | 32 | # 33 | # Ensure all symbols introduced below are included in __all__ 34 | # 35 | 36 | __tracker = AllTracker(globals()) 37 | 38 | 39 | # 40 | # Wrapper classes 41 | # 42 | 43 | 44 | # noinspection PyPep8Naming 45 | class KMeansBaseWrapperDF( 46 | ClusterWrapperDF[T_NativeKMeans], Generic[T_NativeKMeans], metaclass=ABCMeta 47 | ): 48 | """ 49 | DF wrapper for KMeans-like algorithms, e.g., :class:`sklearn.cluster.KMeans`. 50 | """ 51 | 52 | #: the name of the index representing clusters 53 | IDX_CLUSTER = "cluster" 54 | 55 | @property 56 | @fitted_only(not_fitted_error=AttributeError) 57 | def cluster_centers_(self) -> pd.DataFrame: 58 | """ 59 | The cluster centers as a data frame, with clusters as rows and feature values 60 | as columns. 61 | 62 | :raises AttributeError: the clusterer is not fitted 63 | """ 64 | 65 | raw_cluster_centers = self._native_estimator.cluster_centers_ 66 | return pd.DataFrame( 67 | raw_cluster_centers, 68 | columns=self.feature_names_in_, 69 | index=pd.RangeIndex( 70 | len(raw_cluster_centers), name=KMeansBaseWrapperDF.IDX_CLUSTER 71 | ), 72 | ) 73 | 74 | 75 | class FeatureAgglomerationWrapperDF( 76 | ClusterWrapperDF[FeatureAgglomeration], 77 | ColumnPreservingTransformerWrapperDF[FeatureAgglomeration], 78 | metaclass=ABCMeta, 79 | ): 80 | """ 81 | DF wrapper for FeatureAgglomeration that combines clusterer and transformer. 82 | """ 83 | 84 | pass 85 | 86 | 87 | # 88 | # Validate __all__ 89 | # 90 | 91 | __tracker.validate() 92 | -------------------------------------------------------------------------------- /src/sklearndf/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extended versions of all `scikit-learn` pipelines with enhanced support for data 3 | frames. 4 | """ 5 | from ._learner_pipeline import * 6 | from ._pipeline import * 7 | -------------------------------------------------------------------------------- /src/sklearndf/pipeline/_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.pipeline` 3 | """ 4 | 5 | import logging 6 | 7 | from sklearn.pipeline import Pipeline 8 | 9 | from pytools.api import AllTracker 10 | 11 | from .wrapper import FeatureUnionSparseFrames, FeatureUnionWrapperDF, PipelineWrapperDF 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | __all__ = ["PipelineDF", "FeatureUnionDF"] 16 | 17 | 18 | # 19 | # Ensure all symbols introduced below are included in __all__ 20 | # 21 | 22 | __tracker = AllTracker(globals()) 23 | 24 | 25 | # 26 | # Class definitions 27 | # 28 | 29 | 30 | class PipelineDF(PipelineWrapperDF, native=Pipeline): 31 | """Stub for DF wrapper of class ``Pipeline``""" 32 | 33 | 34 | class FeatureUnionDF(FeatureUnionWrapperDF, native=FeatureUnionSparseFrames): 35 | """Stub for DF wrapper of class ``FeatureUnion``""" 36 | 37 | 38 | # 39 | # Validate __all__ 40 | # 41 | 42 | __tracker.validate() 43 | -------------------------------------------------------------------------------- /src/sklearndf/pipeline/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper classes to enhance the functionality of native pipeline classes conforming with 3 | the `scikit-learn` API. 4 | """ 5 | 6 | from ._wrapper import * 7 | -------------------------------------------------------------------------------- /src/sklearndf/pipeline/wrapper/_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.pipeline.wrapper` 3 | """ 4 | 5 | import logging 6 | from abc import ABCMeta 7 | from typing import Any, Dict, Iterator, List, Sequence, Tuple, Union, cast 8 | 9 | import numpy.typing as npt 10 | import pandas as pd 11 | from pandas.core.arrays import ExtensionArray 12 | from scipy import sparse 13 | from sklearn.pipeline import FeatureUnion, Pipeline 14 | from sklearn.preprocessing import FunctionTransformer 15 | 16 | from pytools.api import AllTracker 17 | 18 | from ..._util import hstack_frames 19 | from sklearndf import EstimatorDF, TransformerDF 20 | from sklearndf.wrapper import ( 21 | ClassifierWrapperDF, 22 | RegressorWrapperDF, 23 | TransformerWrapperDF, 24 | ) 25 | 26 | log = logging.getLogger(__name__) 27 | 28 | __all__ = [ 29 | "FeatureUnionSparseFrames", 30 | "FeatureUnionWrapperDF", 31 | "PipelineWrapperDF", 32 | ] 33 | 34 | 35 | # 36 | # Ensure all symbols introduced below are included in __all__ 37 | # 38 | 39 | __tracker = AllTracker(globals()) 40 | 41 | 42 | # 43 | # Class definitions 44 | # 45 | 46 | 47 | class PipelineWrapperDF( 48 | ClassifierWrapperDF[Pipeline], 49 | RegressorWrapperDF[Pipeline], 50 | TransformerWrapperDF[Pipeline], 51 | metaclass=ABCMeta, 52 | ): 53 | """ 54 | DF wrapper for `scikit-learn` class :class:`~sklearn.pipeline.Pipeline`. 55 | """ 56 | 57 | __native_base_class__ = Pipeline 58 | 59 | #: Placeholder that can be used in place of an estimator to designate a pipeline 60 | #: step that preserves the original ingoing data. 61 | PASSTHROUGH = "passthrough" 62 | 63 | def _validate_delegate_estimator(self) -> None: 64 | # ensure that all steps support data frames, and that all except the last 65 | # step are data frame transformers 66 | 67 | steps = self.steps 68 | 69 | if len(steps) == 0: 70 | return 71 | 72 | for name, transformer in steps[:-1]: 73 | if not ( 74 | self._is_passthrough(transformer) 75 | or isinstance(transformer, TransformerDF) 76 | ): 77 | raise ValueError( 78 | f"expected step {name!r} to be a {TransformerDF.__name__}, " 79 | f"or {PipelineWrapperDF.PASSTHROUGH}, but found an instance of " 80 | f"{type(transformer).__name__}" 81 | ) 82 | 83 | final_step = steps[-1] 84 | final_estimator = final_step[1] 85 | if not ( 86 | self._is_passthrough(final_estimator) 87 | or isinstance(final_estimator, EstimatorDF) 88 | ): 89 | raise ValueError( 90 | f"expected final step {final_step[0]!r} to be an " 91 | f"{EstimatorDF.__name__} or {PipelineWrapperDF.PASSTHROUGH}, " 92 | f"but found an instance of {type(final_estimator).__name__}" 93 | ) 94 | 95 | @property 96 | def steps(self) -> List[Tuple[str, EstimatorDF]]: 97 | """ 98 | The ``steps`` attribute of the underlying :class:`~sklearn.pipeline.Pipeline`. 99 | 100 | List of (name, transformer) tuples (transformers implement fit/transform). 101 | """ 102 | return cast(List[Tuple[str, EstimatorDF]], self.native_estimator.steps) 103 | 104 | def __len__(self) -> int: 105 | """The number of steps of the pipeline.""" 106 | return len(self.native_estimator.steps) 107 | 108 | def __getitem__(self, ind: Union[slice, int, str]) -> EstimatorDF: 109 | """ 110 | Return a sub-pipeline or a single estimator in the pipeline 111 | 112 | Indexing with an integer will return an estimator; using a slice 113 | returns another Pipeline instance which copies a slice of this 114 | Pipeline. This copy is shallow: modifying (or fitting) estimators in 115 | the sub-pipeline will affect the larger pipeline and vice-versa. 116 | However, replacing a value in ``steps`` will not change a copy. 117 | """ 118 | 119 | if isinstance(ind, slice): 120 | base_pipeline = self.native_estimator 121 | if ind.step not in (1, None): 122 | raise ValueError("Pipeline slicing only supports a step of 1") 123 | 124 | return cast( 125 | EstimatorDF, 126 | self.__class__( 127 | steps=base_pipeline.steps[ind], 128 | memory=base_pipeline.memory, 129 | verbose=base_pipeline.verbose, 130 | ), 131 | ) 132 | else: 133 | return cast(EstimatorDF, self.native_estimator[ind]) 134 | 135 | @staticmethod 136 | def _is_passthrough(estimator: Union[EstimatorDF, str, None]) -> bool: 137 | # return True if the estimator is a "passthrough" (i.e. identity) transformer 138 | # in the pipeline 139 | return estimator is None or estimator == PipelineWrapperDF.PASSTHROUGH 140 | 141 | def _transformer_steps(self) -> Iterator[Tuple[str, TransformerDF]]: 142 | # make an iterator of all transform steps, i.e., excluding the final step 143 | # in case it is not a transformer 144 | # excludes steps whose transformer is ``None`` or ``"passthrough"`` 145 | 146 | def _iter_not_none( 147 | transformer_steps: Sequence[Tuple[str, EstimatorDF]] 148 | ) -> Iterator[Tuple[str, TransformerDF]]: 149 | return ( 150 | (name, cast(TransformerDF, transformer)) 151 | for name, transformer in transformer_steps 152 | if not self._is_passthrough(transformer) 153 | ) 154 | 155 | steps = self.steps 156 | 157 | if len(steps) == 0: 158 | return iter([]) 159 | 160 | final_estimator = steps[-1][1] 161 | 162 | if isinstance(final_estimator, TransformerDF): 163 | return _iter_not_none(steps) 164 | else: 165 | return _iter_not_none(steps[:-1]) 166 | 167 | def _get_features_original(self) -> pd.Series: 168 | col_mappings = [ 169 | df_transformer.feature_names_original_ 170 | for _, df_transformer in self._transformer_steps() 171 | ] 172 | 173 | _features_out: pd.Index 174 | _features_original: Union[npt.NDArray[Any], ExtensionArray] 175 | 176 | if len(col_mappings) == 0: 177 | _features_out = self.feature_names_in_ 178 | _features_original = _features_out.values 179 | else: 180 | _features_out = col_mappings[-1].index 181 | _features_original = col_mappings[-1].values 182 | 183 | # iterate backwards starting from the penultimate item 184 | for preceding_out_to_original_mapping in col_mappings[-2::-1]: 185 | # join the original columns of my current transformer on the out columns 186 | # in the preceding transformer, then repeat 187 | if not all( 188 | feature in preceding_out_to_original_mapping 189 | for feature in _features_original 190 | ): 191 | unknown_features = set(_features_original) - set( 192 | preceding_out_to_original_mapping 193 | ) 194 | raise KeyError( 195 | f"unknown features encountered while tracing original " 196 | f"features along pipeline: {unknown_features}" 197 | ) 198 | _features_original = preceding_out_to_original_mapping.loc[ 199 | _features_original 200 | ].values 201 | 202 | return pd.Series(index=_features_out, data=_features_original) 203 | 204 | def _get_features_out(self) -> pd.Index: 205 | for _, transformer in reversed(self.steps): 206 | if isinstance(transformer, TransformerDF): 207 | return transformer.feature_names_out_ 208 | 209 | return self.feature_names_in_ 210 | 211 | @property 212 | def _estimator_type(self) -> str: 213 | # noinspection PyProtectedMember 214 | return cast(str, self.native_estimator._estimator_type) 215 | 216 | def _more_tags(self) -> Dict[str, Any]: 217 | return cast( 218 | Dict[str, Any], getattr(self.native_estimator, "_more_tags", lambda: {})() 219 | ) 220 | 221 | 222 | class FeatureUnionSparseFrames( 223 | FeatureUnion, # type:ignore 224 | ): 225 | """ 226 | FeatureUnion transformer that returns sparse data frames instead of arrays if one or 227 | more of its transformers return a sparse data frame. 228 | """ 229 | 230 | # noinspection PyPep8Naming 231 | def _hstack( 232 | self, Xs: List[Union[npt.NDArray[Any], sparse.spmatrix, pd.DataFrame]] 233 | ) -> Union[npt.NDArray[Any], sparse.spmatrix, pd.DataFrame]: 234 | stacked_frames = hstack_frames( 235 | Xs, prefixes=[name for name, _ in self.transformer_list] 236 | ) 237 | if stacked_frames is None: 238 | return super()._hstack(Xs) 239 | else: 240 | return stacked_frames 241 | 242 | 243 | class FeatureUnionWrapperDF( 244 | TransformerWrapperDF[FeatureUnionSparseFrames], metaclass=ABCMeta 245 | ): 246 | """ 247 | DF wrapper for `scikit-learn` class :class:`~sklearn.pipeline.FeatureUnion`. 248 | """ 249 | 250 | DROP = "drop" 251 | PASSTHROUGH = "passthrough" 252 | 253 | __native_base_class__ = FeatureUnionSparseFrames 254 | 255 | @staticmethod 256 | def _prepend_features_out(features_out: pd.Index, name_prefix: str) -> pd.Index: 257 | return pd.Index(data=f"{name_prefix}__" + features_out.astype(str)) 258 | 259 | def _get_features_original(self) -> pd.Series: 260 | # concatenate output-to-input mappings from all included transformers other than 261 | # ones stated as ``None`` or ``"drop"`` or any other string 262 | 263 | # prepend the name of the transformer so the resulting feature name is 264 | # `__ 265 | 266 | def _prepend_features_original( 267 | features_original: pd.Series, name_prefix: str 268 | ) -> pd.Series: 269 | return pd.Series( 270 | data=features_original.values, 271 | index=self._prepend_features_out( 272 | features_out=features_original.index, name_prefix=name_prefix 273 | ), 274 | ) 275 | 276 | # noinspection PyProtectedMember 277 | return pd.concat( 278 | objs=( 279 | _prepend_features_original( 280 | features_original=transformer.feature_names_original_, 281 | name_prefix=name, 282 | ) 283 | for name, transformer, _ in self.native_estimator._iter() 284 | ) 285 | ) 286 | 287 | def _get_features_out(self) -> pd.Index: 288 | # concatenate output columns from all included transformers other than 289 | # ones stated as ``None`` or ``"drop"`` or any other string 290 | 291 | # prepend the name of the transformer so the resulting feature name is 292 | # `__ 293 | 294 | name: str 295 | transformer: Union[TransformerDF, str, FunctionTransformer] 296 | 297 | indices = [ 298 | self._prepend_features_out( 299 | features_out=( 300 | self._get_features_in() 301 | if ( 302 | isinstance(transformer, FunctionTransformer) 303 | and transformer.func is None 304 | ) 305 | else cast(TransformerDF, transformer).feature_names_out_ 306 | ), 307 | name_prefix=name, 308 | ) 309 | for name, transformer in self.native_estimator.transformer_list 310 | if transformer != FeatureUnionWrapperDF.DROP 311 | ] 312 | 313 | if len(indices) == 0: 314 | return pd.Index() 315 | else: 316 | return indices[0].append(indices[1:]) 317 | 318 | 319 | # 320 | # Validate __all__ 321 | # 322 | 323 | __tracker.validate() 324 | -------------------------------------------------------------------------------- /src/sklearndf/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/src/sklearndf/py.typed -------------------------------------------------------------------------------- /src/sklearndf/regression/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extended versions of all `scikit-learn` regressors with enhanced support for data 3 | frames. 4 | """ 5 | from ._regression import * 6 | from ._regression_v0_22 import * 7 | from ._regression_v0_23 import * 8 | from ._regression_v1_0 import * 9 | -------------------------------------------------------------------------------- /src/sklearndf/regression/_regression_v0_22.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.regression` loaded 3 | from sklearn 0.22 onwards 4 | """ 5 | import logging 6 | 7 | from sklearn.ensemble import StackingRegressor 8 | 9 | from pytools.api import AllTracker 10 | 11 | from ..wrapper.stacking import StackingRegressorWrapperDF 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | __all__ = ["StackingRegressorDF"] 16 | 17 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 18 | 19 | 20 | # 21 | # Ensure all symbols introduced below are included in __all__ 22 | # 23 | 24 | __tracker = AllTracker(globals()) 25 | 26 | 27 | # 28 | # Class definitions 29 | # 30 | 31 | 32 | class StackingRegressorDF( 33 | StackingRegressorWrapperDF[StackingRegressor], native=StackingRegressor 34 | ): 35 | """Stub for DF wrapper of class ``StackingRegressor``""" 36 | 37 | 38 | # 39 | # validate __all__ 40 | # 41 | 42 | __tracker.validate() 43 | 44 | 45 | # 46 | # validate that __all__ comprises all symbols ending in "DF", and no others 47 | # 48 | 49 | 50 | __estimators = { 51 | sym 52 | for sym in dir() 53 | if sym.endswith("DF") 54 | and sym not in __imported_estimators 55 | and not sym.startswith("_") 56 | } 57 | if __estimators != set(__all__): 58 | raise RuntimeError( 59 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 60 | f"{__estimators}" 61 | ) 62 | -------------------------------------------------------------------------------- /src/sklearndf/regression/_regression_v0_23.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.regression` loaded 3 | from sklearn 0.23 onwards 4 | """ 5 | import logging 6 | 7 | from sklearn.linear_model import GammaRegressor, PoissonRegressor, TweedieRegressor 8 | 9 | from pytools.api import AllTracker 10 | 11 | from ..wrapper import RegressorWrapperDF 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | __all__ = [ 16 | "GammaRegressorDF", 17 | "PoissonRegressorDF", 18 | "TweedieRegressorDF", 19 | ] 20 | 21 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 22 | 23 | # 24 | # type variables 25 | # 26 | 27 | 28 | # 29 | # Ensure all symbols introduced below are included in __all__ 30 | # 31 | 32 | __tracker = AllTracker(globals()) 33 | 34 | 35 | # 36 | # Class definitions 37 | # 38 | 39 | 40 | class PoissonRegressorDF(RegressorWrapperDF[PoissonRegressor], native=PoissonRegressor): 41 | """Stub for DF wrapper of class ``PoissonRegressor``""" 42 | 43 | 44 | class GammaRegressorDF(RegressorWrapperDF[GammaRegressor], native=GammaRegressor): 45 | """Stub for DF wrapper of class ``GammaRegressor``""" 46 | 47 | 48 | class TweedieRegressorDF(RegressorWrapperDF[TweedieRegressor], native=TweedieRegressor): 49 | """Stub for DF wrapper of class ``TweedieRegressor``""" 50 | 51 | 52 | # 53 | # validate __all__ 54 | # 55 | 56 | __tracker.validate() 57 | 58 | 59 | # 60 | # validate that __all__ comprises all symbols ending in "DF", and no others 61 | # 62 | 63 | __estimators = { 64 | sym 65 | for sym in dir() 66 | if sym.endswith("DF") 67 | and sym not in __imported_estimators 68 | and not sym.startswith("_") 69 | } 70 | if __estimators != set(__all__): 71 | raise RuntimeError( 72 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 73 | f"{__estimators}" 74 | ) 75 | -------------------------------------------------------------------------------- /src/sklearndf/regression/_regression_v1_0.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additional implementation of :mod:`sklearndf.regression` loaded 3 | from sklearn 1.0 onwards 4 | """ 5 | import logging 6 | 7 | from sklearn.ensemble import HistGradientBoostingRegressor 8 | from sklearn.linear_model import QuantileRegressor 9 | 10 | from pytools.api import AllTracker 11 | 12 | from ..wrapper import RegressorWrapperDF 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | __all__ = ["HistGradientBoostingRegressorDF", "QuantileRegressorDF"] 17 | 18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 19 | 20 | 21 | # 22 | # Ensure all symbols introduced below are included in __all__ 23 | # 24 | 25 | __tracker = AllTracker(globals()) 26 | 27 | 28 | # 29 | # ensemble 30 | # 31 | 32 | 33 | class HistGradientBoostingRegressorDF( 34 | RegressorWrapperDF[HistGradientBoostingRegressor], 35 | native=HistGradientBoostingRegressor, 36 | ): 37 | """Stub for DF wrapper of class ``HistGradientBoostingRegressor``""" 38 | 39 | 40 | # 41 | # linear model 42 | # 43 | 44 | 45 | class QuantileRegressorDF( 46 | RegressorWrapperDF[QuantileRegressor], native=QuantileRegressor 47 | ): 48 | """Stub for DF wrapper of class ``QuantileRegressor``""" 49 | 50 | 51 | # 52 | # validate __all__ 53 | # 54 | 55 | __tracker.validate() 56 | 57 | 58 | # 59 | # validate that __all__ comprises all symbols ending in "DF", and no others 60 | # 61 | 62 | __estimators = { 63 | sym 64 | for sym in dir() 65 | if sym.endswith("DF") 66 | and sym not in __imported_estimators 67 | and not sym.startswith("_") 68 | } 69 | if __estimators != set(__all__): 70 | raise RuntimeError( 71 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 72 | f"{__estimators}" 73 | ) 74 | -------------------------------------------------------------------------------- /src/sklearndf/regression/extra/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additional 3rd party regressors that implement the `scikit-learn` interface. 3 | 4 | Note that 3rd party packages implementing the associated native estimators must be 5 | installed explicitly: they are not included in `sklearndf`'s package requirements to 6 | achieve a lean package footprint for default installs of `sklearndf`. 7 | """ 8 | from ._extra import * 9 | -------------------------------------------------------------------------------- /src/sklearndf/regression/extra/_extra.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.regression.extra` 3 | """ 4 | import logging 5 | 6 | from sklearn.base import RegressorMixin 7 | 8 | from pytools.api import AllTracker 9 | 10 | from ...wrapper import MissingEstimator, RegressorWrapperDF 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | __all__ = ["LGBMRegressorDF", "XGBRegressorDF"] 15 | 16 | try: 17 | # import lightgbm classes only if installed 18 | from lightgbm.sklearn import LGBMRegressor 19 | 20 | except ImportError: 21 | 22 | class LGBMRegressor( # type: ignore 23 | MissingEstimator, 24 | RegressorMixin, # type: ignore 25 | ): 26 | """Mock-up for missing estimator.""" 27 | 28 | 29 | try: 30 | # import xgboost classes only if installed 31 | from xgboost import XGBRegressor 32 | 33 | except ImportError: 34 | 35 | class XGBRegressor( # type: ignore 36 | MissingEstimator, 37 | RegressorMixin, # type: ignore 38 | ): 39 | """Mock-up for missing estimator.""" 40 | 41 | 42 | # 43 | # Ensure all symbols introduced below are included in __all__ 44 | # 45 | 46 | __tracker = AllTracker(globals()) 47 | 48 | 49 | # 50 | # Class definitions 51 | # 52 | 53 | 54 | class LGBMRegressorDF(RegressorWrapperDF[LGBMRegressor], native=LGBMRegressor): 55 | """Stub for DF wrapper of class ``LGBMRegressorDF``""" 56 | 57 | 58 | class XGBRegressorDF(RegressorWrapperDF[XGBRegressor], native=XGBRegressor): 59 | """Stub for DF wrapper of class ``XGBRegressorDF``""" 60 | 61 | 62 | # 63 | # validate __all__ 64 | # 65 | 66 | __tracker.validate() 67 | -------------------------------------------------------------------------------- /src/sklearndf/regression/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper classes for `scikit-learn` regressors, providing enhanced support for data 3 | frames. 4 | """ 5 | 6 | from ._wrapper import * 7 | -------------------------------------------------------------------------------- /src/sklearndf/regression/wrapper/_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.regression.wrapper` 3 | """ 4 | 5 | import logging 6 | from abc import ABCMeta 7 | from typing import Any, Generic, Optional, TypeVar, Union, cast 8 | 9 | import numpy.typing as npt 10 | import pandas as pd 11 | from sklearn.base import RegressorMixin 12 | from sklearn.isotonic import IsotonicRegression 13 | from sklearn.multioutput import MultiOutputRegressor 14 | 15 | from pytools.api import AllTracker 16 | 17 | from ...transformation.wrapper import ( 18 | ColumnPreservingTransformerWrapperDF, 19 | NumpyTransformerWrapperDF, 20 | SingleColumnTransformerWrapperDF, 21 | ) 22 | from ...wrapper import MetaEstimatorWrapperDF, RegressorWrapperDF 23 | 24 | log = logging.getLogger(__name__) 25 | 26 | __all__ = [ 27 | "IsotonicRegressionWrapperDF", 28 | "MetaRegressorWrapperDF", 29 | "RegressorTransformerWrapperDF", 30 | "PartialFitRegressorWrapperDF", 31 | "MultiOutputRegressorWrapperDF", 32 | ] 33 | 34 | 35 | # 36 | # type variables 37 | # 38 | 39 | T_PartialFitRegressorWrapperDF = TypeVar( 40 | "T_PartialFitRegressorWrapperDF", 41 | bound="PartialFitRegressorWrapperDF[RegressorMixin]", 42 | ) 43 | T_NativeRegressor = TypeVar("T_NativeRegressor", bound=RegressorMixin) 44 | 45 | 46 | # 47 | # Ensure all symbols introduced below are included in __all__ 48 | # 49 | 50 | __tracker = AllTracker(globals()) 51 | 52 | 53 | # 54 | # Class definitions 55 | # 56 | 57 | 58 | class MetaRegressorWrapperDF( 59 | MetaEstimatorWrapperDF[T_NativeRegressor], 60 | RegressorWrapperDF[T_NativeRegressor], 61 | Generic[T_NativeRegressor], 62 | metaclass=ABCMeta, 63 | ): 64 | """ 65 | Abstract base class of DF wrappers for regressors implementing 66 | :class:`sklearn.base.MetaEstimatorMixin`. 67 | """ 68 | 69 | pass 70 | 71 | 72 | class PartialFitRegressorWrapperDF( 73 | RegressorWrapperDF[T_NativeRegressor], 74 | Generic[T_NativeRegressor], 75 | metaclass=ABCMeta, 76 | ): 77 | """ 78 | Abstract base class of DF wrappers for regressors implementing 79 | method ``partial_fit()``. 80 | """ 81 | 82 | # noinspection PyPep8Naming 83 | def partial_fit( 84 | self: T_PartialFitRegressorWrapperDF, 85 | X: Union[pd.Series, pd.DataFrame], 86 | y: Union[pd.Series, pd.DataFrame], 87 | sample_weight: Optional[pd.Series] = None, 88 | ) -> T_PartialFitRegressorWrapperDF: 89 | """ 90 | Perform incremental fit on a batch of samples. 91 | 92 | This method is meant to be called multiple times for subsets of training 93 | data which, e.g., couldn't fit in the required memory in full. It can be 94 | also used for online learning. 95 | 96 | :param X: data frame with observations as rows and features as columns 97 | :param y: a series or data frame with one or more outputs per observation 98 | :param sample_weight: optional weights applied to individual samples 99 | :return: ``self`` 100 | """ 101 | X, y = self._validate_parameter_types(X, y) 102 | self._partial_fit(X, y, sample_weight=sample_weight) 103 | 104 | return self 105 | 106 | # noinspection PyPep8Naming 107 | def _partial_fit( 108 | self: T_PartialFitRegressorWrapperDF, 109 | X: pd.DataFrame, 110 | y: Union[pd.Series, pd.DataFrame], 111 | **partial_fit_params: Optional[Any], 112 | ) -> T_PartialFitRegressorWrapperDF: 113 | return cast( 114 | T_PartialFitRegressorWrapperDF, 115 | self._native_estimator.partial_fit( 116 | self._prepare_X_for_delegate(X), 117 | self._prepare_y_for_delegate(y), 118 | **{ 119 | arg: value 120 | for arg, value in partial_fit_params.items() 121 | if value is not None 122 | }, 123 | ), 124 | ) 125 | 126 | 127 | class MultiOutputRegressorWrapperDF( 128 | MetaRegressorWrapperDF[MultiOutputRegressor], 129 | PartialFitRegressorWrapperDF[MultiOutputRegressor], 130 | ): 131 | """ 132 | Abstract base class of DF wrappers for multi-output regressors. 133 | """ 134 | 135 | pass 136 | 137 | 138 | class RegressorTransformerWrapperDF( 139 | RegressorWrapperDF[T_NativeRegressor], 140 | ColumnPreservingTransformerWrapperDF[T_NativeRegressor], 141 | Generic[T_NativeRegressor], 142 | metaclass=ABCMeta, 143 | ): 144 | """ 145 | DF wrapper for combined regressors and column preserving transformers. 146 | """ 147 | 148 | pass 149 | 150 | 151 | class IsotonicRegressionWrapperDF( 152 | RegressorTransformerWrapperDF[IsotonicRegression], 153 | SingleColumnTransformerWrapperDF[IsotonicRegression], 154 | NumpyTransformerWrapperDF[IsotonicRegression], 155 | metaclass=ABCMeta, 156 | ): 157 | """ 158 | DF wrapper for :class:`sklearn.isotonic.IsotonicRegression`. 159 | """ 160 | 161 | # noinspection PyPep8Naming 162 | def _adjust_X_type_for_delegate(self, X: pd.DataFrame) -> npt.NDArray[Any]: 163 | arr = super()._adjust_X_type_for_delegate(X) 164 | return arr.ravel() 165 | 166 | 167 | # 168 | # Validate __all__ 169 | # 170 | 171 | __tracker.validate() 172 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extended versions of all `scikit-learn` transformers with enhanced support for data 3 | frames. 4 | """ 5 | 6 | from .. import __sklearn_1_1__, __sklearn_version__ 7 | from ._transformation import * 8 | from ._transformation_v0_22 import * 9 | from ._transformation_v0_24 import * 10 | from ._transformation_v1_0 import * 11 | 12 | if __sklearn_version__ >= __sklearn_1_1__: 13 | from ._transformation_v1_1 import * 14 | 15 | from .. import __sklearn_1_3__ 16 | 17 | if __sklearn_version__ >= __sklearn_1_3__: 18 | from ._transformation_v1_3 import * 19 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/_transformation_v0_22.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.transformation` loaded 3 | from sklearn 0.22 onwards 4 | """ 5 | 6 | 7 | import logging 8 | 9 | from sklearn.impute import KNNImputer 10 | 11 | from pytools.api import AllTracker 12 | 13 | from .wrapper import ImputerWrapperDF 14 | 15 | log = logging.getLogger(__name__) 16 | 17 | __all__ = ["KNNImputerDF"] 18 | 19 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 20 | 21 | 22 | # 23 | # Ensure all symbols introduced below are included in __all__ 24 | # 25 | 26 | __tracker = AllTracker(globals()) 27 | 28 | 29 | # 30 | # impute 31 | # 32 | 33 | 34 | class KNNImputerDF(ImputerWrapperDF[KNNImputer], native=KNNImputer): 35 | """Stub for DF wrapper of class ``KNNImputer``""" 36 | 37 | 38 | # 39 | # validate __all__ 40 | # 41 | 42 | __tracker.validate() 43 | 44 | 45 | # 46 | # validate that __all__ comprises all symbols ending in "DF", and no others 47 | # 48 | 49 | __estimators = [ 50 | sym 51 | for sym in dir() 52 | if sym.endswith("DF") 53 | and sym not in __imported_estimators 54 | and not sym.startswith("_") 55 | ] 56 | if set(__estimators) != set(__all__): 57 | raise RuntimeError( 58 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 59 | f"{__estimators}" 60 | ) 61 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/_transformation_v0_24.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.transformation` loaded 3 | from sklearn 0.24 onwards 4 | """ 5 | 6 | 7 | import logging 8 | 9 | from sklearn.feature_selection import SequentialFeatureSelector 10 | from sklearn.kernel_approximation import PolynomialCountSketch 11 | 12 | from sklearndf.transformation.wrapper import ( 13 | FeatureSelectionWrapperDF, 14 | NComponentsDimensionalityReductionWrapperDF, 15 | ) 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | __all__ = ["PolynomialCountSketchDF", "SequentialFeatureSelectorDF"] 20 | 21 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 22 | 23 | # 24 | # Transformers which have an n_components attribute 25 | # Implemented through NComponentsDimensionalityReductionWrapperDF 26 | # 27 | 28 | 29 | class PolynomialCountSketchDF( 30 | NComponentsDimensionalityReductionWrapperDF[PolynomialCountSketch], 31 | native=PolynomialCountSketch, 32 | ): 33 | """Stub for DF wrapper of class ``PolynomialCountSketch``""" 34 | 35 | 36 | # 37 | # feature_selection 38 | # 39 | # Transformers with a get_support method, implemented via FeatureSelectionWrapperDF 40 | # 41 | 42 | 43 | class SequentialFeatureSelectorDF( 44 | FeatureSelectionWrapperDF[SequentialFeatureSelector], 45 | native=SequentialFeatureSelector, 46 | ): 47 | """Stub for DF wrapper of class ``SequentialFeatureSelector``""" 48 | 49 | 50 | # 51 | # validate that __all__ comprises all symbols ending in "DF", and no others 52 | # 53 | 54 | __estimators = [ 55 | sym 56 | for sym in dir() 57 | if sym.endswith("DF") 58 | and sym not in __imported_estimators 59 | and not sym.startswith("_") 60 | ] 61 | if set(__estimators) != set(__all__): 62 | raise RuntimeError( 63 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 64 | f"{__estimators}" 65 | ) 66 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/_transformation_v1_0.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.transformation` loaded 3 | from sklearn 1.0 onwards 4 | """ 5 | 6 | 7 | import logging 8 | 9 | from sklearn.preprocessing import SplineTransformer 10 | 11 | from .wrapper import PolynomialTransformerWrapperDF 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | __all__ = ["SplineTransformerDF"] 16 | 17 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 18 | 19 | # 20 | # preprocessing 21 | # 22 | 23 | 24 | class SplineTransformerDF( 25 | PolynomialTransformerWrapperDF[SplineTransformer], native=SplineTransformer 26 | ): 27 | """Stub for DF wrapper of class ``SplineTransformer``""" 28 | 29 | 30 | # 31 | # validate that __all__ comprises all symbols ending in "DF", and no others 32 | # 33 | 34 | __estimators = [ 35 | sym 36 | for sym in dir() 37 | if sym.endswith("DF") 38 | and sym not in __imported_estimators 39 | and not sym.startswith("_") 40 | ] 41 | if set(__estimators) != set(__all__): 42 | raise RuntimeError( 43 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 44 | f"{__estimators}" 45 | ) 46 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/_transformation_v1_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.transformation` loaded 3 | from sklearn 1.1 onwards 4 | """ 5 | 6 | 7 | import logging 8 | 9 | from sklearn.decomposition import MiniBatchNMF 10 | from sklearn.ensemble import RandomTreesEmbedding 11 | 12 | from .wrapper import ComponentsDimensionalityReductionWrapperDF, EmbeddingWrapperDF 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | __all__ = ["MiniBatchNMFDF", "RandomTreesEmbeddingDF"] 17 | 18 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 19 | 20 | # 21 | # preprocessing 22 | # 23 | 24 | 25 | class MiniBatchNMFDF( 26 | ComponentsDimensionalityReductionWrapperDF[MiniBatchNMF], 27 | native=MiniBatchNMF, 28 | ): 29 | """Stub for DF wrapper of class ``MiniBatchNMF``""" 30 | 31 | 32 | class RandomTreesEmbeddingDF( 33 | EmbeddingWrapperDF[RandomTreesEmbedding], 34 | native=RandomTreesEmbedding, 35 | ): 36 | """Stub for DF wrapper of class ``RandomTreesEmbedding``""" 37 | 38 | 39 | # 40 | # validate that __all__ comprises all symbols ending in "DF", and no others 41 | # 42 | 43 | __estimators = [ 44 | sym 45 | for sym in dir() 46 | if sym.endswith("DF") 47 | and sym not in __imported_estimators 48 | and not sym.startswith("_") 49 | ] 50 | if set(__estimators) != set(__all__): 51 | raise RuntimeError( 52 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 53 | f"{__estimators}" 54 | ) 55 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/_transformation_v1_3.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.transformation` loaded 3 | from sklearn 1.3 onwards 4 | """ 5 | 6 | 7 | import logging 8 | 9 | from sklearn.preprocessing import TargetEncoder 10 | 11 | from .wrapper import ColumnPreservingTransformerWrapperDF 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | __all__ = [ 16 | "TargetEncoderDF", 17 | ] 18 | 19 | __imported_estimators = {name for name in globals().keys() if name.endswith("DF")} 20 | 21 | # 22 | # preprocessing 23 | # 24 | 25 | 26 | class TargetEncoderDF( 27 | ColumnPreservingTransformerWrapperDF[TargetEncoder], native=TargetEncoder 28 | ): 29 | """Stub for DF wrapper of class ``TargetEncoder``""" 30 | 31 | 32 | # 33 | # validate that __all__ comprises all symbols ending in "DF", and no others 34 | # 35 | 36 | __estimators = [ 37 | sym 38 | for sym in dir() 39 | if sym.endswith("DF") 40 | and sym not in __imported_estimators 41 | and not sym.startswith("_") 42 | ] 43 | if set(__estimators) != set(__all__): 44 | raise RuntimeError( 45 | "__all__ does not contain exactly all DF estimators; expected value is:\n" 46 | f"{__estimators}" 47 | ) 48 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/extra/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additional 3rd party transformers that implement the `scikit-learn` interface. 3 | 4 | Note that 3rd party packages implementing the associated native estimators must be 5 | installed explicitly: they are not included in `sklearndf`'s package requirements to 6 | achieve a lean package footprint for default installs of `sklearndf`. 7 | """ 8 | from ._extra import * 9 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/extra/_extra.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.transformation.extra` 3 | """ 4 | from __future__ import annotations 5 | 6 | import logging 7 | 8 | import numpy as np 9 | from sklearn.base import BaseEstimator, TransformerMixin 10 | 11 | from pytools.api import AllTracker 12 | 13 | from ...wrapper import MissingEstimator 14 | 15 | log = logging.getLogger(__name__) 16 | 17 | __all__ = ["BoostAGrootaDF", "BorutaDF", "GrootCVDF", "LeshyDF"] 18 | 19 | try: 20 | # import boruta classes only if installed 21 | from boruta import BorutaPy 22 | 23 | # Apply a hack to address boruta's incompatibility with numpy >= 1.24: 24 | # boruta uses np.float_ which is deprecated in numpy >= 1.20 and removed in 1.24. 25 | # 26 | # We check these types are already defined in numpy, and if not, we define them 27 | # as aliases to the corresponding new types with a trailing underscore. 28 | 29 | for __attr in ["bool", "int", "float"]: 30 | if not hasattr(np, __attr): 31 | setattr(np, __attr, getattr(np, f"{__attr}_")) 32 | del __attr 33 | 34 | except ImportError: 35 | 36 | class BorutaPy( # type: ignore 37 | MissingEstimator, 38 | TransformerMixin, # type: ignore 39 | ): 40 | """Mock-up for missing estimator.""" 41 | 42 | 43 | try: 44 | # import boruta classes only if installed 45 | from arfs.feature_selection.allrelevant import BoostAGroota, GrootCV, Leshy 46 | 47 | except ImportError: 48 | 49 | class BoostAGroota( # type: ignore 50 | MissingEstimator, 51 | TransformerMixin, # type: ignore 52 | ): 53 | """Mock-up for missing estimator.""" 54 | 55 | class GrootCV( # type: ignore 56 | MissingEstimator, 57 | TransformerMixin, # type: ignore 58 | ): 59 | """Mock-up for missing estimator.""" 60 | 61 | class Leshy( # type: ignore 62 | MissingEstimator, 63 | TransformerMixin, # type: ignore 64 | ): 65 | """Mock-up for missing estimator.""" 66 | 67 | 68 | # 69 | # Ensure all symbols introduced below are included in __all__ 70 | # 71 | 72 | __tracker = AllTracker(globals()) 73 | 74 | 75 | # 76 | # Class definitions 77 | # 78 | 79 | 80 | from .wrapper import ARFSWrapperDF as _ARFSWrapperDF 81 | from .wrapper import BorutaPyWrapperDF as _BorutaPyWrapperDF 82 | 83 | 84 | class BorutaDF(_BorutaPyWrapperDF, native=BorutaPy): 85 | """ 86 | DF version of :class:`~boruta.BorutaPy`. 87 | """ 88 | 89 | 90 | class LeshyDF(_ARFSWrapperDF[Leshy], native=Leshy): 91 | """ 92 | DF version of :class:`~arfs.feature_selection.allrelevant.Leshy`. 93 | """ 94 | 95 | 96 | class BoostAGrootaDF(_ARFSWrapperDF[BoostAGroota], native=BoostAGroota): 97 | """ 98 | DF version of :class:`~arfs.feature_selection.allrelevant.BoostAGroota`. 99 | """ 100 | 101 | @property 102 | def estimator(self) -> BaseEstimator: 103 | """ 104 | Alias for the native estimator's :attr:`.est` attribute, to conform with 105 | the :class:`~sklearn.base.MetaEstimatorMixin` interface. 106 | 107 | :return: the value of the native estimator's :attr:`.est` attribute 108 | """ 109 | return self.native_estimator.est 110 | 111 | @estimator.setter 112 | def estimator(self, est: BaseEstimator) -> None: 113 | """ 114 | Alias for the native estimator's :attr:`.est` attribute, to conform with 115 | the :class:`~sklearn.base.MetaEstimatorMixin` interface. 116 | 117 | :param est: the new value for the native estimator's :attr:`.est` attribute 118 | """ 119 | self.native_estimator.est = est 120 | 121 | @estimator.deleter 122 | def estimator(self) -> None: 123 | """ 124 | Alias for the native estimator's :attr:`.est` attribute, to conform with 125 | the :class:`~sklearn.base.MetaEstimatorMixin` interface. 126 | """ 127 | del self.native_estimator.est 128 | 129 | 130 | class GrootCVDF(_ARFSWrapperDF[GrootCV], native=GrootCV): 131 | """ 132 | DF version of :class:`~arfs.feature_selection.allrelevant.GrootCV`. 133 | """ 134 | 135 | 136 | # 137 | # validate __all__ 138 | # 139 | 140 | __tracker.validate() 141 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/extra/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | DF wrapper classes for additional 3rd party transformers that implement the 3 | `scikit-learn` interface. 4 | """ 5 | 6 | from ._wrapper import * 7 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/extra/wrapper/_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core implementation of :mod:`sklearndf.transformation.extra.wrapper` 3 | """ 4 | from __future__ import annotations 5 | 6 | import logging 7 | from typing import Generic, TypeVar 8 | 9 | import pandas as pd 10 | from sklearn.feature_selection import SelectorMixin 11 | 12 | from pytools.api import AllTracker 13 | 14 | from ....wrapper import MetaEstimatorWrapperDF 15 | from ...wrapper import ColumnSubsetTransformerWrapperDF, NumpyTransformerWrapperDF 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | __all__ = ["BorutaPyWrapperDF", "ARFSWrapperDF"] 20 | 21 | try: 22 | # import boruta classes only if installed 23 | from boruta import BorutaPy 24 | except ImportError: 25 | BorutaPy = None 26 | 27 | 28 | # 29 | # Type variables 30 | # 31 | 32 | T_FeatureSelector = TypeVar("T_FeatureSelector", bound=SelectorMixin) 33 | 34 | 35 | # 36 | # Ensure all symbols introduced below are included in __all__ 37 | # 38 | 39 | __tracker = AllTracker(globals()) 40 | 41 | 42 | # 43 | # Class definitions 44 | # 45 | 46 | 47 | class BorutaPyWrapperDF( 48 | MetaEstimatorWrapperDF[BorutaPy], 49 | NumpyTransformerWrapperDF[BorutaPy], 50 | ColumnSubsetTransformerWrapperDF[BorutaPy], 51 | ): 52 | """ 53 | DF wrapper for :class:`~boruta.BorutaPy`. 54 | """ 55 | 56 | def _get_features_out(self) -> pd.Index: 57 | return self.feature_names_in_[self.native_estimator.support_] 58 | 59 | def _get_sparse_threshold(self) -> float: 60 | # don't allow sparse input 61 | return 0.0 62 | 63 | 64 | class ARFSWrapperDF( 65 | MetaEstimatorWrapperDF[T_FeatureSelector], 66 | ColumnSubsetTransformerWrapperDF[T_FeatureSelector], 67 | Generic[T_FeatureSelector], 68 | ): 69 | """ 70 | DF wrapper for :class:`~boruta.BorutaPy`. 71 | """ 72 | 73 | def _get_features_out(self) -> pd.Index: 74 | return self.feature_names_in_[self.native_estimator.support_] 75 | 76 | def _get_sparse_threshold(self) -> float: 77 | # don't allow sparse input 78 | return 0.0 79 | 80 | 81 | __tracker.validate() 82 | -------------------------------------------------------------------------------- /src/sklearndf/transformation/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper classes for `scikit-learn` transformers, providing enhanced support for data 3 | frames. 4 | """ 5 | 6 | from ._wrapper import * 7 | -------------------------------------------------------------------------------- /src/sklearndf/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper class API for enhancing the functionality of native estimators conforming with 3 | the `scikit-learn` API. 4 | 5 | In most cases, creating a DF classifier or regressor class is as simple as 6 | 7 | .. code-block:: python 8 | 9 | class RandomForestClassifierDF( 10 | ClassifierWrapperDF[RandomForestClassifier], native=RandomForestClassifier 11 | ): 12 | \"""Stub for DF wrapper of class ``RandomForestClassifier``\""" 13 | 14 | class RandomForestRegressorDF( 15 | RegressorWrapperDF[RandomForestRegressor], native=RandomForestRegressor 16 | ): 17 | \"""Stub for DF wrapper of class ``RandomForestRegressor``\""" 18 | 19 | 20 | Any class implementing the `scikit-learn` estimator protocol (and subclassing 21 | :class:`.BaseEstimator`) can be used to create a DF wrapper by declaring a wrapper 22 | class as follows: 23 | 24 | .. code-block:: 25 | 26 | class DF(, native=): 27 | \"""Stub for DF wrapper of class ````\""" 28 | 29 | The resulting wrapper class implements a *delegation* pattern, forwarding method calls 30 | and attribute access to a native estimator instance while 31 | 32 | - implementing enhanced functionality introduced by the :class:`.EstimatorDF` class 33 | hierarchy, managing feature names and translating between data frames and *numpy* 34 | arrays behind the scenes 35 | - adopting all additional methods and attributes from the wrapped native estimator 36 | - delegating relevant method calls and attribute access to the native estimator, 37 | thus replicating the original estimator's behaviour except for the enhanced 38 | functionality introduced by the :class:`.EstimatorDF` class hierarchy. 39 | 40 | Most regressors, classifiers, and clusterers can be augmented using the 41 | :class:`.RegressorWrapperDF`, :class:`.ClassifierWrapperDF`, and 42 | :class:`.ClusterWrapperDF` wrappers, respectively. 43 | 44 | More care must be taken to wrap transformer classes and some clusterer classes, which 45 | may require a more dedicated wrapper class to support the specific behaviour of the 46 | native transformer or clusterer. 47 | See packages :mod:`sklearndf.transformation.wrapper` and 48 | :mod:`sklearndf.clustering.wrapper` for more details on these. 49 | 50 | For more advanced examples, including the use of custom wrapper classes, see the many 51 | examples in modules 52 | :mod:`sklearndf.transformation`, :mod:`sklearndf.classification`, 53 | :mod:`sklearndf.regression`, and :mod:`sklearndf.clustering`. 54 | """ 55 | 56 | from ._missing import * 57 | from ._wrapper import * 58 | -------------------------------------------------------------------------------- /src/sklearndf/wrapper/_missing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Handling of mocked up native estimators. 3 | """ 4 | import logging 5 | from typing import Any 6 | 7 | from sklearn.base import BaseEstimator 8 | 9 | from pytools.api import AllTracker 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | __all__ = [ 14 | "MissingEstimator", 15 | ] 16 | 17 | # 18 | # Ensure all symbols introduced below are included in __all__ 19 | # 20 | 21 | __tracker = AllTracker(globals()) 22 | 23 | 24 | # 25 | # Class declarations 26 | # 27 | 28 | 29 | class MissingEstimator( 30 | BaseEstimator, # type: ignore 31 | ): 32 | """ 33 | Base class of mocked up native estimators, for use in case an optional 3rd party 34 | estimator is not installed but is required to create the associated DF estimator. 35 | 36 | Raises a :class:`.RuntimeError` upon instantiation. 37 | """ 38 | 39 | def __init__(self, *args: Any, **kwargs: Any) -> None: 40 | """ 41 | :param args: arbitrary positional arguments 42 | :param kwargs: arbitrary keyword arguments 43 | :raises RuntimeError: always raised upon instantiation 44 | """ 45 | raise RuntimeError( 46 | f"Estimator {type(self).__name__} is not available. " 47 | f"Please install the package that implements it." 48 | ) 49 | 50 | 51 | # 52 | # validate __all__ 53 | # 54 | 55 | __tracker.validate() 56 | -------------------------------------------------------------------------------- /src/sklearndf/wrapper/numpy/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapter classes that wrap DF estimators and accept numpy arrays for all DF estimator 3 | methods that would usually only accept pandas data frames or series. 4 | 5 | For use in meta-estimators that internally pass on numpy arrays to sub-estimators. 6 | """ 7 | 8 | from ._numpy import * 9 | -------------------------------------------------------------------------------- /src/sklearndf/wrapper/stacking/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | DF wrapper classes for stacking estimators. 3 | """ 4 | 5 | from ._stacking import * 6 | -------------------------------------------------------------------------------- /test/test/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Root of sklearndf unit tests. 3 | """ 4 | -------------------------------------------------------------------------------- /test/test/conftest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | # noinspection PyPackageRequirements 7 | import pytest 8 | import sklearn 9 | from sklearn import datasets 10 | from sklearn.utils import Bunch 11 | 12 | from sklearndf import __sklearn_1_1__, __sklearn_version__ 13 | from sklearndf.transformation import OneHotEncoderDF 14 | 15 | logging.basicConfig(level=logging.DEBUG) 16 | log = logging.getLogger(__name__) 17 | 18 | UNSUPPORTED_SKLEARN_PACKAGES = [ 19 | sklearn.manifold, 20 | sklearn.neighbors, 21 | sklearn.feature_extraction.image, 22 | ] 23 | 24 | 25 | @pytest.fixture # type: ignore 26 | def diabetes_target() -> str: 27 | return "disease_progression_1yr" 28 | 29 | 30 | @pytest.fixture # type: ignore 31 | def iris_target_name() -> str: 32 | return "species" 33 | 34 | 35 | @pytest.fixture # type: ignore 36 | def n_jobs() -> int: 37 | return -3 38 | 39 | 40 | @pytest.fixture # type: ignore 41 | def diabetes_df(diabetes_target: str) -> pd.DataFrame: 42 | # load sklearn test-data and convert to pd 43 | diabetes: Bunch 44 | if __sklearn_version__ >= __sklearn_1_1__: 45 | diabetes = datasets.load_diabetes(scaled=False) 46 | else: 47 | # arg scaled does not exist in scikit-learn < 1.1 48 | diabetes = datasets.load_diabetes() 49 | 50 | return pd.DataFrame( 51 | data=np.c_[diabetes.data, diabetes.target], 52 | columns=[*map(str, diabetes.feature_names), diabetes_target], 53 | ).astype(dtype={"sex": "category"}) 54 | 55 | 56 | @pytest.fixture # type: ignore 57 | def diabetes_features(diabetes_df: pd.DataFrame, diabetes_target: str) -> pd.DataFrame: 58 | return diabetes_df.drop(labels=[diabetes_target], axis=1) 59 | 60 | 61 | @pytest.fixture # type: ignore 62 | def diabetes_target_sr(diabetes_df: pd.DataFrame, diabetes_target: str) -> pd.Series: 63 | return diabetes_df.loc[:, diabetes_target] 64 | 65 | 66 | @pytest.fixture # type: ignore 67 | def diabetes_target_df(diabetes_df: pd.DataFrame, diabetes_target: str) -> pd.DataFrame: 68 | target = diabetes_df.loc[:, [diabetes_target]] 69 | target.loc[:, f"{diabetes_target}_2"] = target.loc[:, diabetes_target] * 2 70 | return target 71 | 72 | 73 | @pytest.fixture # type: ignore 74 | def iris_dataset() -> Bunch: 75 | return datasets.load_iris() 76 | 77 | 78 | @pytest.fixture # type: ignore 79 | def iris_df(iris_dataset: Bunch, iris_target_name: str) -> pd.DataFrame: 80 | # convert sklearn iris data set to data frame 81 | return pd.DataFrame( 82 | data=np.c_[iris_dataset.data, iris_dataset.target], 83 | columns=[*map(str, iris_dataset.feature_names), iris_target_name], 84 | ) 85 | 86 | 87 | @pytest.fixture # type: ignore 88 | def iris_features(iris_df: pd.DataFrame, iris_target_name: str) -> pd.DataFrame: 89 | return iris_df.drop(labels=[iris_target_name], axis=1) 90 | 91 | 92 | @pytest.fixture # type: ignore 93 | def iris_target_sr( 94 | iris_dataset: Bunch, iris_df: pd.DataFrame, iris_target_name: str 95 | ) -> pd.Series: 96 | # replace numerical targets with actual class labels 97 | return iris_df.loc[:, iris_target_name].apply( 98 | lambda x: iris_dataset.target_names[int(x)] 99 | ) 100 | 101 | 102 | @pytest.fixture # type: ignore 103 | def iris_targets_df(iris_df: pd.DataFrame, iris_target_name: str) -> pd.DataFrame: 104 | return iris_df.loc[:, [iris_target_name, iris_target_name]] 105 | 106 | 107 | @pytest.fixture # type: ignore 108 | def iris_targets_binary_df(iris_target_sr: pd.Series) -> pd.DataFrame: 109 | return OneHotEncoderDF(sparse=False).fit_transform(X=iris_target_sr.to_frame()) 110 | 111 | 112 | @pytest.fixture # type:ignore 113 | def test_data_categorical() -> pd.DataFrame: 114 | return pd.DataFrame( 115 | data=[ 116 | ["yes", "red", "child"], 117 | ["yes", "blue", "father"], 118 | ["no", "green", "mother"], 119 | ], 120 | columns=["a", "b", "c"], 121 | ) 122 | -------------------------------------------------------------------------------- /test/test/paths.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | log = logging.getLogger(__name__) 5 | 6 | # directory paths 7 | DIR_DATA = "data" 8 | DIR_CONFIG = "config" 9 | 10 | # file paths 11 | TEST_CONFIG_YML = os.path.join(DIR_CONFIG, "test_config.yml") 12 | -------------------------------------------------------------------------------- /test/test/sklearndf/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from types import ModuleType 4 | from typing import Dict, Iterable, List, Optional, Set, Type, Union 5 | 6 | import pandas as pd 7 | import sklearn 8 | from sklearn.base import BaseEstimator 9 | from sklearn.compose import ColumnTransformer 10 | from sklearn.pipeline import FeatureUnion 11 | 12 | from sklearndf import EstimatorDF, LearnerDF, TransformerDF, __sklearn_version__ 13 | from sklearndf.pipeline.wrapper import FeatureUnionSparseFrames 14 | from sklearndf.transformation.wrapper import ColumnTransformerSparseFrames 15 | from sklearndf.wrapper import EstimatorWrapperDF 16 | 17 | OVERRIDDEN_SKLEARN_CLASSES = { 18 | ColumnTransformerSparseFrames: ColumnTransformer, 19 | FeatureUnionSparseFrames: FeatureUnion, 20 | } 21 | 22 | 23 | def find_all_classes( 24 | *modules: ModuleType, 25 | ) -> Set[Type[EstimatorWrapperDF[BaseEstimator]]]: 26 | """Finds all Class members in given module/modules.""" 27 | types: Set[Type[EstimatorWrapperDF[BaseEstimator]]] = set() 28 | 29 | def _add_classes_from_module(_m: ModuleType) -> None: 30 | member: Type[EstimatorWrapperDF[BaseEstimator]] 31 | for member in vars(_m).values(): 32 | if isinstance(member, type): 33 | types.add(member) 34 | 35 | for module in modules: 36 | _add_classes_from_module(module) 37 | 38 | return types 39 | 40 | 41 | def find_all_submodules(parent_module: ModuleType) -> Set[ModuleType]: 42 | """Finds all submodules for a parent module.""" 43 | parent_name = f"{parent_module.__name__}." 44 | return { 45 | module 46 | for module_name, module in sys.modules.items() 47 | if module_name.startswith(parent_name) 48 | } 49 | 50 | 51 | def sklearn_delegate_classes( 52 | module: ModuleType, 53 | ) -> Dict[Type[BaseEstimator], Type[EstimatorWrapperDF[BaseEstimator]]]: 54 | """ 55 | Create a dictionary mapping sklearn classes to their corresponding sklearndf 56 | classes. 57 | """ 58 | return { 59 | OVERRIDDEN_SKLEARN_CLASSES.get( 60 | df_class.__wrapped__, df_class.__wrapped__ 61 | ): df_class 62 | for df_class in find_all_classes(module) 63 | # we only consider non-abstract wrapper classes wrapping a specific native class 64 | if issubclass(df_class, EstimatorWrapperDF) and hasattr(df_class, "__wrapped__") 65 | } 66 | 67 | 68 | def iterate_classes( 69 | from_modules: Union[ModuleType, Iterable[ModuleType]], 70 | matching: str, 71 | excluding: Optional[Union[str, Iterable[str]]] = None, 72 | ) -> List[Type[EstimatorWrapperDF[BaseEstimator]]]: 73 | """Helper to return all classes with matching name from Python module(s)""" 74 | 75 | if not isinstance(from_modules, Iterable): 76 | from_modules = (from_modules,) 77 | 78 | if excluding is not None and not isinstance(excluding, str): 79 | excluding = "|".join(f"({exclude_pattern})" for exclude_pattern in excluding) 80 | 81 | return [ 82 | m 83 | for m in find_all_classes(*from_modules) 84 | if re.match(matching, m.__name__) 85 | and ((excluding is None) or not re.match(excluding, m.__name__)) 86 | ] 87 | 88 | 89 | def get_sklearndf_wrapper_class( 90 | to_wrap: Type[BaseEstimator], from_module: ModuleType 91 | ) -> Type[EstimatorWrapperDF[BaseEstimator]]: 92 | """Helper to return the wrapped counterpart for a sklearn class""" 93 | try: 94 | return sklearn_delegate_classes(from_module)[to_wrap] 95 | 96 | except KeyError as cause: 97 | raise ValueError( 98 | f"There is no class that wraps '{to_wrap}' in {from_module}" 99 | ) from cause 100 | 101 | 102 | def check_expected_not_fitted_error(estimator: EstimatorDF) -> None: 103 | """Check if transformers & learners raise NotFittedError""" 104 | 105 | test_x = pd.DataFrame(data=list(range(10))) 106 | 107 | def check_sklearndf_call( 108 | func_to_call: str, _estimator: Union[LearnerDF, TransformerDF] 109 | ) -> None: 110 | try: 111 | getattr(_estimator, func_to_call)(X=test_x) 112 | except sklearn.exceptions.NotFittedError: 113 | # This is the expected error, that sklearn[df] should raise 114 | return 115 | except Exception as sklearndf_exception: 116 | # Re-run the predict/transform ahead of fitting, and compare errors 117 | # across sklearn and sklearndf: 118 | try: 119 | if func_to_call == "transform": 120 | x = test_x.values 121 | else: 122 | x = test_x.values.reshape(-1) 123 | 124 | getattr(_estimator.native_estimator, func_to_call)(x) 125 | except sklearn.exceptions.NotFittedError: 126 | raise AssertionError( 127 | "sklearndf did not return an expected NotFittedError" 128 | f" for {_estimator.__class__.__name__}" 129 | ) 130 | except Exception as sklearn_exception: 131 | assert repr(sklearndf_exception) == repr(sklearn_exception), ( 132 | "sklearndf raised a different error as sklearn" 133 | f" for {_estimator.__class__.__name__}:\n" 134 | f"sklearndf: {repr(sklearndf_exception)} \n" 135 | f"sklearn: {repr(sklearn_exception)}" 136 | ) 137 | 138 | if isinstance(estimator, LearnerDF): 139 | check_sklearndf_call("predict", estimator) 140 | elif isinstance(estimator, TransformerDF): 141 | check_sklearndf_call("transform", estimator) 142 | else: 143 | raise TypeError(f"Estimator of unknown type:{estimator.__name__}") 144 | -------------------------------------------------------------------------------- /test/test/sklearndf/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence 2 | 3 | from sklearndf import TransformerDF 4 | from sklearndf.transformation import ( 5 | ColumnTransformerDF, 6 | OneHotEncoderDF, 7 | SimpleImputerDF, 8 | ) 9 | 10 | STEP_IMPUTE = "impute" 11 | STEP_ONE_HOT_ENCODE = "one-hot-encode" 12 | 13 | 14 | def make_simple_transformer( 15 | impute_median_columns: Optional[Sequence[str]] = None, 16 | one_hot_encode_columns: Optional[Sequence[str]] = None, 17 | ) -> TransformerDF: 18 | column_transforms = [] 19 | 20 | if impute_median_columns is not None and len(impute_median_columns) > 0: 21 | column_transforms.append( 22 | (STEP_IMPUTE, SimpleImputerDF(strategy="median"), impute_median_columns) 23 | ) 24 | 25 | if one_hot_encode_columns is not None and len(one_hot_encode_columns) > 0: 26 | column_transforms.append( 27 | ( 28 | STEP_ONE_HOT_ENCODE, 29 | OneHotEncoderDF(sparse=False, handle_unknown="ignore"), 30 | one_hot_encode_columns, 31 | ) 32 | ) 33 | 34 | return ColumnTransformerDF(transformers=column_transforms) 35 | -------------------------------------------------------------------------------- /test/test/sklearndf/pipeline/test_classification_pipeline_df.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from sklearn.base import is_classifier 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.preprocessing import OneHotEncoder 9 | 10 | from sklearndf import ClassifierDF 11 | from sklearndf.classification import RandomForestClassifierDF 12 | from sklearndf.classification.extra import LGBMClassifierDF 13 | from sklearndf.pipeline import ClassifierPipelineDF 14 | from test.sklearndf.pipeline import make_simple_transformer 15 | 16 | 17 | @pytest.mark.parametrize( # type: ignore 18 | argnames="classifier_df_cls", 19 | argvalues=[RandomForestClassifierDF, LGBMClassifierDF], 20 | ) 21 | def test_classification_pipeline_df( 22 | iris_features: pd.DataFrame, 23 | iris_target_sr: pd.DataFrame, 24 | classifier_df_cls: Type[ClassifierDF], 25 | ) -> None: 26 | cls_p_df = ClassifierPipelineDF( 27 | classifier=classifier_df_cls(), 28 | preprocessing=make_simple_transformer( 29 | impute_median_columns=iris_features.select_dtypes( 30 | include=np.number 31 | ).columns, 32 | one_hot_encode_columns=iris_features.select_dtypes(include=object).columns, 33 | ), 34 | ) 35 | 36 | assert is_classifier(cls_p_df) 37 | 38 | cls_p_df.fit(X=iris_features, y=iris_target_sr) 39 | cls_p_df.predict(X=iris_features) 40 | 41 | # test-type check within constructor: 42 | with pytest.raises(TypeError): 43 | # noinspection PyTypeChecker 44 | ClassifierPipelineDF( 45 | classifier=RandomForestClassifier(), preprocessing=OneHotEncoder() 46 | ) 47 | -------------------------------------------------------------------------------- /test/test/sklearndf/pipeline/test_clustering_pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from sklearn.cluster import KMeans 5 | from sklearn.preprocessing import OneHotEncoder 6 | 7 | from sklearndf.clustering import KMeansDF 8 | from sklearndf.pipeline import ClusterPipelineDF 9 | from test.sklearndf.pipeline import make_simple_transformer 10 | 11 | 12 | def test_clustering_pipeline_df( 13 | iris_features: pd.DataFrame, iris_target_sr: pd.DataFrame 14 | ) -> None: 15 | cls_p_df = ClusterPipelineDF( 16 | clusterer=KMeansDF(n_clusters=4), 17 | preprocessing=make_simple_transformer( 18 | impute_median_columns=iris_features.select_dtypes( 19 | include=np.number 20 | ).columns, 21 | one_hot_encode_columns=iris_features.select_dtypes(include=object).columns, 22 | ), 23 | ) 24 | 25 | cls_p_df.fit(X=iris_features, y=iris_target_sr) 26 | cls_p_df.predict(X=iris_features) 27 | 28 | # test-type check within constructor: 29 | with pytest.raises(TypeError): 30 | # noinspection PyTypeChecker 31 | ClusterPipelineDF(clusterer=KMeans(n_clusters=4), preprocessing=OneHotEncoder()) 32 | -------------------------------------------------------------------------------- /test/test/sklearndf/pipeline/test_regression_pipeline_df.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from lightgbm import LGBMRegressor 7 | from sklearn.base import is_regressor 8 | from sklearn.preprocessing import OneHotEncoder 9 | 10 | from sklearndf import RegressorDF 11 | from sklearndf.pipeline import RegressorPipelineDF 12 | from sklearndf.regression import RandomForestRegressorDF 13 | from sklearndf.regression.extra import LGBMRegressorDF 14 | from test.sklearndf.pipeline import make_simple_transformer 15 | 16 | 17 | @pytest.mark.parametrize( # type: ignore 18 | argnames="regressor_df_cls", argvalues=[RandomForestRegressorDF, LGBMRegressorDF] 19 | ) 20 | def test_regression_pipeline_df( 21 | diabetes_features: pd.DataFrame, 22 | diabetes_target_sr: pd.Series, 23 | regressor_df_cls: Type[RegressorDF], 24 | ) -> None: 25 | rpdf = RegressorPipelineDF( 26 | regressor=regressor_df_cls(), 27 | preprocessing=make_simple_transformer( 28 | impute_median_columns=diabetes_features.select_dtypes( 29 | include=np.number 30 | ).columns, 31 | one_hot_encode_columns=diabetes_features.select_dtypes( 32 | include=object 33 | ).columns, 34 | ), 35 | ) 36 | 37 | assert is_regressor(rpdf) 38 | 39 | rpdf.fit(X=diabetes_features, y=diabetes_target_sr) 40 | rpdf.predict(X=diabetes_features) 41 | 42 | # test type check within constructor 43 | with pytest.raises(TypeError): 44 | # noinspection PyTypeChecker 45 | RegressorPipelineDF(regressor=LGBMRegressor(), preprocessing=OneHotEncoder()) 46 | -------------------------------------------------------------------------------- /test/test/sklearndf/test_base.py: -------------------------------------------------------------------------------- 1 | # inspired by: 2 | # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tests/test_base.py 3 | 4 | import re 5 | from typing import Any 6 | 7 | import numpy as np 8 | import pytest 9 | import scipy.sparse as sp 10 | import sklearn 11 | from numpy.testing import assert_array_equal 12 | from sklearn.base import BaseEstimator, is_classifier 13 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 14 | from sklearn.model_selection import GridSearchCV 15 | from sklearn.pipeline import Pipeline 16 | from sklearn.utils import estimator_html_repr 17 | 18 | from pytools.expression import freeze, make_expression 19 | from pytools.expression.atomic import Id 20 | 21 | from sklearndf.classification import SVCDF, DecisionTreeClassifierDF 22 | from sklearndf.clustering.wrapper import KMeansBaseWrapperDF 23 | from sklearndf.pipeline import PipelineDF 24 | from sklearndf.regression import RandomForestRegressorDF 25 | from sklearndf.transformation import OneHotEncoderDF, SimpleImputerDF 26 | from sklearndf.transformation.wrapper import ImputerWrapperDF 27 | from sklearndf.wrapper import ( 28 | ClassifierWrapperDF, 29 | EstimatorWrapperDF, 30 | RegressorWrapperDF, 31 | ) 32 | 33 | 34 | class DummyEstimator( 35 | BaseEstimator, # type: ignore 36 | ): 37 | def __init__(self, l1: int = 0, empty: Any = None) -> None: 38 | self.l1 = l1 39 | self.empty = empty 40 | 41 | 42 | class DummyEstimator2( 43 | BaseEstimator, # type: ignore 44 | ): 45 | def __init__(self, a: Any = None, b: Any = None) -> None: 46 | self.a = a 47 | self.b = b 48 | 49 | 50 | class DummyEstimator3( 51 | BaseEstimator, # type: ignore 52 | ): 53 | def __init__(self, c: int = 0, d: Any = None) -> None: 54 | self.c = c 55 | self.d = d 56 | 57 | 58 | class DummyEstimatorDF(EstimatorWrapperDF[DummyEstimator], native=DummyEstimator): 59 | """A trivial estimator.""" 60 | 61 | 62 | class DummyEstimator2DF(EstimatorWrapperDF[DummyEstimator2], native=DummyEstimator2): 63 | """A trivial estimator.""" 64 | 65 | 66 | class DummyEstimator3DF(EstimatorWrapperDF[DummyEstimator3], native=DummyEstimator3): 67 | """A trivial estimator.""" 68 | 69 | 70 | def test_clone() -> None: 71 | # Tests that clone creates a correct deep copy. 72 | # We create an estimator, make a copy of its original state 73 | # (which, in this case, is the current state of the estimator), 74 | # and check that the obtained copy is a correct deep copy. 75 | 76 | encoder = OneHotEncoderDF(drop="first", sparse=False) 77 | new_encoder = encoder.clone() 78 | assert encoder is not new_encoder 79 | assert encoder.get_params() == new_encoder.get_params() 80 | 81 | encoder = OneHotEncoderDF(handle_unknown="ignore", sparse=False) 82 | new_encoder = sklearn.clone(encoder) 83 | 84 | assert encoder is not new_encoder 85 | 86 | 87 | def test_clone_2() -> None: 88 | # Tests that clone doesn't copy everything. 89 | # We first create an estimator, give it an own attribute, and 90 | # make a copy of its original state. Then we check that the copy doesn't 91 | # have the specific attribute we manually added to the initial estimator. 92 | 93 | encoder = OneHotEncoderDF(drop="first", sparse=False) 94 | 95 | encoder.own_attribute = "test" 96 | new_encoder = encoder.clone() 97 | 98 | assert not hasattr(new_encoder, "own_attribute") 99 | 100 | 101 | def test_clone_empty_array() -> None: 102 | # Regression test for cloning estimators with empty arrays 103 | clf = DummyEstimatorDF(empty=np.array([])) 104 | clf2 = clf.clone() 105 | assert_array_equal(clf.empty, clf2.empty) 106 | 107 | clf = DummyEstimatorDF(empty=sp.csr_matrix(np.array([[0]]))) 108 | clf2 = clf.clone() 109 | assert_array_equal(clf.empty.data, clf2.empty.data) 110 | 111 | 112 | def test_clone_nan() -> None: 113 | # Regression test for cloning estimators with default parameter as np.nan 114 | clf = DummyEstimatorDF(empty=np.nan) 115 | clf2 = clf.clone() 116 | 117 | assert clf.empty is clf2.empty 118 | 119 | 120 | def test_clone_sparse_matrices() -> None: 121 | sparse_matrix_classes = [ 122 | getattr(sp, name) 123 | for name in dir(sp) 124 | if name.endswith("_matrix") and name != "_matrix" 125 | ] 126 | 127 | for cls in sparse_matrix_classes: 128 | sparse_matrix = cls(np.eye(5)) 129 | clf = DummyEstimatorDF(empty=sparse_matrix) 130 | clf_cloned = clf.clone() 131 | assert clf.empty.__class__ is clf_cloned.empty.__class__ 132 | assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray()) 133 | 134 | 135 | def test_clone_estimator_types() -> None: 136 | # Check that clone works for parameters that are types rather than 137 | # instances 138 | clf = DummyEstimatorDF(empty=DummyEstimator) 139 | clf2 = clf.clone() 140 | 141 | assert clf.empty is clf2.empty 142 | 143 | 144 | def test_repr() -> None: 145 | # Smoke test the repr of the base estimator. 146 | repr(DummyEstimatorDF()) 147 | 148 | estimator = DummyEstimator2DF( 149 | a=DummyEstimator3DF(c=None), b=DummyEstimator3DF(c=1, d=2) 150 | ) 151 | assert freeze(make_expression(estimator)) == freeze( 152 | Id.DummyEstimator2DF( 153 | a=Id.DummyEstimator3DF(c=None), b=Id.DummyEstimator3DF(c=1, d=2) 154 | ) 155 | ) 156 | assert repr(estimator) == ( 157 | "DummyEstimator2DF(a=DummyEstimator3DF(c=None), " 158 | "b=DummyEstimator3DF(c=1, d=2))" 159 | ) 160 | 161 | assert len(repr(DummyEstimator2DF(a=["long_params"] * 1000))) == 15021 162 | 163 | 164 | def test_str() -> None: 165 | # Smoke test the str of the base estimator 166 | my_estimator = DummyEstimatorDF() 167 | str(my_estimator) 168 | 169 | 170 | def test_html_repr() -> None: 171 | # store the original display config 172 | display_original = sklearn.get_config()["display"] 173 | 174 | # set the display config to use diagrams 175 | sklearn.set_config(display="diagram") 176 | 177 | try: 178 | pipeline_df = PipelineDF( 179 | [ 180 | ( 181 | "preprocess", 182 | PipelineDF( 183 | [ 184 | ("impute", SimpleImputerDF()), 185 | ] 186 | ), 187 | ), 188 | ("rf", RandomForestRegressorDF(n_estimators=120)), 189 | ] 190 | ) 191 | 192 | def _replace_ids(_html: str) -> str: 193 | # scikit-learn generates new ids on subsequent calls to estimator_html_repr, 194 | # so we replace them with a placeholder 195 | return re.sub( 196 | r'(?<=id-)\d+|(?:(?<=sk-)|(?<=id=")|(?<=for="))\w+(?:-\w+)*', "#", _html 197 | ) 198 | 199 | assert _replace_ids(pipeline_df._repr_html_()) == _replace_ids( 200 | estimator_html_repr(pipeline_df) 201 | ) 202 | 203 | finally: 204 | # reset the display config to its original value 205 | sklearn.set_config(display=display_original) 206 | pass 207 | 208 | 209 | def test_get_params() -> None: 210 | test = DummyEstimator2DF(DummyEstimator3DF(), DummyEstimator3DF()) 211 | 212 | assert "a__d" in test.get_params(deep=True) 213 | assert "a__d" not in test.get_params(deep=False) 214 | 215 | # noinspection PyTypeChecker 216 | test.set_params(a__d=2) 217 | assert test.a.d == 2 218 | with pytest.raises(ValueError): 219 | test.set_params(a__a=2) 220 | 221 | 222 | def test_is_classifier() -> None: 223 | svc = SVCDF() 224 | assert is_classifier(svc) 225 | assert is_classifier(GridSearchCV(svc, {"C": [0.1, 1]})) 226 | assert is_classifier(PipelineDF([("svc", svc)])) 227 | assert is_classifier(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))])) 228 | 229 | 230 | def test_set_params() -> None: 231 | # test nested estimator parameter setting 232 | clf = Pipeline([("svc", SVCDF())]) 233 | # non-existing parameter in svc 234 | with pytest.raises(ValueError): 235 | # noinspection PyTypeChecker 236 | clf.set_params(svc__stupid_param=True) 237 | # non-existing parameter of pipeline 238 | with pytest.raises(ValueError): 239 | # noinspection PyTypeChecker 240 | clf.set_params(svm__stupid_param=True) 241 | 242 | 243 | def test_set_params_updates_valid_params() -> None: 244 | # Check that set_params tries to set SVC().C, not 245 | # DecisionTreeClassifier().C 246 | gs = GridSearchCV(DecisionTreeClassifierDF(), {}) 247 | # noinspection PyTypeChecker 248 | gs.set_params(estimator=SVCDF(), estimator__C=42.0) 249 | assert gs.estimator.C == 42.0 250 | 251 | 252 | # noinspection PyUnusedLocal 253 | def test_native_class_validation() -> None: 254 | with pytest.raises( 255 | TypeError, 256 | match=( 257 | "native class RandomForestClassifier cannot be used with wrapper class " 258 | "MismatchedNativeClass1 because it does not implement RegressorMixin" 259 | ), 260 | ): 261 | 262 | class MismatchedNativeClass1( 263 | RegressorWrapperDF[RandomForestClassifier], native=RandomForestClassifier 264 | ): 265 | pass 266 | 267 | with pytest.raises( 268 | TypeError, 269 | match=( 270 | "native class RandomForestRegressor cannot be used with wrapper class " 271 | "MismatchedNativeClass2 because it does not implement ClassifierMixin" 272 | ), 273 | ): 274 | 275 | class MismatchedNativeClass2( 276 | ClassifierWrapperDF[RandomForestRegressor], native=RandomForestRegressor 277 | ): 278 | pass 279 | 280 | with pytest.raises( 281 | TypeError, 282 | match=( 283 | "native class RandomForestRegressor cannot be used with wrapper class " 284 | "MismatchedNativeClass3 because it does not implement ClusterMixin" 285 | ), 286 | ): 287 | 288 | class MismatchedNativeClass3( 289 | KMeansBaseWrapperDF[RandomForestRegressor], native=RandomForestRegressor 290 | ): 291 | pass 292 | 293 | with pytest.raises( 294 | TypeError, 295 | match=( 296 | "native class RandomForestRegressor cannot be used with wrapper class " 297 | "MismatchedNativeClass4 because it does not implement TransformerMixin" 298 | ), 299 | ): 300 | 301 | class MismatchedNativeClass4( 302 | ImputerWrapperDF[Any], native=RandomForestRegressor 303 | ): 304 | pass 305 | 306 | with pytest.raises( 307 | TypeError, 308 | match=( 309 | "native class RandomForestRegressor cannot be used with wrapper class " 310 | "MismatchedNativeClass5 because it does not implement Pipeline" 311 | ), 312 | ): 313 | 314 | class MismatchedNativeClass5(PipelineDF, native=RandomForestRegressor): 315 | pass 316 | -------------------------------------------------------------------------------- /test/test/sklearndf/test_classification.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | from typing import Any, Dict, Type 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | from sklearn.base import is_classifier 8 | from sklearn.multioutput import ClassifierChain, MultiOutputClassifier 9 | 10 | import sklearndf.classification as classification 11 | from sklearndf import ClassifierDF, __sklearn_1_2__, __sklearn_version__ 12 | from test.sklearndf import check_expected_not_fitted_error, iterate_classes 13 | 14 | CLASSIFIERS_TO_TEST = iterate_classes( 15 | from_modules=classification, 16 | matching=r".*DF", 17 | excluding=[ClassifierDF.__name__, r".*WrapperDF", r"^_"], 18 | ) 19 | 20 | 21 | def test_classifier_count() -> None: 22 | n = len(CLASSIFIERS_TO_TEST) 23 | 24 | print(f"Testing {n} classifiers.") 25 | assert n == 41 26 | 27 | 28 | if __sklearn_version__ < __sklearn_1_2__: 29 | BASE_ESTIMATOR = "base_estimator" 30 | else: 31 | BASE_ESTIMATOR = "estimator" 32 | 33 | 34 | CLASSIFIER_INIT_PARAMETERS: Dict[str, Dict[str, Any]] = { 35 | "CalibratedClassifierCVDF": { 36 | BASE_ESTIMATOR: classification.RandomForestClassifierDF() 37 | }, 38 | "ClassifierChainDF": {"base_estimator": classification.RandomForestClassifierDF()}, 39 | "MultiOutputClassifierDF": {"estimator": classification.RandomForestClassifierDF()}, 40 | "MultiOutputClassifierDF_partial_fit": {"estimator": classification.PerceptronDF()}, 41 | "OneVsOneClassifierDF": {"estimator": classification.RandomForestClassifierDF()}, 42 | "OneVsRestClassifierDF": {"estimator": classification.RandomForestClassifierDF()}, 43 | "OutputCodeClassifierDF": {"estimator": classification.RandomForestClassifierDF()}, 44 | "VotingClassifierDF": { 45 | "estimators": [ 46 | ("rfc", classification.RandomForestClassifierDF()), 47 | ("svmc", classification.SVCDF(probability=True)), 48 | ], 49 | "voting": "soft", 50 | }, 51 | "StackingClassifierDF": { 52 | "estimators": ( 53 | ("Forest", classification.RandomForestClassifierDF(max_depth=5)), 54 | ("Logit", classification.LogisticRegressionCVDF()), 55 | ("AdaBoost", classification.AdaBoostClassifierDF()), 56 | ) 57 | }, 58 | } 59 | 60 | 61 | CLASSIFIERS_PARTIAL_FIT = [ 62 | classification.BernoulliNBDF, 63 | classification.MultinomialNBDF, 64 | classification.PerceptronDF, 65 | classification.SGDClassifierDF, 66 | classification.PassiveAggressiveClassifierDF, 67 | classification.GaussianNBDF, 68 | classification.ComplementNBDF, 69 | classification.MultiOutputClassifierDF, 70 | classification.CategoricalNBDF, 71 | ] 72 | 73 | 74 | @pytest.mark.parametrize( # type: ignore 75 | argnames="sklearndf_cls", argvalues=CLASSIFIERS_TO_TEST 76 | ) 77 | def test_wrapped_fit_predict( 78 | sklearndf_cls: Type[ClassifierDF], 79 | iris_features: pd.DataFrame, 80 | iris_target_sr: pd.Series, 81 | iris_targets_df: pd.DataFrame, 82 | iris_targets_binary_df: pd.DataFrame, 83 | ) -> None: 84 | """Test fit & predict & predict[_log]_proba of wrapped sklearn classifiers""" 85 | # noinspection PyArgumentList 86 | parameters: Dict[str, Any] = CLASSIFIER_INIT_PARAMETERS.get( 87 | sklearndf_cls.__name__, {} 88 | ) 89 | # noinspection PyArgumentList 90 | classifier: ClassifierDF = sklearndf_cls(**parameters) 91 | 92 | assert is_classifier(classifier) 93 | 94 | is_chain = isinstance(classifier.native_estimator, ClassifierChain) 95 | 96 | is_multi_output = isinstance(classifier.native_estimator, MultiOutputClassifier) 97 | check_expected_not_fitted_error(estimator=classifier) 98 | 99 | if is_chain: 100 | # for chain classifiers, classes must be numerical so the preceding 101 | # classification can act as input to the next classification 102 | classes = set(range(iris_targets_binary_df.shape[1])) 103 | classifier.fit(X=iris_features, y=iris_targets_binary_df) 104 | elif is_multi_output: 105 | classes = set( 106 | chain( 107 | *( 108 | list(iris_targets_df.iloc[:, col].unique()) 109 | for col in range(iris_targets_df.shape[1]) 110 | ) 111 | ) 112 | ) 113 | classifier.fit(X=iris_features, y=iris_targets_df) 114 | else: 115 | classes = set(iris_target_sr.unique()) 116 | classifier.fit(X=iris_features, y=iris_target_sr) 117 | 118 | predictions = classifier.predict(X=iris_features) 119 | 120 | # test predictions data-type, length and values 121 | assert isinstance( 122 | predictions, pd.DataFrame if is_multi_output or is_chain else pd.Series 123 | ) 124 | assert len(predictions) == len(iris_target_sr) 125 | assert np.all(predictions.isin(classes)) 126 | 127 | # test predict_proba & predict_log_proba: 128 | for method_name in ["predict_proba", "predict_log_proba"]: 129 | method = getattr(classifier, method_name) 130 | 131 | if hasattr(classifier.native_estimator, method_name): 132 | predictions = method(X=iris_features) 133 | 134 | if is_multi_output: 135 | assert isinstance(predictions, list) 136 | assert classifier.output_names_ == iris_targets_df.columns.tolist() 137 | assert classifier.n_outputs_ == len(predictions) 138 | else: 139 | if is_chain: 140 | assert ( 141 | classifier.output_names_ 142 | == iris_targets_binary_df.columns.tolist() 143 | ) 144 | assert classifier.n_outputs_ == predictions.shape[1] 145 | else: 146 | assert classifier.output_names_ == [iris_target_sr.name] 147 | assert classifier.n_outputs_ == 1 148 | 149 | predictions = [predictions] 150 | 151 | for prediction in predictions: 152 | # test type and shape of predictions 153 | assert isinstance(prediction, pd.DataFrame) 154 | assert len(prediction) == len(iris_target_sr) 155 | assert prediction.shape == (len(iris_target_sr), len(classes)) 156 | # check correct labels are set as columns 157 | assert set(prediction.columns) == classes 158 | else: 159 | with pytest.raises(NotImplementedError): 160 | method(X=iris_features) 161 | 162 | 163 | @pytest.mark.parametrize( # type: ignore 164 | argnames="sklearndf_cls", argvalues=CLASSIFIERS_PARTIAL_FIT 165 | ) 166 | def test_wrapped_partial_fit( 167 | sklearndf_cls: Type[ClassifierDF], 168 | iris_features: pd.DataFrame, 169 | iris_target_sr: pd.Series, 170 | iris_targets_df: pd.DataFrame, 171 | ) -> None: 172 | # noinspection PyArgumentList 173 | classifier: ClassifierDF = sklearndf_cls( 174 | **CLASSIFIER_INIT_PARAMETERS.get(f"{sklearndf_cls.__name__}_partial_fit", {}) 175 | ) 176 | 177 | is_multi_output = isinstance(classifier.native_estimator, MultiOutputClassifier) 178 | if is_multi_output: 179 | classes = iris_targets_df.apply(lambda col: col.unique()).transpose().values 180 | iris_target = iris_targets_df 181 | else: 182 | classes = iris_target_sr.unique() 183 | iris_target = iris_target_sr 184 | 185 | with pytest.raises( 186 | ValueError, 187 | match="classes must be passed on the first call to partial_fit.", 188 | ): 189 | classifier.partial_fit(iris_features, iris_target) 190 | 191 | classifier.partial_fit(iris_features, iris_target, classes) 192 | -------------------------------------------------------------------------------- /test/test/sklearndf/test_clustering.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | import sklearndf.clustering 7 | from sklearndf import ClusterDF, __sklearn_1_1__, __sklearn_1_3__, __sklearn_version__ 8 | from sklearndf.clustering import FeatureAgglomerationDF 9 | from test.sklearndf import iterate_classes 10 | 11 | # noinspection PyTypeChecker 12 | CLUSTERERS_TO_TEST = iterate_classes( 13 | from_modules=sklearndf.clustering, 14 | matching=r".*DF", 15 | excluding=[ClusterDF.__name__, r".*WrapperDF", FeatureAgglomerationDF.__name__], 16 | ) 17 | # FeatureAgglomeration doesn't support `fit_predict` method 18 | CLUSTERERS_WITH_AGGLOMERATION = CLUSTERERS_TO_TEST + [FeatureAgglomerationDF] 19 | 20 | 21 | def test_clusterer_count() -> None: 22 | n = len(CLUSTERERS_TO_TEST) 23 | 24 | print(f"Testing {n} clusterers.") 25 | 26 | if __sklearn_version__ < __sklearn_1_1__: 27 | assert n == 9 28 | elif __sklearn_version__ < __sklearn_1_3__: 29 | assert n == 10 30 | else: 31 | assert n == 11 32 | 33 | 34 | @pytest.mark.parametrize( # type: ignore 35 | argnames="sklearn_clusterer_cls", argvalues=CLUSTERERS_TO_TEST 36 | ) 37 | def test_clusterer_fit_predict_call( 38 | iris_features: pd.DataFrame, sklearn_clusterer_cls: Type[ClusterDF] 39 | ) -> None: 40 | """Check if each sklearndf clusterer supports fit_predict method""" 41 | 42 | clusterer_instance = sklearn_clusterer_cls() 43 | 44 | assert not clusterer_instance.is_fitted 45 | result_prediction = clusterer_instance.fit_predict(iris_features) 46 | assert type(result_prediction) == pd.Series 47 | assert clusterer_instance.is_fitted 48 | 49 | 50 | @pytest.mark.parametrize( # type: ignore 51 | argnames="sklearn_clusterer_cls", argvalues=CLUSTERERS_WITH_AGGLOMERATION 52 | ) 53 | def test_clusterer_fit_call( 54 | iris_features: pd.DataFrame, sklearn_clusterer_cls: Type[ClusterDF] 55 | ) -> None: 56 | """Check if each sklearndf clusterer supports fit method""" 57 | 58 | clusterer_instance = sklearn_clusterer_cls() 59 | 60 | assert not clusterer_instance.is_fitted 61 | clusterer_instance.fit(iris_features) 62 | assert clusterer_instance.is_fitted 63 | -------------------------------------------------------------------------------- /test/test/sklearndf/test_meta_estimators.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | import pytest 5 | from sklearn.base import is_classifier, is_regressor 6 | from sklearn.impute import SimpleImputer 7 | 8 | from sklearndf.classification import ( 9 | ClassifierChainDF, 10 | LogisticRegressionCVDF, 11 | LogisticRegressionDF, 12 | RandomForestClassifierDF, 13 | VotingClassifierDF, 14 | ) 15 | from sklearndf.pipeline import ClassifierPipelineDF, PipelineDF, RegressorPipelineDF 16 | from sklearndf.regression import ( 17 | ElasticNetDF, 18 | LinearRegressionDF, 19 | MultiOutputRegressorDF, 20 | RandomForestRegressorDF, 21 | RidgeCVDF, 22 | ) 23 | from sklearndf.transformation import ColumnTransformerDF, StandardScalerDF 24 | 25 | log = logging.getLogger(__name__) 26 | 27 | 28 | def test_meta_estimators() -> None: 29 | with pytest.warns( 30 | expected_warning=UserWarning, 31 | match=( 32 | "^the following attributes of VotingClassifierDF have been replaced with " 33 | "their native scikit-learn counterparts: estimators$" 34 | ), 35 | ): 36 | VotingClassifierDF(estimators=[("rf", RandomForestClassifierDF())]) 37 | 38 | with pytest.raises( 39 | TypeError, 40 | match=( 41 | "sklearndf meta-estimators only accept simple regressors and classifiers, " 42 | "but got: ClassifierPipelineDF" 43 | ), 44 | ): 45 | VotingClassifierDF( 46 | estimators=[ 47 | ("rf", ClassifierPipelineDF(classifier=RandomForestClassifierDF())) 48 | ] 49 | ) 50 | 51 | with pytest.warns( 52 | expected_warning=UserWarning, 53 | match=( 54 | "^the following attributes of MultiOutputRegressorDF have been replaced " 55 | "with their native scikit-learn counterparts: estimator$" 56 | ), 57 | ): 58 | regressor = MultiOutputRegressorDF(estimator=RandomForestRegressorDF()) 59 | assert is_regressor(regressor) 60 | 61 | with pytest.raises( 62 | TypeError, 63 | match=( 64 | "sklearndf meta-estimators only accept simple regressors and classifiers, " 65 | "but got: RegressorPipelineDF" 66 | ), 67 | ): 68 | MultiOutputRegressorDF( 69 | estimator=RegressorPipelineDF(regressor=RandomForestRegressorDF()) 70 | ) 71 | 72 | with pytest.warns( 73 | expected_warning=UserWarning, 74 | match=( 75 | "^the following attributes of ClassifierChainDF have been replaced " 76 | "with their native scikit-learn counterparts: base_estimator$" 77 | ), 78 | ): 79 | classifier = ClassifierChainDF(base_estimator=RandomForestClassifierDF()) 80 | assert is_classifier(classifier) 81 | 82 | with pytest.raises( 83 | TypeError, 84 | match=( 85 | "sklearndf meta-estimators only accept simple regressors and classifiers, " 86 | "but got: SimpleImputer" 87 | ), 88 | ): 89 | ClassifierChainDF(base_estimator=SimpleImputer()) 90 | 91 | 92 | def test_stacking_regressor( 93 | diabetes_features: pd.DataFrame, diabetes_target_sr: pd.Series 94 | ) -> None: 95 | from sklearndf.regression import StackingRegressorDF 96 | 97 | # basic building blocks 98 | model1 = LinearRegressionDF() 99 | model2 = ElasticNetDF() 100 | feature_names = list(diabetes_features.columns) 101 | preprocessing = ColumnTransformerDF( 102 | [ 103 | ("scaled", StandardScalerDF(), feature_names[1:]), 104 | ("keep", "passthrough", feature_names[:1]), 105 | ] 106 | ) 107 | print(preprocessing) 108 | 109 | # Pipeline with stack works 110 | pipeline = PipelineDF( 111 | [ 112 | ("preprocessing", preprocessing), 113 | ( 114 | "stack", 115 | StackingRegressorDF( 116 | [ 117 | ("model1", model1), 118 | ("model2", model2), 119 | ] 120 | ), 121 | ), 122 | ] 123 | ) 124 | 125 | assert is_regressor(pipeline) 126 | 127 | pipeline.fit(diabetes_features, diabetes_target_sr) 128 | print(pipeline.predict(diabetes_features)) 129 | 130 | # Stack of Pipelines doesn't 131 | stack_of_pipelines = StackingRegressorDF( 132 | estimators=[ 133 | ( 134 | "pipeline1", 135 | PipelineDF([("preprocessing", preprocessing), ("model1", model1)]), 136 | ), 137 | ( 138 | "pipeline2", 139 | PipelineDF([("preprocessing", preprocessing), ("model2", model2)]), 140 | ), 141 | ("ignore", "drop"), 142 | ], 143 | final_estimator=RidgeCVDF(), 144 | ) 145 | 146 | assert is_regressor(stack_of_pipelines) 147 | 148 | stack_of_pipelines.fit(diabetes_features, diabetes_target_sr) 149 | 150 | pred = stack_of_pipelines.predict(diabetes_features) 151 | assert isinstance(pred, pd.Series) 152 | 153 | assert not stack_of_pipelines.final_estimator.is_fitted 154 | final_estimator_fitted = stack_of_pipelines.final_estimator_ 155 | assert final_estimator_fitted.feature_names_in_.to_list() == [ 156 | "pipeline1", 157 | "pipeline2", 158 | ] 159 | 160 | 161 | def test_stacking_classifier( 162 | iris_features: pd.DataFrame, iris_target_sr: pd.Series 163 | ) -> None: 164 | from sklearndf.classification import StackingClassifierDF 165 | 166 | # basic building blocks 167 | model1 = LogisticRegressionCVDF() 168 | model2 = RandomForestClassifierDF(max_depth=5) 169 | feature_names = iris_features.columns.to_list() 170 | preprocessing = ColumnTransformerDF( 171 | [ 172 | ("scaled", StandardScalerDF(), feature_names[1:]), 173 | ("keep", "passthrough", feature_names[:1]), 174 | ] 175 | ) 176 | 177 | # Pipeline with stack works 178 | pipeline = PipelineDF( 179 | [ 180 | ("preprocessing", preprocessing), 181 | ( 182 | "stack", 183 | StackingClassifierDF( 184 | [ 185 | ("model1", model1), 186 | ("model2", model2), 187 | ] 188 | ), 189 | ), 190 | ] 191 | ) 192 | 193 | assert is_classifier(pipeline) 194 | 195 | pipeline.fit(iris_features, iris_target_sr) 196 | 197 | # Stack of Pipelines doesn't 198 | stack_of_pipelines = StackingClassifierDF( 199 | estimators=[ 200 | ( 201 | "pipeline1", 202 | PipelineDF([("preprocessing", preprocessing), ("model1", model1)]), 203 | ), 204 | ( 205 | "pipeline2", 206 | PipelineDF([("preprocessing", preprocessing), ("model2", model2)]), 207 | ), 208 | ("ignore", "drop"), 209 | ], 210 | final_estimator=LogisticRegressionDF(), 211 | passthrough=True, 212 | ) 213 | 214 | assert is_classifier(pipeline) 215 | 216 | stack_of_pipelines.fit(iris_features, iris_target_sr) 217 | 218 | pred = stack_of_pipelines.predict_proba(iris_features) 219 | assert pred.columns.to_list() == ["setosa", "versicolor", "virginica"] 220 | 221 | assert not stack_of_pipelines.final_estimator.is_fitted 222 | final_estimator_fitted = stack_of_pipelines.final_estimator_ 223 | assert final_estimator_fitted.feature_names_in_.to_list() == [ 224 | "pipeline1_setosa", 225 | "pipeline1_versicolor", 226 | "pipeline1_virginica", 227 | "pipeline2_setosa", 228 | "pipeline2_versicolor", 229 | "pipeline2_virginica", 230 | "sepal length (cm)", 231 | "sepal width (cm)", 232 | "petal length (cm)", 233 | "petal width (cm)", 234 | ] 235 | -------------------------------------------------------------------------------- /test/test/sklearndf/test_missing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | 5 | from sklearndf.wrapper import MissingEstimator 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | 10 | def test_missing() -> None: 11 | class MyMissingEstimator(MissingEstimator): 12 | pass 13 | 14 | with pytest.raises( 15 | RuntimeError, 16 | match=( 17 | "Estimator MyMissingEstimator is not available. " 18 | "Please install the package that implements it." 19 | ), 20 | ): 21 | MyMissingEstimator(1, "2", a=2) 22 | -------------------------------------------------------------------------------- /test/test/sklearndf/test_regression.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Type 2 | 3 | import pandas as pd 4 | import pytest 5 | from sklearn.base import BaseEstimator, is_regressor 6 | from sklearn.multioutput import MultiOutputRegressor, RegressorChain 7 | 8 | import sklearndf.regression 9 | from sklearndf import RegressorDF, TransformerDF 10 | from sklearndf.regression import ( 11 | SVRDF, 12 | IsotonicRegressionDF, 13 | LinearRegressionDF, 14 | MLPRegressorDF, 15 | MultiOutputRegressorDF, 16 | PassiveAggressiveRegressorDF, 17 | RandomForestRegressorDF, 18 | SGDRegressorDF, 19 | ) 20 | from sklearndf.wrapper import EstimatorWrapperDF 21 | from test.sklearndf import check_expected_not_fitted_error, iterate_classes 22 | 23 | # noinspection PyTypeChecker 24 | # ignore false alert about module type 25 | REGRESSORS_TO_TEST: List[Type[EstimatorWrapperDF[BaseEstimator]]] = iterate_classes( 26 | from_modules=sklearndf.regression, 27 | matching=r".*DF", 28 | excluding=[RegressorDF.__name__, TransformerDF.__name__, r".*WrapperDF"], 29 | ) 30 | 31 | 32 | def test_regressor_count() -> None: 33 | n = len(REGRESSORS_TO_TEST) 34 | 35 | print(f"Testing {n} regressors.") 36 | assert n == 55 37 | 38 | 39 | DEFAULT_REGRESSOR_PARAMETERS: Dict[str, Dict[str, Any]] = { 40 | "MultiOutputRegressorDF": dict(estimator=RandomForestRegressorDF()), 41 | "MultiOutputRegressorDF_partial_fit": dict(estimator=SGDRegressorDF()), 42 | "RegressorChainDF": dict(base_estimator=RandomForestRegressorDF()), 43 | "VotingRegressorDF": dict( 44 | estimators=[("rfr", RandomForestRegressorDF()), ("svr", SVRDF())] 45 | ), 46 | "StackingRegressorDF": dict( 47 | estimators=( 48 | ("Forest", RandomForestRegressorDF()), 49 | ("SVR", SVRDF()), 50 | ("Linear", LinearRegressionDF()), 51 | ) 52 | ), 53 | # the rank of Y is 1, so n_components needs to be 1 54 | "CCADF": dict(n_components=1), 55 | # the rank of Y is 1, so n_components needs to be 1 56 | "PLSCanonicalDF": dict(n_components=1), 57 | # use a solver that is still supported with scipy 1.11 58 | "QuantileRegressorDF": dict(solver="highs"), 59 | } 60 | 61 | REGRESSORS_PARTIAL_FIT = [ 62 | SGDRegressorDF, 63 | PassiveAggressiveRegressorDF, 64 | MultiOutputRegressorDF, 65 | MLPRegressorDF, 66 | ] 67 | 68 | 69 | @pytest.mark.parametrize( # type: ignore 70 | argnames="sklearndf_cls", argvalues=REGRESSORS_TO_TEST 71 | ) 72 | def test_wrapped_fit_predict( 73 | sklearndf_cls: Type[RegressorDF], 74 | diabetes_features: pd.DataFrame, 75 | diabetes_target_sr: pd.Series, 76 | diabetes_target_df: pd.DataFrame, 77 | ) -> None: 78 | """Test fit & predict of wrapped sklearn regressors""" 79 | parameters: Dict[str, Any] = DEFAULT_REGRESSOR_PARAMETERS.get( 80 | sklearndf_cls.__name__, {} 81 | ) 82 | 83 | # noinspection PyArgumentList 84 | regressor: RegressorDF = sklearndf_cls(**parameters) 85 | 86 | assert is_regressor(regressor) 87 | 88 | check_expected_not_fitted_error(estimator=regressor) 89 | 90 | if ( 91 | type(regressor).__name__.startswith("Multi") 92 | or isinstance(regressor.native_estimator, MultiOutputRegressor) 93 | or isinstance(regressor.native_estimator, RegressorChain) 94 | ): 95 | regressor.fit(X=diabetes_features, y=diabetes_target_df) 96 | 97 | else: 98 | if isinstance(regressor, IsotonicRegressionDF): 99 | # fit will fail when we have more than one feature 100 | with pytest.raises(ValueError): 101 | regressor.fit(X=diabetes_features, y=diabetes_target_sr) 102 | # eliminate all features except one then continue testing 103 | diabetes_features = diabetes_features.loc[:, "bmi"] 104 | 105 | regressor.fit(X=diabetes_features, y=diabetes_target_sr) 106 | 107 | predictions = regressor.predict(X=diabetes_features) 108 | 109 | # test predictions data-type, length and values 110 | assert isinstance(predictions, (pd.Series, pd.DataFrame)) 111 | assert len(predictions) == len(diabetes_target_sr) 112 | 113 | 114 | @pytest.mark.parametrize( # type: ignore 115 | argnames="sklearndf_cls", argvalues=REGRESSORS_PARTIAL_FIT 116 | ) 117 | def test_wrapped_partial_fit( 118 | sklearndf_cls: Type[RegressorDF], 119 | diabetes_features: pd.DataFrame, 120 | diabetes_target_sr: pd.Series, 121 | diabetes_target_df: pd.DataFrame, 122 | ) -> None: 123 | # noinspection PyArgumentList 124 | regressor = sklearndf_cls( 125 | **DEFAULT_REGRESSOR_PARAMETERS.get(f"{sklearndf_cls.__name__}_partial_fit", {}) 126 | ) 127 | 128 | is_multi_output = isinstance(regressor.native_estimator, MultiOutputRegressor) 129 | diabetes_target = diabetes_target_df if is_multi_output else diabetes_target_sr 130 | 131 | # noinspection PyUnresolvedReferences 132 | regressor.partial_fit(diabetes_features, diabetes_target) 133 | -------------------------------------------------------------------------------- /test/test/sklearndf/test_sklearn_coverage.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from types import ModuleType 3 | from typing import Dict, Iterable, List, Optional, Type, TypeVar, Union 4 | 5 | import pytest 6 | import sklearn 7 | from sklearn.base import ( 8 | BaseEstimator, 9 | ClassifierMixin, 10 | ClusterMixin, 11 | RegressorMixin, 12 | TransformerMixin, 13 | ) 14 | from sklearn.pipeline import Pipeline 15 | from sklearn.utils.metaestimators import _BaseComposition 16 | 17 | import sklearndf.classification 18 | import sklearndf.clustering 19 | import sklearndf.pipeline 20 | import sklearndf.regression 21 | import sklearndf.transformation 22 | from ..conftest import UNSUPPORTED_SKLEARN_PACKAGES 23 | from . import find_all_submodules, iterate_classes, sklearn_delegate_classes 24 | from sklearndf import EstimatorDF 25 | 26 | T = TypeVar("T") 27 | 28 | 29 | GENERAL_COVERAGE_EXCLUSIONS = { 30 | # exclude all private classes: 31 | r"^_", 32 | # exclude all Base classes: 33 | r"^Base[A-Z]", 34 | # exclude all Mixin classes: 35 | r".*Mixin$", 36 | } 37 | 38 | CLASSIFIER_COVERAGE_EXCLUSIONS = { 39 | *GENERAL_COVERAGE_EXCLUSIONS, 40 | # Base classes and Mixins not following the convention 41 | "ForestClassifier", 42 | "_IdentityClassifier", 43 | } 44 | 45 | 46 | REGRESSOR_COVERAGE_EXCLUSIONS = { 47 | *GENERAL_COVERAGE_EXCLUSIONS, 48 | # Base classes and mix-ins 49 | "ForestRegressor", 50 | "GeneralizedLinearRegressor", 51 | # Private classes 52 | "_SigmoidCalibration", 53 | "_PLS", 54 | } 55 | 56 | 57 | TRANSFORMER_COVERAGE_EXCLUSIONS = { 58 | *GENERAL_COVERAGE_EXCLUSIONS, 59 | # class "Imputer" was deprecated in 0.20 and removed in 0.22 60 | "Imputer", 61 | # class "AgglomerationTransform" is just a mix-in class and 62 | # isn't meant to be used directly 63 | "AgglomerationTransform", 64 | } 65 | 66 | 67 | PIPELINE_COVERAGE_EXCLUSIONS = GENERAL_COVERAGE_EXCLUSIONS 68 | 69 | 70 | CLUSTERER_COVERAGE_EXCLUSIONS = { 71 | *GENERAL_COVERAGE_EXCLUSIONS, 72 | } 73 | 74 | 75 | UNSUPPORTED_SKLEARN_CLASSES = { 76 | sklearn_class.__name__ 77 | for sklearn_class in iterate_classes( 78 | from_modules=itertools.chain.from_iterable( 79 | (p, *find_all_submodules(p)) for p in UNSUPPORTED_SKLEARN_PACKAGES 80 | ), 81 | matching=".*", 82 | ) 83 | } 84 | 85 | 86 | def _find_sklearn_classes_to_cover( 87 | from_modules: Union[ModuleType, Iterable[ModuleType]], 88 | subclass_of: Type[T], 89 | excluding: Optional[Union[str, Iterable[str]]] = None, 90 | ) -> List[Type[T]]: 91 | return [ 92 | cls 93 | for cls in iterate_classes( 94 | from_modules=from_modules, matching=".*", excluding=excluding 95 | ) 96 | if issubclass(cls, subclass_of) 97 | ] 98 | 99 | 100 | def sklearn_classifier_classes() -> List[type]: 101 | return _find_sklearn_classes_to_cover( 102 | from_modules=find_all_submodules(sklearn), 103 | subclass_of=ClassifierMixin, 104 | excluding=CLASSIFIER_COVERAGE_EXCLUSIONS, 105 | ) 106 | 107 | 108 | def sklearn_regressor_classes() -> List[type]: 109 | return _find_sklearn_classes_to_cover( 110 | from_modules=find_all_submodules(sklearn), 111 | subclass_of=RegressorMixin, 112 | excluding=REGRESSOR_COVERAGE_EXCLUSIONS, 113 | ) 114 | 115 | 116 | def sklearn_pipeline_classes() -> List[type]: 117 | pipeline_modules = find_all_submodules(sklearn.pipeline) 118 | pipeline_modules.add(sklearn.pipeline) 119 | 120 | return _find_sklearn_classes_to_cover( 121 | from_modules=pipeline_modules, 122 | subclass_of=_BaseComposition, 123 | excluding=PIPELINE_COVERAGE_EXCLUSIONS, 124 | ) 125 | 126 | 127 | def sklearn_transformer_classes() -> List[type]: 128 | """Return all classes that are 'just' transformers, not learners or pipelines.""" 129 | transformer_mixin_classes = [ 130 | cls 131 | for cls in iterate_classes( 132 | from_modules=find_all_submodules(sklearn), 133 | matching=".*", 134 | excluding=TRANSFORMER_COVERAGE_EXCLUSIONS, 135 | ) 136 | if issubclass(cls, TransformerMixin) 137 | ] 138 | 139 | transformer_classes = list( 140 | set(transformer_mixin_classes) 141 | .difference(sklearn_classifier_classes()) 142 | .difference(sklearn_regressor_classes()) 143 | .difference(sklearn_pipeline_classes()) 144 | .difference(sklearn_clusterer_classes()) 145 | ) 146 | 147 | return transformer_classes 148 | 149 | 150 | def sklearn_clusterer_classes() -> List[type]: 151 | return _find_sklearn_classes_to_cover( 152 | from_modules=find_all_submodules(sklearn), 153 | subclass_of=ClusterMixin, 154 | excluding=CLUSTERER_COVERAGE_EXCLUSIONS, 155 | ) 156 | 157 | 158 | def _check_unexpected_sklearn_class(cls: type) -> None: 159 | f_cls_name = f"{cls.__module__}.{cls.__name__}" 160 | if cls.__name__ in UNSUPPORTED_SKLEARN_CLASSES: 161 | pytest.skip(f"Class {f_cls_name} is not wrapped but marked as unsupported") 162 | else: 163 | raise ValueError(f"Class {f_cls_name} is not wrapped") 164 | 165 | 166 | @pytest.mark.parametrize( # type: ignore 167 | argnames="sklearn_classifier_cls", argvalues=sklearn_classifier_classes() 168 | ) 169 | def test_classifier_coverage(sklearn_classifier_cls: Type[ClassifierMixin]) -> None: 170 | """Check if each sklearn classifier has a wrapped sklearndf counterpart.""" 171 | sklearn_classes: Dict[ 172 | Type[BaseEstimator], Type[EstimatorDF] 173 | ] = sklearn_delegate_classes(sklearndf.classification) 174 | 175 | if sklearn_classifier_cls not in sklearn_classes: 176 | _check_unexpected_sklearn_class(sklearn_classifier_cls) 177 | 178 | 179 | @pytest.mark.parametrize( # type: ignore 180 | argnames="sklearn_regressor_cls", argvalues=sklearn_regressor_classes() 181 | ) 182 | def test_regressor_coverage(sklearn_regressor_cls: Type[RegressorMixin]) -> None: 183 | """Check if each sklearn regressor has a wrapped sklearndf counterpart.""" 184 | sklearn_classes: Dict[ 185 | Type[BaseEstimator], Type[EstimatorDF] 186 | ] = sklearn_delegate_classes(sklearndf.regression) 187 | 188 | if sklearn_regressor_cls not in sklearn_classes: 189 | _check_unexpected_sklearn_class(sklearn_regressor_cls) 190 | 191 | 192 | @pytest.mark.parametrize( # type: ignore 193 | argnames="sklearn_transformer_cls", argvalues=sklearn_transformer_classes() 194 | ) 195 | def test_transformer_coverage(sklearn_transformer_cls: Type[TransformerMixin]) -> None: 196 | """Check if each sklearn transformer has a wrapped sklearndf counterpart.""" 197 | 198 | sklearn_classes: Dict[ 199 | Type[BaseEstimator], Type[EstimatorDF] 200 | ] = sklearn_delegate_classes(sklearndf.transformation) 201 | 202 | if sklearn_transformer_cls not in sklearn_classes: 203 | _check_unexpected_sklearn_class(sklearn_transformer_cls) 204 | 205 | 206 | @pytest.mark.parametrize( # type: ignore 207 | argnames="sklearn_pipeline_cls", argvalues=sklearn_pipeline_classes() 208 | ) 209 | def test_pipeline_coverage(sklearn_pipeline_cls: Type[Pipeline]) -> None: 210 | """Check if each sklearn pipeline estimator has 211 | a wrapped sklearndf counterpart.""" 212 | 213 | # noinspection PyTypeChecker 214 | sklearn_classes = sklearn_delegate_classes(sklearndf.pipeline) 215 | 216 | if sklearn_pipeline_cls not in sklearn_classes: 217 | _check_unexpected_sklearn_class(sklearn_pipeline_cls) 218 | 219 | 220 | @pytest.mark.parametrize( # type: ignore 221 | argnames="sklearn_clusterer_cls", argvalues=sklearn_clusterer_classes() 222 | ) 223 | def test_clusterer_coverage(sklearn_clusterer_cls: Type[ClusterMixin]) -> None: 224 | """Check if each sklearn clusterer has a wrapped sklearndf counterpart.""" 225 | sklearn_classes: Dict[ 226 | Type[BaseEstimator], Type[EstimatorDF] 227 | ] = sklearn_delegate_classes(sklearndf.clustering) 228 | 229 | if sklearn_clusterer_cls not in sklearn_classes: 230 | _check_unexpected_sklearn_class(sklearn_clusterer_cls) 231 | -------------------------------------------------------------------------------- /test/test/sklearndf/transformation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BCG-X-Official/sklearndf/c3fe1925e762fd19edf93182d98aa794e9d549e8/test/test/sklearndf/transformation/__init__.py -------------------------------------------------------------------------------- /test/test/sklearndf/transformation/test_extra.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, Optional, Type 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from lightgbm import LGBMRegressor 7 | from packaging.version import Version 8 | 9 | from sklearndf import TransformerDF 10 | from sklearndf.pipeline import PipelineDF 11 | from sklearndf.regression import RandomForestRegressorDF 12 | from sklearndf.regression.extra import LGBMRegressorDF 13 | from sklearndf.transformation import SimpleImputerDF 14 | from sklearndf.transformation.extra import BoostAGrootaDF, BorutaDF, GrootCVDF, LeshyDF 15 | from sklearndf.wrapper import MissingEstimator 16 | 17 | # get the version of the arfs package 18 | __arfs_version__: Optional[Version] 19 | try: 20 | import arfs 21 | 22 | # get the version of the arfs package 23 | __arfs_version__ = Version(arfs.__version__) 24 | except ImportError: 25 | __arfs_version__ = None 26 | 27 | __arfs_1_1__ = Version("1.1") 28 | 29 | # set up a regressors for use in the feature selection tests 30 | 31 | regressor_params = dict(max_depth=5, n_jobs=-3, random_state=42, n_estimators=100) 32 | lgbm_regressor = LGBMRegressor(**regressor_params) 33 | lgbm_regressor_df = LGBMRegressorDF(**regressor_params) 34 | 35 | parametrize_feature_selector_cls: Callable[ 36 | [Callable[..., None]], Callable[..., None] 37 | ] = pytest.mark.parametrize( 38 | # the class/parameter combinations to test for feature selection 39 | argnames=["feature_selector_cls", "feature_selector_params"], 40 | argvalues=[ 41 | (cls, params) 42 | for cls, params in [ 43 | # Boruta selector 44 | ( 45 | BorutaDF, 46 | dict( 47 | estimator=RandomForestRegressorDF( 48 | max_depth=5, n_jobs=-3, random_state=42, n_estimators=100 49 | ) 50 | ), 51 | ), 52 | # Various ARFS selectors 53 | (LeshyDF, dict(estimator=lgbm_regressor, random_state=42, perc=90)), 54 | (LeshyDF, dict(estimator=lgbm_regressor_df, random_state=42, perc=90)), 55 | ( 56 | BoostAGrootaDF, 57 | dict(est=lgbm_regressor, cutoff=1.1) 58 | if __arfs_version__ is None or __arfs_version__ < __arfs_1_1__ 59 | else dict(estimator=lgbm_regressor, cutoff=1.1), 60 | ), 61 | (GrootCVDF, dict()), 62 | ] 63 | if not issubclass(cls.__wrapped__, MissingEstimator) 64 | ], 65 | ) 66 | 67 | 68 | # 69 | # Test the feature selection classes 70 | # 71 | 72 | 73 | @parametrize_feature_selector_cls 74 | def test_feature_selection_df( 75 | feature_selector_cls: Type[TransformerDF], feature_selector_params: Dict[str, Any] 76 | ) -> None: 77 | """ 78 | Test feature selection using the Boruta or ARFS package using a simple synthetic 79 | dataset. 80 | 81 | :param feature_selector_cls: The feature selector class to test. 82 | :param feature_selector_params: The parameters to use for the feature selector. 83 | """ 84 | 85 | df = pd.DataFrame(data=np.random.randn(100, 5), columns=list("abcde")) 86 | x = df.iloc[:, :-1] 87 | y = df.iloc[:, -1] 88 | 89 | feature_selector = feature_selector_cls(**feature_selector_params) 90 | feature_selector.fit(x, y) 91 | assert set(feature_selector.feature_names_out_) <= {"a", "b", "c", "d", "e"} 92 | 93 | 94 | @parametrize_feature_selector_cls 95 | def test_feature_selection_pipeline_df( 96 | feature_selector_cls: Type[TransformerDF], 97 | feature_selector_params: Dict[str, Any], 98 | diabetes_df: pd.DataFrame, 99 | diabetes_target: str, 100 | ) -> None: 101 | """ 102 | Test feature selection using the Boruta or ARFS package using the diabetes 103 | dataset. 104 | 105 | :param feature_selector_cls: The feature selector class to test. 106 | :param feature_selector_params: The parameters to use for the feature selector. 107 | :param diabetes_df: The diabetes dataset. 108 | :param diabetes_target: The diabetes target column. 109 | """ 110 | 111 | feature_selector = feature_selector_cls(**feature_selector_params) 112 | 113 | diabetes_df = diabetes_df.sample(frac=0.5, random_state=42) 114 | 115 | feature_selection_pipeline = PipelineDF( 116 | steps=[ 117 | ( 118 | "preprocess", 119 | PipelineDF( 120 | steps=[ 121 | ("imputer", SimpleImputerDF()), 122 | ] 123 | ), 124 | ), 125 | ("selector", feature_selector), 126 | ] 127 | ) 128 | 129 | x = diabetes_df.drop(columns=diabetes_target) 130 | y = diabetes_df.loc[:, diabetes_target] 131 | 132 | feature_selection_pipeline.fit(x, y) 133 | 134 | selected_features = set(feature_selection_pipeline.feature_names_out_) 135 | try: 136 | assert selected_features == set(feature_selector.selected_features_) 137 | except AttributeError: 138 | pass 139 | 140 | assert {"bmi", "bp", "s5"}.issubset( 141 | selected_features 142 | ), "key features have been selected" 143 | 144 | assert len(selected_features) <= 5, "no more than 5 features were selected" 145 | 146 | assert (selected_features - {"bmi", "bp", "s5"}).issubset( 147 | {"sex", "s1", "s2", "s3", "s6"} 148 | ), "additional selected features were not completely irrelevant" 149 | -------------------------------------------------------------------------------- /test/test/sklearndf/transformation/test_imputers.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | from typing import Type 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | 9 | import sklearndf.transformation 10 | from sklearndf import TransformerDF 11 | from test.sklearndf import iterate_classes 12 | 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.DEBUG) 15 | 16 | IMPUTERS_TO_TEST = iterate_classes( 17 | from_modules=sklearndf.transformation, matching=r".*Imputer.*DF", excluding=[] 18 | ) 19 | 20 | 21 | @pytest.mark.parametrize( # type: ignore 22 | argnames=["imputer_cls", "add_indicator"], 23 | argvalues=itertools.product(IMPUTERS_TO_TEST, [True, False]), 24 | ) 25 | def test_imputer( 26 | imputer_cls: Type[TransformerDF], 27 | add_indicator: bool, 28 | ) -> None: 29 | """ 30 | Test imputer classes using the combinations of arguments from 31 | ``@pytest.mark.parametrize`` 32 | 33 | :param imputer_cls: the imputer class to test 34 | :param add_indicator: whether to add an indicator column 35 | :return: 36 | """ 37 | imputer_df = imputer_cls(add_indicator=add_indicator) 38 | imputer_cls_orig = type(imputer_df.native_estimator) 39 | 40 | test_data_x = pd.DataFrame( 41 | data=[[7, 2, 3], [4, np.nan, 6], [10, 5, 9]], columns=["a", "b", "c"] 42 | ) 43 | test_data_x_with_all_nan = pd.DataFrame( 44 | data=[[7, np.nan, 3], [4, np.nan, 6], [np.nan, np.nan, np.nan]], 45 | columns=["a", "b", "c"], 46 | ) 47 | test_data_y = pd.DataFrame( 48 | data=[[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]], columns=["a", "b", "c"] 49 | ) 50 | 51 | # noinspection PyArgumentList 52 | imputer_orig = imputer_cls_orig(add_indicator=add_indicator) 53 | # noinspection PyUnresolvedReferences 54 | imputer_orig.fit(test_data_x.values) 55 | # noinspection PyUnresolvedReferences 56 | y_transformed = imputer_orig.transform(test_data_y) 57 | 58 | imputer_df.fit(test_data_x) 59 | y_transformed_df = imputer_df.transform(test_data_y) 60 | 61 | assert np.array_equal( 62 | np.round(y_transformed, 4), np.round(y_transformed_df.values, 4) 63 | ), ( 64 | f"Different imputation results! " 65 | f"sklearn:{y_transformed} " 66 | f"sklearndf: {y_transformed_df.values}" 67 | ) 68 | 69 | # test correct imputation (and returned column labels) 70 | # for the case when a full input series is NaN 71 | # noinspection PyUnresolvedReferences 72 | imputer_orig.fit(test_data_x_with_all_nan.values) 73 | # noinspection PyUnresolvedReferences 74 | y_transformed = imputer_orig.transform(test_data_y) 75 | 76 | imputer_df.fit(test_data_x_with_all_nan) 77 | y_transformed_df = imputer_df.transform(test_data_y) 78 | 79 | assert np.array_equal( 80 | np.round(y_transformed, 4), np.round(y_transformed_df.values, 4) 81 | ), ( 82 | f"Different imputation results! " 83 | f"sklearn:{y_transformed} " 84 | f"sklearndf: {y_transformed_df.values}" 85 | ) 86 | -------------------------------------------------------------------------------- /test/test/sklearndf/transformation/test_sparse.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from pandas.testing import assert_index_equal, assert_series_equal 4 | 5 | from sklearndf._util import sparse_frame_density 6 | from sklearndf.pipeline import FeatureUnionDF, PipelineDF 7 | from sklearndf.transformation import CountVectorizerDF, TfidfTransformerDF 8 | 9 | 10 | def test_tfidf() -> None: 11 | # expected results 12 | 13 | word_feature_names = ( 14 | ["and", "document", "first", "here", "is", "it"] 15 | + ["last", "one", "or", "second", "the", "third", "this"] 16 | # single-word features 17 | ) 18 | bigram_feature_names = ( 19 | ["and the", "first document", "here is", "is it", "is the", "is this", "it the"] 20 | + ["last document", "or is", "second document", "the first", "the last"] 21 | + ["the second", "the third", "third one", "this the"] 22 | ) 23 | 24 | # create a simple toy corpus, inspired by scikit-learn's documentation 25 | 26 | corpus = pd.Series( 27 | [ 28 | "Here is the first document.", 29 | "Here is the second document.", 30 | "And the third one.", 31 | "Is this the first document?", 32 | "The last document?", 33 | "Or is it the second document?", 34 | ] 35 | ) 36 | corpus_named = corpus.rename("document") 37 | 38 | # count the words for every document in the corpus 39 | 40 | word_counter = CountVectorizerDF() 41 | 42 | with pytest.raises( 43 | ValueError, match="the name of the series passed as arg X must not be None$" 44 | ): 45 | word_counter.fit_transform(corpus) 46 | 47 | word_counts_sparse_df = word_counter.fit_transform(corpus_named) 48 | 49 | assert word_counter.feature_names_out_.to_list() == word_feature_names 50 | assert all( 51 | isinstance(dtype, pd.SparseDtype) for dtype in word_counts_sparse_df.dtypes 52 | ) 53 | 54 | # compute the tf-idf values for every word in every document 55 | 56 | tfidf = TfidfTransformerDF() 57 | x_tfidf = tfidf.fit_transform(word_counts_sparse_df) 58 | 59 | assert all(isinstance(dtype, pd.SparseDtype) for dtype in x_tfidf.dtypes) 60 | assert_index_equal(tfidf.feature_names_out_, word_counts_sparse_df.columns) 61 | assert_index_equal(tfidf.feature_names_out_, x_tfidf.columns) 62 | assert sparse_frame_density(x_tfidf) == pytest.approx(0.3589744) 63 | 64 | # count the bigrams for every document in the corpus 65 | 66 | bigram_counter = CountVectorizerDF(analyzer="word", ngram_range=(2, 2)) 67 | x2 = bigram_counter.fit_transform(corpus_named) 68 | assert bigram_counter.feature_names_out_.to_list() == bigram_feature_names 69 | assert all(isinstance(dtype, pd.SparseDtype) for dtype in x2.dtypes) 70 | 71 | # create a pipeline that combines the word and bigram counter 72 | # and computes the tf-idf values for every word and bigram 73 | 74 | vectorize = FeatureUnionDF( 75 | [ 76 | ("words", word_counter), 77 | ("bigrams", bigram_counter), 78 | ] 79 | ) 80 | pipeline = PipelineDF( 81 | [ 82 | ("vectorize", vectorize), 83 | ("tfidf", tfidf), 84 | ] 85 | ) 86 | 87 | tfidf = pipeline.fit_transform(corpus_named) 88 | assert all(isinstance(dtype, pd.SparseDtype) for dtype in tfidf.dtypes) 89 | assert_series_equal( 90 | pipeline.feature_names_original_, 91 | pd.Series( 92 | index=pd.Index( 93 | [f"words__{name}" for name in word_feature_names] 94 | + [f"bigrams__{name}" for name in bigram_feature_names], 95 | name="feature", 96 | ), 97 | data="document", # all features share the same input column, "document" 98 | name="feature_original", 99 | ), 100 | ) 101 | -------------------------------------------------------------------------------- /test/test/test_docs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test docstrings. 3 | """ 4 | 5 | from pytools.api import DocValidator 6 | 7 | 8 | def test_doc() -> None: 9 | assert DocValidator( 10 | root_dir="src", 11 | exclude_from_parameter_validation=( 12 | r"sklearndf\.(?:" 13 | + "|".join( 14 | f"(?:{pattern})" 15 | for pattern in ( 16 | # generated classes, except in the '.extra' subpackages 17 | r"(?:classification|clustering|regression|transformation)" 18 | r"\.(?!extra\.).*", 19 | # LGBM estimators in the '.extra' packages 20 | r"(?:classification|regression)\.extra\.LGBM.*", 21 | # XGBoost estimators in the '.extra' packages 22 | r"(?:classification|regression)\.extra\.XGB.*", 23 | # BorutaPy package 24 | r"transformation\.extra\.BorutaDF", 25 | # ARFS package 26 | r"transformation\.extra\.BoostAGrootaDF", 27 | r"transformation\.extra\.GrootCVDF", 28 | r"transformation\.extra\.LeshyDF", 29 | # scikit-learn pipeline classes 30 | r"pipeline\.(PipelineDF|FeatureUnionDF).*", 31 | # sparse frames version of FeatureUnion 32 | r"pipeline\.wrapper\.FeatureUnion\.", 33 | ) 34 | ) 35 | + ")" 36 | ), 37 | ).validate_doc(), "docstrings and type hints are valid" 38 | -------------------------------------------------------------------------------- /tmp/README.md: -------------------------------------------------------------------------------- 1 | This folder is for temporary files. It is not managed by git. -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py3, 3 | py37 4 | py38, 5 | py39 6 | skip_missing_interpreters = true 7 | isolated_build = true 8 | minversion = 3.7 9 | distshare= {toxinidir}/dist/tox 10 | 11 | [testenv] 12 | changedir = . 13 | passenv = * 14 | setenv = 15 | PYTHONPATH = {toxinidir}{:}{toxinidir}/test 16 | PIP_DISABLE_PIP_VERSION_CHECK = 1 17 | PIP_EXTRA_INDEX_URL={env:FACET_PATH_URI}/pytools/dist/tox/simple 18 | 19 | # We change the install command to build packages from source that depend on numpy's 20 | # binary API. 21 | # This is necessary to prevent the notorious "RuntimeError: module compiled against API 22 | # version 0x… but this version of numpy is 0x…" error. 23 | install_command = 24 | python -m pip install {opts} {packages} --no-binary '{env:FACET_NO_BINARY}' 25 | 26 | extras = 27 | testing 28 | 29 | commands = 30 | # print all installed packages to stdout 31 | python -m pip freeze 32 | # run the tests 33 | pytest test/ -s 34 | 35 | [testenv:{py3,py37,py38,py39}-custom-deps] 36 | deps = 37 | # install custom dependencies 38 | gamma-pytools{env:FACET_V_GAMMA_PYTOOLS} 39 | joblib{env:FACET_V_JOBLIB} 40 | matplotlib{env:FACET_V_MATPLOTLIB} 41 | numpy{env:FACET_V_NUMPY} 42 | pandas{env:FACET_V_PANDAS} 43 | scikit-learn{env:FACET_V_SCIKIT_LEARN} 44 | scipy{env:FACET_V_SCIPY} 45 | typing_inspect{env:FACET_V_TYPING_INSPECT} 46 | # optional dependencies, for testing only 47 | arfs{env:FACET_V_ARFS} 48 | boruta{env:FACET_V_BORUTA} 49 | lightgbm{env:FACET_V_LIGHTGBM} 50 | xgboost{env:FACET_V_XGBOOST} 51 | 52 | [flake8] 53 | 54 | max-line-length = 88 55 | 56 | show-source = true 57 | 58 | ignore = 59 | W504, # line break after binary operator 60 | E402, # module level import not at top of file 61 | E731, # do not assign a lambda expression, use a def 62 | E741, # ignore not easy to read variables like i l I etc 63 | C408, # Unnecessary (dict/list/tuple) call - rewrite as a literal 64 | S001, # found modulo formatter (incorrect picks up mod operations) 65 | 66 | # Ignores below are added to prevent conflicts with Black formatter 67 | E231, # Missing whitespace after ',', ';', or ':' 68 | E203, # space before : 69 | W503, # line break before binary operator 70 | 71 | per-file-ignores = 72 | __init__.py: F401, F403, F405 73 | 74 | exclude = 75 | .eggs/*.py, 76 | venv/*, 77 | .venv/*, 78 | .git/* 79 | 80 | [coverage:report] 81 | ignore_errors = False 82 | show_missing = True 83 | 84 | [isort] 85 | profile=black 86 | src_paths=src,test 87 | known_local_folder=sklearndf,test 88 | known_first_party=pytools 89 | known_third_party=numpy,pandas,joblib,sklearn,matplot 90 | 91 | [pytest] 92 | adopts = 93 | --cov-report=html:coverage_html 94 | --cov-report=xml:coverage.xml 95 | --cov-config=setup.cfg 96 | --cov-report=term-missing:skip-covered 97 | --no-cov-on-fail 98 | testpaths= test/test/ 99 | log_cli_level=ERROR 100 | cache_dir=.pytest_cache 101 | --------------------------------------------------------------------------------