├── .github └── workflows │ ├── codeql-analysis.yml │ ├── linux.yml │ ├── macos.yml │ ├── pypi-test.yml │ ├── release.yml │ └── windows.yml ├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── CONTRIBUTORS ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── complete tutorial.ipynb │ ├── conf.py │ ├── images │ ├── scorecard.png │ └── stepwise.png │ ├── index.md │ ├── modules.md │ ├── reference.md │ ├── toad.detector.md │ ├── toad.md │ ├── toad.merge.md │ ├── toad.metrics.md │ ├── toad.nn.functional.md │ ├── toad.nn.md │ ├── toad.nn.module.md │ ├── toad.nn.trainer.md │ ├── toad.plot.md │ ├── toad.preprocessing.md │ ├── toad.preprocessing.partition.md │ ├── toad.preprocessing.process.md │ ├── toad.scorecard.md │ ├── toad.selection.md │ ├── toad.stats.md │ ├── toad.transform.md │ ├── toad.utils.decorator.md │ ├── toad.utils.func.md │ ├── toad.utils.md │ ├── toad.utils.mixin.md │ ├── tutorial.ipynb │ └── tutorial_chinese.ipynb ├── images └── toad_logo.png ├── pyproject.toml ├── requirements-dist.txt ├── requirements-nn.txt ├── requirements-test.txt ├── requirements-tools.txt ├── requirements.txt ├── scripts └── build_wheels.sh ├── setup.cfg ├── setup.py ├── tests └── test_data.csv └── toad ├── __init__.py ├── c_utils.pxd ├── c_utils.pyx ├── cli.py ├── cli_test.py ├── commands ├── __init__.py ├── detect │ └── __init__.py ├── evaluate │ ├── __init__.py │ └── evaluate.py └── tree │ ├── __init__.py │ └── tree.py ├── detector.py ├── impute.py ├── impute_test.py ├── merge.pyx ├── merge_test.py ├── metrics.py ├── metrics_test.py ├── nn ├── __init__.py ├── functional.py ├── functional_test.py ├── loss.py ├── loss_test.py ├── module.py ├── module_test.py ├── trainer │ ├── __init__.py │ ├── callback.py │ ├── callback_test.py │ ├── earlystop.py │ ├── earlystop_test.py │ ├── event.py │ ├── event_test.py │ ├── history.py │ ├── history_test.py │ ├── metrics.py │ ├── trainer.py │ └── trainer_test.py └── zoo │ ├── __init__.py │ ├── autoencoder.py │ └── autoencoder_test.py ├── plot.py ├── plot_test.py ├── preprocessing ├── __init__.py ├── partition.py ├── partition_test.py ├── process.py └── process_test.py ├── scorecard.py ├── scorecard_test.py ├── selection.py ├── selection_test.py ├── stats.py ├── stats_test.py ├── tadpole ├── __init__.py ├── base.py ├── fonts │ └── NotoSansCJKsc-Regular.otf ├── func.py └── utils.py ├── transform.py ├── transform_test.py ├── utils ├── __init__.py ├── decorator.py ├── decorator_test.py ├── func.py ├── func_test.py ├── mixin.py ├── mixin_test.py ├── pickletracer.py ├── pickletracer_test.py └── progress │ ├── __init__.py │ ├── pandas.py │ ├── pandas_test.py │ ├── progress.py │ └── progress_test.py └── version.py /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [master, dev] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: [master] 9 | schedule: 10 | - cron: '0 3 * * 4' 11 | 12 | jobs: 13 | analyse: 14 | name: Analyse 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v2 20 | with: 21 | # We must fetch at least the immediate parents so that if this is 22 | # a pull request then we can checkout the head. 23 | fetch-depth: 2 24 | 25 | # If this run was triggered by a pull request event, then checkout 26 | # the head of the pull request instead of the merge commit. 27 | - run: git checkout HEAD^2 28 | if: ${{ github.event_name == 'pull_request' }} 29 | 30 | # Initializes the CodeQL tools for scanning. 31 | - name: Initialize CodeQL 32 | uses: github/codeql-action/init@v1 33 | # Override language selection by uncommenting this and choosing your languages 34 | # with: 35 | # languages: go, javascript, csharp, python, cpp, java 36 | 37 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 38 | # If this step fails, then you should remove it and run the build manually (see below) 39 | - name: Autobuild 40 | uses: github/codeql-action/autobuild@v1 41 | 42 | # ℹ️ Command-line programs to run using the OS shell. 43 | # 📚 https://git.io/JvXDl 44 | 45 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 46 | # and modify them (or add more) to build your code if your project 47 | # uses a compiled language 48 | 49 | #- run: | 50 | # make bootstrap 51 | # make release 52 | 53 | - name: Perform CodeQL Analysis 54 | uses: github/codeql-action/analyze@v1 55 | -------------------------------------------------------------------------------- /.github/workflows/linux.yml: -------------------------------------------------------------------------------- 1 | name: Test on Linux 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | strategy: 8 | matrix: 9 | python-version: ['3.9', '3.10', '3.11', '3.12'] 10 | experimental: [false] 11 | include: 12 | - python-version: '3.13' 13 | experimental: true 14 | fail-fast: false 15 | runs-on: ubuntu-latest 16 | continue-on-error: ${{ matrix.experimental }} 17 | name: Test py ${{ matrix.python-version }} 18 | steps: 19 | - uses: actions/checkout@master 20 | - name: Setup Python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - run: make build_deps 25 | - run: pip install -r requirements-nn.txt 26 | - run: pip install .[all] 27 | - run: make test 28 | release: 29 | needs: [test] 30 | # release when using `tags` or `release` branch 31 | if: ${{ startsWith(github.ref, 'refs/tags') || github.ref == 'refs/heads/release' }} 32 | runs-on: ubuntu-latest 33 | steps: 34 | - uses: actions/checkout@master 35 | - name: Setup Python 36 | uses: actions/setup-python@v5 37 | with: 38 | python-version: '3.10' 39 | architecture: x64 40 | - run: make dist 41 | - uses: RalfG/python-wheels-manylinux-build@v0.7.1 42 | with: 43 | build-requirements: 'cython numpy' 44 | - run: rm dist/*-linux_x86_64.whl 45 | - uses: pypa/gh-action-pypi-publish@release/v1 46 | name: publish pypi 47 | with: 48 | user: __token__ 49 | password: ${{ secrets.PYPI }} 50 | skip-existing: true 51 | verbose: true 52 | -------------------------------------------------------------------------------- /.github/workflows/macos.yml: -------------------------------------------------------------------------------- 1 | name: Test on MacOS 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | strategy: 8 | matrix: 9 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 10 | macos-version: ['macos-13', 'macos-latest'] 11 | include: 12 | - experimental: false 13 | - macos-version: 'macos-latest' 14 | experimental: true 15 | - python-version: '3.9' 16 | experimental: true 17 | - python-version: '3.13' 18 | experimental: true 19 | fail-fast: false 20 | runs-on: ${{ matrix.macos-version }} 21 | continue-on-error: ${{ matrix.experimental }} 22 | name: Test py ${{ matrix.python-version }} ${{ matrix.macos-version }} 23 | steps: 24 | - uses: actions/checkout@master 25 | - name: Setup Python 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - run: make build_deps 30 | - run: pip install -r requirements-nn.txt 31 | - run: pip install .[all] 32 | - run: make test 33 | - run: make dist_wheel 34 | - uses: actions/upload-artifact@v4 35 | with: 36 | name: wheel-${{ matrix.python-version }}-${{ matrix.macos-version }} 37 | path: dist/*.whl 38 | release: 39 | needs: [test] 40 | # release when using `tags` or `release` branch 41 | if: ${{ startsWith(github.ref, 'refs/tags') || github.ref == 'refs/heads/release' }} 42 | runs-on: ubuntu-latest 43 | steps: 44 | - uses: actions/download-artifact@v4 45 | with: 46 | pattern: wheel-* 47 | path: dist/ 48 | merge-multiple: true 49 | - uses: pypa/gh-action-pypi-publish@release/v1 50 | name: publish pypi 51 | with: 52 | user: __token__ 53 | password: ${{ secrets.PYPI }} 54 | -------------------------------------------------------------------------------- /.github/workflows/pypi-test.yml: -------------------------------------------------------------------------------- 1 | name: Pypi test 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'pypi/**' 7 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "*" 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@master 14 | 15 | - name: Release 16 | uses: docker://antonyurchenko/git-release:latest 17 | env: 18 | GITHUB_TOKEN: ${{ secrets.TOKEN }} 19 | -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | name: Test on Windows 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | strategy: 8 | matrix: 9 | python-version: ['3.9', '3.10', '3.11', '3.12'] 10 | experimental: [false] 11 | include: 12 | - python-version: '3.13' 13 | experimental: true 14 | fail-fast: false 15 | runs-on: windows-latest 16 | continue-on-error: ${{ matrix.experimental }} 17 | name: Test py ${{ matrix.python-version }} 18 | steps: 19 | - uses: actions/checkout@master 20 | - name: Setup Python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - run: make build_deps 25 | - run: pip install -r requirements-nn.txt 26 | - run: pip install .[all] 27 | - run: make test 28 | - run: make dist_wheel 29 | - uses: actions/upload-artifact@v4 30 | with: 31 | name: wheel-${{ matrix.python-version }} 32 | path: dist/*.whl 33 | release: 34 | needs: [test] 35 | # release when using `tags` or `release` branch 36 | if: ${{ startsWith(github.ref, 'refs/tags') || github.ref == 'refs/heads/release' }} 37 | runs-on: ubuntu-latest 38 | steps: 39 | - uses: actions/download-artifact@v4 40 | with: 41 | pattern: wheel-* 42 | path: dist/ 43 | merge-multiple: true 44 | - uses: pypa/gh-action-pypi-publish@release/v1 45 | name: publish pypi 46 | with: 47 | user: __token__ 48 | password: ${{ secrets.PYPI }} 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | build/ 3 | *.egg-info/ 4 | dist/ 5 | .tox/ 6 | .vscode/ 7 | .DS_Store 8 | .python-version 9 | *.csv 10 | *.xlsx 11 | *.c 12 | *.so 13 | *.pyc 14 | .idea/ 15 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.11" 7 | 8 | sphinx: 9 | configuration: docs/source/conf.py 10 | 11 | formats: all 12 | 13 | python: 14 | install: 15 | - requirements: requirements.txt 16 | - requirements: requirements-nn.txt 17 | - requirements: docs/requirements.txt 18 | - method: setuptools 19 | path: . 20 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | matrix: 2 | include: 3 | - name: "Python 3.6 on Linux" 4 | os: linux 5 | language: python 6 | python: "3.6" 7 | sudo: required 8 | services: 9 | - docker 10 | env: 11 | - DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64 12 | - PLAT=manylinux1_x86_64 13 | before_install: 14 | - sudo apt-get install -y graphviz 15 | dist: trusty 16 | before_deploy: 17 | - make dist_manylinux 18 | 19 | - name: "Python 3.7 on macOS" 20 | os: osx 21 | osx_image: xcode11.3 22 | language: shell 23 | env: 24 | - SUDO=sudo 25 | - HOMEBREW_NO_INSTALL_CLEANUP=TRUE 26 | before_install: 27 | - brew update 28 | # - brew install graphviz 29 | before_deploy: 30 | - make dist_wheel 31 | 32 | - name: "Python 3.7 on Windows" 33 | os: windows 34 | language: shell 35 | python: "3.7" 36 | env: 37 | - PATH=/c/Python37:/c/Python37/Scripts:$PATH 38 | before_install: 39 | - choco install python --version=3.7.2 40 | - choco install graphviz 41 | - choco install make 42 | before_deploy: 43 | - make dist_wheel 44 | 45 | - name: "Python 3.6 on Windows" 46 | os: windows 47 | language: shell 48 | python: "3.6" 49 | env: 50 | - PATH=/c/Python36:/c/Python36/Scripts:$PATH 51 | before_install: 52 | - choco install python --version=3.6.8 53 | - choco install graphviz 54 | - choco install make 55 | - pip install -U patsy 56 | before_deploy: 57 | - make dist_wheel 58 | 59 | 60 | install: 61 | - make install 62 | script: 63 | - make test 64 | 65 | deploy: 66 | - skip_cleanup: true 67 | provider: script 68 | script: make upload 69 | on: 70 | tags: true 71 | 72 | - skip_cleanup: true 73 | provider: script 74 | script: make upload 75 | on: 76 | branch: release 77 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] 9 | 10 | ## [0.1.5] - 2025-02-18 11 | 12 | ### Add 13 | - Added `ax` support for `plot` 14 | - Added `Apple M1` support 15 | 16 | ## [0.1.4] - 2024-11-03 17 | 18 | ### Add 19 | - Added wheel package supported for `py3.12` 20 | - Added `figsize` param in `toad.plot.bin_plot` function 21 | 22 | ### Changed 23 | - Update `pandas` version to `>=1.5` 24 | - Python `3.7` `3.8` is no longer supported 25 | 26 | ## [0.1.3] - 2023-12-10 27 | 28 | ### Add 29 | - Added `performance` in `toad.utils` for test code performance 30 | - Added `pickletracer` in `toad.utils` for infer requirements in pickle object 31 | 32 | ### Fixed 33 | - Fixed `Value Error` in `select` and `drop_corr` method when using `pandas >= 2.0.x` 34 | 35 | ## [0.1.2] - 2023-04-09 36 | 37 | ### Add 38 | - Added `ks_plot` for KS plot, [#102](https://github.com/amphibian-dev/toad/issues/102) thanks @kevin-meng 39 | - Added `xgb_loss` decorator for convert a normal loss function to a xgb supported loss function 40 | - Added `binary_focal_loss` function in `nn.functional` 41 | - Added `event` module in `nn.trainer`, and changed `trainer` mode to event-based 42 | - Added wheel package supported for `py3.9`, `py3.10` and `py3.11` 43 | 44 | ### Changed 45 | - Now you can pass arguments to `DecisionTreeClassifier` in `merge` or `Combiner` when use `method = dt` 46 | 47 | ### Fixed 48 | - Fixed `groupby` rewrited in `preprocessing` 49 | - Fixed the expired deprecations of numpy types in `1.24.0` 50 | 51 | ## [0.1.1] - 2022-08-14 52 | 53 | ### Add 54 | - Added `Progress` for `pandas.apply` by using `pandas_enable` and `pandas_disable` 55 | - Added `feature_bin_stats` for feature bins, [#91](https://github.com/amphibian-dev/toad/issues/91) thanks @kevin-meng 56 | 57 | ### Changed 58 | - `countBlank` can use customize missing value, [#101](https://github.com/amphibian-dev/toad/issues/101) thanks @kevin-meng 59 | - remove ref of `merge` in `__init__` file 60 | 61 | 62 | 63 | ## [0.1.0] - 2021-10-08 64 | 65 | ### Add 66 | 67 | - Added `backward_rounds` for `nn.Trainer.train` 68 | - Added `evalute` func in `nn.Module` 69 | - Added `get_reason` func in `ScoreCard`, [#79](https://github.com/amphibian-dev/toad/issues/79) thanks @qianweishuo 70 | - Added dict type input support for `ScoreCard.predict` and `Combiner.transform`, [#79](https://github.com/amphibian-dev/toad/issues/79) thanks @qianweishuo 71 | - Added iterator support for `Progress` 72 | 73 | ### Changed 74 | 75 | - Change `callback` and `earlystopping` to python decorator 76 | 77 | 78 | ## [0.0.65] - 2021-06-30 79 | 80 | ### Breaking Changes 81 | 82 | - Add new `lift` value and rename the old `lift` value to `cum_lift` in `KS_Bucket` 83 | - Move `nn.autoencoder` to `nn.zoo.autoencoder` 84 | 85 | ### Add 86 | 87 | - Added `label_smoothing`, `focal_loss` function in `nn` module 88 | - Added some features in `nn.trainer` 89 | - Added default `early_stopping` for `nn.Trainer` 90 | 91 | ### Changed 92 | 93 | - Update `numpy` version to `>=1.20` 94 | - Python `3.6` is no longer supported 95 | 96 | ### Fixed 97 | 98 | - Fixed combiner error after `ScoreCard` reload. [#67](https://github.com/amphibian-dev/toad/issues/67) 99 | 100 | 101 | ## [0.0.64] - 2021-03-22 102 | 103 | ### Added 104 | 105 | - Added `callback` param in `fit` method for `nn` 106 | - Added `Trainer` and `EarlyStopping` in `nn.trainer` module 107 | 108 | ### Changed 109 | 110 | - Use mean of loss in `nn.Module.fit` instead of the latest loss value 111 | - Set default rotation for x tick labels 112 | 113 | ### Fixed 114 | 115 | - Fixed dependence version of `numpy` 116 | - Fixed `DistModule` module 117 | - Fixed `ScoreCard` representation error 118 | 119 | ## [0.0.62] - 2021-02-19 120 | 121 | ### Added 122 | 123 | - `save` and `load` method for nn module 124 | - Added `lift` value in `KS_bucket` function 125 | - Added checking duplicate keys in `Transformer` 126 | 127 | ### Changed 128 | 129 | - `quality` method support `indicators` 130 | 131 | ### Fixed 132 | 133 | - Fixed tadpole warning of legend. [#52](https://github.com/amphibian-dev/toad/issues/52) 134 | - Fixed tadpole `title` and `x/y label` display for `UTF8` 135 | - Fixed default rule in RuleMixin. 136 | - Fixed loss function of VAE model. 137 | - Fixed `decimal` argument in `ScoreCard.export` function 138 | 139 | ### Enhancements 140 | 141 | - Reduce memory usage when using `select` function 142 | 143 | ## [0.0.61] - 2020-06-24 144 | 145 | ### Added 146 | 147 | - Support for calculating IV for each groups in a feature. [#25](https://github.com/amphibian-dev/toad/issues/25) 148 | - Add `cpu_cores` for `quality` function 149 | - Add `predict_proba` for `ScoreCard` 150 | - Impute module 151 | - NN module 152 | 153 | ### Changed 154 | 155 | - The y axis of `badrate_plot` is starting with `0` now. [#23](https://github.com/amphibian-dev/toad/issues/23) 156 | - `KS` is implemented using `ks2samp` instead 157 | 158 | ### Fixed 159 | 160 | - Fixed `Preprocess` bugs 161 | 162 | ### Docs 163 | 164 | - Add references for `Chi-Merge`, `Stepwise Regression`, `Scorecard Transformation` 165 | 166 | ## [0.0.60] - 2020-04-20 167 | 168 | ### Added 169 | 170 | - Preprocess module. 171 | - Annotation format for bin plot. 172 | - KS bucket support split pointers as bucket. [#22](https://github.com/amphibian-dev/toad/issues/22) 173 | 174 | ### Changed 175 | 176 | - Format_bins support ellipsis. 177 | - Reverse cumulative columns in KS bucket 178 | - Use correct order of score for auc and roc plot. [#21](https://github.com/amphibian-dev/toad/issues/21) 179 | 180 | ### Fixed 181 | 182 | - Fixed number type of x axis of badrate plot. [#20](https://github.com/amphibian-dev/toad/issues/20) 183 | - Fixed negative ks value in `KS_bucket`. 184 | 185 | ## [0.0.59] - 2020-02-07 186 | 187 | ### Added 188 | 189 | - Combiner support empty separate. 190 | - Confusion matrix function in metrics. 191 | - support python 3.8. 192 | 193 | ### Changed 194 | 195 | - Transform support y as string type. 196 | - VIF independent statsmodels. 197 | 198 | 199 | [Unreleased]: https://github.com/amphibian-dev/toad/compare/0.1.0...HEAD 200 | [0.1.0]: https://github.com/amphibian-dev/toad/compare/0.0.65...0.1.0 201 | [0.0.65]: https://github.com/amphibian-dev/toad/compare/0.0.64...0.0.65 202 | [0.0.64]: https://github.com/amphibian-dev/toad/compare/0.0.62...0.0.64 203 | [0.0.62]: https://github.com/amphibian-dev/toad/compare/0.0.61...0.0.62 204 | [0.0.61]: https://github.com/amphibian-dev/toad/compare/0.0.60...0.0.61 205 | [0.0.60]: https://github.com/amphibian-dev/toad/compare/0.0.59...0.0.60 206 | [0.0.59]: https://github.com/amphibian-dev/toad/compare/0.0.58...0.0.59 207 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Welcome to Toad contributing guide 2 | 3 | We're so glad you're thinking about contributing to toad project. If you're unsure about anything, just ask @Secbone or submit the issue or pull request anyway. The worst that can happen is you'll be politely asked to change something. We love all friendly contributions. 4 | 5 | 我们非常开心你乐意为 toad 项目贡献代码。如果你有任何疑问,可以联系 @Secbone 或者提交 issue 和 pull request 都可以。最糟不过是被礼貌地要求你修改一些东西。我们非常愿意看到所有善意的问题。 6 | 7 | ## Getting Started · 开始吧 8 | 9 | ### Setup Environment · 设置环境 10 | 11 | Setting up the environment is very simple, you just need to run the following command 12 | 13 | 设置环境非常简单,你只需要执行以下代码 14 | 15 | ```bash 16 | make install 17 | ``` 18 | 19 | All done! Now you can enjoy your coding~ 20 | 21 | 完成!开始享受你的编码吧~ 22 | 23 | ### About Cython · 关于 Cython 24 | 25 | `toad.merge` module is compiled with `cython`, so if you want to change something with `toad.merge`, you need to run 26 | 27 | `toad.merge` 模块是使用 `cython` 编译的,所有如果你想要对 `toad.merge` 模块进行改动时,你需要运行 28 | 29 | ```bash 30 | make build 31 | ``` 32 | after you updated code. 33 | 34 | 之后来使你的代码生效。 35 | 36 | ### Testing · 测试 37 | 38 | You can run 39 | 40 | 你可以执行 41 | 42 | ```bash 43 | make test 44 | ``` 45 | 46 | for testing the whole package. We recommend that you do this before every commit to avoid new code impacting old functionality. 47 | 48 | 来测试整个包的代码。我们建议你在每次体检前这么做,以防止新代码对老的功能产生影响。 49 | 50 | You can also run 51 | 52 | 你也可以运行 53 | 54 | ```bash 55 | make test toad/xxxx_test.py 56 | ``` 57 | 58 | to test only a single module. 59 | 60 | 来只测试某一个模块。 61 | 62 | ### Pull Request 63 | 64 | When you're finished with the changes, creating a pull request and waiting for merge. 65 | 66 | 当你完成所有的改动后,就可以创建一个 pull request 并且等它被合并啦~ 67 | -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- 1 | Lei Cui 2 | Secbone 3 | Shaoqian Dong 4 | Xiyu Zhou 5 | Yanping He 6 | Yutong Jiang 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 ESC Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include requirements.txt 3 | include requirements-*.txt 4 | include setup.py 5 | include toad/*.pyd 6 | include toad/*.pyx 7 | include toad/tadpole/fonts/* 8 | 9 | include CONTRIBUTORS 10 | include LICENSE 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build test 2 | 3 | SHELL = /bin/bash 4 | 5 | PYTHON = python3 6 | PIP = pip3 7 | SUDO ?= 8 | 9 | SPHINXOPTS = 10 | SPHINXBUILD = sphinx-build 11 | SPHINXPROJ = toad 12 | DOCSDIR = docs 13 | SOURCEDIR := $(DOCSDIR)/source 14 | BUILDDIR := $(DOCSDIR)/build 15 | 16 | 17 | ifeq ('$(shell type -P python3)','') 18 | PYTHON = python 19 | endif 20 | 21 | ifeq ('$(shell type -P pip3)','') 22 | PIP = pip 23 | endif 24 | 25 | 26 | install: build 27 | $(SUDO) $(PIP) install -e . 28 | 29 | uninstall: 30 | cat files.txt | xargs rm -rf 31 | 32 | test_deps: 33 | $(SUDO) $(PIP) install -r requirements-test.txt 34 | 35 | test: test_deps 36 | $(eval TARGET := $(filter-out $@, $(MAKECMDGOALS))) 37 | @if [ -z $(TARGET) ]; then \ 38 | $(PYTHON) -m pytest -x toad; \ 39 | else \ 40 | $(PYTHON) -m pytest -s $(TARGET); \ 41 | fi 42 | 43 | build_deps: 44 | $(SUDO) $(PIP) install -r requirements.txt 45 | 46 | build: build_deps 47 | $(PYTHON) setup.py build_ext --inplace 48 | 49 | dist_deps: 50 | $(SUDO) $(PIP) install -U -r requirements-dist.txt 51 | 52 | dist: build dist_deps 53 | $(SUDO) $(PYTHON) setup.py sdist 54 | 55 | dist_wheel: build dist_deps 56 | $(SUDO) $(PYTHON) setup.py bdist_wheel --universal 57 | 58 | upload: 59 | twine check dist/* 60 | @twine upload dist/* -u $(TWINE_USER) -p $(TWINE_PASS) 61 | 62 | clean: 63 | @rm -rf build/ dist/ *.egg-info/ **/__pycache__/ 64 | @rm -rf toad/*.c toad/*.so 65 | 66 | docs: build 67 | @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 68 | 69 | %: 70 | @: 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 | 5 | # TOAD 6 | 7 | 8 | [![PyPi version][pypi-image]][pypi-url] 9 | [![Python version][python-image]][docs-url] 10 | [![Build Status][actions-image]][actions-url] 11 | [![Downloads Status][downloads-image]][docs-url] 12 | 13 | 14 | Toad is dedicated to facilitating model development process, especially for a scorecard. It provides intuitive functions of the entire process, from EDA, feature engineering and selection etc. to results validation and scorecard transformation. Its key functionality streamlines the most critical and time-consuming process such as feature selection and fine binning. 15 | 16 | Toad 是专为工业界模型开发设计的Python工具包,特别针对评分卡的开发。Toad 的功能覆盖了建模全流程,从 EDA、特征工程、特征筛选 到 模型验证和评分卡转化。Toad 的主要功能极大简化了建模中最重要最费时的流程,即特征筛选和分箱。 17 | 18 | ## Install and Upgrade · 安装与升级 19 | 20 | Pip 21 | 22 | ```bash 23 | pip install toad # to install 24 | pip install -U toad # to upgrade 25 | ``` 26 | 27 | Conda 28 | 29 | ```bash 30 | conda install toad --channel conda-forge # to install 31 | conda install -U toad --channel conda-forge # to upgrade 32 | ``` 33 | 34 | Source code 35 | 36 | ```bash 37 | python setup.py install 38 | ``` 39 | 40 | ## Key features · 主要功能 41 | 42 | The following showcases some of the most popular features of toad, for more detailed demonstrations and user guidance, please refer to the tutorials. 43 | 44 | 以下部分简单介绍了toad最受欢迎的一些功能,具体的使用方法和使用教程,请详见文档部分。 45 | 46 | - Simple IV calculation for all features · 一键算IV: 47 | 48 | ```python 49 | toad.quality(data, 'target', indicators = ['iv']) 50 | ``` 51 | 52 | - Preliminary selection based on criteria · 根据特定条件的初步变量筛选; 53 | - and stepwise feature selection (with optimised algorithm) · 优化过的逐步回归: 54 | 55 | ```python 56 | selected_data = toad.selection.select(data,target = 'target', empty = 0.5, iv = 0.02, corr = 0.7, return_drop=True, exclude=['ID','month']) 57 | 58 | final_data = toad.selection.stepwise(data_woe,target = 'target', estimator='ols', direction = 'both', criterion = 'aic', exclude = to_drop) 59 | ``` 60 | 61 | - Reliable fine binning with visualisation · 分箱及可视化: 62 | 63 | ```python 64 | # Chi-squared fine binning 65 | c = toad.transform.Combiner() 66 | c.fit(data_selected.drop(to_drop, axis=1), y = 'target', method = 'chi', min_samples = 0.05) 67 | print(c.export()) 68 | 69 | # Visualisation to check binning results 70 | col = 'feature_name' 71 | bin_plot(c.transform(data_selected[[col,'target']], labels=True), x=col, target='target') 72 | ``` 73 | 74 | - Intuitive model results presentation · 模型结果展示: 75 | 76 | ```python 77 | toad.metrics.KS_bucket(pred_proba, final_data['target'], bucket=10, method = 'quantile') 78 | ``` 79 | 80 | - One-click scorecard transformation · 评分卡转化: 81 | 82 | ```python 83 | card = toad.ScoreCard( 84 | combiner = c, 85 | transer = transer, 86 | class_weight = 'balanced', 87 | C=0.1, 88 | base_score = 600, 89 | base_odds = 35 , 90 | pdo = 60, 91 | rate = 2 92 | ) 93 | 94 | card.fit(final_data[col], final_data['target']) 95 | print(card.export()) 96 | ``` 97 | 98 | ## Documents · 文档 99 | 100 | - [Tutorial](https://toad.readthedocs.io/en/latest/tutorial.html) 101 | 102 | - [中文指引](https://toad.readthedocs.io/en/latest/tutorial_chinese.html) 103 | 104 | - [docs][docs-url] 105 | 106 | - [Contributing](CONTRIBUTING.md) 107 | 108 | ## Community · 社区 109 | We welcome public feedback and new PRs. We hold a WeChat group for questions and suggestions. 110 | 111 | 欢迎各位提PR,同时我们有toad使用交流的微信群,欢迎询问加群。 112 | 113 | ## Contributors 114 | 115 | [![Contributors][contributor-image]][contributor-url] 116 | 117 | ------------ 118 | 119 | ## Dedicated by **The ESC Team** 120 | 121 | [pypi-image]: https://img.shields.io/pypi/v/toad?style=flat-square 122 | [pypi-url]: https://pypi.org/project/toad/ 123 | [python-image]: https://img.shields.io/pypi/pyversions/toad?style=flat-square 124 | [actions-image]: https://img.shields.io/github/actions/workflow/status/amphibian-dev/toad/release.yml?style=flat-square 125 | [actions-url]: https://github.com/amphibian-dev/toad/actions 126 | [downloads-image]: https://img.shields.io/pypi/dm/toad?style=flat-square 127 | [docs-url]: https://toad.readthedocs.io/ 128 | [contributor-image]: https://contrib.rocks/image?repo=amphibian-dev/toad 129 | [contributor-url]: https://github.com/amphibian-dev/toad/graphs/contributors 130 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | docutils==0.16 2 | recommonmark 3 | sphinx-readable-theme 4 | ipykernel 5 | nbsphinx 6 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | 13 | import os 14 | import sys 15 | import inspect 16 | 17 | sys.path.insert(0, os.path.abspath('../..')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'toad' 23 | copyright = '2020, ESC Team' 24 | author = 'ESC Team' 25 | 26 | 27 | import toad 28 | version = toad.VERSION 29 | # The full version, including alpha/beta/rc tags 30 | release = version 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | import recommonmark 35 | import sphinx_readable_theme 36 | from recommonmark.transform import AutoStructify 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.doctest', 43 | 'sphinx.ext.intersphinx', 44 | 'sphinx.ext.todo', 45 | 'sphinx.ext.autodoc', 46 | "sphinx.ext.autosummary", 47 | 'sphinx.ext.linkcode', 48 | 'sphinx.ext.napoleon', 49 | 'nbsphinx', 50 | 'recommonmark', 51 | 'sphinx_readable_theme', 52 | ] 53 | 54 | 55 | 56 | autodoc_member_order = 'bysource' 57 | 58 | # Add any paths that contain templates here, relative to this directory. 59 | templates_path = ['_templates'] 60 | 61 | # List of patterns, relative to source directory, that match files and 62 | # directories to ignore when looking for source files. 63 | # This pattern also affects html_static_path and html_extra_path. 64 | exclude_patterns = [ 65 | 'toad/commands', 66 | '_build', 67 | '**.ipynb_checkpoints', 68 | ] 69 | 70 | master_doc = 'index' 71 | 72 | 73 | def linkcode_resolve(domain, info): 74 | """linkcode extension config function 75 | """ 76 | if domain != "py": 77 | return None 78 | 79 | modname = info["module"] 80 | fullname = info["fullname"] 81 | 82 | submod = sys.modules.get(modname) 83 | if submod is None: 84 | return None 85 | 86 | obj = submod 87 | for part in fullname.split("."): 88 | try: 89 | obj = getattr(obj, part) 90 | except AttributeError: 91 | return None 92 | 93 | try: 94 | # inspect.unwrap() was added in Python version 3.4 95 | if sys.version_info >= (3, 5): 96 | fn = inspect.getsourcefile(inspect.unwrap(obj)) 97 | else: 98 | fn = inspect.getsourcefile(obj) 99 | except TypeError: 100 | fn = None 101 | if not fn: 102 | return None 103 | 104 | try: 105 | source, lineno = inspect.getsourcelines(obj) 106 | except OSError: 107 | lineno = None 108 | 109 | if lineno: 110 | linespec = "#L{:d}-L{:d}".format(lineno, lineno + len(source) - 1) 111 | else: 112 | linespec = "" 113 | 114 | fn = os.path.relpath(fn, start = os.path.dirname(toad.__file__)) 115 | 116 | return "http://github.com/amphibian-dev/toad/blob/master/toad/{}{}".format( 117 | fn, linespec 118 | ) 119 | 120 | 121 | # -- Options for HTML output ------------------------------------------------- 122 | 123 | # The theme to use for HTML and HTML Help pages. See the documentation for 124 | # a list of builtin themes. 125 | # 126 | html_theme_path = [sphinx_readable_theme.get_html_theme_path()] 127 | html_theme = 'readable' 128 | 129 | # Add any paths that contain custom static files (such as style sheets) here, 130 | # relative to this directory. They are copied after the builtin static files, 131 | # so a file named "default.css" will overwrite the builtin "default.css". 132 | html_static_path = ['_static'] 133 | 134 | 135 | 136 | def setup(app): 137 | app.add_config_value( 138 | 'recommonmark_config', 139 | { 140 | 'enable_eval_rst': True, 141 | 'enable_auto_toc_tree': True, 142 | 'auto_toc_tree_section': 'Contents', 143 | }, 144 | True, 145 | ) 146 | 147 | app.add_transform(AutoStructify) 148 | -------------------------------------------------------------------------------- /docs/source/images/scorecard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/docs/source/images/scorecard.png -------------------------------------------------------------------------------- /docs/source/images/stepwise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/docs/source/images/stepwise.png -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to toad's documentation! 2 | 3 | 4 | ## Installation 5 | 6 | via pip 7 | 8 | ```bash 9 | pip install toad 10 | ``` 11 | 12 | via anaconda 13 | ```bash 14 | conda install toad --channel conda-forge 15 | ``` 16 | 17 | via source code 18 | ```bash 19 | python setup.py install 20 | ``` 21 | 22 | ## Tutorial 23 | 24 | A [basic tutorial](tutorial) is provided. 25 | 26 | [中文指引](tutorial_chinese) 27 | 28 | ## Contents 29 | 30 | ```eval_rst 31 | .. toctree:: 32 | :maxdepth: 1 33 | 34 | toad 35 | ``` 36 | 37 | 38 | ## Indices and tables 39 | 40 | 41 | ```eval_rst 42 | * :ref:`genindex` 43 | * :ref:`modindex` 44 | * :ref:`search` 45 | ``` 46 | 47 | 48 | ## Links 49 | 50 | [FiboRule](http://open.fibo.cn/) 51 | -------------------------------------------------------------------------------- /docs/source/modules.md: -------------------------------------------------------------------------------- 1 | ## toad 2 | 3 | 4 | ```eval_rst 5 | .. toctree:: 6 | :maxdepth: 4 7 | 8 | toad 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/reference.md: -------------------------------------------------------------------------------- 1 | # ChiMerge 2 | 3 | [https://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf](https://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf) 4 | 5 | ChiMerge Algorithm uses Chi-squared statistic to discretize attributes (numeric). In toad, we firstly transform Char/Object attributes to numeric with WOE function. The Algorithm is clear in paper (i.e. ChiMerge Algorithm Part). 6 | 7 | # Stepwise Regression 8 | 9 | [https://link.springer.com/article/10.1007%2FBF02576123](https://link.springer.com/article/10.1007%2FBF02576123) [1] 10 | 11 | [https://www.sciencedirect.com/science/article/pii/S0950584917305153?via%3Dihub](https://www.sciencedirect.com/science/article/pii/S0950584917305153?via%3Dihub) [2] 12 | 13 | [http://www.jstor.org/stable/1434071](http://www.jstor.org/stable/1434071)[3] 14 | 15 | Stepwise Regression (Forward/Backward/Stepwise, i.e. [2] 3.6. Stepwise Linear Regression)is uesed to reduce Low Information Gain Attributes and simplify the Final Model. 16 | 17 | The Stepwise Regression Process[2]: 18 | 19 | ```eval_rst 20 | .. image:: images/stepwise.png 21 | :width: 80% 22 | :align: center 23 | ``` 24 | 25 | # Scorecard Transformation 26 | 27 | John Wiley & Sons, Inc., *Credit Risk Scorecards Developing and Implementing Intelligent Credit Scoring* (Final Scorecard Production Part) 28 | 29 | 30 | 31 | Formula: 32 | 33 | Score = Offset + Factor ∗ ln (odds) #odds: good:bad 34 | 35 | Score + pdo = Offset + Factor ∗ ln (2 ∗ odds) # pdo: points to double the odds 36 | 37 | ==> 38 | 39 | pdo = Factor ∗ ln (2), 40 | 41 | Factor = pdo / ln (2); 42 | 43 | Offset = Score - Factor ∗ ln (odds) 44 | 45 | For example, if a scorecard were being scaled where the user wanted 46 | 47 | odds of 50:1 at 600 points and wanted the odds to double every 20 48 | 49 | points (i.e., pdo = 20), the factor and offset would be: 50 | 51 | Factor = 20 / ln (2) = 28.8539 52 | 53 | Offset = 600 – 28.8539 * ln (50) = 487.123 54 | 55 | ==> 56 | 57 | Each score corresponding to each set of odds: 58 | 59 | Score = 487.123 + 28.8539 * ln (odds) 60 | 61 | Scorecard is developed with WOE as input, the formula can be modified as: 62 | 63 | ```eval_rst 64 | .. image:: images/scorecard.png 65 | :width: 80% 66 | :align: center 67 | ``` 68 | 69 | WOE = weight of evidence for each grouped attribute 70 | 71 | β = regression coefficient for each characteristic 72 | 73 | a = intercept term from logistic regression 74 | 75 | n = number of characteristics 76 | 77 | k = number of groups (of attributes) in each characteristic 78 | -------------------------------------------------------------------------------- /docs/source/toad.detector.md: -------------------------------------------------------------------------------- 1 | ## toad.detector module 2 | 3 | 4 | ```eval_rst 5 | .. automodule:: toad.detector 6 | :members: 7 | :special-members: __init__ 8 | :show-inheritance: 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/toad.md: -------------------------------------------------------------------------------- 1 | ## toad package 2 | 3 | 4 | 5 | ## Submodules 6 | 7 | 8 | ```eval_rst 9 | .. toctree:: 10 | 11 | toad.detector 12 | toad.merge 13 | toad.metrics 14 | toad.plot 15 | toad.scorecard 16 | toad.selection 17 | toad.stats 18 | toad.transform 19 | toad.preprocessing 20 | toad.nn 21 | toad.utils 22 | ``` 23 | 24 | ## Module contents 25 | 26 | ```eval_rst 27 | .. automodule:: toad 28 | :members: 29 | :special-members: __init__ 30 | :show-inheritance: 31 | ``` 32 | -------------------------------------------------------------------------------- /docs/source/toad.merge.md: -------------------------------------------------------------------------------- 1 | ## toad.merge module 2 | 3 | 4 | ```eval_rst 5 | .. automodule:: toad.merge 6 | :members: 7 | :special-members: __init__ 8 | :show-inheritance: 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/toad.metrics.md: -------------------------------------------------------------------------------- 1 | ## toad.metrics module 2 | 3 | 4 | ```eval_rst 5 | .. automodule:: toad.metrics 6 | :members: 7 | :special-members: __init__ 8 | :show-inheritance: 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/toad.nn.functional.md: -------------------------------------------------------------------------------- 1 | ## toad.nn.functional module 2 | 3 | ```eval_rst 4 | .. automodule:: toad.nn.functional 5 | :members: 6 | :special-members: __init__ 7 | :show-inheritance: 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/source/toad.nn.md: -------------------------------------------------------------------------------- 1 | ## toad.nn module 2 | 3 | 4 | ```eval_rst 5 | .. toctree:: 6 | toad.nn.module 7 | toad.nn.functional 8 | toad.nn.trainer 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/toad.nn.module.md: -------------------------------------------------------------------------------- 1 | ## toad.nn.module module 2 | 3 | ```eval_rst 4 | .. automodule:: toad.nn.module 5 | :members: 6 | :special-members: __init__ 7 | :show-inheritance: 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/source/toad.nn.trainer.md: -------------------------------------------------------------------------------- 1 | ## toad.nn.trainer module 2 | 3 | ```eval_rst 4 | .. automodule:: toad.nn.trainer 5 | :members: 6 | :special-members: __init__ 7 | :show-inheritance: 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/source/toad.plot.md: -------------------------------------------------------------------------------- 1 | ## toad.plot module 2 | 3 | 4 | ```eval_rst 5 | .. automodule:: toad.plot 6 | :members: 7 | :special-members: __init__ 8 | :show-inheritance: 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/toad.preprocessing.md: -------------------------------------------------------------------------------- 1 | ## toad.preprocessing module 2 | 3 | 4 | ```eval_rst 5 | .. toctree:: 6 | toad.preprocessing.process 7 | toad.preprocessing.partition 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/source/toad.preprocessing.partition.md: -------------------------------------------------------------------------------- 1 | ## toad.preprocessing.partition module 2 | 3 | ```eval_rst 4 | .. automodule:: toad.preprocessing.partition 5 | :members: 6 | :special-members: __init__ 7 | :show-inheritance: 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/source/toad.preprocessing.process.md: -------------------------------------------------------------------------------- 1 | ## toad.preprocessing.process module 2 | 3 | ```eval_rst 4 | .. automodule:: toad.preprocessing.process 5 | :members: 6 | :special-members: __init__ 7 | :show-inheritance: 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/source/toad.scorecard.md: -------------------------------------------------------------------------------- 1 | ## toad.scorecard module 2 | 3 | 4 | ```eval_rst 5 | .. automodule:: toad.scorecard 6 | :members: 7 | :special-members: __init__ 8 | :show-inheritance: 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/toad.selection.md: -------------------------------------------------------------------------------- 1 | ## toad.selection module 2 | 3 | 4 | ```eval_rst 5 | .. automodule:: toad.selection 6 | :members: 7 | :special-members: __init__ 8 | :show-inheritance: 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/toad.stats.md: -------------------------------------------------------------------------------- 1 | ## toad.stats module 2 | 3 | 4 | ```eval_rst 5 | .. automodule:: toad.stats 6 | :members: 7 | :special-members: __init__ 8 | :show-inheritance: 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/toad.transform.md: -------------------------------------------------------------------------------- 1 | ## toad.transform module 2 | 3 | 4 | ```eval_rst 5 | .. automodule:: toad.transform 6 | :members: 7 | :special-members: __init__ 8 | :inherited-members: 9 | :show-inheritance: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/source/toad.utils.decorator.md: -------------------------------------------------------------------------------- 1 | ## toad.utils.decorator module 2 | 3 | ```eval_rst 4 | .. automodule:: toad.utils.decorator 5 | :members: 6 | :special-members: __init__ 7 | :show-inheritance: 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/source/toad.utils.func.md: -------------------------------------------------------------------------------- 1 | ## toad.utils.func module 2 | 3 | ```eval_rst 4 | .. automodule:: toad.utils.func 5 | :members: 6 | :special-members: __init__ 7 | :show-inheritance: 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/source/toad.utils.md: -------------------------------------------------------------------------------- 1 | ## toad.utils module 2 | 3 | 4 | ```eval_rst 5 | .. toctree:: 6 | toad.utils.func 7 | toad.utils.decorator 8 | toad.utils.mixin 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/source/toad.utils.mixin.md: -------------------------------------------------------------------------------- 1 | ## toad.utils.mixin module 2 | 3 | ```eval_rst 4 | .. automodule:: toad.utils.mixin 5 | :members: 6 | :special-members: __init__ 7 | :show-inheritance: 8 | ``` 9 | -------------------------------------------------------------------------------- /images/toad_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/images/toad_logo.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "toad" 3 | description = "Toad is dedicated to facilitating model development process, especially for a scorecard." 4 | requires-python = ">=3.9" 5 | license = {file = "LICENSE"} 6 | authors = [{name = "Secbone", email = "secbone@gmail.com"}] 7 | classifiers = [ 8 | 'Operating System :: POSIX', 9 | 'Operating System :: Microsoft :: Windows', 10 | 'Operating System :: MacOS :: MacOS X', 11 | 'Programming Language :: Python :: 3.9', 12 | 'Programming Language :: Python :: 3.10', 13 | 'Programming Language :: Python :: 3.11', 14 | 'Programming Language :: Python :: 3.12', 15 | ] 16 | dynamic = [ 17 | "version", 18 | "readme", 19 | "dependencies", 20 | "optional-dependencies", 21 | "entry-points", 22 | ] 23 | 24 | [tool.setuptools.dynamic] 25 | readme = {file = ["README.md"], content-type = "text/markdown"} 26 | dependencies = {file = ["requirements.txt"]} 27 | optional-dependencies = {nn = {file = ["requirements-nn.txt"]}, tools = {file = ["requirements-tools.txt"]}, all = {file = ["requirements-nn.txt", "requirements-tools.txt"]} } 28 | 29 | [build-system] 30 | requires = [ 31 | "setuptools", 32 | "Cython >= 0.29.15", 33 | "numpy >= 1.20", 34 | "wheel", 35 | "twine", 36 | ] 37 | build-backend = "setuptools.build_meta" 38 | 39 | [console_scripts] 40 | toad = "toad.cli:main" 41 | 42 | [tool.setuptools.packages.find] 43 | exclude = ["tests"] 44 | 45 | [project.urls] 46 | Homepage = "https://github.com/amphibian-dev/toad" 47 | Documentation = "https://toad.readthedocs.io/en/stable/" 48 | Repository = "https://github.com/amphibian-dev/toad.git" 49 | Issues = "https://github.com/amphibian-dev/toad/issues" 50 | Changelog = "https://github.com/amphibian-dev/toad/blob/master/CHANGELOG.md" 51 | -------------------------------------------------------------------------------- /requirements-dist.txt: -------------------------------------------------------------------------------- 1 | wheel 2 | twine 3 | -------------------------------------------------------------------------------- /requirements-nn.txt: -------------------------------------------------------------------------------- 1 | torch >= 1.8.1 2 | torchvision >= 0.9.1 3 | numpy < 2.0 ; sys_platform == "darwin" 4 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-timeout 3 | -------------------------------------------------------------------------------- /requirements-tools.txt: -------------------------------------------------------------------------------- 1 | cloudpickle 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython <= 0.29.15 ; python_version < "3.10" 2 | Cython >= 0.29.15 ; python_version >= "3.10" 3 | numpy <= 1.24 ; python_version < "3.10" 4 | numpy >= 1.24 ; python_version >= "3.10" 5 | pandas >= 1.5 6 | scipy 7 | joblib >= 0.12 8 | scikit-learn >= 0.21 9 | seaborn >= 0.10.0 10 | setuptools 11 | -------------------------------------------------------------------------------- /scripts/build_wheels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -x 3 | 4 | 5 | # Compile wheels 6 | for PYBIN in /opt/python/cp3[5678]*/bin; do 7 | "${PYBIN}/pip" install -r /io/dev-requirements.txt 8 | "${PYBIN}/pip" wheel --no-deps /io/ -w /dist/ 9 | done 10 | 11 | # Bundle external shared libraries into the wheels 12 | for whl in /dist/toad*.whl; do 13 | auditwheel repair "$whl" --plat $PLAT -w /io/dist/ 14 | done -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [build_ext] 2 | inplace = 1 3 | 4 | [bdist_wheel] 5 | universal=1 6 | 7 | [aliases] 8 | test=pytest 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from setuptools import setup, find_packages, Extension 4 | 5 | 6 | NAME = 'toad' 7 | 8 | 9 | CURRENT_PATH = os.path.abspath(os.path.dirname(__file__)) 10 | VERSION_FILE = os.path.join(CURRENT_PATH, NAME, 'version.py') 11 | 12 | def get_version(): 13 | ns = {} 14 | with open(VERSION_FILE) as f: 15 | exec(f.read(), ns) 16 | return ns['__version__'] 17 | 18 | 19 | def get_ext_modules(): 20 | from Cython.Build import cythonize 21 | 22 | extensions = [ 23 | Extension('toad.c_utils', sources = ['toad/c_utils.pyx'], include_dirs = [np.get_include()]), 24 | Extension('toad.merge', sources = ['toad/merge.pyx'], include_dirs = [np.get_include()]), 25 | ] 26 | 27 | return cythonize(extensions) 28 | 29 | 30 | def get_requirements(stage = None): 31 | file_name = 'requirements' 32 | 33 | if stage is not None: 34 | file_name = f"{file_name}-{stage}" 35 | 36 | requirements = [] 37 | with open(f"{file_name}.txt", 'r') as f: 38 | for line in f: 39 | line = line.strip() 40 | if not line or line.startswith('-'): 41 | continue 42 | 43 | requirements.append(line) 44 | 45 | return requirements 46 | 47 | 48 | setup( 49 | name = NAME, 50 | version = get_version(), 51 | description = 'Toad is dedicated to facilitating model development process, especially for a scorecard.', 52 | long_description = open('README.md', encoding = 'utf-8').read(), 53 | long_description_content_type = 'text/markdown', 54 | url = 'https://github.com/amphibian-dev/toad', 55 | author = 'ESC Team', 56 | author_email = 'secbone@gmail.com', 57 | packages = find_packages(exclude = ['tests']), 58 | include_dirs = [np.get_include()], 59 | ext_modules = get_ext_modules(), 60 | include_package_data = True, 61 | python_requires = '>=3.7', 62 | setup_requires = ['numpy'], 63 | tests_require = get_requirements('test'), 64 | license = 'MIT', 65 | classifiers = [ 66 | 'Operating System :: POSIX', 67 | 'Operating System :: Microsoft :: Windows', 68 | 'Operating System :: MacOS :: MacOS X', 69 | 'Programming Language :: Python :: 3.8', 70 | 'Programming Language :: Python :: 3.9', 71 | 'Programming Language :: Python :: 3.10', 72 | 'Programming Language :: Python :: 3.11', 73 | 'Programming Language :: Python :: 3.12', 74 | ], 75 | entry_points = { 76 | 'console_scripts': [ 77 | 'toad = toad.cli:main', 78 | ], 79 | }, 80 | ) 81 | -------------------------------------------------------------------------------- /toad/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .merge import merge, DTMerge, ChiMerge, StepMerge, QuantileMerge, KMeansMerge 3 | except ImportError: 4 | import warnings 5 | warnings.warn( 6 | """`merge` module need to be builded""", 7 | ImportWarning, 8 | ) 9 | 10 | from .detector import detect 11 | from .metrics import KS, KS_bucket, F1 12 | from .stats import quality, IV, VIF, WOE, entropy, entropy_cond, gini, gini_cond 13 | from .transform import Combiner, WOETransformer 14 | from .selection import select 15 | from .scorecard import ScoreCard 16 | from .utils import Progress, performance 17 | from .version import __version__ 18 | 19 | VERSION = __version__ 20 | -------------------------------------------------------------------------------- /toad/c_utils.pxd: -------------------------------------------------------------------------------- 1 | ctypedef fused number: 2 | int 3 | double 4 | long 5 | 6 | 7 | cdef number c_min(number[:] arr) 8 | 9 | cdef number c_sum(number[:,:] arr) 10 | 11 | cdef number[:] c_sum_axis_0(number[:,:] arr) 12 | 13 | cdef number[:] c_sum_axis_1(number[:,:] arr) 14 | -------------------------------------------------------------------------------- /toad/c_utils.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level = 3, infer_types = True, boundscheck = False 2 | 3 | import numpy as np 4 | cimport numpy as np 5 | cimport cython 6 | 7 | 8 | 9 | cdef number c_min(number[:] arr): 10 | cdef number res = np.inf 11 | 12 | for i in range(arr.shape[0]): 13 | if res > arr[i]: 14 | res = arr[i] 15 | return res 16 | 17 | 18 | cdef number c_sum(number[:,:] arr): 19 | cdef number res = 0 20 | 21 | cdef Py_ssize_t i,j 22 | for i in range(arr.shape[0]): 23 | for j in range(arr.shape[1]): 24 | res += arr[i, j] 25 | 26 | return res 27 | 28 | 29 | cdef number[:] c_sum_axis_0(number[:,:] arr): 30 | cdef number[:] res = np.zeros(arr.shape[1], dtype=float) 31 | 32 | for i in range(arr.shape[0]): 33 | for j in range(arr.shape[1]): 34 | res[j] += arr[i, j] 35 | 36 | return res 37 | 38 | 39 | cdef number[:] c_sum_axis_1(number[:,:] arr): 40 | cdef number[:] res = np.zeros(arr.shape[0], dtype=float) 41 | 42 | for i in range(arr.shape[0]): 43 | for j in range(arr.shape[1]): 44 | res[i] += arr[i, j] 45 | 46 | return res 47 | -------------------------------------------------------------------------------- /toad/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | toad command line application 3 | """ 4 | import argparse 5 | from .commands import get_plugins 6 | 7 | 8 | def add_sub(parsers, config): 9 | """add sub parser by config 10 | """ 11 | info = config.get('info', {}) 12 | args = config.get('args', []) 13 | defaults = config.get('defaults', None) 14 | 15 | sub_parser = parsers.add_parser(**info) 16 | 17 | for detail in args: 18 | flag = detail.pop('flag') 19 | sub_parser.add_argument(*flag, **detail) 20 | 21 | if defaults: 22 | sub_parser.set_defaults(**defaults) 23 | 24 | 25 | def get_parser(): 26 | """get parser 27 | """ 28 | parser = argparse.ArgumentParser( 29 | prog = 'toad', 30 | description = 'Detect data from a csv file', 31 | ) 32 | 33 | subparsers = parser.add_subparsers() 34 | 35 | plugins = get_plugins() 36 | for plug in plugins: 37 | add_sub(subparsers, plug.ARGS) 38 | 39 | return parser 40 | 41 | 42 | def main(): 43 | """ 44 | """ 45 | parser = get_parser() 46 | 47 | args = parser.parse_args() 48 | if hasattr(args, 'func'): 49 | args.func(args) 50 | 51 | 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /toad/cli_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import pyximport 6 | 7 | pyximport.install(setup_args={"include_dirs": np.get_include()}) 8 | 9 | from toad.cli import get_parser 10 | 11 | def disable_stdout(fn): 12 | 13 | def wrapper(*args): 14 | import os 15 | import sys 16 | 17 | with open(os.devnull, 'w') as f: 18 | so = sys.stdout 19 | sys.stdout = f 20 | 21 | fn(*args) 22 | 23 | sys.stdout = so 24 | 25 | return wrapper 26 | 27 | 28 | parser = get_parser() 29 | 30 | 31 | 32 | @disable_stdout 33 | def test_detect(): 34 | args = parser.parse_args(['detect', '-i', 'tests/test_data.csv']) 35 | rep = args.func(args) 36 | assert rep.loc['E', 'unique'] == 20 37 | 38 | @pytest.mark.skip("tree command will generate a pic in travis-ci log") 39 | @disable_stdout 40 | def test_tree(): 41 | args = parser.parse_args(['tree', '-i', 'tests/test_data.csv']) 42 | args.func(args) 43 | pass 44 | -------------------------------------------------------------------------------- /toad/commands/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pkgutil 3 | from importlib import import_module 4 | 5 | COMMAND_DIR = os.path.dirname(os.path.abspath(__file__)) 6 | 7 | def get_plugins(): 8 | plugins = [] 9 | 10 | for _, name, ispkg in pkgutil.iter_modules([COMMAND_DIR]): 11 | if ispkg: 12 | module = import_module('toad.commands.{}'.format(name)) 13 | plugins.append(module) 14 | 15 | return plugins 16 | -------------------------------------------------------------------------------- /toad/commands/detect/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import pandas as pd 4 | 5 | def func(args): 6 | """detect csv data 7 | 8 | Examples: 9 | 10 | toad detect -i xxx.csv -o report.csv 11 | """ 12 | from toad.detector import detect 13 | 14 | sys.stdout.write('reading data....\n') 15 | with args.input as input: 16 | data = pd.read_csv(input) 17 | 18 | sys.stdout.write('detecting...\n') 19 | report = detect(data) 20 | 21 | if args.output: 22 | sys.stdout.write('saving report...\n') 23 | report.to_csv(args.output) 24 | sys.stdout.write('report saved!\n') 25 | else: 26 | sys.stdout.write(str(report)) 27 | sys.stdout.write('\n') 28 | 29 | return report 30 | 31 | ARGS = { 32 | 'info': { 33 | 'name': 'detect', 34 | 'description': 'detect data from a csv file', 35 | }, 36 | 'defaults': { 37 | 'func': func, 38 | }, 39 | 'args': [ 40 | { 41 | 'flag': ('-i', '--input'), 42 | 'type': argparse.FileType(), 43 | 'help': 'the csv file which will be detected', 44 | 'required': True, 45 | }, 46 | { 47 | 'flag': ('-o', '--output'), 48 | 'type': argparse.FileType('w'), 49 | 'help': 'path of the csv report will be saved', 50 | }, 51 | ] 52 | } 53 | -------------------------------------------------------------------------------- /toad/commands/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import pandas as pd 4 | 5 | def func(args): 6 | """detect csv data 7 | 8 | Examples: 9 | 10 | toad evaluate -i xxx.csv 11 | """ 12 | from .evaluate import evaluate 13 | 14 | 15 | sys.stdout.write('reading data....\n') 16 | 17 | test_data = pd.read_csv(args.input) 18 | if args.base is not None: 19 | self_data = pd.read_csv(args.base) 20 | else: 21 | self_data = None 22 | 23 | arguments = { 24 | 'excel_name': args.name, 25 | 'num': args.top, 26 | 'iv_threshold_value': args.iv, 27 | 'unique_num': args.unique, 28 | 'self_data': self_data, 29 | 'overdue_days': args.overdue, 30 | } 31 | 32 | evaluate(test_data, **arguments) 33 | 34 | 35 | ARGS = { 36 | 'info': { 37 | 'name': 'evaluate', 38 | 'description': '第三方数据评估', 39 | }, 40 | 'defaults': { 41 | 'func': func, 42 | }, 43 | 'args': [ 44 | { 45 | 'flag': ('-i', '--input'), 46 | 'type': argparse.FileType('r', encoding='utf-8'), 47 | 'help': '需要评估的 csv 文件', 48 | 'required': True, 49 | }, 50 | { 51 | 'flag': ('--base',), 52 | 'type': argparse.FileType('r', encoding='utf-8'), 53 | 'help': '用于测试提升效果的基准 csv 数据文件', 54 | 'default': None, 55 | }, 56 | { 57 | 'flag': ('--overdue',), 58 | 'help': '是否启用逾期天数分析', 59 | 'action': 'store_true', 60 | }, 61 | { 62 | 'flag': ('--top',), 63 | 'type': int, 64 | 'help': '选择 IV 最高的 n 个变量分析', 65 | 'default': 10, 66 | }, 67 | { 68 | 'flag': ('--iv',), 69 | 'type': float, 70 | 'help': '选择 IV 大于阈值的变量进行分析', 71 | 'default': 0.02, 72 | }, 73 | { 74 | 'flag': ('--unique',), 75 | 'type': int, 76 | 'help': '将连续变量合并成 n 组进行分析', 77 | 'default': 10, 78 | }, 79 | { 80 | 'flag': ('--name',), 81 | 'type': str, 82 | 'help': '生成报告的文件名', 83 | 'default': 'report.xlsx', 84 | }, 85 | ] 86 | } 87 | -------------------------------------------------------------------------------- /toad/commands/tree/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import pandas as pd 4 | 5 | def func(args): 6 | """detect csv data 7 | 8 | Examples: 9 | 10 | toad tree -i xxx.csv 11 | """ 12 | import toad 13 | from .tree import split_data, dtree 14 | args = vars(args) 15 | 16 | # remove func attribute 17 | args.pop('func') 18 | 19 | input = args.pop('input') 20 | target = args.pop('target') 21 | include = args.pop('include') 22 | exclude = args.pop('exclude') 23 | 24 | sys.stdout.write('reading data....\n') 25 | data = pd.read_csv(input) 26 | 27 | X, *tars = split_data(data, target = target) 28 | 29 | if include is not None: 30 | X = X[include] 31 | 32 | if exclude is not None: 33 | X = X.drop(columns = exclude) 34 | 35 | X = toad.utils.get_dummies(X) 36 | 37 | 38 | for t in tars: 39 | sys.stdout.write('analyse '+ t.name +' ...\n') 40 | dtree(X, t, **args) 41 | 42 | 43 | ARGS = { 44 | 'info': { 45 | 'name': 'tree', 46 | 'description': 'analyse bad rate from a csv file', 47 | }, 48 | 'defaults': { 49 | 'func': func, 50 | }, 51 | 'args': [ 52 | { 53 | 'flag': ('-i', '--input'), 54 | 'type': argparse.FileType('r', encoding='utf-8'), 55 | 'help': 'the csv file which will be analysed', 56 | 'required': True, 57 | }, 58 | { 59 | 'flag': ('-t', '--target'), 60 | 'nargs': '+', 61 | 'help': 'the target(s) will be analysed', 62 | 'default': 'target', 63 | }, 64 | { 65 | 'flag': ('-c', '--criterion'), 66 | 'type': str, 67 | 'help': 'criterion to measure the quality of a split. Support "gini" (default), "entropy"', 68 | 'default': 'gini', 69 | }, 70 | { 71 | 'flag': ('-d', '--depth'), 72 | 'type': int, 73 | 'help': 'the maximum depth of the tree', 74 | 'default': None, 75 | }, 76 | { 77 | 'flag': ('-s', '--sample'), 78 | 'type': float, 79 | 'help': 'minimum number of sample in each node', 80 | 'default': 0.01, 81 | }, 82 | { 83 | 'flag': ('-r', '--ratio'), 84 | 'type': float, 85 | 'help': 'threshold of ratio that will be highlighted', 86 | 'default': 0.15, 87 | }, 88 | { 89 | 'flag': ('--exclude',), 90 | 'nargs': '+', 91 | 'help': 'feature names that will not use to analyse', 92 | 'default': None, 93 | }, 94 | { 95 | 'flag': ('--include',), 96 | 'nargs': '+', 97 | 'help': 'feature names that will be used to analyse', 98 | 'default': None, 99 | }, 100 | ] 101 | } 102 | -------------------------------------------------------------------------------- /toad/commands/tree/tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | Windows: 3 | conda install python-graphviz 4 | Mac: 5 | brew install graphviz 6 | pip install graphviz 7 | """ 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | import graphviz 13 | 14 | import sklearn 15 | from sklearn.tree import DecisionTreeClassifier 16 | 17 | 18 | def tree_to_dot(tree, features, high_light = 0.15): 19 | from io import StringIO 20 | from sklearn.tree import _tree 21 | 22 | out = StringIO() 23 | tree_ = tree.tree_ 24 | 25 | features = np.array([ 26 | features[i] if i != _tree.TREE_UNDEFINED else "undefined!" 27 | for i in tree_.feature 28 | ]) 29 | 30 | out.write('digraph Tree {\n') 31 | out.write('edge [fontname="FangSong"];\n') 32 | out.write('node [shape=box];\n') 33 | 34 | def recurse(node, parent = None, label = None): 35 | sample = tree_.n_node_samples[node] 36 | bad_rate = tree_.value[node][0,1] / sample 37 | 38 | out.write('{} [label="'.format(node)) 39 | 40 | out.write('bad rate: {:.2%}\n'.format(bad_rate)) 41 | out.write('sample: {:.2%}\n'.format(sample / tree_.n_node_samples[0])) 42 | 43 | # end of label 44 | out.write('"') 45 | 46 | if bad_rate > high_light: 47 | out.write(', color="red"') 48 | 49 | # end of node 50 | out.write('];\n') 51 | 52 | if tree_.feature[node] != _tree.TREE_UNDEFINED: 53 | name = features[node] 54 | threshold = tree_.threshold[node] 55 | recurse(tree_.children_left[node], node, '{} <= {:.2f}'.format(name, threshold)) 56 | recurse(tree_.children_right[node], node, '{} > {:.2f}'.format(name, threshold)) 57 | 58 | if parent is not None: 59 | out.write('{} -> {} [label="{}"];\n'.format(parent, node, label)) 60 | 61 | recurse(0, None) 62 | 63 | out.write('}') 64 | s = out.getvalue() 65 | out.close() 66 | return s 67 | 68 | 69 | def dot_to_img(dot, file = 'report.png'): 70 | import os 71 | 72 | name, ext = os.path.splitext(file) 73 | 74 | graph = graphviz.Source(dot) 75 | graph.format = ext[1:] 76 | graph.view(name, cleanup = True) 77 | 78 | 79 | def split_data(frame, target = 'target'): 80 | X = frame.drop(columns = target) 81 | 82 | res = (X,) 83 | if isinstance(target, str): 84 | target = [target] 85 | 86 | for col in target: 87 | res += (frame[col],) 88 | 89 | return res 90 | 91 | 92 | def dtree(frame, target, criterion = 'gini', depth = None, sample = 0.01, ratio = 0.15): 93 | tree = DecisionTreeClassifier( 94 | criterion = criterion, 95 | min_samples_leaf = sample, 96 | max_depth = depth, 97 | ) 98 | 99 | tree.fit(frame.fillna(-1), target) 100 | 101 | dot_string = tree_to_dot(tree, frame.columns.values, high_light = ratio) 102 | 103 | dot_to_img(dot_string, file = target.name + '.png') 104 | -------------------------------------------------------------------------------- /toad/detector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """Command line tools for detecting csv data 4 | 5 | Team: ESC 6 | 7 | Examples: 8 | 9 | python detector.py -i xxx.csv -o report.csv 10 | 11 | """ 12 | 13 | import pandas as pd 14 | 15 | def getTopValues(series, top = 5, reverse = False): 16 | """Get top/bottom n values 17 | 18 | Args: 19 | series (Series): data series 20 | top (number): number of top/bottom n values 21 | reverse (bool): it will return bottom n values if True is given 22 | 23 | Returns: 24 | Series: Series of top/bottom n values and percentage. ['value:percent', None] 25 | """ 26 | itype = 'top' 27 | counts = series.value_counts() 28 | counts = list(zip(counts.index, counts, counts.divide(series.size))) 29 | 30 | if reverse: 31 | counts.reverse() 32 | itype = 'bottom' 33 | 34 | template = "{0[0]}:{0[2]:.2%}" 35 | indexs = [itype + str(i + 1) for i in range(top)] 36 | values = [template.format(counts[i]) if i < len(counts) else None for i in range(top)] 37 | 38 | return pd.Series(values, index = indexs) 39 | 40 | 41 | def getDescribe(series, percentiles = [.25, .5, .75]): 42 | """Get describe of series 43 | 44 | Args: 45 | series (Series): data series 46 | percentiles: the percentiles to include in the output 47 | 48 | Returns: 49 | Series: the describe of data include mean, std, min, max and percentiles 50 | """ 51 | d = series.describe(percentiles) 52 | return d.drop('count') 53 | 54 | 55 | def countBlank(series, blanks = []): 56 | """Count number and percentage of blank values in series 57 | 58 | Args: 59 | series (Series): data series 60 | blanks (list): list of blank values 61 | 62 | Returns: 63 | number: number of blanks 64 | str: the percentage of blank values 65 | """ 66 | if len(blanks)>0: 67 | isnull = series.replace(blanks, None).isnull() 68 | else: 69 | isnull = series.isnull() 70 | n = isnull.sum() 71 | ratio = isnull.mean() 72 | 73 | return (n, "{0:.2%}".format(ratio)) 74 | 75 | 76 | def isNumeric(series): 77 | """Check if the series's type is numeric 78 | 79 | Args: 80 | series (Series): data series 81 | 82 | Returns: 83 | bool 84 | """ 85 | return series.dtype.kind in 'ifc' 86 | 87 | 88 | def detect(dataframe): 89 | """ Detect data 90 | 91 | Args: 92 | dataframe (DataFrame): data that will be detected 93 | 94 | Returns: 95 | DataFrame: report of detecting 96 | """ 97 | 98 | rows = [] 99 | for name, series in dataframe.items(): 100 | numeric_index = ['mean', 'std', 'min', '1%', '10%', '50%', '75%', '90%', '99%', 'max'] 101 | discrete_index = ['top1', 'top2', 'top3', 'top4', 'top5', 'bottom5', 'bottom4', 'bottom3', 'bottom2', 'bottom1'] 102 | 103 | details_index = [numeric_index[i] + '_or_' + discrete_index[i] for i in range(len(numeric_index))] 104 | details = [] 105 | 106 | if isNumeric(series): 107 | desc = getDescribe( 108 | series, 109 | percentiles = [.01, .1, .5, .75, .9, .99] 110 | ) 111 | details = desc.tolist() 112 | else: 113 | top5 = getTopValues(series) 114 | bottom5 = getTopValues(series, reverse = True) 115 | details = top5.tolist() + bottom5[::-1].tolist() 116 | 117 | # print(details_index) 118 | nblank, pblank = countBlank(series) 119 | 120 | row = pd.Series( 121 | index = ['type', 'size', 'missing', 'unique'] + details_index, 122 | data = [series.dtype, series.size, pblank, series.nunique()] + details 123 | ) 124 | 125 | row.name = name 126 | rows.append(row) 127 | 128 | return pd.DataFrame(rows) 129 | -------------------------------------------------------------------------------- /toad/impute.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from pandas.api.types import is_numeric_dtype 4 | from sklearn.experimental import enable_iterative_imputer 5 | from sklearn.impute import IterativeImputer 6 | from sklearn.ensemble import RandomForestRegressor 7 | from sklearn.preprocessing import LabelEncoder 8 | 9 | 10 | 11 | def impute(df): 12 | imputer = Imputer( 13 | estimator = RandomForestRegressor(), 14 | random_state = 1, 15 | ) 16 | 17 | return imputer.fit_transform(df) 18 | 19 | 20 | class Imputer(IterativeImputer): 21 | def __init__(self, missing_values = np.nan, **kwargs): 22 | super().__init__(missing_values = np.nan, **kwargs) 23 | 24 | if not isinstance(missing_values, list): 25 | missing_values = [missing_values] 26 | 27 | self.missing_values_list = missing_values 28 | self.encoder_dict = dict() 29 | 30 | def _impute_one_feature(self, X_filled, mask_missing_values, feat_idx, 31 | neighbor_feat_idx, **kwargs): 32 | 33 | return super()._impute_one_feature(X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, **kwargs) 34 | 35 | def fit_transform(self, X, **kwargs): 36 | X, mask = self._replace_empty(X) 37 | X = self._fit_encode(X, mask) 38 | 39 | res = super().fit_transform(X, **kwargs) 40 | res = pd.DataFrame(res, columns = X.columns) 41 | return self._decode(res) 42 | 43 | 44 | def transform(self, X, **kwargs): 45 | X, mask = self._replace_empty(X) 46 | X = self._encode(X, mask) 47 | 48 | res = super().transform(X, **kwargs) 49 | res = pd.DataFrame(res, columns = X.columns) 50 | return self._decode(res) 51 | 52 | 53 | def _replace_empty(self, X): 54 | mask = X.isin(self.missing_values_list) 55 | X = X.where(~mask, np.nan) 56 | return X, mask 57 | 58 | def _fit_encode(self, X, mask): 59 | """fit encoder for object data 60 | 61 | Args: 62 | X (DataFrame) 63 | mask (Mask): empty mask for X 64 | """ 65 | category_data = X.select_dtypes(exclude = np.number).columns 66 | 67 | for col in category_data: 68 | unique, X[col].loc[~mask[col]] = np.unique(X[col][~mask[col]], return_inverse = True) 69 | 70 | self.encoder_dict[col] = unique 71 | 72 | return X 73 | 74 | def _encode(self, X, mask): 75 | """encode object data to number 76 | 77 | Args: 78 | X (DataFrame) 79 | mask (Mask): empty mask for X 80 | """ 81 | for col, unique in self.encoder_dict.items(): 82 | table = dict(zip(unique, np.arange(len(unique)))) 83 | X[col].loc[~mask[col]] = np.array([table[v] for v in X[col][~mask[col]]]) 84 | 85 | return X 86 | 87 | def _decode(self, X): 88 | """decode object data from number to origin data 89 | 90 | Args: 91 | X (DataFrame) 92 | mask (Mask): empty mask for X 93 | """ 94 | for col, unique in self.encoder_dict.items(): 95 | ix = X[col].values.astype(int) 96 | X[col] = unique[ix] 97 | 98 | return X 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /toad/impute_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from .impute import impute 5 | 6 | 7 | ab = np.array(list('ABCDEFG')) 8 | int_feat = np.random.randint(10, size = 500) 9 | float_feat = np.random.rand(500) 10 | str_feat = ab[np.random.choice(7, 500)] 11 | uni_feat = np.ones(500) 12 | # empty_feat = np.full(500, np.nan) 13 | 14 | target = np.random.randint(2, size = 500) 15 | 16 | df = pd.DataFrame({ 17 | 'A': int_feat, 18 | 'B': str_feat, 19 | 'C': uni_feat, 20 | 'D': float_feat, 21 | # 'E': empty_feat, 22 | }) 23 | 24 | mask = np.random.choice([True, False], size = 500 * 4, p = [0.95, 0.05]).reshape(500, 4) 25 | df = df.where(mask, np.nan) 26 | 27 | 28 | def test_impute_with_number(): 29 | res = impute(df.drop(columns = 'B')) 30 | 31 | assert res.isna().sum().sum() == 0 32 | 33 | 34 | def test_impute_with_str(): 35 | res = impute(df) 36 | 37 | assert res.isna().sum().sum() == 0 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /toad/merge_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import pyximport 6 | 7 | pyximport.install(setup_args={"include_dirs": np.get_include()}) 8 | 9 | from .merge import merge, ChiMerge, DTMerge, QuantileMerge, StepMerge, KMeansMerge 10 | 11 | np.random.seed(1) 12 | feature = np.random.rand(500) 13 | target = np.random.randint(2, size = 500) 14 | A = np.random.randint(100, size = 500) 15 | B = np.random.randint(3, size = 500) 16 | 17 | df = pd.DataFrame({ 18 | 'feature': feature, 19 | 'target': target, 20 | 'A': A, 21 | }) 22 | 23 | 24 | 25 | def test_chimerge(): 26 | splits = ChiMerge(feature, target, n_bins = 10) 27 | assert len(splits) == 9 28 | 29 | def test_chimerge_bins_not_enough(): 30 | splits = ChiMerge(B, target, n_bins = 10) 31 | assert len(splits) == 2 32 | 33 | def test_chimerge_bins_with_min_samples(): 34 | splits = ChiMerge(feature, target, min_samples = 0.02) 35 | assert len(splits) == 10 36 | 37 | def test_dtmerge(): 38 | splits = DTMerge(feature, target, n_bins = 10) 39 | assert len(splits) == 9 40 | 41 | def test_quantilemerge(): 42 | splits = QuantileMerge(feature, n_bins = 10) 43 | assert len(splits) == 9 44 | 45 | def test_quantilemerge_not_enough(): 46 | splits = QuantileMerge(B, n_bins = 10) 47 | assert len(splits) == 2 48 | 49 | def test_stepmerge(): 50 | splits = StepMerge(feature, n_bins = 10) 51 | assert len(splits) == 9 52 | 53 | def test_kmeansmerge(): 54 | splits = KMeansMerge(feature, n_bins = 10) 55 | assert len(splits) == 9 56 | 57 | def test_merge(): 58 | res = merge(feature, target = target, method = 'chi', n_bins = 10) 59 | assert len(np.unique(res)) == 10 60 | 61 | def test_merge_frame(): 62 | res = merge(df, target = 'target', method = 'chi', n_bins = 10) 63 | assert len(np.unique(res['A'])) == 10 64 | -------------------------------------------------------------------------------- /toad/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.stats import ks_2samp 4 | 5 | from sklearn.metrics import f1_score, roc_auc_score, roc_curve 6 | 7 | from .utils import ( 8 | feature_splits, 9 | iter_df, 10 | unpack_tuple, 11 | bin_by_splits, 12 | ) 13 | 14 | 15 | def KS(score, target): 16 | """calculate ks value 17 | 18 | Args: 19 | score (array-like): list of score or probability that the model predict 20 | target (array-like): list of real target 21 | 22 | Returns: 23 | float: the max KS value 24 | """ 25 | mask = target == 1 26 | res = ks_2samp(score[mask], score[~mask]) 27 | return res[0] 28 | 29 | 30 | def KS_bucket(score, target, bucket = 10, method = 'quantile', return_splits = False, **kwargs): 31 | """calculate ks value by bucket 32 | 33 | Args: 34 | score (array-like): list of score or probability that the model predict 35 | target (array-like): list of real target 36 | bucket (int): n groups that will bin into 37 | method (str): method to bin score. `quantile` (default), `step` 38 | return_splits (bool): if need to return splits of bucket 39 | 40 | Returns: 41 | DataFrame 42 | """ 43 | df = pd.DataFrame({ 44 | 'score': score, 45 | 'bad': target, 46 | }) 47 | 48 | df['good'] = 1 - df['bad'] 49 | 50 | bad_total = df['bad'].sum() 51 | good_total = df['good'].sum() 52 | all_total = bad_total + good_total 53 | 54 | splits = None 55 | df['bucket'] = 0 56 | 57 | if bucket is False: 58 | df['bucket'] = score 59 | elif isinstance(bucket, (list, np.ndarray, pd.Series)): 60 | # list of split pointers 61 | if len(bucket) < len(score): 62 | bucket = bin_by_splits(score, bucket) 63 | 64 | df['bucket'] = bucket 65 | elif isinstance(bucket, int): 66 | from .merge import merge 67 | df['bucket'], splits = merge(score, n_bins = bucket, method = method, return_splits = True, **kwargs) 68 | 69 | grouped = df.groupby('bucket', as_index = False) 70 | 71 | agg1 = pd.DataFrame() 72 | agg1['min'] = grouped.min()['score'] 73 | agg1['max'] = grouped.max()['score'] 74 | agg1['bads'] = grouped.sum()['bad'] 75 | agg1['goods'] = grouped.sum()['good'] 76 | agg1['total'] = agg1['bads'] + agg1['goods'] 77 | 78 | agg2 = (agg1.sort_values(by = 'min')).reset_index(drop = True) 79 | 80 | agg2['bad_rate'] = agg2['bads'] / agg2['total'] 81 | agg2['good_rate'] = agg2['goods'] / agg2['total'] 82 | 83 | agg2['odds'] = agg2['bads'] / agg2['goods'] 84 | 85 | agg2['bad_prop'] = agg2['bads'] / bad_total 86 | agg2['good_prop'] = agg2['goods'] / good_total 87 | agg2['total_prop'] = agg2['total'] / all_total 88 | 89 | 90 | cum_bads = agg2['bads'].cumsum() 91 | cum_goods = agg2['goods'].cumsum() 92 | cum_total = agg2['total'].cumsum() 93 | 94 | cum_bads_rev = agg2.loc[::-1, 'bads'].cumsum()[::-1] 95 | cum_goods_rev = agg2.loc[::-1, 'goods'].cumsum()[::-1] 96 | cum_total_rev = agg2.loc[::-1, 'total'].cumsum()[::-1] 97 | 98 | agg2['cum_bad_rate'] = cum_bads / cum_total 99 | agg2['cum_bad_rate_rev'] = cum_bads_rev / cum_total_rev 100 | 101 | agg2['cum_bads_prop'] = cum_bads / bad_total 102 | agg2['cum_bads_prop_rev'] = cum_bads_rev / bad_total 103 | agg2['cum_goods_prop'] = cum_goods / good_total 104 | agg2['cum_goods_prop_rev'] = cum_goods_rev / good_total 105 | agg2['cum_total_prop'] = cum_total / all_total 106 | agg2['cum_total_prop_rev'] = cum_total_rev / all_total 107 | 108 | 109 | agg2['ks'] = agg2['cum_bads_prop'] - agg2['cum_goods_prop'] 110 | 111 | reverse_suffix = '' 112 | # fix negative ks value 113 | if agg2['ks'].sum() < 0: 114 | agg2['ks'] = -agg2['ks'] 115 | reverse_suffix = '_rev' 116 | 117 | agg2['lift'] = agg2['bad_prop'] / agg2['total_prop'] 118 | agg2['cum_lift'] = agg2['cum_bads_prop' + reverse_suffix] / agg2['cum_total_prop' + reverse_suffix] 119 | 120 | if return_splits and splits is not None: 121 | return agg2, splits 122 | 123 | return agg2 124 | 125 | def KS_by_col(df, by='feature', score='score', target='target'): 126 | """ 127 | """ 128 | 129 | pass 130 | 131 | 132 | def SSE(y_pred, y): 133 | """sum of squares due to error 134 | """ 135 | return np.sum((y_pred - y) ** 2) 136 | 137 | 138 | def MSE(y_pred, y): 139 | """mean of squares due to error 140 | """ 141 | return np.mean((y_pred - y) ** 2) 142 | 143 | 144 | def AIC(y_pred, y, k, llf = None): 145 | """Akaike Information Criterion 146 | 147 | Args: 148 | y_pred (array-like) 149 | y (array-like) 150 | k (int): number of featuers 151 | llf (float): result of log-likelihood function 152 | """ 153 | if llf is None: 154 | llf = np.log(SSE(y_pred, y)) 155 | 156 | return 2 * k - 2 * llf 157 | 158 | 159 | def BIC(y_pred, y, k, llf = None): 160 | """Bayesian Information Criterion 161 | 162 | Args: 163 | y_pred (array-like) 164 | y (array-like) 165 | k (int): number of featuers 166 | llf (float): result of log-likelihood function 167 | """ 168 | n = len(y) 169 | if llf is None: 170 | llf = np.log(SSE(y_pred, y)) 171 | 172 | return np.log(n) * k - 2 * llf 173 | 174 | 175 | def F1(score, target, split = 'best', return_split = False): 176 | """calculate f1 value 177 | 178 | Args: 179 | score (array-like) 180 | target (array-like) 181 | 182 | Returns: 183 | float: best f1 score 184 | float: best spliter 185 | """ 186 | dataframe = pd.DataFrame({ 187 | 'score': score, 188 | 'target': target, 189 | }) 190 | 191 | if split == 'best': 192 | # find best split for score 193 | splits = feature_splits(dataframe['score'], dataframe['target']) 194 | else: 195 | splits = [split] 196 | 197 | best = 0 198 | sp = None 199 | for df, pointer in iter_df(dataframe, 'score', 'target', splits): 200 | v = f1_score(df['target'], df['score']) 201 | 202 | if v > best: 203 | best = v 204 | sp = pointer 205 | 206 | if return_split: 207 | return best, sp 208 | 209 | return best 210 | 211 | 212 | def AUC(score, target, return_curve = False): 213 | """AUC Score 214 | 215 | Args: 216 | score (array-like): list of score or probability that the model predict 217 | target (array-like): list of real target 218 | return_curve (bool): if need return curve data for ROC plot 219 | 220 | Returns: 221 | float: auc score 222 | """ 223 | # fix score order 224 | if np.nanmax(score) > 1: 225 | score = -score 226 | 227 | auc = roc_auc_score(target, score) 228 | 229 | if not return_curve: 230 | return auc 231 | 232 | return (auc,) + roc_curve(target, score) 233 | 234 | 235 | def _PSI(test, base): 236 | test_prop = pd.Series(test).value_counts(normalize = True, dropna = False) 237 | base_prop = pd.Series(base).value_counts(normalize = True, dropna = False) 238 | 239 | psi = np.sum((test_prop - base_prop) * np.log(test_prop / base_prop)) 240 | 241 | frame = pd.DataFrame({ 242 | 'test': test_prop, 243 | 'base': base_prop, 244 | }) 245 | frame.index.name = 'value' 246 | 247 | return psi, frame.reset_index() 248 | 249 | 250 | 251 | def PSI(test, base, combiner = None, return_frame = False): 252 | """calculate PSI 253 | 254 | Args: 255 | test (array-like): data to test PSI 256 | base (array-like): base data for calculate PSI 257 | combiner (Combiner|list|dict): combiner to combine data 258 | return_frame (bool): if need to return frame of proportion 259 | 260 | Returns: 261 | float|Series 262 | """ 263 | 264 | if combiner is not None: 265 | if isinstance(combiner, (dict, list)): 266 | from .transform import Combiner 267 | combiner = Combiner().load(combiner) 268 | 269 | test = combiner.transform(test, labels = True) 270 | base = combiner.transform(base, labels = True) 271 | 272 | psi = list() 273 | frame = list() 274 | 275 | if isinstance(test, pd.DataFrame): 276 | for col in test: 277 | p, f = _PSI(test[col], base[col]) 278 | psi.append(p) 279 | frame.append(f) 280 | 281 | psi = pd.Series(psi, index = test.columns) 282 | 283 | frame = pd.concat( 284 | frame, 285 | keys = test.columns, 286 | names = ['columns', 'id'], 287 | ).reset_index() 288 | frame = frame.drop(columns = 'id') 289 | else: 290 | psi, frame = _PSI(test, base) 291 | 292 | 293 | res = (psi,) 294 | 295 | if return_frame: 296 | res += (frame,) 297 | 298 | return unpack_tuple(res) 299 | 300 | 301 | def matrix(y_pred, y, splits = None): 302 | """confusion matrix of target 303 | 304 | Args: 305 | y_pred (array-like) 306 | y (array-like) 307 | splits (float|list): split points of y_pred 308 | 309 | Returns: 310 | DataFrame: confusion matrix witch true labels in rows and predicted labels in columns 311 | 312 | """ 313 | if splits is not None: 314 | y_pred = bin_by_splits(y_pred, splits) 315 | 316 | labels = np.unique(y) 317 | from sklearn.metrics import confusion_matrix 318 | m = confusion_matrix(y, y_pred, labels = labels) 319 | 320 | return pd.DataFrame( 321 | m, 322 | index = pd.Index(labels, name = 'Actual'), 323 | columns = pd.Index(labels, name = 'Predicted'), 324 | ) 325 | -------------------------------------------------------------------------------- /toad/metrics_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from .metrics import KS, KS_bucket, F1, PSI, AUC, matrix 6 | 7 | np.random.seed(1) 8 | 9 | feature = np.random.rand(500) 10 | target = np.random.randint(2, size = 500) 11 | base_feature = np.random.rand(500) 12 | 13 | test_df = pd.DataFrame({ 14 | 'A': np.random.rand(500), 15 | 'B': np.random.rand(500), 16 | }) 17 | base_df = pd.DataFrame({ 18 | 'A': np.random.rand(500), 19 | 'B': np.random.rand(500), 20 | }) 21 | 22 | FUZZ_THRESHOLD = 1e-10 23 | 24 | def test_KS(): 25 | result = KS(feature, target) 26 | assert result == 0.05536775661256989 27 | 28 | def test_KS_bucket(): 29 | result = KS_bucket(feature, target) 30 | assert result.loc[4, 'ks'] == -0.028036335090276976 31 | 32 | def test_KS_bucket_use_step(): 33 | result = KS_bucket(feature, target, method = 'step', clip_q = 0.01) 34 | assert result.loc[4, 'ks'] == -0.0422147102645028 35 | 36 | def test_KS_bucket_for_all_score(): 37 | result = KS_bucket(feature, target, bucket = False) 38 | assert len(result) == 500 39 | 40 | def test_KS_bucket_return_splits(): 41 | result, splits = KS_bucket(feature, target, return_splits = True) 42 | assert len(splits) == 9 43 | 44 | def test_KS_bucket_use_split_pointers(): 45 | result = KS_bucket(feature, target, bucket = [0.2, 0.6]) 46 | assert len(result) == 3 47 | 48 | def test_KS_bucket_with_lift(): 49 | result = KS_bucket(feature, target) 50 | assert result.loc[3, 'lift'] == 1.0038610038610036 51 | 52 | def test_KS_bucket_with_cum_lift(): 53 | result = KS_bucket(feature, target) 54 | assert result.loc[3, 'cum_lift'] == 1.003861003861004 55 | 56 | 57 | def test_F1(): 58 | result, split = F1(feature, target, return_split = True) 59 | assert result == pytest.approx(0.6844207723035951, FUZZ_THRESHOLD) 60 | 61 | def test_F1_split(): 62 | result = F1(feature, target, split = 0.5) 63 | assert result == pytest.approx(0.51417004048583, FUZZ_THRESHOLD) 64 | 65 | def test_AUC(): 66 | result = AUC(feature, target) 67 | assert result == 0.5038690142424582 68 | 69 | def test_AUC_with_curve(): 70 | auc, fpr, tpr, thresholds = AUC(feature, target, return_curve = True) 71 | assert thresholds[200] == 0.15773006987053328 72 | 73 | def test_PSI(): 74 | result = PSI(feature, base_feature, combiner = [0.3, 0.5, 0.7]) 75 | assert result == 0.018630024627491467 76 | 77 | def test_PSI_frame(): 78 | result = PSI( 79 | test_df, 80 | base_df, 81 | combiner = { 82 | 'A': [0.3, 0.5, 0.7], 83 | 'B': [0.4, 0.8], 84 | }, 85 | ) 86 | 87 | assert result['B'] == pytest.approx(0.014528279995858708, FUZZ_THRESHOLD) 88 | 89 | def test_PSI_return_frame(): 90 | result, frame = PSI( 91 | test_df, 92 | base_df, 93 | combiner = { 94 | 'A': [0.3, 0.5, 0.7], 95 | 'B': [0.4, 0.8], 96 | }, 97 | return_frame = True, 98 | ) 99 | 100 | assert frame.loc[4, 'test'] == 0.38 101 | 102 | def test_matrix(): 103 | df = matrix(feature, target, splits = 0.5) 104 | assert df.iloc[0,1] == 133 105 | -------------------------------------------------------------------------------- /toad/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | from .trainer import * 3 | -------------------------------------------------------------------------------- /toad/nn/functional.py: -------------------------------------------------------------------------------- 1 | from toad.utils.decorator import support_numpy 2 | 3 | def flooding(loss, b): 4 | """flooding loss 5 | """ 6 | return (loss - b).abs() + b 7 | 8 | 9 | @support_numpy 10 | def focal_loss(input, target, alpha = 1., gamma = 2., reduction = 'mean'): 11 | """focal loss 12 | 13 | Args: 14 | input (Tensor): N x C, C is the number of classes 15 | target (Tensor): N, each value is the index of classes 16 | alpha (Variable): balaced variant of focal loss, range is in [0, 1] 17 | gamma (float): focal loss parameter 18 | reduction (str): `mean`, `sum`, `none` for reduce the loss of each classes 19 | """ 20 | import numpy as np 21 | import torch 22 | import torch.nn.functional as F 23 | 24 | prob = F.sigmoid(input) 25 | weight = torch.pow(1. - prob, gamma) 26 | focal = -alpha * weight * torch.log(prob) 27 | loss = F.nll_loss(focal, target, reduction = reduction) 28 | 29 | return loss 30 | 31 | 32 | @support_numpy 33 | def binary_focal_loss(input, target, **kwargs): 34 | """binary focal loss 35 | """ 36 | # convert 1d tensor to 2d 37 | if input.ndim == 1: 38 | import torch 39 | input = input.view(-1, 1) 40 | input = torch.hstack([1 - input, input]) 41 | 42 | return focal_loss(input, target, **kwargs) 43 | 44 | 45 | def focal_loss_for_numpy(input, target, alpha = 1., gamma = 2., reduction = 'mean'): 46 | """focal loss for numpy array 47 | """ 48 | import numpy as np 49 | 50 | prob = 1 / (1 + np.exp(-input)) 51 | weight = np.power(1. - prob, gamma) 52 | focal = -alpha * weight * np.log(prob) 53 | loss = -focal[np.arange(len(focal)), target] 54 | 55 | if reduction == 'mean': 56 | loss = loss.mean() 57 | elif reduction == 'sum': 58 | loss = loss.sum() 59 | elif reduction == 'none': 60 | pass 61 | 62 | return loss 63 | 64 | 65 | def label_smoothing(labels, smoothing = 0.1): 66 | """label smoothing 67 | """ 68 | assert len(labels.shape) == 2, "labels must be 2 dim where shape should be (N, C)" 69 | 70 | return (1. - smoothing) * labels + smoothing / labels.shape[1] 71 | -------------------------------------------------------------------------------- /toad/nn/functional_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | from torch.utils.data import Dataset, DataLoader 7 | 8 | from .functional import focal_loss, binary_focal_loss 9 | 10 | DATASET_SIZE = 20000 11 | NUM_CLASSES = 4 12 | 13 | 14 | @pytest.fixture(autouse=True) 15 | def seed(): 16 | torch.manual_seed(0) 17 | yield 18 | 19 | 20 | def test_focal_loss(seed): 21 | y_pred = torch.rand(DATASET_SIZE, NUM_CLASSES, dtype=torch.float) 22 | y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long) 23 | loss = focal_loss(y_pred, y) 24 | assert loss.item() == pytest.approx(-0.07764504849910736, 1e-6) 25 | 26 | 27 | def test_loss_with_grad(seed): 28 | y_pred = torch.rand(DATASET_SIZE, NUM_CLASSES, dtype=torch.float, requires_grad=True) 29 | y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long) 30 | loss = focal_loss(y_pred, y) 31 | loss.backward() 32 | assert y_pred.grad is not None 33 | 34 | 35 | def test_binary_focal_loss(seed): 36 | y_pred = torch.rand(DATASET_SIZE, dtype=torch.float) 37 | y = torch.randint(2, size=(DATASET_SIZE,), dtype=torch.long) 38 | loss = binary_focal_loss(y_pred, y) 39 | assert loss.item() == pytest.approx(-0.07776755839586258, 1e-6) 40 | 41 | 42 | def test_numpy_support_focal_loss(seed): 43 | y_pred = torch.rand(DATASET_SIZE, NUM_CLASSES, dtype=torch.float).numpy() 44 | y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long).numpy() 45 | loss = focal_loss(y_pred, y) 46 | assert loss.item() == pytest.approx(-0.07764504849910736, 1e-6) 47 | 48 | 49 | def test_binary_focal_loss_for_xgb(seed): 50 | from toad.utils.decorator import xgb_loss 51 | 52 | y_pred = torch.rand(DATASET_SIZE, dtype=torch.float).numpy() 53 | y = torch.randint(2, size=(DATASET_SIZE,), dtype=torch.long).numpy() 54 | loss_func = xgb_loss(gamma=5.0, alpha=0.5)(binary_focal_loss) 55 | grad, hess = loss_func(y_pred, y) 56 | 57 | assert grad == pytest.approx(-0.00023283064365386963) 58 | assert hess == pytest.approx(465.66128730773926) 59 | 60 | 61 | # TODO 62 | # focal loss sum/none 63 | # label_smoothing 64 | -------------------------------------------------------------------------------- /toad/nn/loss.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import torch 4 | from torch.nn import Module 5 | 6 | from .functional import focal_loss 7 | 8 | 9 | class FocalLoss(Module): 10 | def __init__(self, alpha = 1., gamma = 2., reduction = 'mean'): 11 | super(FocalLoss, self).__init__() 12 | 13 | self.alpha = alpha 14 | self.gamma = gamma 15 | self.reduction = reduction 16 | 17 | def forward(self, input, target): 18 | return focal_loss( 19 | input, 20 | target, 21 | alpha = self.alpha, 22 | gamma = self.gamma, 23 | reduction = self.reduction, 24 | ) 25 | 26 | 27 | class DictLoss(Module): 28 | def __init__(self, torch_loss, weights: Dict[str, float] = None): 29 | super(DictLoss, self).__init__() 30 | self.torch_loss = torch_loss 31 | self.weights = weights or {} 32 | 33 | def forward(self, input: Dict[str, torch.Tensor], target: Dict[str, torch.Tensor]): 34 | loss = 0 35 | weight_sum = 0 36 | for key, _target in target.items(): 37 | if key not in input: 38 | continue 39 | weight = self.weights.get(key, 1) 40 | mask = torch.bitwise_not(torch.isnan(_target)) 41 | _target = _target.to(input[key].device) 42 | loss += weight * self.torch_loss(input[key][mask], _target[mask]) 43 | weight_sum += weight 44 | 45 | return loss / weight_sum 46 | 47 | 48 | class ListLoss(Module): 49 | def __init__(self, torch_loss, weights: List[float] = None): 50 | super(ListLoss, self).__init__() 51 | self.torch_loss = torch_loss 52 | self.weights = weights 53 | 54 | def forward(self, input: List[torch.Tensor], target: List[torch.Tensor]): 55 | loss = 0 56 | weight_sum = 0 57 | for i, (_input, _target) in enumerate(zip(input, target)): 58 | if self.weights: 59 | weight = self.weights[i] 60 | else: 61 | weight = 1 62 | _target = _target.to(_input.device) 63 | mask = torch.bitwise_not(torch.isnan(_target)) 64 | loss += weight * self.torch_loss(_input[mask], _target[mask]) 65 | weight_sum += weight 66 | 67 | return loss / weight_sum 68 | -------------------------------------------------------------------------------- /toad/nn/loss_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from torch.utils.data import Dataset, DataLoader 5 | 6 | from .module import Module 7 | from .loss import DictLoss, ListLoss 8 | 9 | DATASET_SIZE = 20000 10 | NUM_FEATS = 784 11 | NUM_CLASSES = 2 12 | 13 | X = torch.rand(DATASET_SIZE, NUM_FEATS, dtype=torch.float) 14 | y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long) 15 | 16 | 17 | class DictDataset(Dataset): 18 | def __init__(self, x, y): 19 | super().__init__() 20 | self.x = x 21 | self.y = y 22 | 23 | def __len__(self): 24 | return self.x.shape[0] 25 | 26 | def __getitem__(self, item): 27 | return self.x[item], {'y': self.y[item]} 28 | 29 | 30 | class ListDataset(Dataset): 31 | def __init__(self, x, y): 32 | super().__init__() 33 | self.x = x 34 | self.y = y 35 | 36 | def __len__(self): 37 | return self.x.shape[0] 38 | 39 | def __getitem__(self, item): 40 | return self.x[item], [self.y[item]] 41 | 42 | 43 | class TestDictModel(Module): 44 | def __init__(self, in_feats, out_feats): 45 | super().__init__() 46 | 47 | self.linear = nn.Linear(in_feats, out_feats) 48 | 49 | def forward(self, x): 50 | x = self.linear(x) 51 | return {'y': F.relu(x)} 52 | 53 | def fit_step(self, batch, loss=None): 54 | x, y = batch 55 | y_hat = self(x) 56 | return loss(y_hat, y) 57 | 58 | 59 | class TestListModel(Module): 60 | def __init__(self, in_feats, out_feats): 61 | super().__init__() 62 | 63 | self.linear = nn.Linear(in_feats, out_feats) 64 | 65 | def forward(self, x): 66 | x = self.linear(x) 67 | return [F.relu(x)] 68 | 69 | def fit_step(self, batch, loss=None): 70 | x, y = batch 71 | y_hat = self(x) 72 | return loss(y_hat, y) 73 | 74 | 75 | def test_dict_loss(): 76 | model = TestDictModel(NUM_FEATS, NUM_CLASSES) 77 | loader = DataLoader( 78 | DictDataset(X, y), 79 | batch_size=128, 80 | shuffle=True, 81 | ) 82 | model.fit(loader, epoch=1, loss=DictLoss(F.cross_entropy)) 83 | 84 | 85 | def test_list_loss(): 86 | model = TestListModel(NUM_FEATS, NUM_CLASSES) 87 | loader = DataLoader( 88 | ListDataset(X, y), 89 | batch_size=128, 90 | shuffle=True, 91 | ) 92 | model.fit(loader, epoch=1, loss=ListLoss(F.cross_entropy)) 93 | -------------------------------------------------------------------------------- /toad/nn/module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch import nn, optim 4 | from torch.nn.parallel import DistributedDataParallel 5 | 6 | from .trainer.history import get_current_history 7 | from ..utils.progress import Progress 8 | 9 | 10 | 11 | class Module(nn.Module): 12 | """base module for every model 13 | 14 | Examples: 15 | >>> from toad.nn import Module 16 | ... from torch import nn 17 | ... 18 | ... class Net(Module): 19 | ... def __init__(self, inputs, hidden, outputs): 20 | ... super().__init__() 21 | ... self.model = nn.Sequential( 22 | ... nn.Linear(inputs, hidden), 23 | ... nn.ReLU(), 24 | ... nn.Linear(hidden, outputs), 25 | ... nn.Sigmoid(), 26 | ... ) 27 | ... 28 | ... def forward(self, x): 29 | ... return self.model(x) 30 | ... 31 | ... def fit_step(self, batch): 32 | ... x, y = batch 33 | ... y_hat = self(x) 34 | ... 35 | ... # log into history 36 | ... self.log('y', y) 37 | ... self.log('y_hat', y_hat) 38 | ... 39 | ... return nn.functional.mse_loss(y_hat, y) 40 | ... 41 | ... model = Net(10, 4, 1) 42 | ... 43 | ... model.fit(train_loader) 44 | 45 | """ 46 | def __init__(self): 47 | """define model struct 48 | """ 49 | super().__init__() 50 | 51 | 52 | @property 53 | def device(self): 54 | """device of model 55 | """ 56 | return next(self.parameters()).device 57 | 58 | 59 | def fit(self, loader, trainer = None, optimizer = None, loss = None, early_stopping = None, **kwargs): 60 | """train model 61 | 62 | Args: 63 | loader (DataLoader): loader for training model 64 | trainer (Trainer): trainer for training model 65 | optimizer (torch.Optimier): the default optimizer is `Adam(lr = 1e-3)` 66 | loss (Callable): could be called as 'loss(y_hat, y)' 67 | early_stopping (earlystopping): the default value is `loss_earlystopping`, 68 | you can set it to `False` to disable early stopping 69 | epoch (int): number of epoch for training loop 70 | callback (callable): callable function will be called every epoch 71 | """ 72 | if trainer is None: 73 | from .trainer import Trainer 74 | trainer = Trainer(self, loader, optimizer = optimizer, loss = loss, early_stopping = early_stopping) 75 | 76 | trainer.train(**kwargs) 77 | 78 | 79 | def evaluate(self, loader, trainer = None): 80 | """evaluate model 81 | 82 | Args: 83 | loader (DataLoader): loader for evaluate model 84 | trainer (Trainer): trainer for evaluate model 85 | """ 86 | if trainer is None: 87 | from .trainer import Trainer 88 | trainer = Trainer(self) 89 | 90 | return trainer.evaluate(loader) 91 | 92 | 93 | 94 | def fit_step(self, batch, loss = None, *args, **kwargs): 95 | """step for fitting 96 | 97 | Args: 98 | batch (Any): batch data from dataloader 99 | loss (Callable): could be called as 'loss(y_hat, y)' 100 | 101 | Returns: 102 | Tensor: loss of this step 103 | """ 104 | x, y = batch 105 | y_hat = self.__call__(x) 106 | if loss is None: 107 | loss = nn.functional.mse_loss 108 | return loss(y_hat, y) 109 | 110 | 111 | def save(self, path): 112 | """save model 113 | """ 114 | torch.save(self.state_dict(), path) 115 | 116 | 117 | def load(self, path): 118 | """load model 119 | """ 120 | state = torch.load(path) 121 | self.load_state_dict(state) 122 | 123 | 124 | def log(self, key, value): 125 | """log values to history 126 | 127 | Args: 128 | key (str): name of message 129 | value (Tensor): tensor of values 130 | """ 131 | history = get_current_history() 132 | if history is None: 133 | return 134 | 135 | return history.log(key, value) 136 | 137 | 138 | def distributed(self, backend = None, **kwargs): 139 | """get distributed model 140 | """ 141 | if not torch.distributed.is_initialized(): 142 | if backend is None: 143 | # choose a backend 144 | backend = 'nccl' if torch.distributed.is_nccl_available() else 'gloo' 145 | 146 | torch.distributed.init_process_group(backend, **kwargs) 147 | 148 | return DistModule(self) 149 | 150 | 151 | 152 | class DistModule(DistributedDataParallel): 153 | """distributed module class 154 | """ 155 | def fit(self, *args, **kwargs): 156 | return self.module.fit(*args, **kwargs) 157 | 158 | def save(self, *args, **kwargs): 159 | return self.module.save(*args, **kwargs) 160 | 161 | def load(self, *args, **kwargs): 162 | return self.module.load(*args, **kwargs) 163 | 164 | def log(self, *args, **kwargs): 165 | return self.module.log(*args, **kwargs) 166 | 167 | 168 | -------------------------------------------------------------------------------- /toad/nn/module_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from torch.utils.data import TensorDataset, DataLoader 5 | 6 | from .module import Module 7 | 8 | DATASET_SIZE = 20000 9 | NUM_FEATS = 784 10 | NUM_CLASSES = 2 11 | 12 | X = torch.rand(DATASET_SIZE, NUM_FEATS, dtype = torch.float) 13 | y = torch.randint(NUM_CLASSES, size = (DATASET_SIZE,), dtype = torch.long) 14 | 15 | loader = DataLoader( 16 | TensorDataset(X, y), 17 | batch_size = 128, 18 | shuffle = True, 19 | ) 20 | 21 | class TestModel(Module): 22 | def __init__(self, in_feats, out_feats): 23 | super().__init__() 24 | 25 | self.linear = nn.Linear(in_feats, out_feats) 26 | 27 | def forward(self, x): 28 | x = self.linear(x) 29 | return F.relu(x) 30 | 31 | def fit_step(self, batch): 32 | x, y = batch 33 | y_hat = self(x) 34 | return F.cross_entropy(y_hat, y) 35 | 36 | def test_model(): 37 | model = TestModel(NUM_FEATS, NUM_CLASSES) 38 | model.fit(loader, epoch = 1) 39 | 40 | 41 | def test_fit_callback(): 42 | h_list = [] 43 | 44 | def func(history, epoch): 45 | h_list.append(history) 46 | 47 | model = TestModel(NUM_FEATS, NUM_CLASSES) 48 | model.fit(loader, epoch = 2, callback = func) 49 | assert len(h_list) == 2 50 | 51 | 52 | class TestModel2(TestModel): 53 | def fit_step(self, batch, loss=None): 54 | x, y = batch 55 | y_hat = self(x) 56 | return loss(y_hat, y) 57 | 58 | 59 | def test_model_loss(): 60 | model = TestModel2(NUM_FEATS, NUM_CLASSES) 61 | model.fit(loader, epoch=1, loss=F.cross_entropy) 62 | -------------------------------------------------------------------------------- /toad/nn/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from .history import History, get_current_history 2 | from .callback import callback 3 | from .earlystop import earlystopping 4 | from .trainer import Trainer 5 | 6 | __all__ = [ 7 | 'History', 8 | 'get_current_history', 9 | 'callback', 10 | 'earlystopping', 11 | 'Trainer', 12 | ] 13 | -------------------------------------------------------------------------------- /toad/nn/trainer/callback.py: -------------------------------------------------------------------------------- 1 | from ...utils.decorator import Decorator 2 | 3 | class callback(Decorator): 4 | """callback for trainer 5 | 6 | Examples: 7 | >>> @callback 8 | ... def savemodel(model): 9 | ... model.save("path_to_file") 10 | ... 11 | ... trainer.train(model, callback = savemodel) 12 | 13 | """ 14 | def __init__(self, *args, **kwargs): 15 | if hasattr(self, 'wrapped'): 16 | # use `wrapped` func as core func 17 | super().__init__(getattr(self, 'wrapped')) 18 | # setup configuration 19 | self.setup(*args, **kwargs) 20 | return 21 | 22 | # init normal decorator 23 | super().__init__(*args, **kwargs) 24 | 25 | 26 | def setup_func(self, func): 27 | import inspect 28 | self._params = inspect.signature(func).parameters 29 | 30 | return func 31 | 32 | 33 | def wrapper(self, **kwargs): 34 | params = {k: v for k ,v in kwargs.items() if k in self._params.keys()} 35 | 36 | return self.call(**params) 37 | 38 | 39 | 40 | class checkpoint(callback): 41 | """ 42 | Args: 43 | dir (string): dir name for saving checkpoint 44 | every (int): every epoch for saving 45 | format (string): checkpoint file format 46 | """ 47 | dirpath = "model_checkpoints" 48 | every = 1 49 | filename = "{name}-{epoch}.pt" 50 | 51 | 52 | def wrapper(self, **kwargs): 53 | model = kwargs.get("model") 54 | epoch = kwargs.get("epoch") 55 | 56 | name = type(model).__name__ 57 | 58 | from pathlib import Path 59 | dirpath = Path(self.dirpath) 60 | dirpath.mkdir(parents = True, exist_ok = True) 61 | 62 | filename = self.filename.format( 63 | name = name, 64 | epoch = epoch, 65 | ) 66 | 67 | path = dirpath / filename 68 | 69 | if epoch % self.every == 0: 70 | super().wrapper( 71 | path = path, 72 | **kwargs 73 | ) 74 | 75 | 76 | class savemodel(checkpoint): 77 | """ 78 | Args: 79 | dir (string): dir name for saving checkpoint 80 | every (int): every epoch for saving 81 | format (string): checkpoint file format, default is `{name}-{epoch}.pt` 82 | """ 83 | def wrapped(self, model, path): 84 | import torch 85 | torch.save(model.state_dict(), path) 86 | -------------------------------------------------------------------------------- /toad/nn/trainer/callback_test.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from .callback import callback, savemodel 4 | from ..module import Module 5 | 6 | class TestModel(Module): 7 | def __init__(self, in_feats, out_feats): 8 | super().__init__() 9 | 10 | self.linear = nn.Linear(in_feats, out_feats) 11 | 12 | 13 | def test_callback(): 14 | @callback 15 | def hook(history, trainer): 16 | return history['a'] 17 | 18 | res = hook(epoch = 1, trainer = None, history = {"a": 3}) 19 | 20 | assert res == 3 21 | 22 | def test_checkpoint(): 23 | model = TestModel(10, 2) 24 | hook = savemodel(dirpath = '/dev', filename = "null") 25 | hook(model = model, epoch = 1) 26 | -------------------------------------------------------------------------------- /toad/nn/trainer/earlystop.py: -------------------------------------------------------------------------------- 1 | from .callback import callback 2 | from ...utils.decorator import Decorator 3 | 4 | 5 | class earlystopping(callback): 6 | """ 7 | Examples: 8 | >>> @earlystopping(delta = 1e-3, patience = 5) 9 | ... def auc(history): 10 | ... return AUC(history['y_hat'], history['y']) 11 | """ 12 | delta = -1e-3 13 | patience = 10 14 | skip = 0 15 | 16 | def setup(self, delta = -1e-3, patience = 10, skip = 0): 17 | """ 18 | Args: 19 | delta (float): stop training if diff of new score is smaller than delta 20 | patience (int): patience of rounds to stop training 21 | skip (int): n rounds from starting training to warm up 22 | """ 23 | self.direction = 1.0 if delta > 0 else -1.0 24 | self.delta = delta * self.direction 25 | self.patience = patience 26 | self.skip = skip 27 | 28 | self.reset() 29 | 30 | 31 | def get_best_state(self): 32 | """get best state of model 33 | """ 34 | return self.best_state 35 | 36 | 37 | def reset(self): 38 | """ 39 | """ 40 | self.best_score = float('inf') * (-self.direction) 41 | self.best_state = None 42 | self._times = 0 43 | 44 | 45 | def wrapper(self, model, trainer = None, epoch = 0, **kwargs): 46 | # set skip round 47 | if epoch < self.skip: 48 | return False 49 | 50 | score = super().wrapper(model = model, epoch = epoch, **kwargs) 51 | diff = (score - self.best_score) * self.direction 52 | 53 | if diff > self.delta: 54 | self.best_state = model.state_dict() 55 | self.best_score = score 56 | self._times = 0 57 | return False 58 | 59 | self._times += 1 60 | if self._times >= self.patience: 61 | model.load_state_dict(self.best_state) 62 | 63 | if trainer: 64 | trainer.terminate() 65 | 66 | return True 67 | 68 | 69 | class loss_stopping(earlystopping): 70 | """scoring function 71 | """ 72 | def wrapped(self, history): 73 | return history['loss'].mean() 74 | -------------------------------------------------------------------------------- /toad/nn/trainer/earlystop_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .earlystop import earlystopping 3 | 4 | 5 | 6 | def test_earlystopping(): 7 | model = torch.nn.Linear(10, 10) 8 | 9 | @earlystopping(delta = -1, patience = 3) 10 | def scoring(history): 11 | return history['loss'] 12 | 13 | rounds = [] 14 | for i in range(10): 15 | if scoring(model = model, history = {"loss": 1}): 16 | break 17 | 18 | rounds.append(i) 19 | 20 | assert len(rounds) == 3 21 | 22 | 23 | def test_best_state(): 24 | model = torch.nn.Linear(10, 1) 25 | 26 | @earlystopping(delta = -1, patience = 1) 27 | def scoring(history): 28 | return history['loss'] 29 | 30 | with torch.no_grad(): 31 | model.weight.fill_(1.) 32 | 33 | # save init weight 34 | scoring(model = model, history = {"loss": 10}) 35 | assert scoring.best_state["weight"].sum().item() == 10 36 | 37 | # change weight 38 | with torch.no_grad(): 39 | model.weight.fill_(0.) 40 | 41 | # save best weight 42 | scoring(model = model, history = {"loss": 5}) 43 | assert scoring.best_state["weight"].sum().item() == 0 44 | 45 | -------------------------------------------------------------------------------- /toad/nn/trainer/event.py: -------------------------------------------------------------------------------- 1 | from .callback import callback as Callback 2 | 3 | 4 | class Event: 5 | def __init__(self): 6 | self._events = {} 7 | 8 | def register(self, event, handler, every = 1): 9 | """register events handler 10 | """ 11 | if not isinstance(handler, Callback): 12 | handler = Callback(handler) 13 | 14 | if event not in self._events: 15 | self._events[event] = [] 16 | 17 | handler._event_count = 0 18 | handler._event_every = every 19 | 20 | self._events[event].append(handler) 21 | 22 | 23 | def on(self, event, **kwargs): 24 | def wrapper(handler): 25 | self.register(event, handler, **kwargs) 26 | return handler 27 | 28 | return wrapper 29 | 30 | 31 | def emit(self, event, *args, **kwargs): 32 | """emit event 33 | """ 34 | if event not in self._events: 35 | return 36 | 37 | # trigger handler 38 | for handler in self._events[event]: 39 | # increase count 40 | handler._event_count += 1 41 | 42 | # trigger event 43 | if handler._event_count % handler._event_every == 0: 44 | handler(*args, **kwargs) 45 | 46 | 47 | def mute(self, event): 48 | """remove events handlers 49 | """ 50 | if event in self._events: 51 | handlers = self._events.pop(event) 52 | -------------------------------------------------------------------------------- /toad/nn/trainer/event_test.py: -------------------------------------------------------------------------------- 1 | from .event import Event 2 | 3 | 4 | def test_event_trigger(): 5 | e = Event() 6 | 7 | counts = 0 8 | 9 | @e.on("test:trigger") 10 | def func(): 11 | nonlocal counts 12 | counts += 1 13 | 14 | e.emit("test:trigger") 15 | 16 | assert counts == 1 17 | 18 | 19 | def test_event_trigger_every(): 20 | e = Event() 21 | 22 | counts = 0 23 | 24 | @e.on("test:trigger", every = 2) 25 | def func(): 26 | nonlocal counts 27 | counts += 1 28 | 29 | for i in range(10): 30 | e.emit("test:trigger") 31 | 32 | assert counts == 5 33 | -------------------------------------------------------------------------------- /toad/nn/trainer/history.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | _history_stack = [None] 6 | 7 | 8 | def get_current_history(): 9 | global _history_stack 10 | 11 | return _history_stack[-1] 12 | 13 | 14 | 15 | class History: 16 | """model history 17 | """ 18 | def __init__(self): 19 | self._store = {} 20 | 21 | 22 | def __getitem__(self, key): 23 | return self._store[key] 24 | 25 | 26 | def __setitem__(self, key, value): 27 | return self.log(key, value) 28 | 29 | 30 | def _push(self, key, value): 31 | """push value into history 32 | 33 | Args: 34 | key (str): key of history 35 | value (np.ndarray): an array of values 36 | """ 37 | if key not in self._store: 38 | self._store[key] = value 39 | return 40 | 41 | self._store[key] = np.concatenate([ 42 | self._store[key], 43 | value, 44 | ]) 45 | 46 | 47 | def log(self, key, value): 48 | """log message to history 49 | 50 | Args: 51 | key (str): name of message 52 | value (Tensor): tensor of values 53 | """ 54 | if isinstance(value, torch.Tensor): 55 | value = value.detach().cpu().numpy() 56 | 57 | # fix scaler tensor 58 | if value.ndim == 0: 59 | value = value.reshape(-1) 60 | 61 | if np.isscalar(value): 62 | value = np.array([value]) 63 | 64 | if not isinstance(value, np.ndarray): 65 | raise TypeError("value should be `torch.Tensor` or `scalar`") 66 | 67 | self._push(key, value) 68 | 69 | 70 | def start(self): 71 | global _history_stack 72 | _history_stack.append(self) 73 | 74 | return self 75 | 76 | 77 | def end(self): 78 | global _history_stack 79 | return _history_stack.pop() 80 | 81 | 82 | def __enter__(self): 83 | return self.start() 84 | 85 | 86 | def __exit__(self, exc_type, exc_val, exc_tb): 87 | return self.end() 88 | -------------------------------------------------------------------------------- /toad/nn/trainer/history_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from .history import History, get_current_history 4 | 5 | def test_history_log(): 6 | history = History() 7 | 8 | for i in range(10): 9 | history.log('tensor', torch.rand(3, 5)) 10 | 11 | assert history['tensor'].shape == (30, 5) 12 | 13 | 14 | def test_current_history(): 15 | history = History() 16 | 17 | with history: 18 | h = get_current_history() 19 | h.log('tensor', torch.rand(3, 5)) 20 | 21 | assert history['tensor'].shape == (3, 5) 22 | -------------------------------------------------------------------------------- /toad/nn/trainer/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/toad/nn/trainer/metrics.py -------------------------------------------------------------------------------- /toad/nn/trainer/trainer_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | from torch.utils.data import TensorDataset, DataLoader 7 | 8 | from .history import History 9 | from ..module import Module 10 | from .trainer import Trainer 11 | from .callback import callback 12 | from .earlystop import earlystopping 13 | 14 | 15 | DATASET_SIZE = 20000 16 | NUM_FEATS = 784 17 | NUM_CLASSES = 2 18 | 19 | X = torch.rand(DATASET_SIZE, NUM_FEATS, dtype = torch.float) 20 | y = torch.randint(NUM_CLASSES, size = (DATASET_SIZE,), dtype = torch.long) 21 | 22 | loader = DataLoader( 23 | TensorDataset(X, y), 24 | batch_size = 128, 25 | shuffle = True, 26 | ) 27 | 28 | class TestModel(Module): 29 | def __init__(self, in_feats, out_feats): 30 | super().__init__() 31 | 32 | self.linear = nn.Linear(in_feats, out_feats) 33 | 34 | def forward(self, x): 35 | x = self.linear(x) 36 | return F.relu(x) 37 | 38 | def fit_step(self, batch): 39 | x, y = batch 40 | y_hat = self(x) 41 | return F.cross_entropy(y_hat, y) 42 | 43 | 44 | def test_trainer(): 45 | model = TestModel(NUM_FEATS, NUM_CLASSES) 46 | trainer = Trainer(model, loader) 47 | trainer.train(epoch = 2) 48 | assert len(trainer.history) == 2 49 | 50 | 51 | def test_trainer_early_stopping(): 52 | model = TestModel(NUM_FEATS, NUM_CLASSES) 53 | 54 | @earlystopping(delta = -1.0, patience = 3) 55 | def scoring(history): 56 | return history['loss'].mean() 57 | 58 | trainer = Trainer(model, loader, early_stopping = scoring) 59 | trainer.train(epoch = 200) 60 | assert len(trainer.history) == 4 61 | 62 | 63 | def test_trainer_fit_step(): 64 | model = TestModel(NUM_FEATS, NUM_CLASSES) 65 | trainer = Trainer(model, loader) 66 | step_count = 0 67 | 68 | @trainer.fit_step 69 | def step(model, batch): 70 | x, y = batch 71 | y_hat = model(x) 72 | nonlocal step_count 73 | step_count += 1 74 | return F.cross_entropy(y_hat, y) 75 | 76 | trainer.train(epoch = 2) 77 | assert step_count > 1 78 | 79 | 80 | def test_multi_callbacks(): 81 | log = {} 82 | 83 | @callback 84 | def log_epoch(epoch): 85 | log['epoch'] = epoch 86 | 87 | @callback 88 | def log_loss(history): 89 | log['loss'] = history['loss'] 90 | 91 | model = TestModel(NUM_FEATS, NUM_CLASSES) 92 | trainer = Trainer(model) 93 | trainer.train(loader, epoch = 2, callback = [log_epoch, log_loss]) 94 | 95 | assert log['epoch'] == 1 96 | assert len(log['loss']) == 157 97 | 98 | 99 | def test_trainer_evaluate(): 100 | model = TestModel(NUM_FEATS, NUM_CLASSES) 101 | trainer = Trainer(model, loader) 102 | 103 | @trainer.fit_step 104 | def step(model, batch): 105 | x, y = batch 106 | y_hat = model(x) 107 | return F.cross_entropy(y_hat, y) 108 | 109 | history = trainer.evaluate(loader) 110 | 111 | assert len(history["loss"]) == 157 112 | 113 | 114 | 115 | class TestModel2(TestModel): 116 | def fit_step(self, batch, loss=None): 117 | x, y = batch 118 | y_hat = self(x) 119 | return loss(y_hat, y) 120 | 121 | 122 | def test_trainer_loss(): 123 | model = TestModel2(NUM_FEATS, NUM_CLASSES) 124 | trainer = Trainer(model, loader, loss = F.cross_entropy) 125 | trainer.train(epoch = 2) 126 | assert len(trainer.history) == 2 127 | 128 | 129 | # def test_trainer_distributed(): 130 | # model = TestModel(NUM_FEATS, NUM_CLASSES) 131 | # trainer = Trainer(model, loader) 132 | # trainer.distributed(workers = 2) 133 | # trainer.train(epoch = 5) 134 | 135 | 136 | 137 | ### distribut model test 138 | # from toad.nn.trainer.trainer import Trainer 139 | # from torchvision.transforms import ToTensor 140 | # import torch 141 | # from torch import nn 142 | # from torchvision import datasets 143 | # from toad.nn import Module 144 | # from torch.utils.data import DataLoader 145 | # import ray 146 | 147 | # class NeuralNetwork(Module): 148 | # def __init__(self): 149 | # super(NeuralNetwork, self).__init__() 150 | # self.flatten = nn.Flatten() 151 | # self.linear_relu_stack = nn.Sequential( 152 | # nn.Linear(28 * 28, 512), 153 | # nn.ReLU(), 154 | # nn.Linear(512, 512), 155 | # nn.ReLU(), 156 | # nn.Linear(512, 10), 157 | # nn.ReLU(), 158 | # ) 159 | # def forward(self, x): 160 | # x = self.flatten(x) 161 | # logits = self.linear_relu_stack(x) 162 | # return logits 163 | # def fit_step(self, batch): 164 | # X, y = batch 165 | # pred =self(X) 166 | # loss_fn=nn.CrossEntropyLoss() 167 | # return loss_fn(pred, y) 168 | 169 | 170 | # @pytest.mark.skip("distributed trainer skip") 171 | # def test_distribute_example(): 172 | # training_data = datasets.FashionMNIST( 173 | # root="~/data", 174 | # train=True, 175 | # download=True, 176 | # transform=ToTensor(), 177 | # ) 178 | # # Download test data from open datasets. 179 | # test_data = datasets.FashionMNIST( 180 | # root="~/data", 181 | # train=False, 182 | # download=True, 183 | # transform=ToTensor(), 184 | # ) 185 | # worker_batch_size = 64 // 4 186 | # # Create data loaders. 187 | # train_dataloader = DataLoader(training_data, batch_size=16) 188 | # test_dataloader = DataLoader(test_data, batch_size=16) 189 | # model=NeuralNetwork() 190 | # optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 191 | # trainer=Trainer(model,train_dataloader,optimizer) 192 | # trainer.distributed(address="ray://172.20.144.21:10001",num_works=4,use_gpu=False) 193 | # trainer.train(epoch=1) 194 | # trainer.evaluate(test_dataloader) 195 | -------------------------------------------------------------------------------- /toad/nn/zoo/__init__.py: -------------------------------------------------------------------------------- 1 | from .autoencoder import BaseAutoEncoder, VAE 2 | -------------------------------------------------------------------------------- /toad/nn/zoo/autoencoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn.functional import relu, binary_cross_entropy 4 | 5 | from ..module import Module 6 | 7 | 8 | 9 | class BaseAutoEncoder(Module): 10 | def __init__(self, input, hidden, zipped): 11 | super().__init__() 12 | 13 | self.encoder = nn.Sequential( 14 | nn.Linear(input, hidden), 15 | nn.ReLU(), 16 | nn.Linear(hidden, zipped), 17 | ) 18 | 19 | self.decoder = nn.Sequential( 20 | nn.Linear(zipped, hidden), 21 | nn.ReLU(), 22 | nn.Linear(hidden, input), 23 | ) 24 | 25 | self.loss = nn.MSELoss() 26 | 27 | 28 | def encode(self, x): 29 | return self.encoder(x) 30 | 31 | def decode(self, x): 32 | return self.decoder(x) 33 | 34 | def forward(self, x): 35 | z = self.encode(x) 36 | return self.decode(z) 37 | 38 | def fit_step(self, x): 39 | return self.loss(self(x), x) 40 | 41 | 42 | 43 | class VAE(Module): 44 | def __init__(self, input, hidden, zipped): 45 | super().__init__() 46 | 47 | self.hidden_layer = nn.Linear(input, hidden) 48 | 49 | self.mu_layer = nn.Linear(hidden, zipped) 50 | self.var_layer = nn.Linear(hidden, zipped) 51 | 52 | self.decoder = nn.Sequential( 53 | nn.Linear(zipped, hidden), 54 | nn.ReLU(), 55 | nn.Linear(hidden, input), 56 | ) 57 | 58 | self.loss = nn.MSELoss() 59 | 60 | def encode(self, x): 61 | h = relu(self.hidden_layer(x)) 62 | mu = self.mu_layer(h) 63 | var = self.var_layer(h) 64 | 65 | std = torch.exp(var / 2) 66 | eps = torch.rand_like(std) 67 | 68 | z = mu + eps * std 69 | return z, mu, var 70 | 71 | def decode(self, x): 72 | return self.decoder(x) 73 | 74 | def forward(self, x): 75 | z, mu, var = self.encode(x) 76 | x_hat = self.decode(z) 77 | return x_hat, mu, var 78 | 79 | def fit_step(self, x): 80 | x_hat, mu, var = self(x) 81 | l = self.loss(x_hat, x) 82 | kld = -0.5 * torch.sum(1 + var - torch.pow(mu, 2) - torch.exp(var)) 83 | 84 | loss = l + kld 85 | return loss 86 | -------------------------------------------------------------------------------- /toad/nn/zoo/autoencoder_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import pytest 4 | import numpy as np 5 | from torch.utils.data import TensorDataset, DataLoader 6 | 7 | from .autoencoder import BaseAutoEncoder, VAE 8 | 9 | # skip testing with python 3.9 on linux 10 | if sys.version_info >= (3, 9) and sys.platform.startswith('linux'): 11 | pytest.skip("failed with python 3.9 on linux, need fix!", allow_module_level = True) 12 | 13 | 14 | X = torch.Tensor(np.random.rand(20000, 784)) 15 | 16 | loader = DataLoader( 17 | X, 18 | batch_size = 128, 19 | shuffle = True, 20 | ) 21 | 22 | def test_ae(): 23 | ae = BaseAutoEncoder(784, 200, 10) 24 | ae.fit(loader, epoch = 1) 25 | 26 | def test_vae(): 27 | vae = VAE(784, 200, 10) 28 | vae.fit(loader, epoch = 1) 29 | -------------------------------------------------------------------------------- /toad/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.metrics import roc_curve 5 | 6 | from .stats import IV, feature_bin_stats 7 | from .metrics import AUC 8 | from .tadpole import tadpole 9 | from .tadpole.utils import HEATMAP_CMAP, MAX_STYLE, add_annotate, add_text, reset_ylim 10 | from .utils import unpack_tuple, generate_str 11 | 12 | def badrate_plot(frame, x = None, target = 'target', by = None, 13 | freq = None, format = None, return_counts = False, 14 | return_proportion = False, return_frame = False): 15 | """plot for badrate 16 | 17 | Args: 18 | frame (DataFrame) 19 | x (str): column in frame that will be used as x axis 20 | target (str): target column in frame 21 | by (str): column in frame that will be calculated badrate by it 22 | freq (str): offset aliases string by pandas 23 | http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases 24 | format (str): format string for time 25 | return_counts (bool): if need return counts plot 26 | return_frame (bool): if need return frame 27 | 28 | Returns: 29 | Axes: badrate plot 30 | Axes: counts plot 31 | Axes: proportion plot 32 | Dataframe: grouping detail data 33 | """ 34 | frame = frame.copy() 35 | markers = True 36 | 37 | if not isinstance(target, str): 38 | temp_name = generate_str() 39 | frame[temp_name] = target 40 | target = temp_name 41 | 42 | grouper = x 43 | if freq is not None: 44 | frame.loc[:, x] = pd.to_datetime(frame[x], format = format) 45 | grouper = pd.Grouper(key = x, freq = freq) 46 | 47 | if by is not None: 48 | grouper = [by, grouper] 49 | 50 | styles_count = frame[by].nunique() 51 | if styles_count > MAX_STYLE: 52 | markers = ['o'] * styles_count 53 | 54 | group = frame.groupby(grouper) 55 | table = group[target].agg(['sum', 'count']).reset_index() 56 | table['badrate'] = table['sum'] / table['count'] 57 | 58 | # set number dtype to object 59 | if np.issubdtype(table[x].dtype, np.number): 60 | table[x] = table[x].astype(str) 61 | 62 | 63 | rate_plot = tadpole.lineplot( 64 | x = x, 65 | y = 'badrate', 66 | hue = by, 67 | style = by, 68 | data = table, 69 | legend = 'full', 70 | markers = markers, 71 | dashes = False, 72 | ) 73 | 74 | # set y axis start with 0 75 | rate_plot.set_ylim(0, None) 76 | 77 | res = (rate_plot,) 78 | 79 | if return_counts: 80 | count_plot = tadpole.barplot( 81 | x = x, 82 | y = 'count', 83 | hue = by, 84 | data = table, 85 | ) 86 | res += (count_plot,) 87 | 88 | 89 | if return_proportion: 90 | table['prop'] = 0 91 | for v in table[x].unique(): 92 | mask = (table[x] == v) 93 | table.loc[mask, 'prop'] = table[mask]['count'] / table[mask]['count'].sum() 94 | 95 | prop_plot = tadpole.barplot( 96 | x = x, 97 | y = 'prop', 98 | hue = by, 99 | data = table, 100 | ) 101 | res += (prop_plot,) 102 | 103 | 104 | if return_frame: 105 | res += (table,) 106 | 107 | return unpack_tuple(res) 108 | 109 | 110 | def corr_plot(frame, figure_size = (20, 15), ax = None): 111 | """plot for correlation 112 | 113 | Args: 114 | frame (DataFrame): frame to draw plot 115 | Returns: 116 | Axes 117 | """ 118 | corr = frame.corr() 119 | 120 | mask = np.zeros_like(corr, dtype = bool) 121 | mask[np.triu_indices_from(mask)] = True 122 | 123 | map_plot = tadpole.heatmap( 124 | corr, 125 | mask = mask, 126 | cmap = HEATMAP_CMAP, 127 | vmax = 1, 128 | vmin = -1, 129 | center = 0, 130 | square = True, 131 | cbar_kws = {"shrink": .5}, 132 | linewidths = .5, 133 | annot = True, 134 | fmt = '.2f', 135 | figure_size = figure_size, 136 | ax = ax, 137 | ) 138 | 139 | return map_plot 140 | 141 | 142 | def proportion_plot(x = None, keys = None, ax = None): 143 | """plot for comparing proportion in different dataset 144 | 145 | Args: 146 | x (Series|list): series or list of series data for plot 147 | keys (str|list): keys for each data 148 | 149 | Returns: 150 | Axes 151 | """ 152 | if not isinstance(x, list): 153 | x = [x] 154 | 155 | if keys is None: 156 | keys = [ 157 | x[ix].name 158 | if hasattr(x[ix], 'name') and x[ix].name is not None 159 | else ix 160 | for ix in range(len(x)) 161 | ] 162 | elif isinstance(keys, str): 163 | keys = [keys] 164 | 165 | x = map(pd.Series, x) 166 | data = pd.concat(x, keys = keys, names = ['keys']).reset_index() 167 | data = data.rename(columns = {data.columns[2]: 'value'}) 168 | 169 | prop_data = data.groupby('keys')['value'].value_counts( 170 | normalize = True, 171 | dropna = False, 172 | ).rename('proportion').reset_index() 173 | 174 | prop_plot = tadpole.barplot( 175 | x = 'value', 176 | y = 'proportion', 177 | hue = 'keys', 178 | data = prop_data, 179 | ax = ax, 180 | ) 181 | 182 | return prop_plot 183 | 184 | 185 | def roc_plot(score, target, compare = None, figsize = (14, 10), ax = None): 186 | """plot for roc 187 | 188 | Args: 189 | score (array-like): predicted score 190 | target (array-like): true target 191 | compare (array-like): another score for comparing with score 192 | 193 | Returns: 194 | Axes 195 | """ 196 | auc, fpr, tpr, thresholds = AUC(score, target, return_curve = True) 197 | 198 | if ax is None: 199 | fig, ax = plt.subplots(1, 1, figsize = figsize) 200 | 201 | ax.plot(fpr, tpr, label = 'ROC curve (area = %0.5f)' % auc) 202 | ax.fill_between(fpr, tpr, alpha = 0.3) 203 | if compare is not None: 204 | c_aux, c_fpr, c_tpr, _ = AUC(compare, target, return_curve = True) 205 | ax.plot(c_fpr, c_tpr,label = 'ROC compare (area = %0.5f)' % c_aux) 206 | ax.fill_between(c_fpr, c_tpr, alpha = 0.3) 207 | 208 | ax.plot([0, 1], [0, 1], color = 'red', linestyle = '--') 209 | plt.legend(loc = "lower right") 210 | 211 | return ax 212 | 213 | def ks_plot(score, target, figsize = (14, 10), ax = None): 214 | """plot for ks 215 | 216 | Args: 217 | score (array-like): predicted score 218 | target (array-like): true target 219 | compare (array-like): another score for comparing with score 220 | 221 | Returns: 222 | Axes 223 | """ 224 | fpr, tpr, thresholds = roc_curve(target, score) 225 | 226 | if ax is None: 227 | fig, ax = plt.subplots(1, 1, figsize = figsize) 228 | 229 | ax.plot(thresholds[1 : ], tpr[1 : ], label = 'tpr') 230 | ax.plot(thresholds[1 : ], fpr[1 : ], label = 'fpr') 231 | ax.plot(thresholds[1 : ], (tpr - fpr)[1 : ], label = 'ks') 232 | 233 | ax.invert_xaxis() 234 | ax.legend() 235 | 236 | ks_value = max(tpr - fpr) 237 | x = np.argwhere(abs(fpr - tpr) == ks_value)[0, 0] 238 | thred_value = thresholds[x] 239 | ax.axvline(thred_value, color = 'r', linestyle = '--') 240 | plt.title(f'ks:{ks_value:.5f} threshold:{thred_value:.5f}') 241 | 242 | return ax 243 | 244 | def bin_plot(frame, x = None, target = 'target', iv = True, annotate_format = ".2f", 245 | return_frame = False, figsize = (12, 6), ax = None): 246 | """plot for bins 247 | 248 | Args: 249 | frame (DataFrame) 250 | x (str): column in frame that will be used as x axis 251 | target (str): target column in frame 252 | iv (bool): if need to show iv in plot 253 | annotate_format (str): format str for axis annotation of chart 254 | return_frame (bool): if need return bin frame 255 | figsize (tuple): size of the figure (width, height) 256 | 257 | Returns: 258 | Dataframe: contains good, bad, badrate, prop, y_prop, n_prop, woe, iv 259 | """ 260 | frame = frame.copy() 261 | 262 | if not isinstance(target, str): 263 | temp_name = generate_str() 264 | frame[temp_name] = target 265 | target = temp_name 266 | 267 | table = feature_bin_stats(frame, x, target) 268 | 269 | if ax is None: 270 | fig, ax = plt.subplots(figsize=figsize) 271 | 272 | ax = tadpole.barplot( 273 | x = x, 274 | y = 'prop', 275 | data = table, 276 | color = '#82C6E2', 277 | ax = ax, 278 | ) 279 | 280 | ax = add_annotate(ax, format = annotate_format) 281 | 282 | badrate_ax = ax.twinx() 283 | badrate_ax.grid(False) 284 | 285 | badrate_ax = tadpole.lineplot( 286 | x = x, 287 | y = 'badrate', 288 | data = table, 289 | color = '#D65F5F', 290 | ax = badrate_ax, 291 | ) 292 | 293 | badrate_ax.set_ylim([0, None]) 294 | badrate_ax = add_annotate(badrate_ax, format = annotate_format) 295 | 296 | if iv: 297 | ax = reset_ylim(ax) 298 | ax = add_text(ax, 'IV: {:.5f}'.format(table['iv'].sum())) 299 | 300 | res = (ax,) 301 | 302 | if return_frame: 303 | res += (table,) 304 | 305 | return unpack_tuple(res) 306 | -------------------------------------------------------------------------------- /toad/plot_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from .plot import ( 6 | badrate_plot, 7 | corr_plot, 8 | proportion_plot, 9 | roc_plot, 10 | bin_plot, 11 | ) 12 | 13 | np.random.seed(1) 14 | 15 | LENGTH = 500 16 | 17 | A = np.random.rand(LENGTH) 18 | A[np.random.choice(LENGTH, 20, replace = False)] = np.nan 19 | 20 | B = np.random.randint(100, size = LENGTH) 21 | C = A + np.random.normal(0, 0.2, LENGTH) 22 | D = A + np.random.normal(0, 0.1, LENGTH) 23 | 24 | E = np.random.rand(LENGTH) 25 | E[np.random.choice(LENGTH, 480, replace = False)] = np.nan 26 | 27 | F = B + np.random.normal(0, 10, LENGTH) 28 | 29 | target = np.random.randint(2, size = LENGTH) 30 | 31 | frame = pd.DataFrame({ 32 | 'A': A, 33 | 'B': B, 34 | 'C': C, 35 | 'D': D, 36 | 'E': E, 37 | 'F': F, 38 | }) 39 | 40 | frame['target'] = target 41 | 42 | 43 | def test_badrate_plot(): 44 | g = badrate_plot( 45 | frame, 46 | x = 'A', 47 | target = 'target', 48 | return_counts = True, 49 | return_proportion = True, 50 | ) 51 | 52 | def test_badrate_plot_y_axis(): 53 | g = badrate_plot( 54 | frame, 55 | x = 'A', 56 | target = 'target', 57 | ) 58 | bottom, _ = g.get_ylim() 59 | assert bottom == 0 60 | 61 | def test_corr_plot(): 62 | g = corr_plot(frame) 63 | 64 | 65 | def test_proportion_plot(): 66 | g = proportion_plot(x = frame['target']) 67 | 68 | 69 | def test_roc_plot(): 70 | g = roc_plot(frame['B'], frame['target']) 71 | 72 | 73 | def test_bin_plot(): 74 | g = bin_plot(frame, x = 'B', target = 'target') 75 | 76 | 77 | def test_bin_plot_return_frame(): 78 | g, df = bin_plot(frame, x = 'B', target = 'target', return_frame = True) 79 | assert df.shape == (100, 10) 80 | -------------------------------------------------------------------------------- /toad/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .process import Processing, Mask, F 2 | from .partition import Partition, TimePartition -------------------------------------------------------------------------------- /toad/preprocessing/partition.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | class Partition: 6 | def partition(self, data): 7 | """partition data 8 | 9 | Args: 10 | data (DataFrame): dataframe 11 | 12 | Returns: 13 | iterator -> ndarray[bool]: mask of partition data 14 | iterator -> str: suffix string of current partition 15 | """ 16 | yield np.ones(len(data)).astype(bool), '' 17 | 18 | 19 | 20 | class TimePartition(Partition): 21 | """partition data by time delta 22 | 23 | Args: 24 | base (str): column name of base time 25 | filter (str): column name of target time to be compared 26 | times (list): list of time delta` 27 | 28 | Example: 29 | 30 | >>> TimePartition('apply_time', 'query_time', ['30d', '90d', 'all']) 31 | 32 | """ 33 | def __init__(self, base, filter, times): 34 | self.base = base 35 | self.filter = filter 36 | self.times = times 37 | 38 | 39 | def partition(self, data): 40 | base = pd.to_datetime(data[self.base]) 41 | filter = pd.to_datetime(data[self.filter]) 42 | 43 | for t in self.times: 44 | if t != 'all': 45 | delta = pd.Timedelta(t) 46 | mask = filter > (base - delta) 47 | else: 48 | mask = np.ones(len(filter)).astype(bool) 49 | 50 | yield mask, '_' + t 51 | 52 | 53 | class ValuePartition(Partition): 54 | """partition data by column values 55 | 56 | Args: 57 | column (str): column name which will be used as partition 58 | 59 | Example: 60 | 61 | >>> ValuePartition('status') 62 | 63 | """ 64 | def __init__(self, column): 65 | self.column = column 66 | 67 | 68 | def partition(self, data): 69 | data = data[self.column] 70 | unique = data.unique() 71 | 72 | for u in unique: 73 | if pd.isna(u): 74 | mask = data.isna() 75 | else: 76 | mask = (data == u) 77 | 78 | yield mask, '_' + str(u) -------------------------------------------------------------------------------- /toad/preprocessing/partition_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | from .partition import TimePartition, ValuePartition 7 | 8 | 9 | np.random.seed(1) 10 | 11 | ab = np.array(list('ABCDEFG')) 12 | 13 | history = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, 400, size = 500) 14 | open_time = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, size = 500) 15 | A = ab[np.random.choice(7, 500)] 16 | B = np.random.randint(10, size = 500).astype(float) 17 | B[np.random.choice(500, 10)] = np.nan 18 | 19 | 20 | df = pd.DataFrame({ 21 | 'history': history, 22 | 'open_time': open_time, 23 | 'A': A, 24 | 'B': B, 25 | }) 26 | 27 | 28 | def test_timepartition(): 29 | tp = TimePartition('open_time', 'history', ['90d', '180d']) 30 | mask, suffix = next(tp.partition(df)) 31 | assert mask.sum() == 93 32 | 33 | 34 | def test_timepartition_all(): 35 | tp = TimePartition('open_time', 'history', ['all']) 36 | mask, suffix = next(tp.partition(df)) 37 | assert mask.sum() == 500 38 | 39 | def test_valuepartition(): 40 | vp = ValuePartition('A') 41 | mask, suffix = next(vp.partition(df)) 42 | assert mask.sum() == 67 43 | 44 | def test_valuepartition_with_na(): 45 | vp = ValuePartition('B') 46 | s = 0 47 | for mask, suffix in vp.partition(df): 48 | s += mask.sum() 49 | 50 | assert s == 500 -------------------------------------------------------------------------------- /toad/preprocessing/process.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | _ALL_SYMBOL_ = '__all_symbol__' 5 | 6 | class Processing: 7 | """ 8 | 9 | Examples: 10 | 11 | >>> (Processing(data) 12 | ... .groupby('id') 13 | ... .partitionby(TimePartition( 14 | ... 'base_time', 15 | ... 'filter_time', 16 | ... ['30d', '60d', '180d', '365d', 'all'] 17 | ... )) 18 | ... .apply({'A': ['max', 'min', 'mean']}) 19 | ... .apply({'B': ['max', 'min', 'mean']}) 20 | ... .apply({'C': 'nunique'}) 21 | ... .apply({'D': { 22 | ... 'f': len, 23 | ... 'name': 'normal_count', 24 | ... 'mask': Mask('D').isin(['normal']), 25 | ... }}) 26 | ... .apply({'id': 'count'}) 27 | ... .exec() 28 | ... ) 29 | """ 30 | def __init__(self, data): 31 | self.data = data 32 | self.funcs = {} 33 | self.partitions = None 34 | 35 | def groupby(self, name): 36 | """group data by name 37 | 38 | Args: 39 | name (str): column name in data 40 | """ 41 | self._groupby = name 42 | return self 43 | 44 | def apply(self, f): 45 | """apply functions to data 46 | 47 | Args: 48 | f (dict|function): a config dict that keys are the column names and 49 | values are the functions, it will take the column series as the 50 | functions argument. if `f` is a function, it will take the whole 51 | dataframe as the argument. 52 | 53 | """ 54 | if not isinstance(f, dict): 55 | f = { 56 | _ALL_SYMBOL_: f 57 | } 58 | 59 | for k, v in f.items(): 60 | self.append_func(k, v) 61 | 62 | return self 63 | 64 | 65 | def append_func(self, col, func): 66 | if not isinstance(func, (list, tuple)): 67 | func = [func] 68 | 69 | if col not in self.funcs: 70 | self.funcs[col] = [] 71 | 72 | for f in func: 73 | self.funcs[col].append(self._convert_func(f)) 74 | 75 | 76 | def _convert_func(self, f): 77 | if isinstance(f, F): 78 | return f 79 | 80 | if not isinstance(f, dict): 81 | f = {'f': f} 82 | 83 | return F(**f) 84 | 85 | 86 | def partitionby(self, p): 87 | """partition data to multiple pieces, processing will process to all the pieces 88 | 89 | Args: 90 | p (Partition) 91 | """ 92 | self.partitions = p 93 | return self 94 | 95 | def exec(self): 96 | if self.partitions is None: 97 | return self.process(self.data) 98 | 99 | res = None 100 | for mask, suffix in self.partitions.partition(self.data): 101 | data = self.process(self.data[mask]) 102 | data = data.add_suffix(suffix) 103 | 104 | if res is None: 105 | res = data 106 | continue 107 | 108 | res = res.join(data, how = 'outer') 109 | 110 | return res 111 | 112 | 113 | 114 | def process(self, data): 115 | group = data.groupby(self._groupby) 116 | 117 | res = [] 118 | for col, l in self.funcs.items(): 119 | for f in l: 120 | g = group 121 | 122 | if f.need_filter: 123 | g = f.filter(data).groupby(self._groupby) 124 | 125 | if f.is_buildin: 126 | r = getattr(g[col], f.name)() 127 | r.name = f.name 128 | else: 129 | if col == _ALL_SYMBOL_: 130 | col = None 131 | 132 | r = g.apply(f, col = col) 133 | 134 | if isinstance(r, pd.Series): 135 | r = pd.DataFrame(r) 136 | 137 | res.append(r.add_prefix(col + '_')) 138 | 139 | return pd.concat(res, axis=1) 140 | 141 | 142 | 143 | class Mask: 144 | """a placeholder to select dataframe 145 | """ 146 | def __init__(self, column = None): 147 | self.column = column 148 | self.operators = [] 149 | 150 | def push(self, op, value): 151 | self.operators.append({ 152 | 'op': op, 153 | 'value': value, 154 | }) 155 | 156 | def replay(self, data): 157 | base = data 158 | if self.column is not None: 159 | base = data[self.column] 160 | 161 | for item in self.operators: 162 | v = item['value'] 163 | 164 | if isinstance(v, Mask): 165 | v = v.replay(data) 166 | 167 | f = getattr(base, item['op']) 168 | 169 | if v is None: 170 | base = f() 171 | continue 172 | 173 | base = f(v) 174 | 175 | return base 176 | 177 | def __eq__(self, other): 178 | self.push('__eq__', other) 179 | return self 180 | 181 | def __lt__(self, other): 182 | self.push('__lt__', other) 183 | return self 184 | 185 | def __gt__(self, other): 186 | self.push('__gt__', other) 187 | return self 188 | 189 | def __le__(self, other): 190 | self.push('__le__', other) 191 | return self 192 | 193 | def __ge__(self, other): 194 | self.push('__ge__', other) 195 | return self 196 | 197 | def __invert__(self): 198 | self.push('__invert__', None) 199 | return self 200 | 201 | def __and__(self, other): 202 | self.push('__and__', other) 203 | return self 204 | 205 | def __or__(self, other): 206 | self.push('__or__', other) 207 | return self 208 | 209 | def __xor__(self, other): 210 | self.push('__xor__', other) 211 | return self 212 | 213 | def isin(self, other): 214 | self.push('isin', other) 215 | return self 216 | 217 | def isna(self): 218 | self.push('isna', None) 219 | return self 220 | 221 | 222 | 223 | class F: 224 | """function class for processing 225 | """ 226 | def __init__(self, f, name = None, mask = None): 227 | self.f = f 228 | 229 | if name is None: 230 | if self.is_buildin: 231 | name = f 232 | else: 233 | name = f.__name__ 234 | 235 | self.__name__ = name 236 | 237 | self.mask = mask 238 | 239 | @property 240 | def name(self): 241 | return self.__name__ 242 | 243 | @property 244 | def is_buildin(self): 245 | return isinstance(self.f, str) 246 | 247 | @property 248 | def need_filter(self): 249 | return self.mask is not None 250 | 251 | def __call__(self, data, *args, col = None, **kwargs): 252 | if col in data: 253 | data = data[col] 254 | 255 | r = self.f(data, *args, **kwargs) 256 | 257 | if not isinstance(r, dict): 258 | r = { 259 | self.name: r 260 | } 261 | 262 | return pd.Series(r) 263 | 264 | 265 | def filter(self, data): 266 | if self.mask is None: 267 | return data 268 | 269 | mask = self.mask 270 | if isinstance(self.mask, Mask): 271 | mask = self.mask.replay(data) 272 | 273 | return data[mask] 274 | 275 | -------------------------------------------------------------------------------- /toad/preprocessing/process_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | from .process import Processing, Mask, F 7 | 8 | 9 | np.random.seed(1) 10 | 11 | history = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, 400, size = 500) 12 | open_time = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, size = 500) 13 | A = np.random.randint(10, size = 500) 14 | B = np.random.rand(500) 15 | B[np.random.choice(500, 10)] = np.nan 16 | 17 | 18 | df = pd.DataFrame({ 19 | 'history': history, 20 | 'open_time': open_time, 21 | 'A': A, 22 | 'B': B, 23 | }) 24 | 25 | 26 | def test_mask(): 27 | m = Mask('A') > 3 28 | assert m.replay(df).sum() == (A > 3).sum() 29 | 30 | 31 | def test_mask_without_name(): 32 | m = Mask() > 3 33 | assert m.replay(A).sum() == (A > 3).sum() 34 | 35 | def test_mask_isin(): 36 | m = Mask('A').isin([1,2,3]) 37 | assert m.replay(df).sum() == df['A'].isin([1,2,3]).sum() 38 | 39 | def test_mask_isna(): 40 | m = Mask('A').isna() 41 | assert m.replay(df).sum() == df['A'].isna().sum() 42 | 43 | def test_f(): 44 | assert F(len)(A)[0] == 500 45 | 46 | def test_processing(): 47 | res = ( 48 | Processing(df) 49 | .groupby('open_time') 50 | .apply({'A': ['min', 'mean']}) 51 | .apply({'B': [ 52 | { 53 | 'f': 'size', 54 | 'mask': Mask('A') > 1, 55 | }, 56 | { 57 | 'f': len, 58 | }, 59 | ]}) 60 | .exec() 61 | ) 62 | 63 | assert res.size == 120 and res.loc['2020-02-29', 'B_size'] == 23 64 | 65 | 66 | def test_processing_with_partition(): 67 | from .partition import ValuePartition 68 | res = ( 69 | Processing(df) 70 | .groupby('open_time') 71 | .partitionby(ValuePartition('A')) 72 | .apply({'B': ['mean', 'size']}) 73 | .exec() 74 | ) 75 | 76 | assert res.size == 600 and res.loc['2020-02-29', 'B_size_1'] == 2 -------------------------------------------------------------------------------- /toad/scorecard_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.linear_model import LogisticRegression 5 | 6 | from .scorecard import ScoreCard, WOETransformer, Combiner 7 | 8 | np.random.seed(1) 9 | 10 | # Create a testing dataframe and a scorecard model. 11 | 12 | ab = np.array(list('ABCDEFG')) 13 | feature = np.random.randint(10, size = 500) 14 | target = np.random.randint(2, size = 500) 15 | str_feat = ab[np.random.choice(7, 500)] 16 | 17 | df = pd.DataFrame({ 18 | 'A': feature, 19 | 'B': str_feat, 20 | 'C': ab[np.random.choice(2, 500)], 21 | 'D': np.ones(500), 22 | }) 23 | 24 | card_config = { 25 | 'A': { 26 | '[-inf ~ 3)': 100, 27 | '[3 ~ 5)': 200, 28 | '[5 ~ 8)': 300, 29 | '[8 ~ inf)': 400, 30 | 'nan': 500, 31 | }, 32 | 'B': { 33 | ','.join(list('ABCD')): 200, 34 | ','.join(list('EF')): 400, 35 | 'else': 500, 36 | }, 37 | 'C': { 38 | 'A': 200, 39 | 'B': 100, 40 | }, 41 | } 42 | 43 | combiner = Combiner() 44 | bins = combiner.fit_transform(df, target, n_bins = 5) 45 | woe_transer = WOETransformer() 46 | woe = woe_transer.fit_transform(bins, target) 47 | 48 | # create a score card 49 | card = ScoreCard( 50 | combiner = combiner, 51 | transer = woe_transer, 52 | ) 53 | card.fit(woe, target) 54 | 55 | 56 | FUZZ_THRESHOLD = 1e-6 57 | TEST_SCORE = pytest.approx(453.5702462572068, FUZZ_THRESHOLD) 58 | TEST_PROBA = pytest.approx(0.4673322872985267, FUZZ_THRESHOLD) 59 | 60 | 61 | def test_representation(): 62 | repr(card) 63 | 64 | 65 | def test_load(): 66 | card = ScoreCard().load(card_config) 67 | score = card.predict(df) 68 | assert score[200] == 600 69 | 70 | 71 | def test_load_after_init_combiner(): 72 | card = ScoreCard( 73 | combiner = combiner, 74 | transer = woe_transer, 75 | ) 76 | card.load(card_config) 77 | score = card.predict(df) 78 | assert score[200] == 600 79 | 80 | 81 | def test_proba_to_score(): 82 | model = LogisticRegression() 83 | model.fit(woe, target) 84 | 85 | proba = model.predict_proba(woe)[:, 1] 86 | score = card.proba_to_score(proba) 87 | assert score[404] == TEST_SCORE 88 | 89 | 90 | def test_score_to_prob(): 91 | score = card.predict(df) 92 | proba = card.score_to_proba(score) 93 | assert proba[404] == TEST_PROBA 94 | 95 | 96 | def test_predict(): 97 | score = card.predict(df) 98 | assert score[404] == TEST_SCORE 99 | 100 | 101 | def test_predict_proba(): 102 | proba = card.predict_proba(df) 103 | assert proba[404, 1] == TEST_PROBA 104 | 105 | 106 | def test_card_feature_effect(): 107 | """ 108 | verify the `base effect of each feature` is consistent with assumption 109 | FEATURE_EFFECT is manually calculated with following logic: 110 | FEATURE_EFFECT = np.median(card.woe_to_score(df),axis = 0) 111 | """ 112 | FEATURE_EFFECT = pytest.approx(np.array([142.26368948220417, 152.82747912111066, 148.82665746001695, 0.]), FUZZ_THRESHOLD) 113 | assert card.base_effect.values == FEATURE_EFFECT 114 | 115 | 116 | def test_predict_sub_score(): 117 | score, sub = card.predict(df, return_sub=True) 118 | assert sub.loc[250, 'B'] == pytest.approx(162.09822360428146, FUZZ_THRESHOLD) 119 | 120 | 121 | def test_woe_to_score(): 122 | score = card.woe_to_score(woe) 123 | score = np.sum(score, axis=1) 124 | assert score[404] == TEST_SCORE 125 | 126 | 127 | def test_bin_to_score(): 128 | score = card.bin_to_score(bins) 129 | assert score[404] == TEST_SCORE 130 | 131 | 132 | def test_export_map(): 133 | card_map = card.export() 134 | assert card_map['B']['D'] == 159.26 135 | 136 | 137 | def test_card_map(): 138 | config = card.export() 139 | card_from_map = ScoreCard().load(config) 140 | score = card_from_map.predict(df) 141 | assert score[404] == 453.57 142 | 143 | 144 | def test_card_map_with_else(): 145 | card_from_map = ScoreCard().load(card_config) 146 | score = card_from_map.predict(df) 147 | assert score[80] == 1000 148 | 149 | 150 | def test_generate_testing_frame(): 151 | card = ScoreCard().load(card_config) 152 | frame = card.testing_frame() 153 | assert frame.loc[4, 'B'] == 'E' 154 | 155 | 156 | def test_export_frame(): 157 | card = ScoreCard().load(card_config) 158 | frame = card.export(to_frame=True) 159 | rows = frame[(frame['name'] == 'B') & (frame['value'] == 'else')].reset_index() 160 | assert rows.loc[0, 'score'] == 500 161 | 162 | 163 | def test_card_combiner_number_not_match(): 164 | c = combiner.export() 165 | c['A'] = [0, 3, 6, 8] 166 | com = Combiner().load(c) 167 | bins = com.transform(df) 168 | woe_transer = WOETransformer() 169 | woe = woe_transer.fit_transform(bins, target) 170 | 171 | card = ScoreCard( 172 | combiner=com, 173 | transer=woe_transer, 174 | ) 175 | 176 | with pytest.raises(Exception) as e: 177 | # will raise an exception when fitting a card 178 | card.fit(woe, target) 179 | 180 | assert '\'A\' is not matched' in str(e.value) 181 | 182 | 183 | def test_card_combiner_str_not_match(): 184 | c = combiner.export() 185 | c['C'] = [['A'], ['B'], ['C']] 186 | com = Combiner().load(c) 187 | bins = com.transform(df) 188 | woe_transer = WOETransformer() 189 | woe = woe_transer.fit_transform(bins, target) 190 | 191 | card = ScoreCard( 192 | combiner=com, 193 | transer=woe_transer, 194 | ) 195 | 196 | with pytest.raises(Exception) as e: 197 | # will raise an exception when fitting a card 198 | card.fit(woe, target) 199 | 200 | assert '\'C\' is not matched' in str(e.value) 201 | 202 | 203 | def test_card_with_less_X(): 204 | x = woe.drop(columns='A') 205 | card = ScoreCard( 206 | combiner=combiner, 207 | transer=woe_transer, 208 | ) 209 | 210 | card.fit(x, target) 211 | assert card.predict(df)[200] == pytest.approx(457.5903160102142, FUZZ_THRESHOLD) 212 | 213 | 214 | def test_card_predict_with_unknown_feature(): 215 | np.random.seed(9) 216 | unknown_df = df.copy() 217 | unknown_df.loc[200, 'C'] = 'U' 218 | assert card.predict(unknown_df)[200] == pytest.approx(456.41288777297257, FUZZ_THRESHOLD) 219 | 220 | 221 | def test_card_predict_with_unknown_feature_default_max(): 222 | np.random.seed(9) 223 | unknown_df = df.copy() 224 | unknown_df.loc[200, 'C'] = 'U' 225 | score, sub = card.predict(unknown_df, default = 'max', return_sub = True) 226 | 227 | assert sub.loc[200, 'C'] == card['C']['scores'].max() 228 | assert score[200] == pytest.approx(462.2871531373114, FUZZ_THRESHOLD) 229 | 230 | 231 | def test_card_predict_with_unknown_feature_default_with_value(): 232 | np.random.seed(9) 233 | unknown_df = df.copy() 234 | unknown_df.loc[200, 'C'] = 'U' 235 | score, sub = card.predict(unknown_df, default = 42, return_sub = True) 236 | 237 | assert sub.loc[200, 'C'] == 42 238 | assert score[200] == pytest.approx(355.46049567729443, FUZZ_THRESHOLD) 239 | 240 | 241 | def test_get_reason_vector(): 242 | """ 243 | verify the score reason of df is consistent with assumption 244 | DF_REASON is manually calculated with following logic: 245 | if score is lower than base_odds, select top k feature with lowest subscores where their corresponding subscores are lower than the base effect of features. 246 | if score is higher than base_odds, select top k feature with highest subscores where their corresponding subscores are higher than the base effect of features. 247 | 248 | e.g. xx.iloc[404] 249 | sub_scores: 151 159 143 0 250 | base_effect: 142 153 149 0 251 | diff_effect: +9 +6 -6 0 252 | 253 | total_score: 453(151+159+143+0) > base_odds(35) 254 | which is larger than base, hence, we try to find top `keep` features who contributed most to positivity 255 | find_largest_top_3: A(+9) B(+6) D(+0) 256 | """ 257 | reason = card.get_reason(df) 258 | assert reason.iloc[404]['top1'].tolist() == ['C', pytest.approx(142.9523920956781, FUZZ_THRESHOLD), 'B'] 259 | 260 | 261 | @pytest.mark.timeout(0.007) 262 | def test_predict_dict(): 263 | """ a test for scalar inference time cost """ 264 | proba = card.predict(df.iloc[404].to_dict()) 265 | assert proba == TEST_SCORE 266 | 267 | -------------------------------------------------------------------------------- /toad/selection_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from .selection import drop_empty, drop_var, drop_corr, drop_iv, drop_vif, select, stepwise 6 | 7 | np.random.seed(1) 8 | 9 | LENGTH = 500 10 | 11 | A = np.random.rand(LENGTH) 12 | A[np.random.choice(LENGTH, 20, replace = False)] = np.nan 13 | 14 | B = np.random.randint(100, size = LENGTH) 15 | C = A + np.random.normal(0, 0.2, LENGTH) 16 | D = A + np.random.normal(0, 0.1, LENGTH) 17 | 18 | E = np.random.rand(LENGTH) 19 | E[np.random.choice(LENGTH, 480, replace = False)] = np.nan 20 | 21 | F = B + np.random.normal(0, 10, LENGTH) 22 | 23 | target = np.random.randint(2, size = LENGTH) 24 | 25 | frame = pd.DataFrame({ 26 | 'A': A, 27 | 'B': B, 28 | 'C': C, 29 | 'D': D, 30 | 'E': E, 31 | 'F': F, 32 | }) 33 | 34 | frame['target'] = target 35 | 36 | 37 | def test_drop_empty(): 38 | df = drop_empty(frame, threshold = 0.8) 39 | assert 'E' not in df 40 | 41 | def test_drop_var(): 42 | df = drop_var(frame, threshold = 0.1) 43 | assert 'A' not in df 44 | 45 | def test_drop_var_exclude(): 46 | df = drop_var(frame, threshold = 0.1, exclude = 'A') 47 | assert 'A' in df 48 | 49 | def test_drop_corr(): 50 | df = drop_corr(frame, target = 'target') 51 | assert set(['D', 'E', 'F', 'target']) == set(df.columns.tolist()) 52 | 53 | def test_drop_corr_with_string(): 54 | ab = np.array(list('ABCDEFG')) 55 | str_feat = pd.Series(ab[np.random.choice(7, 500)]) 56 | 57 | df = drop_corr(pd.concat((frame, str_feat.rename('str_f')), axis = 1), target = 'target') 58 | assert set(['D', 'E', 'F', 'target', 'str_f']) == set(df.columns.tolist()) 59 | 60 | def test_drop_iv(): 61 | df = drop_iv(frame, target = 'target', threshold = 0.25) 62 | assert 'B' not in df 63 | 64 | def test_select(): 65 | df = select(frame, target = 'target', empty = 0.8, iv = 0.2, corr = 0.7) 66 | assert ['D', 'F', 'target'] == df.columns.tolist() 67 | 68 | def test_select_exclude(): 69 | df = select(frame, target = 'target', empty = 0.8, iv = 0.2, corr = 0.7, exclude = ['A']) 70 | assert ['A', 'D', 'F', 'target'] == df.columns.tolist() 71 | 72 | def test_stepwise(): 73 | df = stepwise(frame.fillna(-1), target = 'target') 74 | assert ['C', 'E', 'F', 'target'] == df.columns.tolist() 75 | 76 | def test_stepwise_backward(): 77 | df = stepwise(frame.fillna(-1), target = 'target', direction = 'backward') 78 | assert ['C', 'E', 'F', 'target'] == df.columns.tolist() 79 | 80 | def test_stepwise_forward(): 81 | df = stepwise(frame.fillna(-1), target = 'target', direction = 'forward') 82 | assert ['C', 'E', 'F', 'target'] == df.columns.tolist() 83 | 84 | def test_stepwise_exclude(): 85 | df = stepwise(frame.fillna(-1), target = 'target', exclude = 'B') 86 | assert ['B', 'C', 'E', 'F', 'target'] == df.columns.tolist() 87 | 88 | def test_stepwise_return_drop(): 89 | df, drop_list = stepwise(frame.fillna(-1), target = 'target', return_drop = True) 90 | assert ['B', 'A', 'D'] == drop_list 91 | 92 | def test_stepwise_lr(): 93 | df = stepwise(frame.fillna(-1), target = 'target', estimator = 'lr', direction = 'forward') 94 | assert ['C', 'target'] == df.columns.tolist() 95 | 96 | def test_stepwise_ks(): 97 | df = stepwise(frame.fillna(-1), target = 'target', criterion = 'ks', direction = 'forward') 98 | assert ['A', 'C', 'target'] == df.columns.tolist() 99 | 100 | def test_stepwise_zero(): 101 | df = pd.DataFrame({ 102 | 'X': np.zeros(500), 103 | 'Z': np.random.rand(500), 104 | 'Y': np.random.randint(2, size = 500), 105 | }) 106 | df = stepwise(df, target = 'Y') 107 | assert set(['Z', 'Y']) == set(df.columns.tolist()) 108 | 109 | def test_stepwise_forward_when_best_is_first(): 110 | df = frame[['E', 'F', 'B', 'A', 'D', 'C', 'target']] 111 | df = stepwise(df.fillna(-1), target = 'target', direction = 'forward') 112 | assert ['E', 'F', 'C', 'target'] == df.columns.tolist() 113 | 114 | def test_drop_vif(): 115 | df = drop_vif(frame.fillna(-1), exclude = 'target') 116 | assert ['C', 'F', 'target'] == df.columns.tolist() 117 | -------------------------------------------------------------------------------- /toad/stats_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from .stats import IV, WOE, gini, gini_cond, entropy_cond, quality, _IV, VIF 6 | 7 | 8 | np.random.seed(1) 9 | 10 | feature = np.random.rand(500) 11 | target = np.random.randint(2, size = 500) 12 | A = np.random.randint(100, size = 500) 13 | B = np.random.randint(100, size = 500) 14 | mask = np.random.randint(8, size = 500) 15 | 16 | df = pd.DataFrame({ 17 | 'feature': feature, 18 | 'target': target, 19 | 'A': A, 20 | 'B': B, 21 | }) 22 | 23 | 24 | def test_woe(): 25 | value = WOE(0.2, 0.3) 26 | assert value == pytest.approx(-0.4054651081081643) 27 | 28 | def test_iv_priv(): 29 | value, _ = _IV(df['feature'], df['target']) 30 | assert value == pytest.approx(0.010385942643745403) 31 | 32 | def test_iv(): 33 | value = IV(df['feature'], df['target'], n_bins = 10, method = 'dt') 34 | assert value == pytest.approx(0.2735917707743619) 35 | 36 | def test_iv_return_sub(): 37 | _, sub = IV(mask, df['target'], return_sub = True, n_bins = 10, method = 'dt') 38 | assert len(sub) == 8 39 | assert sub[4] == pytest.approx(0.006449386778057019) 40 | 41 | def test_iv_frame(): 42 | res = IV(df, 'target', n_bins = 10, method = 'chi') 43 | assert res.loc[0, 'A'] == pytest.approx(0.226363832867123) 44 | 45 | def test_gini(): 46 | value = gini(df['target']) 47 | assert value == 0.499352 48 | 49 | def test_gini_cond(): 50 | value = gini_cond(df['feature'], df['target']) 51 | assert value == pytest.approx(0.4970162601626016) 52 | 53 | def test_entropy_cond(): 54 | value = entropy_cond(df['feature'], df['target']) 55 | assert value == pytest.approx(0.6924990371522171) 56 | 57 | def test_quality(): 58 | result = quality(df, 'target') 59 | assert result.loc['feature', 'iv'] == 0.2735917707743619 60 | assert result.loc['A', 'gini'] == 0.49284164671885444 61 | assert result.loc['B', 'entropy'] == pytest.approx(0.6924956879070063, 5e-5) 62 | assert result.loc['feature', 'unique'] == 500 63 | 64 | def test_quality_iv_only(): 65 | result = quality(df, 'target', iv_only = True) 66 | assert np.isnan(result.loc['feature', 'gini']) 67 | 68 | def test_quality_with_merge(): 69 | result = quality(df, 'target', n_bins = 5, method = 'chi') 70 | assert result.loc['feature', 'iv'] == 0.13367825777558 71 | 72 | def test_quality_object_type_array_with_nan(): 73 | feature = np.array([np.nan, 'A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype = 'O')[mask] 74 | 75 | df = pd.DataFrame({ 76 | 'feature': feature, 77 | 'target': target, 78 | }) 79 | result = quality(df) 80 | assert result.loc['feature', 'iv'] == 0.016379338180530334 81 | 82 | def test_vif(): 83 | vif = VIF(df) 84 | assert vif['A'] == 2.969336442640111 85 | -------------------------------------------------------------------------------- /toad/tadpole/__init__.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | 3 | sns.set_palette('muted') 4 | 5 | from .base import Tadpole 6 | from .utils import tadpole_axes 7 | 8 | 9 | tadpole = Tadpole() -------------------------------------------------------------------------------- /toad/tadpole/base.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | from .utils import ( 3 | get_axes, 4 | tadpole_axes, 5 | FIG_SIZE, 6 | ) 7 | 8 | class Tadpole: 9 | def __getattr__(self, name): 10 | t = getattr(sns, name) 11 | if callable(t): 12 | return self.wrapsns(t) 13 | 14 | return t 15 | 16 | def wrapsns(self, f): 17 | @tadpole_axes 18 | def wrapper(*args, figure_size = FIG_SIZE, **kwargs): 19 | kw = kwargs.copy() 20 | if 'ax' not in kw: 21 | kw['ax'] = get_axes(size = figure_size) 22 | 23 | try: 24 | return f(*args, **kw) 25 | except: 26 | return f(*args, **kwargs) 27 | 28 | return wrapper 29 | -------------------------------------------------------------------------------- /toad/tadpole/fonts/NotoSansCJKsc-Regular.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/toad/tadpole/fonts/NotoSansCJKsc-Regular.otf -------------------------------------------------------------------------------- /toad/tadpole/func.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /toad/tadpole/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import seaborn as sns 3 | from functools import wraps 4 | import matplotlib.pyplot as plt 5 | from matplotlib.axes import Axes 6 | from matplotlib.font_manager import FontProperties 7 | 8 | sns.set_palette('muted') 9 | 10 | CURRENT_PATH = os.path.abspath(os.path.dirname(__file__)) 11 | FONT_FILE = 'NotoSansCJKsc-Regular.otf' 12 | FONTS_PATH = os.path.join(CURRENT_PATH, 'fonts', FONT_FILE) 13 | myfont = FontProperties(fname = os.path.abspath(FONTS_PATH)) 14 | sns.set(font = myfont.get_family()) 15 | 16 | HEATMAP_CMAP = sns.diverging_palette(240, 10, as_cmap = True) 17 | MAX_STYLE = 6 18 | FIG_SIZE = (12, 6) 19 | 20 | def get_axes(size = FIG_SIZE): 21 | _, ax = plt.subplots(figsize = size) 22 | return ax 23 | 24 | def reset_legend(axes): 25 | if axes.get_legend() is not None: 26 | axes.legend( 27 | loc='center left', 28 | bbox_to_anchor=(1, 0.5), 29 | framealpha = 0, 30 | prop = myfont, 31 | ) 32 | 33 | return axes 34 | 35 | def reset_ticklabels(axes): 36 | labels = [] 37 | if axes.get_xticklabels(): 38 | labels += axes.get_xticklabels() 39 | 40 | if axes.get_yticklabels(): 41 | labels += axes.get_yticklabels() 42 | 43 | for label in labels: 44 | label.set_fontproperties(myfont) 45 | 46 | return axes 47 | 48 | def reset_xticks(axes): 49 | for label in axes.get_xticklabels(): 50 | label.set_ha('left') 51 | label.set_rotation(-25) 52 | 53 | return axes 54 | 55 | 56 | def reset_title(axes): 57 | title = axes.get_title() 58 | 59 | if title: 60 | axes.set_title(title, fontproperties = myfont) 61 | 62 | return axes 63 | 64 | 65 | def reset_xylabels(axes): 66 | y_label = axes.get_ylabel() 67 | if y_label: 68 | axes.set_ylabel(y_label, fontproperties = myfont) 69 | 70 | x_label = axes.get_xlabel() 71 | if x_label: 72 | axes.set_xlabel(x_label, fontproperties = myfont) 73 | 74 | return axes 75 | 76 | 77 | def reset_ylim(axes): 78 | # for axes and twins 79 | for ax in axes.figure.axes: 80 | if ax.bbox.bounds == axes.bbox.bounds: 81 | bottom, top = ax.get_ylim() 82 | top += (top - bottom) * 0.1 83 | ax.set_ylim(bottom, top) 84 | 85 | return axes 86 | 87 | 88 | def fix_axes(axes): 89 | if not isinstance(axes, Axes): 90 | return axes 91 | 92 | functions = [reset_title, reset_xylabels, reset_ticklabels, reset_legend, reset_xticks] 93 | 94 | for func in functions: 95 | func(axes) 96 | return axes 97 | 98 | def tadpole_axes(fn): 99 | @wraps(fn) 100 | def func(*args, **kwargs): 101 | res = fn(*args, **kwargs) 102 | 103 | if not isinstance(res, tuple): 104 | return fix_axes(res) 105 | 106 | r = tuple() 107 | for i in res: 108 | r += (fix_axes(i),) 109 | 110 | return r 111 | 112 | return func 113 | 114 | 115 | 116 | def annotate(ax, x, y, space = 5, format = ".2f"): 117 | """ 118 | """ 119 | va = 'bottom' 120 | 121 | if y < 0: 122 | space *= -1 123 | va = 'top' 124 | 125 | ax.annotate( 126 | ("{:"+ format +"}").format(y), 127 | (x, y), 128 | xytext = (0, space), 129 | textcoords = "offset points", 130 | ha = 'center', 131 | va = va, 132 | ) 133 | 134 | 135 | 136 | def add_bar_annotate(ax, **kwargs): 137 | """ 138 | """ 139 | for rect in ax.patches: 140 | y_value = rect.get_height() 141 | x_value = rect.get_x() + rect.get_width() / 2 142 | 143 | annotate(ax, x_value, y_value, **kwargs) 144 | 145 | return ax 146 | 147 | 148 | def add_line_annotate(ax, **kwargs): 149 | """ 150 | """ 151 | for line in ax.lines: 152 | points = line.get_xydata() 153 | 154 | for point in points: 155 | annotate(ax, point[0], point[1], **kwargs) 156 | 157 | return ax 158 | 159 | 160 | def add_annotate(ax, **kwargs): 161 | if len(ax.lines) > 0: 162 | add_line_annotate(ax, **kwargs) 163 | 164 | if len(ax.patches) > 0: 165 | add_bar_annotate(ax, **kwargs) 166 | 167 | return ax 168 | 169 | 170 | def add_text(ax, text, loc = 'top left', offset = (0.01, 0.04)): 171 | x_min, x_max = ax.get_xlim() 172 | y_min, y_max = ax.get_ylim() 173 | 174 | x_offset = (x_max - x_min) * offset[0] 175 | y_offset = (y_max - y_min) * offset[1] 176 | 177 | if loc == 'top left': 178 | loc = (x_min + x_offset, y_max - y_offset) 179 | elif loc == 'top right': 180 | loc = (x_max - x_offset, y_max - y_offset) 181 | elif loc == 'bottom left': 182 | loc = (x_min + x_offset, y_min + y_offset) 183 | elif loc == 'bottom right': 184 | loc = (x_max - x_offset, y_min + y_offset) 185 | 186 | ax.text(*loc, text, fontsize = 'x-large') 187 | 188 | return ax 189 | -------------------------------------------------------------------------------- /toad/transform_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import pyximport 6 | 7 | pyximport.install(setup_args={"include_dirs": np.get_include()}) 8 | 9 | from .transform import WOETransformer, Combiner, GBDTTransformer 10 | 11 | np.random.seed(1) 12 | 13 | ab = np.array(list('ABCDEFG')) 14 | feature = np.random.randint(10, size = 500) 15 | target = np.random.randint(2, size = 500) 16 | str_feat = ab[np.random.choice(7, 500)] 17 | uni_feat = np.ones(500) 18 | empty_feat = feature.astype(float) 19 | empty_feat[np.random.choice(500, 50, replace = False)] = np.nan 20 | 21 | df = pd.DataFrame({ 22 | 'A': feature, 23 | 'B': str_feat, 24 | 'C': uni_feat, 25 | 'D': empty_feat, 26 | 'target': target, 27 | }) 28 | 29 | 30 | 31 | def test_duplicated_keys(): 32 | dup_df = df.rename(columns = {"C": "A"}) 33 | with pytest.raises(Exception, match=r"X has duplicate keys `.*`"): 34 | WOETransformer().fit_transform(dup_df, target) 35 | 36 | def test_woe_transformer(): 37 | f = WOETransformer().fit_transform(feature, target) 38 | assert f[451] == pytest.approx(-0.17061154127869285) 39 | 40 | def test_woe_transformer_with_str(): 41 | f = WOETransformer().fit_transform(str_feat, target) 42 | assert f[451] == pytest.approx(-0.2198594761130199) 43 | 44 | def test_woe_transformer_with_unknown_group(): 45 | transer = WOETransformer().fit(str_feat, target) 46 | res = transer.transform(['Z'], default = 'min') 47 | assert res[0] == pytest.approx(-0.2198594761130199) 48 | 49 | def test_woe_transformer_frame(): 50 | res = WOETransformer().fit_transform(df, target) 51 | assert res.iloc[451, 1] == pytest.approx(-0.2198594761130199) 52 | 53 | def test_woe_transformer_dict(): 54 | transer = WOETransformer().fit(df, 'target') 55 | res = transer.transform({ 56 | "A": 6, 57 | "B": "C", 58 | "C": 1, 59 | "D": 2, 60 | }) 61 | assert res['B'].item() == pytest.approx(-0.09149433112609942) 62 | 63 | def test_woe_transformer_select_dtypes(): 64 | res = WOETransformer().fit_transform(df, target, select_dtypes = 'object') 65 | assert res.loc[451, 'A'] == 3 66 | 67 | def test_woe_transformer_exclude(): 68 | res = WOETransformer().fit_transform(df, target, exclude = 'A') 69 | assert res.loc[451, 'A'] == 3 70 | 71 | def test_woe_transformer_export_single(): 72 | transer = WOETransformer().fit(feature, target) 73 | t = transer.export() 74 | assert t[transer._default_name][5] == pytest.approx(0.3938235330926786) 75 | 76 | def test_woe_transformer_export(): 77 | transer = WOETransformer().fit(df, target) 78 | t = transer.export() 79 | assert t['C'][1] == 0 80 | 81 | def test_woe_transformer_load(): 82 | rules = { 83 | 'A': { 84 | 1: 0.1, 85 | 2: 0.2, 86 | 3: 0.3, 87 | } 88 | } 89 | 90 | transer = WOETransformer().load(rules) 91 | assert transer._rules['A']['woe'][1] == 0.2 92 | 93 | 94 | def test_combiner(): 95 | f = Combiner().fit_transform(feature, target, method = 'chi') 96 | assert f[451] == 3 97 | 98 | def test_combiner_with_str(): 99 | f = Combiner().fit_transform(str_feat, target, method = 'chi') 100 | assert f[451] == 0 101 | 102 | def test_combiner_unique_feature(): 103 | f = Combiner().fit_transform(uni_feat, target, method = 'chi') 104 | assert f[451] == 0 105 | 106 | def test_combiner_frame(): 107 | res = Combiner().fit_transform(df, target) 108 | assert res.iloc[404, 1] == 2 109 | 110 | def test_combiner_select_dtypes(): 111 | res = Combiner().fit_transform(df, target, select_dtypes = 'number') 112 | assert res.loc[451, 'B'] == 'G' 113 | 114 | def test_combiner_exclude(): 115 | res = Combiner().fit_transform(df, target, exclude = 'B') 116 | assert res.loc[451, 'B'] == 'G' 117 | 118 | def test_combiner_labels(): 119 | combiner = Combiner().fit(df, target) 120 | res = combiner.transform(df, labels = True) 121 | assert res.loc[451, 'A'] == '03.[3 ~ 4)' 122 | 123 | def test_combiner_single_feature(): 124 | combiner = Combiner().fit(df['A'], method = 'step', n_bins = 5) 125 | res = combiner.transform(df['A']) 126 | assert res[451] == 1 127 | 128 | def test_combiner_export(): 129 | combiner = Combiner().fit(df, target, method = 'chi', n_bins = 4) 130 | bins = combiner.export() 131 | assert isinstance(bins['B'][0], list) 132 | 133 | def test_combiner_update(): 134 | combiner = Combiner().fit(df, target, method = 'chi', n_bins = 4) 135 | combiner.update({'A': [1,2,3,4,5,6]}) 136 | bins = combiner.export() 137 | assert len(bins['A']) == 6 138 | 139 | def test_combiner_step(): 140 | combiner = Combiner().fit(df['A'], method = 'step', n_bins = 4) 141 | bins = combiner.export() 142 | assert bins['A'][1] == 4.5 143 | 144 | def test_combiner_target_in_frame(): 145 | combiner = Combiner().fit(df, 'target', n_bins = 4) 146 | bins = combiner.export() 147 | assert bins['A'][1] == 6 148 | 149 | def test_combiner_target_in_frame_kwargs(): 150 | combiner = Combiner().fit(df, y = 'target', n_bins = 4) 151 | bins = combiner.export() 152 | assert bins['A'][1] == 6 153 | 154 | def test_combiner_empty_separate(): 155 | combiner = Combiner() 156 | bins = combiner.fit_transform(df, 'target', n_bins = 4, empty_separate = True) 157 | mask = pd.isna(df['D']) 158 | assert (bins['D'][~mask] != 4).all() 159 | 160 | def test_combiner_labels_with_empty(): 161 | combiner = Combiner().fit(df, 'target', n_bins = 4, empty_separate = True) 162 | res = combiner.transform(df, labels = True) 163 | assert res.loc[2, 'D'] == '04.nan' 164 | 165 | def test_gbdt_transformer(): 166 | np.random.seed(1) 167 | 168 | df = pd.DataFrame({ 169 | 'A': np.random.rand(500), 170 | 'B': np.random.randint(10, size = 500), 171 | }) 172 | f = GBDTTransformer().fit_transform(df, target, n_estimators = 10, max_depth = 2) 173 | assert f.shape == (500, 40) 174 | -------------------------------------------------------------------------------- /toad/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .func import * 2 | from .decorator import * 3 | from .progress import Progress 4 | -------------------------------------------------------------------------------- /toad/utils/decorator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from time import time 4 | from .func import save_json, read_json 5 | from functools import wraps, WRAPPER_ASSIGNMENTS 6 | 7 | 8 | 9 | class Decorator: 10 | """base decorater class 11 | """ 12 | _cls = None 13 | is_class = False 14 | 15 | def __init__(self, *args, is_class = False, **kwargs): 16 | self.is_class = is_class 17 | self.args = [] 18 | self.kwargs = {} 19 | 20 | if len(args) == 1 and callable(args[0]): 21 | self.fn = args[0] 22 | else: 23 | self.setup(*args, **kwargs) 24 | 25 | 26 | @property 27 | def fn(self): 28 | if hasattr(self, '__wrapped__'): 29 | return self.__wrapped__ 30 | 31 | return None 32 | 33 | @fn.setter 34 | def fn(self, func): 35 | if hasattr(self, 'setup_func'): 36 | func = self.setup_func(func) 37 | 38 | self.__wrapped__ = func 39 | 40 | def __call__(self, *args, **kwargs): 41 | if self.fn is None: 42 | self.fn = args[0] 43 | return self 44 | 45 | if self.is_class: 46 | self._cls = args[0] 47 | args = args[1:] 48 | 49 | return self.wrapper(*args, **kwargs) 50 | 51 | 52 | def __get__(self, instance, type = None): 53 | self.is_class = True 54 | self._cls = instance 55 | 56 | @wraps(self.__wrapped__) 57 | def func(*args, **kwargs): 58 | return self.__call__(instance, *args, **kwargs) 59 | 60 | return func 61 | 62 | 63 | def __getattribute__(self, name): 64 | if name in WRAPPER_ASSIGNMENTS: 65 | return getattr(self.__wrapped__, name) 66 | 67 | return object.__getattribute__(self, name) 68 | 69 | 70 | def setup(self, *args, **kwargs): 71 | self.args = args 72 | self.kwargs = kwargs 73 | 74 | for key in kwargs: 75 | setattr(self, key, kwargs[key]) 76 | 77 | 78 | def call(self, *args, **kwargs): 79 | if self._cls is not None: 80 | args = (self._cls, *args) 81 | 82 | return self.fn(*args, **kwargs) 83 | 84 | def wrapper(self, *args, **kwargs): 85 | return self.call(*args, **kwargs) 86 | 87 | 88 | class frame_exclude(Decorator): 89 | """decorator for exclude columns 90 | """ 91 | 92 | def wrapper(self, X, *args, exclude = None, **kwargs): 93 | if exclude is not None and isinstance(X, pd.DataFrame): 94 | X = X.drop(columns = exclude) 95 | 96 | return self.call(X, *args, **kwargs) 97 | 98 | 99 | class select_dtypes(Decorator): 100 | """ decorator for select frame by dtypes 101 | """ 102 | 103 | def wrapper(self, X, *args, select_dtypes = None, **kwargs): 104 | if select_dtypes is not None and isinstance(X, pd.DataFrame): 105 | X = X.select_dtypes(include = select_dtypes) 106 | 107 | return self.call(X, *args, **kwargs) 108 | 109 | 110 | class save_to_json(Decorator): 111 | """support save result to json file 112 | """ 113 | def wrapper(self, *args, to_json = None, **kwargs): 114 | res = self.call(*args, **kwargs) 115 | 116 | if to_json is not None: 117 | save_json(res, to_json) 118 | 119 | return res 120 | 121 | 122 | class load_from_json(Decorator): 123 | """support load data from json file 124 | """ 125 | require_first = False 126 | 127 | def wrapper(self, *args, from_json = None, **kwargs): 128 | if from_json is not None: 129 | obj = read_json(from_json) 130 | args = (obj, *args) 131 | 132 | elif self.require_first and len(args) > 0 and isinstance(args[0], str): 133 | obj = read_json(args[0]) 134 | args = (obj, *args[1:]) 135 | 136 | return self.call(*args, **kwargs) 137 | 138 | 139 | class support_dataframe(Decorator): 140 | """decorator for supporting dataframe 141 | """ 142 | require_target = True 143 | target = 'target' 144 | 145 | def wrapper(self, frame, *args, **kwargs): 146 | if not isinstance(frame, pd.DataFrame): 147 | return self.call(frame, *args, **kwargs) 148 | 149 | frame = frame.copy() 150 | if self.require_target and isinstance(args[0], str): 151 | target = frame.pop(args[0]) 152 | args = (target,) + args[1:] 153 | elif self.target in kwargs and isinstance(kwargs[self.target], str): 154 | kwargs[self.target] = frame.pop(kwargs[self.target]) 155 | 156 | res = dict() 157 | for col in frame: 158 | r = self.call(frame[col], *args, **kwargs) 159 | 160 | if not isinstance(r, np.ndarray): 161 | r = [r] 162 | 163 | res[col] = r 164 | return pd.DataFrame(res) 165 | 166 | 167 | class proxy_docstring(Decorator): 168 | method_name = None 169 | 170 | def __get__(self, *args): 171 | func = super().__get__(*args) 172 | 173 | if self.method_name is not None and hasattr(self._cls, self.method_name): 174 | setattr(func, '__doc__', getattr(self._cls, self.method_name).__doc__) 175 | 176 | return func 177 | 178 | 179 | class support_numpy(Decorator): 180 | """decorator for supporting numpy array to use torch function 181 | """ 182 | def wrapper(self, *args, **kwargs): 183 | import torch 184 | 185 | has_numpy = False 186 | l_args = [] 187 | for a in args: 188 | if not isinstance(a, torch.Tensor): 189 | a = torch.tensor(a) 190 | has_numpy = True 191 | 192 | l_args.append(a) 193 | 194 | res = self.call(*l_args, **kwargs) 195 | 196 | # only when arguments has numpy array, convert result to numpy array 197 | if has_numpy and isinstance(res, torch.Tensor): 198 | res = res.numpy() 199 | 200 | return res 201 | 202 | 203 | class xgb_loss(Decorator): 204 | """decorator for converting function to xgb supported loss function 205 | 206 | Args: 207 | loss_func (callable): loss function 208 | **kwargs: other arguments for loss function except `pred` and `label` 209 | 210 | Examples: 211 | 212 | >>> @xgb_loss(**kwargs) 213 | >>> def loss_func(pred, label, **kwargs): 214 | >>> ... 215 | >>> return loss 216 | >>> 217 | >>> # or use `xgb_loss` directly 218 | >>> xgb_func = xgb_loss(**kwargs)(loss_func) 219 | >>> 220 | >>> # use in xgb 221 | >>> model = xgb.XGBClassifier(objective = xgb_func) 222 | """ 223 | def wrapper(self, pred, label): 224 | from .func import derivative 225 | 226 | def partial_func(x): 227 | return self.call(x, label, **self.kwargs) 228 | 229 | grad = derivative(partial_func, pred, n=1, dx=1e-6) 230 | hess = derivative(partial_func, pred, n=2, dx=1e-6) 231 | 232 | return grad, hess 233 | 234 | 235 | class performance(Decorator): 236 | """decorator for analysis code performance 237 | 238 | Args: 239 | loop (int): loop times, default `1` 240 | 241 | Examples: 242 | >>> @performance(loop = 100) 243 | >>> def func(): 244 | >>> ... # code 245 | >>> return res 246 | >>> 247 | >>> func() 248 | >>> 249 | >>> # or use `performance` in `with` statement 250 | >>> with performance(): 251 | >>> ... # code 252 | """ 253 | loop = 1 254 | 255 | def wrapper(self, *args, **kwargs): 256 | costs = [] 257 | for _ in range(self.loop): 258 | start = time() 259 | res = self.call(*args, **kwargs) 260 | end = time() 261 | costs.append(end - start) 262 | 263 | self.analysis(costs) 264 | return res 265 | 266 | 267 | def analysis(self, costs): 268 | import numpy as np 269 | 270 | print('total cost: {:.5f}s'.format(np.sum(costs))) 271 | print("-"*40) 272 | data = { 273 | "Mean": np.mean(costs), 274 | "Min": np.min(costs), 275 | "Max": np.max(costs), 276 | "90%": np.percentile(costs, 90), 277 | "95%": np.percentile(costs, 95), 278 | "99%": np.percentile(costs, 99), 279 | } 280 | HEADER = "{:>8}"*len(data) 281 | BODY = "{:>7.3f}s"*len(data) 282 | print(HEADER.format(*data.keys())) 283 | print(BODY.format(*data.values())) 284 | 285 | 286 | def __enter__(self): 287 | self.start = time() 288 | return self 289 | 290 | def __exit__(self, exc_type, exc_value, traceback): 291 | self.end = time() 292 | self.analysis([self.end - self.start]) 293 | -------------------------------------------------------------------------------- /toad/utils/decorator_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from .decorator import ( 6 | Decorator, 7 | frame_exclude, 8 | xgb_loss, 9 | performance, 10 | ) 11 | 12 | np.random.seed(1) 13 | 14 | 15 | def func(): 16 | "This is a doc for method" 17 | pass 18 | 19 | 20 | def test_decorator_doc(): 21 | f = frame_exclude(func) 22 | 23 | assert f.__doc__ == 'This is a doc for method' 24 | 25 | 26 | def test_decorator_init_func(): 27 | class a(Decorator): 28 | def setup_func(self, func): 29 | return sum 30 | 31 | f = a(func) 32 | 33 | assert f([10, 20]) == 30 34 | 35 | 36 | def test_decorator_inherit(): 37 | class a(Decorator): 38 | bias = 0 39 | def wrapper(self, *args, a = 0, **kwargs): 40 | return self.call(a + self.bias) 41 | 42 | class b(a): 43 | def wrapper(self, *args, b = 0, **kwargs): 44 | a = super().wrapper(*args, **kwargs) 45 | b = self.call(b) 46 | return a + b 47 | 48 | f = b(bias = 2)(lambda x: x+1) 49 | assert f(a = 1, b = 2) == 7 50 | 51 | 52 | def test_xgb_loss(): 53 | def loss(x, y): 54 | return np.abs(x - y).sum() 55 | 56 | xgb_l = xgb_loss(loss) 57 | grad, hess = xgb_l(np.arange(3), np.arange(3, 6)) 58 | 59 | assert grad == pytest.approx(-3.0) 60 | assert hess == pytest.approx(0.0) 61 | 62 | 63 | def test_performance(): 64 | @performance(loop = 10) 65 | def func(x): 66 | from time import sleep 67 | sleep(0.01) 68 | return x**x 69 | 70 | assert func(2) == 4 71 | 72 | 73 | def test_performance_with_clause(): 74 | def func(x): 75 | from time import sleep 76 | sleep(0.01) 77 | return x**x 78 | 79 | with performance(): 80 | res = func(2) 81 | 82 | assert res == 4 83 | -------------------------------------------------------------------------------- /toad/utils/func_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pytest 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from .func import ( 7 | np_unique, 8 | fillna, 9 | clip, 10 | diff_time_frame, 11 | bin_to_number, 12 | generate_target, 13 | generate_str, 14 | get_dummies, 15 | feature_splits, 16 | ) 17 | 18 | np.random.seed(1) 19 | feature = np.random.rand(500) 20 | target = np.random.randint(2, size = 500) 21 | 22 | 23 | 24 | def test_fillna(): 25 | res = fillna(np.array([1, 2, 3, np.nan, 4, 5])) 26 | assert res[3] == -1 27 | 28 | 29 | def test_np_unique(): 30 | res = np_unique(np.array([np.nan, np.nan, np.nan])) 31 | assert len(res) == 1 32 | 33 | 34 | def test_clip(): 35 | res1 = clip(feature, quantile = (.05, .95)) 36 | res2 = clip(feature, quantile = 0.05) 37 | assert np.testing.assert_array_equal(res1, res2) is None 38 | 39 | 40 | def test_feature_splits(): 41 | value = feature_splits(feature, target) 42 | assert len(value) == 243 43 | 44 | 45 | @pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python3.8 or higher") 46 | def test_diff_time_frame(): 47 | time_data = [ 48 | { 49 | 'base': '2018-01', 50 | 'time1': '2018-04', 51 | 'time2': '2018-04-02', 52 | }, 53 | { 54 | 'base': '2018-01', 55 | 'time1': '2018-05', 56 | 'time2': '2018-04-05', 57 | }, 58 | { 59 | 'base': '2018-02', 60 | 'time1': '2018-04', 61 | 'time2': '2018-04-10', 62 | }, 63 | ] 64 | 65 | frame = pd.DataFrame(time_data) 66 | res = diff_time_frame(frame['base'], frame[['time1', 'time2']]) 67 | assert res.iloc[0, 1] == 91 68 | 69 | 70 | def test_bin_to_number(): 71 | s = pd.Series([ 72 | '1', 73 | '1-100', 74 | '-', 75 | '100-200', 76 | np.nan, 77 | '200-300', 78 | '300', 79 | '100-200', 80 | '>500', 81 | ]) 82 | 83 | res = s.apply(bin_to_number()) 84 | assert res[3] == 150 85 | 86 | def test_bin_to_number_for_frame(): 87 | df = pd.DataFrame([ 88 | { 89 | 'area_1': '100-200', 90 | 'area_2': '150~200', 91 | }, 92 | { 93 | 'area_1': '300-400', 94 | 'area_2': '200~250', 95 | }, 96 | { 97 | 'area_1': '200-300', 98 | 'area_2': '450~500', 99 | }, 100 | { 101 | 'area_1': '100-200', 102 | 'area_2': '250~300', 103 | }, 104 | ]) 105 | 106 | res = df.applymap(bin_to_number()) 107 | assert res.loc[1, 'area_2'] == 225 108 | 109 | def test_generate_target(): 110 | t = generate_target(len(feature), rate = 0.3, weight = feature) 111 | rate = t.sum() / len(t) 112 | assert rate == 0.3 113 | 114 | @pytest.fixture 115 | def test_generate_str(): 116 | s = generate_str(size = 8) 117 | assert s == 'EPL5MTQK' 118 | 119 | def test_get_dummies_binary(): 120 | ab = np.array(list('ABCDEFG')) 121 | df = pd.DataFrame({ 122 | 'binary': ab[np.random.choice(2, 500)], 123 | 'multiple': ab[np.random.choice(5, 500)], 124 | }) 125 | data = get_dummies(df, binary_drop = True) 126 | 127 | assert 'binary_A' not in data.columns 128 | -------------------------------------------------------------------------------- /toad/utils/mixin.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | from copy import deepcopy 4 | from .decorator import save_to_json, load_from_json 5 | 6 | 7 | DEFAULT_NAME = '_feature_default_name_' 8 | 9 | 10 | class RulesMixin: 11 | _rules = {} 12 | 13 | def _parse_rule(self, rule): 14 | return rule 15 | 16 | def _format_rule(self, rule): 17 | return rule 18 | 19 | def default_rule(self): 20 | if len(self._rules) == 1: 21 | # return the only rule as default 22 | return next(iter(self._rules.values())) 23 | 24 | if self._default_name not in self._rules: 25 | raise Exception('can not get default rule') 26 | 27 | return self._rules[self._default_name] 28 | 29 | @property 30 | def _default_name(self): 31 | return DEFAULT_NAME 32 | 33 | @property 34 | def rules(self): 35 | return self._rules 36 | 37 | @rules.setter 38 | def rules(self, value): 39 | self._rules = value 40 | 41 | 42 | @load_from_json(is_class = True, require_first = True) 43 | def load(self, rules, update = False, **kwargs): 44 | """load rules from dict or json file 45 | 46 | Args: 47 | rules (dict): dictionary of rules 48 | from_json (str|IOBase): json file of rules 49 | update (bool): if need to use updating instead of replacing rules 50 | """ 51 | rules = deepcopy(rules) 52 | 53 | if not isinstance(rules, dict): 54 | rules = { 55 | self._default_name: rules, 56 | } 57 | 58 | for key in rules: 59 | rules[key] = self._parse_rule(rules[key], **kwargs) 60 | 61 | if update: 62 | self._rules.update(rules) 63 | else: 64 | self._rules = rules 65 | 66 | if hasattr(self, 'after_load'): 67 | self.after_load(rules) 68 | 69 | return self 70 | 71 | @save_to_json(is_class = True) 72 | def export(self, **kwargs): 73 | """export rules to dict or a json file 74 | 75 | Args: 76 | to_json (str|IOBase): json file to save rules 77 | 78 | Returns: 79 | dict: dictionary of rules 80 | """ 81 | res = {} 82 | for key in self._rules: 83 | res[key] = self._format_rule(self._rules[key], **kwargs) 84 | 85 | if hasattr(self, 'after_export'): 86 | res = self.after_export(res, **kwargs) 87 | 88 | return res 89 | 90 | def update(self, *args, **kwargs): 91 | """update rules 92 | 93 | Args: 94 | rules (dict): dictionary of rules 95 | from_json (str|IOBase): json file of rules 96 | """ 97 | return self.load(*args, update = True, **kwargs) 98 | 99 | 100 | def __len__(self): 101 | return len(self._rules.keys()) 102 | 103 | def __contains__(self, key): 104 | return key in self._rules 105 | 106 | def __getitem__(self, key): 107 | return self._rules[key] 108 | 109 | def __setitem__(self, key, value): 110 | self._rules[key] = value 111 | 112 | def __iter__(self): 113 | return iter(self._rules) 114 | 115 | 116 | 117 | 118 | RE_NUM = r'-?\d+(.\d+)?' 119 | RE_SEP = r'[~-]' 120 | RE_BEGIN = r'(-inf|{num})'.format(num = RE_NUM) 121 | RE_END = r'(inf|{num})'.format(num = RE_NUM) 122 | RE_RANGE = r'\[{begin}\s*{sep}\s*{end}\)'.format( 123 | begin = RE_BEGIN, 124 | end = RE_END, 125 | sep = RE_SEP, 126 | ) 127 | 128 | 129 | 130 | 131 | 132 | class BinsMixin: 133 | EMPTY_BIN = -1 134 | ELSE_GROUP = 'else' 135 | NUMBER_EXP = re.compile(RE_RANGE) 136 | 137 | @classmethod 138 | def parse_bins(self, bins): 139 | """parse labeled bins to array 140 | """ 141 | if self._is_numeric(bins): 142 | return self._numeric_parser(bins) 143 | 144 | l = list() 145 | 146 | for item in bins: 147 | if item == self.ELSE_GROUP: 148 | l.append(item) 149 | else: 150 | l.append(item.split(',')) 151 | 152 | return np.array(l, dtype = object) 153 | 154 | 155 | @classmethod 156 | def format_bins(self, bins, index = False, ellipsis = None): 157 | """format bins to label 158 | 159 | Args: 160 | bins (ndarray): bins to format 161 | index (bool): if need index prefix 162 | ellipsis (int): max length threshold that labels will not be ellipsis, `None` for skipping ellipsis 163 | 164 | Returns: 165 | ndarray: array of labels 166 | """ 167 | l = list() 168 | 169 | if np.issubdtype(bins.dtype, np.number): 170 | has_empty = len(bins) > 0 and np.isnan(bins[-1]) 171 | 172 | if has_empty: 173 | bins = bins[:-1] 174 | 175 | sp_l = [-np.inf] + bins.tolist() + [np.inf] 176 | for i in range(len(sp_l) - 1): 177 | l.append('['+str(sp_l[i])+' ~ '+str(sp_l[i+1])+')') 178 | 179 | if has_empty: 180 | l.append('nan') 181 | else: 182 | for keys in bins: 183 | if isinstance(keys, str) and keys == self.ELSE_GROUP: 184 | l.append(keys) 185 | else: 186 | label = ','.join(keys) 187 | if ellipsis is not None: 188 | label = label[:ellipsis] + '..' if len(label) > ellipsis else label 189 | l.append(label) 190 | 191 | if index: 192 | l = ["{:02}.{}".format(ix, lab) for ix, lab in enumerate(l)] 193 | 194 | return np.array(l) 195 | 196 | 197 | @classmethod 198 | def _is_numeric(self, bins): 199 | m = self.NUMBER_EXP.match(bins[0]) 200 | 201 | return m is not None 202 | 203 | @classmethod 204 | def _numeric_parser(self, bins): 205 | l = list() 206 | 207 | for item in bins: 208 | 209 | if item == 'nan': 210 | l.append(np.nan) 211 | continue 212 | 213 | m = self.NUMBER_EXP.match(item) 214 | split = m.group(3) 215 | 216 | if split == 'inf': 217 | # split = np.inf 218 | continue 219 | 220 | split = float(split) 221 | 222 | l.append(split) 223 | 224 | return np.array(l) 225 | -------------------------------------------------------------------------------- /toad/utils/mixin_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from .mixin import RulesMixin, BinsMixin 4 | 5 | np.random.seed(1) 6 | 7 | class RulesObject(RulesMixin): 8 | def _parse_rule(self, rule): 9 | return { 10 | 'rule': rule 11 | } 12 | 13 | 14 | def _format_rule(self, rule): 15 | return 'rule -> %s' % rule['rule'] 16 | 17 | 18 | rules = {'A': 'rule_A'} 19 | 20 | def test_rule_parse(): 21 | r = RulesObject().load(rules) 22 | assert r.rules['A']['rule'] == 'rule_A' 23 | 24 | def test_rule_format(): 25 | r = RulesObject().load(rules) 26 | assert r.export()['A'] == 'rule -> rule_A' 27 | 28 | def test_save_update(): 29 | r = RulesObject().load(rules) 30 | r.update({'A': 'update_A'}) 31 | assert r.rules['A']['rule'] == 'update_A' 32 | 33 | def test_format_bins(): 34 | obj = BinsMixin() 35 | formated = obj.format_bins(np.array([2,4,6])) 36 | expect = ['[-inf ~ 2)', '[2 ~ 4)', '[4 ~ 6)', '[6 ~ inf)'] 37 | assert all([a == b for a, b in zip(formated, expect)]) 38 | 39 | def test_format_bins_with_index(): 40 | obj = BinsMixin() 41 | formated = obj.format_bins(np.array([2,4,6]), index = True) 42 | assert '01.[2 ~ 4)' in formated 43 | 44 | def test_format_bins_with_ellipsis(): 45 | obj = BinsMixin() 46 | formated = obj.format_bins(np.array([['A', 'B', 'C'], ['D', 'E']], dtype = object), ellipsis = 3) 47 | assert formated[0] == 'A,B..' and formated[1] == 'D,E' 48 | -------------------------------------------------------------------------------- /toad/utils/pickletracer.py: -------------------------------------------------------------------------------- 1 | import cloudpickle 2 | from pickle import Unpickler 3 | from cloudpickle import CloudPickler 4 | 5 | _global_tracer = None 6 | 7 | def get_current_tracer(): 8 | global _global_tracer 9 | # if _global_tracer is None: 10 | # raise ValueError("tracer is not initialized") 11 | return _global_tracer 12 | 13 | 14 | class Unpickler(Unpickler): 15 | """trace object dependences during unpickle""" 16 | def find_class(self, module, name): 17 | tracer = get_current_tracer() 18 | tracer.add(module) 19 | return super().find_class(module, name) 20 | 21 | 22 | class Pickler(CloudPickler): 23 | """trace object dependences during pickle""" 24 | def __init__(self, *args, **kwargs): 25 | super().__init__(*args, **kwargs) 26 | 27 | import types 28 | self._reduce_module = CloudPickler.dispatch_table[types.ModuleType] 29 | self.dispatch_table[types.ModuleType] = self.reduce_module 30 | 31 | 32 | def reduce_module(self, obj): 33 | tracer = get_current_tracer() 34 | tracer.add(obj.__name__) 35 | return self._reduce_module(obj) 36 | 37 | 38 | def __setattr__(self, name, value): 39 | if name == 'persistent_id': 40 | # fix torch module 41 | def wrapper_func(obj): 42 | from torch.nn import Module 43 | if isinstance(obj, Module): 44 | return None 45 | 46 | return value(obj) 47 | 48 | return super().__setattr__(name, wrapper_func) 49 | 50 | return super().__setattr__(name, value) 51 | 52 | 53 | class Tracer: 54 | def __init__(self): 55 | import re 56 | 57 | self._modules = set() 58 | self._ignore_modules = {"builtins"} 59 | self._temp_dispatch_table = {} 60 | 61 | # match python site packages path 62 | self._regex = re.compile(r".*python[\d\.]+\/site-packages/[\w-]+") 63 | 64 | def add(self, module): 65 | root = module.split(".")[0] 66 | 67 | if root in self._ignore_modules: 68 | return 69 | 70 | self._modules.add(root) 71 | 72 | def trace(self, obj): 73 | """trace `obj` by picke and unpicke 74 | """ 75 | import io 76 | dummy = io.BytesIO() 77 | 78 | with self: 79 | Pickler(dummy).dump(obj) 80 | dummy.seek(0) 81 | Unpickler(dummy).load() 82 | 83 | return self.get_deps() 84 | 85 | 86 | def get_deps(self): 87 | import sys 88 | 89 | deps = { 90 | "pip": [], 91 | "files": [], 92 | } 93 | 94 | for name in self._modules: 95 | if name not in sys.modules: 96 | # TODO: should raise error 97 | continue 98 | 99 | module = sys.modules[name] 100 | # package module 101 | if self._regex.match(module.__spec__.origin): 102 | # TODO: spilt pip and conde pkg 103 | deps["pip"].append(module) 104 | continue 105 | 106 | # local file module 107 | deps["files"].append(module) 108 | 109 | return deps 110 | 111 | 112 | def __enter__(self): 113 | global _global_tracer 114 | if _global_tracer is not None: 115 | raise ValueError("a tracer is already exists") 116 | 117 | # save the Cloudpickler global dispatch table 118 | self._temp_dispatch_table = CloudPickler.dispatch_table.copy() 119 | # setup the global tracer 120 | _global_tracer = self 121 | return self 122 | 123 | def __exit__(self, exc_type, exc_val, exc_tb): 124 | global _global_tracer 125 | 126 | # restore the dispatch table to Cloudpickler 127 | CloudPickler.dispatch_table = self._temp_dispatch_table 128 | # clean the global tracer 129 | _global_tracer = None 130 | 131 | 132 | 133 | 134 | def dump(obj, file, *args, **kwargs): 135 | return Pickler(file).dump(obj) 136 | 137 | 138 | def load(file, *args, **kwargs): 139 | return Unpickler(file).load() 140 | 141 | -------------------------------------------------------------------------------- /toad/utils/pickletracer_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pytest 3 | from .pickletracer import Tracer, get_current_tracer 4 | 5 | 6 | def test_tracer_with_clause(): 7 | assert get_current_tracer() is None 8 | with Tracer() as t: 9 | assert get_current_tracer() == t 10 | 11 | assert get_current_tracer() is None 12 | 13 | 14 | @pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows") 15 | def test_trace_pyfunc(): 16 | import pandas as pd 17 | import numpy as np 18 | from sklearn.linear_model import LinearRegression 19 | X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) 20 | # y = 1 * x_0 + 2 * x_1 + 3 21 | y = np.dot(X, np.array([1, 2])) + 3 22 | reg = LinearRegression().fit(X, y) 23 | reg.score(X, y) 24 | 25 | def func(data): 26 | # data = dfunc(data) 27 | df = pd.DataFrame(data) 28 | return df 29 | 30 | class Model: 31 | def __init__(self, model, pref): 32 | self.model = model 33 | self.pref = pref 34 | 35 | def predict(self, data): 36 | data = self.pref(data) 37 | return self.model.predict(data) 38 | 39 | 40 | m = Model(reg, func) 41 | 42 | deps = Tracer().trace(m) 43 | 44 | assert set([m.__name__ for m in deps['pip']]) == set(['numpy', 'pandas', 'cloudpickle', 'sklearn']) 45 | 46 | 47 | def test_default_cloudpickle(): 48 | import pandas as pd 49 | 50 | def func(data): 51 | # data = dfunc(data) 52 | df = pd.DataFrame(data) 53 | return df 54 | 55 | deps = Tracer().trace(func) 56 | 57 | import io 58 | import cloudpickle 59 | 60 | dummy = io.BytesIO() 61 | # this should be correct after trace object 62 | # test for restore cloudpickle global dispatch table 63 | cloudpickle.dump(func, dummy) 64 | -------------------------------------------------------------------------------- /toad/utils/progress/__init__.py: -------------------------------------------------------------------------------- 1 | from .progress import Progress 2 | -------------------------------------------------------------------------------- /toad/utils/progress/pandas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from .progress import Progress 4 | 5 | 6 | class ProgressAccessor: 7 | def __init__(self, obj): 8 | self.obj = obj 9 | 10 | def apply(self, func, *args, **kwargs): 11 | if isinstance(self.obj, pd.Series): 12 | l = len(self.obj) 13 | else: 14 | # dataframe 15 | axis = kwargs.get("axis", 0) 16 | if axis == 'index': 17 | axis = 0 18 | elif axis == 'columns': 19 | axis = 1 20 | 21 | l = self.obj.size // self.obj.shape[axis] 22 | 23 | p = iter(Progress(range(l))) 24 | 25 | def wrapper(*args, **kwargs): 26 | next(p) 27 | return func(*args, **kwargs) 28 | 29 | res = self.obj.apply(wrapper, *args, **kwargs) 30 | p.end() 31 | return res 32 | 33 | 34 | class pandas_enable: 35 | def __init__(self): 36 | pd.api.extensions.register_dataframe_accessor("progress")(ProgressAccessor) 37 | pd.api.extensions.register_series_accessor("progress")(ProgressAccessor) 38 | 39 | def __enter__(self): 40 | return self 41 | 42 | def __exit__(self, exce_type, exce_value, exce_trace): 43 | pandas_disable() 44 | 45 | 46 | def pandas_disable(): 47 | if hasattr(pd.DataFrame, 'progress'): 48 | delattr(pd.DataFrame, 'progress') 49 | 50 | if hasattr(pd.Series, 'progress'): 51 | delattr(pd.Series, 'progress') 52 | -------------------------------------------------------------------------------- /toad/utils/progress/pandas_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from .pandas import pandas_enable, pandas_disable 4 | 5 | 6 | 7 | def test_pandas_with(): 8 | assert hasattr(pd.DataFrame, 'progress') == False 9 | assert hasattr(pd.Series, 'progress') == False 10 | with pandas_enable(): 11 | assert hasattr(pd.DataFrame, 'progress') == True 12 | assert hasattr(pd.Series, 'progress') == True 13 | assert hasattr(pd.DataFrame, 'progress') == False 14 | assert hasattr(pd.Series, 'progress') == False 15 | 16 | def test_pandas_disable(): 17 | assert hasattr(pd.DataFrame, 'progress') == False 18 | assert hasattr(pd.Series, 'progress') == False 19 | pandas_enable() 20 | assert hasattr(pd.DataFrame, 'progress') == True 21 | assert hasattr(pd.Series, 'progress') == True 22 | pandas_disable() 23 | assert hasattr(pd.DataFrame, 'progress') == False 24 | assert hasattr(pd.Series, 'progress') == False 25 | 26 | def test_dataframe_apply(): 27 | df = pd.DataFrame({ 28 | "A": np.random.rand(1000), 29 | "B": np.random.randint(10, size = (1000,)) 30 | }) 31 | 32 | with pandas_enable(): 33 | res = df.progress.apply(lambda x: x + 1) 34 | 35 | def test_dataframe_apply_axis(): 36 | df = pd.DataFrame({ 37 | "A": np.random.rand(1000), 38 | "B": np.random.randint(10, size = (1000,)) 39 | }) 40 | 41 | with pandas_enable(): 42 | res = df.progress.apply(lambda x: x + 1, axis = 1) 43 | 44 | 45 | def test_series_apply(): 46 | series = pd.Series(np.random.rand(2000)) 47 | 48 | with pandas_enable(): 49 | res = series.progress.apply(lambda x: x + 1) 50 | 51 | -------------------------------------------------------------------------------- /toad/utils/progress/progress.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from time import time 3 | 4 | class Progress: 5 | """ 6 | """ 7 | def __init__(self, iterable, size = None, interval = 0.1): 8 | """ 9 | Args: 10 | iterable 11 | size (int): max size of iterable 12 | interval (float): update bar interval second, default is `0.1` 13 | 14 | Attrs: 15 | BAR_LENGTH (int): bar length, default is `32` 16 | SYMBOL_DONE (str): symbol indicating complation 17 | SYMBOL_REST (str): symbol indicating remaining 18 | prefix (str): string template before progress bar 19 | suffix (str): string template after progress bar 20 | template (str): string template for rendering, `{prefix} {bar} {suffix}` 21 | """ 22 | self.iterable = iterable 23 | self.interval = interval 24 | 25 | self.batch = 1 26 | self.size = size 27 | if hasattr(iterable, '__len__'): 28 | self.size = len(iterable) 29 | 30 | # is pytorch dataloader 31 | if hasattr(iterable, 'batch_size'): 32 | self.batch = getattr(iterable, 'batch_size') 33 | self.size = len(iterable.dataset) 34 | 35 | 36 | self.reset() 37 | 38 | 39 | self.BAR_LENGTH = 32 40 | 41 | self.SYMBOL_DONE = '█' 42 | self.SYMBOL_REST = '.' 43 | self.prefix = "" 44 | self.suffix = "" 45 | 46 | if self.size is None: 47 | self.template = "{prefix} {done} iters {time:.2f}s {tps}it/s {suffix}" 48 | else: 49 | self.template = "{prefix} {percent:3.0f}%|{bar}| [{done}/{size}] {time:.2f}s {suffix}" 50 | 51 | 52 | def __len__(self): 53 | return self.size 54 | 55 | 56 | def __iter__(self): 57 | self.reset() 58 | self.iterator = iter(self.iterable) 59 | return self 60 | 61 | 62 | def __next__(self): 63 | try: 64 | return self.next() 65 | except StopIteration as e: 66 | self.end() 67 | raise e 68 | 69 | 70 | def reset(self): 71 | # reset index 72 | self.idx = 0 73 | 74 | # reset time 75 | self.time = None 76 | self.start_time = time() 77 | self._last_time = self.start_time 78 | self.iterator = iter(self.iterable) 79 | 80 | 81 | def next(self): 82 | item = next(self.iterator) 83 | self.update() 84 | return item 85 | 86 | 87 | def update(self, idx = None, force = False): 88 | # update idx 89 | if idx is None: 90 | idx = self.idx + 1 91 | 92 | self.idx = idx 93 | 94 | curr_time = time() 95 | self.time = curr_time - self.start_time 96 | 97 | # skip update if delta is too small 98 | if not force and curr_time - self._last_time < self.interval: 99 | return 100 | 101 | self._last_time = curr_time 102 | 103 | # update bar 104 | self.flush() 105 | 106 | 107 | def end(self): 108 | """progress end 109 | """ 110 | self.update(idx = self.idx, force = True) 111 | self.print('\n') 112 | 113 | 114 | def flush(self): 115 | if self.size is None: 116 | done = self.idx * self.batch 117 | percent = 0 118 | bar = None 119 | else: 120 | done = min(self.idx * self.batch, self.size) 121 | percent = done / self.size 122 | 123 | bar = (self.SYMBOL_DONE * int(percent * self.BAR_LENGTH)).ljust(self.BAR_LENGTH, self.SYMBOL_REST) 124 | 125 | self.print('\r' + self.template.format( 126 | percent = percent * 100, 127 | bar = bar, 128 | done = done, 129 | size = self.size, 130 | time = self.time, 131 | tps = done / max(self.time, 1), 132 | prefix = self.prefix, 133 | suffix = self.suffix, 134 | )) 135 | 136 | 137 | def print(self, text): 138 | sys.stdout.write(text) 139 | sys.stdout.flush() 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /toad/utils/progress/progress_test.py: -------------------------------------------------------------------------------- 1 | from time import sleep, time 2 | from .progress import Progress 3 | 4 | 5 | class TestIterator: 6 | def __init__(self, size): 7 | self._size = size 8 | 9 | def __iter__(self): 10 | for i in range(self._size): 11 | yield i 12 | 13 | 14 | def test_progress(): 15 | p = Progress(range(100)) 16 | for i in p: 17 | sleep(0.01) 18 | assert p.idx == 100 19 | 20 | def test_progress_size(): 21 | p = Progress(range(9527)) 22 | assert p.size == 9527 23 | 24 | def test_iterator(): 25 | ti = TestIterator(100) 26 | p = Progress(ti) 27 | for i in p: 28 | sleep(0.01) 29 | assert p.idx == 100 30 | 31 | 32 | def test_multi_loop(): 33 | p = Progress(range(100)) 34 | for i in p: 35 | sleep(0.01) 36 | assert p.idx == 100 37 | 38 | for i in p: 39 | sleep(0.01) 40 | assert p.idx == 100 41 | 42 | def test_speed(): 43 | p = Progress(range(1000)) 44 | for i in p: 45 | sleep(0.001) 46 | assert p.idx == 1000 47 | -------------------------------------------------------------------------------- /toad/version.py: -------------------------------------------------------------------------------- 1 | __version_info__ = (0, 1, 5, 'final', 0) 2 | 3 | def get_version(version): 4 | main = '.'.join(str(x) for x in version[:3]) 5 | 6 | if version[3] == 'final': 7 | return main 8 | 9 | symbol = { 10 | 'alpha': 'a', 11 | 'beta': 'b', 12 | 'rc': 'rc', 13 | } 14 | 15 | return main + symbol[version[3]] + str(version[4]) 16 | 17 | __version__ = get_version(__version_info__) 18 | --------------------------------------------------------------------------------