├── .github
└── workflows
│ ├── codeql-analysis.yml
│ ├── linux.yml
│ ├── macos.yml
│ ├── pypi-test.yml
│ ├── release.yml
│ └── windows.yml
├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── CONTRIBUTORS
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docs
├── Makefile
├── make.bat
├── requirements.txt
└── source
│ ├── complete tutorial.ipynb
│ ├── conf.py
│ ├── images
│ ├── scorecard.png
│ └── stepwise.png
│ ├── index.md
│ ├── modules.md
│ ├── reference.md
│ ├── toad.detector.md
│ ├── toad.md
│ ├── toad.merge.md
│ ├── toad.metrics.md
│ ├── toad.nn.functional.md
│ ├── toad.nn.md
│ ├── toad.nn.module.md
│ ├── toad.nn.trainer.md
│ ├── toad.plot.md
│ ├── toad.preprocessing.md
│ ├── toad.preprocessing.partition.md
│ ├── toad.preprocessing.process.md
│ ├── toad.scorecard.md
│ ├── toad.selection.md
│ ├── toad.stats.md
│ ├── toad.transform.md
│ ├── toad.utils.decorator.md
│ ├── toad.utils.func.md
│ ├── toad.utils.md
│ ├── toad.utils.mixin.md
│ ├── tutorial.ipynb
│ └── tutorial_chinese.ipynb
├── images
└── toad_logo.png
├── pyproject.toml
├── requirements-dist.txt
├── requirements-nn.txt
├── requirements-test.txt
├── requirements-tools.txt
├── requirements.txt
├── scripts
└── build_wheels.sh
├── setup.cfg
├── setup.py
├── tests
└── test_data.csv
└── toad
├── __init__.py
├── c_utils.pxd
├── c_utils.pyx
├── cli.py
├── cli_test.py
├── commands
├── __init__.py
├── detect
│ └── __init__.py
├── evaluate
│ ├── __init__.py
│ └── evaluate.py
└── tree
│ ├── __init__.py
│ └── tree.py
├── detector.py
├── impute.py
├── impute_test.py
├── merge.pyx
├── merge_test.py
├── metrics.py
├── metrics_test.py
├── nn
├── __init__.py
├── functional.py
├── functional_test.py
├── loss.py
├── loss_test.py
├── module.py
├── module_test.py
├── trainer
│ ├── __init__.py
│ ├── callback.py
│ ├── callback_test.py
│ ├── earlystop.py
│ ├── earlystop_test.py
│ ├── event.py
│ ├── event_test.py
│ ├── history.py
│ ├── history_test.py
│ ├── metrics.py
│ ├── trainer.py
│ └── trainer_test.py
└── zoo
│ ├── __init__.py
│ ├── autoencoder.py
│ └── autoencoder_test.py
├── plot.py
├── plot_test.py
├── preprocessing
├── __init__.py
├── partition.py
├── partition_test.py
├── process.py
└── process_test.py
├── scorecard.py
├── scorecard_test.py
├── selection.py
├── selection_test.py
├── stats.py
├── stats_test.py
├── tadpole
├── __init__.py
├── base.py
├── fonts
│ └── NotoSansCJKsc-Regular.otf
├── func.py
└── utils.py
├── transform.py
├── transform_test.py
├── utils
├── __init__.py
├── decorator.py
├── decorator_test.py
├── func.py
├── func_test.py
├── mixin.py
├── mixin_test.py
├── pickletracer.py
├── pickletracer_test.py
└── progress
│ ├── __init__.py
│ ├── pandas.py
│ ├── pandas_test.py
│ ├── progress.py
│ └── progress_test.py
└── version.py
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | name: "CodeQL"
2 |
3 | on:
4 | push:
5 | branches: [master, dev]
6 | pull_request:
7 | # The branches below must be a subset of the branches above
8 | branches: [master]
9 | schedule:
10 | - cron: '0 3 * * 4'
11 |
12 | jobs:
13 | analyse:
14 | name: Analyse
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - name: Checkout repository
19 | uses: actions/checkout@v2
20 | with:
21 | # We must fetch at least the immediate parents so that if this is
22 | # a pull request then we can checkout the head.
23 | fetch-depth: 2
24 |
25 | # If this run was triggered by a pull request event, then checkout
26 | # the head of the pull request instead of the merge commit.
27 | - run: git checkout HEAD^2
28 | if: ${{ github.event_name == 'pull_request' }}
29 |
30 | # Initializes the CodeQL tools for scanning.
31 | - name: Initialize CodeQL
32 | uses: github/codeql-action/init@v1
33 | # Override language selection by uncommenting this and choosing your languages
34 | # with:
35 | # languages: go, javascript, csharp, python, cpp, java
36 |
37 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
38 | # If this step fails, then you should remove it and run the build manually (see below)
39 | - name: Autobuild
40 | uses: github/codeql-action/autobuild@v1
41 |
42 | # ℹ️ Command-line programs to run using the OS shell.
43 | # 📚 https://git.io/JvXDl
44 |
45 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
46 | # and modify them (or add more) to build your code if your project
47 | # uses a compiled language
48 |
49 | #- run: |
50 | # make bootstrap
51 | # make release
52 |
53 | - name: Perform CodeQL Analysis
54 | uses: github/codeql-action/analyze@v1
55 |
--------------------------------------------------------------------------------
/.github/workflows/linux.yml:
--------------------------------------------------------------------------------
1 | name: Test on Linux
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | strategy:
8 | matrix:
9 | python-version: ['3.9', '3.10', '3.11', '3.12']
10 | experimental: [false]
11 | include:
12 | - python-version: '3.13'
13 | experimental: true
14 | fail-fast: false
15 | runs-on: ubuntu-latest
16 | continue-on-error: ${{ matrix.experimental }}
17 | name: Test py ${{ matrix.python-version }}
18 | steps:
19 | - uses: actions/checkout@master
20 | - name: Setup Python
21 | uses: actions/setup-python@v5
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 | - run: make build_deps
25 | - run: pip install -r requirements-nn.txt
26 | - run: pip install .[all]
27 | - run: make test
28 | release:
29 | needs: [test]
30 | # release when using `tags` or `release` branch
31 | if: ${{ startsWith(github.ref, 'refs/tags') || github.ref == 'refs/heads/release' }}
32 | runs-on: ubuntu-latest
33 | steps:
34 | - uses: actions/checkout@master
35 | - name: Setup Python
36 | uses: actions/setup-python@v5
37 | with:
38 | python-version: '3.10'
39 | architecture: x64
40 | - run: make dist
41 | - uses: RalfG/python-wheels-manylinux-build@v0.7.1
42 | with:
43 | build-requirements: 'cython numpy'
44 | - run: rm dist/*-linux_x86_64.whl
45 | - uses: pypa/gh-action-pypi-publish@release/v1
46 | name: publish pypi
47 | with:
48 | user: __token__
49 | password: ${{ secrets.PYPI }}
50 | skip-existing: true
51 | verbose: true
52 |
--------------------------------------------------------------------------------
/.github/workflows/macos.yml:
--------------------------------------------------------------------------------
1 | name: Test on MacOS
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | strategy:
8 | matrix:
9 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
10 | macos-version: ['macos-13', 'macos-latest']
11 | include:
12 | - experimental: false
13 | - macos-version: 'macos-latest'
14 | experimental: true
15 | - python-version: '3.9'
16 | experimental: true
17 | - python-version: '3.13'
18 | experimental: true
19 | fail-fast: false
20 | runs-on: ${{ matrix.macos-version }}
21 | continue-on-error: ${{ matrix.experimental }}
22 | name: Test py ${{ matrix.python-version }} ${{ matrix.macos-version }}
23 | steps:
24 | - uses: actions/checkout@master
25 | - name: Setup Python
26 | uses: actions/setup-python@v5
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | - run: make build_deps
30 | - run: pip install -r requirements-nn.txt
31 | - run: pip install .[all]
32 | - run: make test
33 | - run: make dist_wheel
34 | - uses: actions/upload-artifact@v4
35 | with:
36 | name: wheel-${{ matrix.python-version }}-${{ matrix.macos-version }}
37 | path: dist/*.whl
38 | release:
39 | needs: [test]
40 | # release when using `tags` or `release` branch
41 | if: ${{ startsWith(github.ref, 'refs/tags') || github.ref == 'refs/heads/release' }}
42 | runs-on: ubuntu-latest
43 | steps:
44 | - uses: actions/download-artifact@v4
45 | with:
46 | pattern: wheel-*
47 | path: dist/
48 | merge-multiple: true
49 | - uses: pypa/gh-action-pypi-publish@release/v1
50 | name: publish pypi
51 | with:
52 | user: __token__
53 | password: ${{ secrets.PYPI }}
54 |
--------------------------------------------------------------------------------
/.github/workflows/pypi-test.yml:
--------------------------------------------------------------------------------
1 | name: Pypi test
2 |
3 | on:
4 | push:
5 | branches:
6 | - 'pypi/**'
7 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - "*"
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: Checkout
13 | uses: actions/checkout@master
14 |
15 | - name: Release
16 | uses: docker://antonyurchenko/git-release:latest
17 | env:
18 | GITHUB_TOKEN: ${{ secrets.TOKEN }}
19 |
--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
1 | name: Test on Windows
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | strategy:
8 | matrix:
9 | python-version: ['3.9', '3.10', '3.11', '3.12']
10 | experimental: [false]
11 | include:
12 | - python-version: '3.13'
13 | experimental: true
14 | fail-fast: false
15 | runs-on: windows-latest
16 | continue-on-error: ${{ matrix.experimental }}
17 | name: Test py ${{ matrix.python-version }}
18 | steps:
19 | - uses: actions/checkout@master
20 | - name: Setup Python
21 | uses: actions/setup-python@v5
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 | - run: make build_deps
25 | - run: pip install -r requirements-nn.txt
26 | - run: pip install .[all]
27 | - run: make test
28 | - run: make dist_wheel
29 | - uses: actions/upload-artifact@v4
30 | with:
31 | name: wheel-${{ matrix.python-version }}
32 | path: dist/*.whl
33 | release:
34 | needs: [test]
35 | # release when using `tags` or `release` branch
36 | if: ${{ startsWith(github.ref, 'refs/tags') || github.ref == 'refs/heads/release' }}
37 | runs-on: ubuntu-latest
38 | steps:
39 | - uses: actions/download-artifact@v4
40 | with:
41 | pattern: wheel-*
42 | path: dist/
43 | merge-multiple: true
44 | - uses: pypa/gh-action-pypi-publish@release/v1
45 | name: publish pypi
46 | with:
47 | user: __token__
48 | password: ${{ secrets.PYPI }}
49 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | build/
3 | *.egg-info/
4 | dist/
5 | .tox/
6 | .vscode/
7 | .DS_Store
8 | .python-version
9 | *.csv
10 | *.xlsx
11 | *.c
12 | *.so
13 | *.pyc
14 | .idea/
15 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-22.04
5 | tools:
6 | python: "3.11"
7 |
8 | sphinx:
9 | configuration: docs/source/conf.py
10 |
11 | formats: all
12 |
13 | python:
14 | install:
15 | - requirements: requirements.txt
16 | - requirements: requirements-nn.txt
17 | - requirements: docs/requirements.txt
18 | - method: setuptools
19 | path: .
20 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | matrix:
2 | include:
3 | - name: "Python 3.6 on Linux"
4 | os: linux
5 | language: python
6 | python: "3.6"
7 | sudo: required
8 | services:
9 | - docker
10 | env:
11 | - DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64
12 | - PLAT=manylinux1_x86_64
13 | before_install:
14 | - sudo apt-get install -y graphviz
15 | dist: trusty
16 | before_deploy:
17 | - make dist_manylinux
18 |
19 | - name: "Python 3.7 on macOS"
20 | os: osx
21 | osx_image: xcode11.3
22 | language: shell
23 | env:
24 | - SUDO=sudo
25 | - HOMEBREW_NO_INSTALL_CLEANUP=TRUE
26 | before_install:
27 | - brew update
28 | # - brew install graphviz
29 | before_deploy:
30 | - make dist_wheel
31 |
32 | - name: "Python 3.7 on Windows"
33 | os: windows
34 | language: shell
35 | python: "3.7"
36 | env:
37 | - PATH=/c/Python37:/c/Python37/Scripts:$PATH
38 | before_install:
39 | - choco install python --version=3.7.2
40 | - choco install graphviz
41 | - choco install make
42 | before_deploy:
43 | - make dist_wheel
44 |
45 | - name: "Python 3.6 on Windows"
46 | os: windows
47 | language: shell
48 | python: "3.6"
49 | env:
50 | - PATH=/c/Python36:/c/Python36/Scripts:$PATH
51 | before_install:
52 | - choco install python --version=3.6.8
53 | - choco install graphviz
54 | - choco install make
55 | - pip install -U patsy
56 | before_deploy:
57 | - make dist_wheel
58 |
59 |
60 | install:
61 | - make install
62 | script:
63 | - make test
64 |
65 | deploy:
66 | - skip_cleanup: true
67 | provider: script
68 | script: make upload
69 | on:
70 | tags: true
71 |
72 | - skip_cleanup: true
73 | provider: script
74 | script: make upload
75 | on:
76 | branch: release
77 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to this project will be documented in this file.
4 |
5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7 |
8 | ## [Unreleased]
9 |
10 | ## [0.1.5] - 2025-02-18
11 |
12 | ### Add
13 | - Added `ax` support for `plot`
14 | - Added `Apple M1` support
15 |
16 | ## [0.1.4] - 2024-11-03
17 |
18 | ### Add
19 | - Added wheel package supported for `py3.12`
20 | - Added `figsize` param in `toad.plot.bin_plot` function
21 |
22 | ### Changed
23 | - Update `pandas` version to `>=1.5`
24 | - Python `3.7` `3.8` is no longer supported
25 |
26 | ## [0.1.3] - 2023-12-10
27 |
28 | ### Add
29 | - Added `performance` in `toad.utils` for test code performance
30 | - Added `pickletracer` in `toad.utils` for infer requirements in pickle object
31 |
32 | ### Fixed
33 | - Fixed `Value Error` in `select` and `drop_corr` method when using `pandas >= 2.0.x`
34 |
35 | ## [0.1.2] - 2023-04-09
36 |
37 | ### Add
38 | - Added `ks_plot` for KS plot, [#102](https://github.com/amphibian-dev/toad/issues/102) thanks @kevin-meng
39 | - Added `xgb_loss` decorator for convert a normal loss function to a xgb supported loss function
40 | - Added `binary_focal_loss` function in `nn.functional`
41 | - Added `event` module in `nn.trainer`, and changed `trainer` mode to event-based
42 | - Added wheel package supported for `py3.9`, `py3.10` and `py3.11`
43 |
44 | ### Changed
45 | - Now you can pass arguments to `DecisionTreeClassifier` in `merge` or `Combiner` when use `method = dt`
46 |
47 | ### Fixed
48 | - Fixed `groupby` rewrited in `preprocessing`
49 | - Fixed the expired deprecations of numpy types in `1.24.0`
50 |
51 | ## [0.1.1] - 2022-08-14
52 |
53 | ### Add
54 | - Added `Progress` for `pandas.apply` by using `pandas_enable` and `pandas_disable`
55 | - Added `feature_bin_stats` for feature bins, [#91](https://github.com/amphibian-dev/toad/issues/91) thanks @kevin-meng
56 |
57 | ### Changed
58 | - `countBlank` can use customize missing value, [#101](https://github.com/amphibian-dev/toad/issues/101) thanks @kevin-meng
59 | - remove ref of `merge` in `__init__` file
60 |
61 |
62 |
63 | ## [0.1.0] - 2021-10-08
64 |
65 | ### Add
66 |
67 | - Added `backward_rounds` for `nn.Trainer.train`
68 | - Added `evalute` func in `nn.Module`
69 | - Added `get_reason` func in `ScoreCard`, [#79](https://github.com/amphibian-dev/toad/issues/79) thanks @qianweishuo
70 | - Added dict type input support for `ScoreCard.predict` and `Combiner.transform`, [#79](https://github.com/amphibian-dev/toad/issues/79) thanks @qianweishuo
71 | - Added iterator support for `Progress`
72 |
73 | ### Changed
74 |
75 | - Change `callback` and `earlystopping` to python decorator
76 |
77 |
78 | ## [0.0.65] - 2021-06-30
79 |
80 | ### Breaking Changes
81 |
82 | - Add new `lift` value and rename the old `lift` value to `cum_lift` in `KS_Bucket`
83 | - Move `nn.autoencoder` to `nn.zoo.autoencoder`
84 |
85 | ### Add
86 |
87 | - Added `label_smoothing`, `focal_loss` function in `nn` module
88 | - Added some features in `nn.trainer`
89 | - Added default `early_stopping` for `nn.Trainer`
90 |
91 | ### Changed
92 |
93 | - Update `numpy` version to `>=1.20`
94 | - Python `3.6` is no longer supported
95 |
96 | ### Fixed
97 |
98 | - Fixed combiner error after `ScoreCard` reload. [#67](https://github.com/amphibian-dev/toad/issues/67)
99 |
100 |
101 | ## [0.0.64] - 2021-03-22
102 |
103 | ### Added
104 |
105 | - Added `callback` param in `fit` method for `nn`
106 | - Added `Trainer` and `EarlyStopping` in `nn.trainer` module
107 |
108 | ### Changed
109 |
110 | - Use mean of loss in `nn.Module.fit` instead of the latest loss value
111 | - Set default rotation for x tick labels
112 |
113 | ### Fixed
114 |
115 | - Fixed dependence version of `numpy`
116 | - Fixed `DistModule` module
117 | - Fixed `ScoreCard` representation error
118 |
119 | ## [0.0.62] - 2021-02-19
120 |
121 | ### Added
122 |
123 | - `save` and `load` method for nn module
124 | - Added `lift` value in `KS_bucket` function
125 | - Added checking duplicate keys in `Transformer`
126 |
127 | ### Changed
128 |
129 | - `quality` method support `indicators`
130 |
131 | ### Fixed
132 |
133 | - Fixed tadpole warning of legend. [#52](https://github.com/amphibian-dev/toad/issues/52)
134 | - Fixed tadpole `title` and `x/y label` display for `UTF8`
135 | - Fixed default rule in RuleMixin.
136 | - Fixed loss function of VAE model.
137 | - Fixed `decimal` argument in `ScoreCard.export` function
138 |
139 | ### Enhancements
140 |
141 | - Reduce memory usage when using `select` function
142 |
143 | ## [0.0.61] - 2020-06-24
144 |
145 | ### Added
146 |
147 | - Support for calculating IV for each groups in a feature. [#25](https://github.com/amphibian-dev/toad/issues/25)
148 | - Add `cpu_cores` for `quality` function
149 | - Add `predict_proba` for `ScoreCard`
150 | - Impute module
151 | - NN module
152 |
153 | ### Changed
154 |
155 | - The y axis of `badrate_plot` is starting with `0` now. [#23](https://github.com/amphibian-dev/toad/issues/23)
156 | - `KS` is implemented using `ks2samp` instead
157 |
158 | ### Fixed
159 |
160 | - Fixed `Preprocess` bugs
161 |
162 | ### Docs
163 |
164 | - Add references for `Chi-Merge`, `Stepwise Regression`, `Scorecard Transformation`
165 |
166 | ## [0.0.60] - 2020-04-20
167 |
168 | ### Added
169 |
170 | - Preprocess module.
171 | - Annotation format for bin plot.
172 | - KS bucket support split pointers as bucket. [#22](https://github.com/amphibian-dev/toad/issues/22)
173 |
174 | ### Changed
175 |
176 | - Format_bins support ellipsis.
177 | - Reverse cumulative columns in KS bucket
178 | - Use correct order of score for auc and roc plot. [#21](https://github.com/amphibian-dev/toad/issues/21)
179 |
180 | ### Fixed
181 |
182 | - Fixed number type of x axis of badrate plot. [#20](https://github.com/amphibian-dev/toad/issues/20)
183 | - Fixed negative ks value in `KS_bucket`.
184 |
185 | ## [0.0.59] - 2020-02-07
186 |
187 | ### Added
188 |
189 | - Combiner support empty separate.
190 | - Confusion matrix function in metrics.
191 | - support python 3.8.
192 |
193 | ### Changed
194 |
195 | - Transform support y as string type.
196 | - VIF independent statsmodels.
197 |
198 |
199 | [Unreleased]: https://github.com/amphibian-dev/toad/compare/0.1.0...HEAD
200 | [0.1.0]: https://github.com/amphibian-dev/toad/compare/0.0.65...0.1.0
201 | [0.0.65]: https://github.com/amphibian-dev/toad/compare/0.0.64...0.0.65
202 | [0.0.64]: https://github.com/amphibian-dev/toad/compare/0.0.62...0.0.64
203 | [0.0.62]: https://github.com/amphibian-dev/toad/compare/0.0.61...0.0.62
204 | [0.0.61]: https://github.com/amphibian-dev/toad/compare/0.0.60...0.0.61
205 | [0.0.60]: https://github.com/amphibian-dev/toad/compare/0.0.59...0.0.60
206 | [0.0.59]: https://github.com/amphibian-dev/toad/compare/0.0.58...0.0.59
207 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Welcome to Toad contributing guide
2 |
3 | We're so glad you're thinking about contributing to toad project. If you're unsure about anything, just ask @Secbone or submit the issue or pull request anyway. The worst that can happen is you'll be politely asked to change something. We love all friendly contributions.
4 |
5 | 我们非常开心你乐意为 toad 项目贡献代码。如果你有任何疑问,可以联系 @Secbone 或者提交 issue 和 pull request 都可以。最糟不过是被礼貌地要求你修改一些东西。我们非常愿意看到所有善意的问题。
6 |
7 | ## Getting Started · 开始吧
8 |
9 | ### Setup Environment · 设置环境
10 |
11 | Setting up the environment is very simple, you just need to run the following command
12 |
13 | 设置环境非常简单,你只需要执行以下代码
14 |
15 | ```bash
16 | make install
17 | ```
18 |
19 | All done! Now you can enjoy your coding~
20 |
21 | 完成!开始享受你的编码吧~
22 |
23 | ### About Cython · 关于 Cython
24 |
25 | `toad.merge` module is compiled with `cython`, so if you want to change something with `toad.merge`, you need to run
26 |
27 | `toad.merge` 模块是使用 `cython` 编译的,所有如果你想要对 `toad.merge` 模块进行改动时,你需要运行
28 |
29 | ```bash
30 | make build
31 | ```
32 | after you updated code.
33 |
34 | 之后来使你的代码生效。
35 |
36 | ### Testing · 测试
37 |
38 | You can run
39 |
40 | 你可以执行
41 |
42 | ```bash
43 | make test
44 | ```
45 |
46 | for testing the whole package. We recommend that you do this before every commit to avoid new code impacting old functionality.
47 |
48 | 来测试整个包的代码。我们建议你在每次体检前这么做,以防止新代码对老的功能产生影响。
49 |
50 | You can also run
51 |
52 | 你也可以运行
53 |
54 | ```bash
55 | make test toad/xxxx_test.py
56 | ```
57 |
58 | to test only a single module.
59 |
60 | 来只测试某一个模块。
61 |
62 | ### Pull Request
63 |
64 | When you're finished with the changes, creating a pull request and waiting for merge.
65 |
66 | 当你完成所有的改动后,就可以创建一个 pull request 并且等它被合并啦~
67 |
--------------------------------------------------------------------------------
/CONTRIBUTORS:
--------------------------------------------------------------------------------
1 | Lei Cui
2 | Secbone
3 | Shaoqian Dong
4 | Xiyu Zhou
5 | Yanping He
6 | Yutong Jiang
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 ESC Team
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include requirements.txt
3 | include requirements-*.txt
4 | include setup.py
5 | include toad/*.pyd
6 | include toad/*.pyx
7 | include toad/tadpole/fonts/*
8 |
9 | include CONTRIBUTORS
10 | include LICENSE
11 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: build test
2 |
3 | SHELL = /bin/bash
4 |
5 | PYTHON = python3
6 | PIP = pip3
7 | SUDO ?=
8 |
9 | SPHINXOPTS =
10 | SPHINXBUILD = sphinx-build
11 | SPHINXPROJ = toad
12 | DOCSDIR = docs
13 | SOURCEDIR := $(DOCSDIR)/source
14 | BUILDDIR := $(DOCSDIR)/build
15 |
16 |
17 | ifeq ('$(shell type -P python3)','')
18 | PYTHON = python
19 | endif
20 |
21 | ifeq ('$(shell type -P pip3)','')
22 | PIP = pip
23 | endif
24 |
25 |
26 | install: build
27 | $(SUDO) $(PIP) install -e .
28 |
29 | uninstall:
30 | cat files.txt | xargs rm -rf
31 |
32 | test_deps:
33 | $(SUDO) $(PIP) install -r requirements-test.txt
34 |
35 | test: test_deps
36 | $(eval TARGET := $(filter-out $@, $(MAKECMDGOALS)))
37 | @if [ -z $(TARGET) ]; then \
38 | $(PYTHON) -m pytest -x toad; \
39 | else \
40 | $(PYTHON) -m pytest -s $(TARGET); \
41 | fi
42 |
43 | build_deps:
44 | $(SUDO) $(PIP) install -r requirements.txt
45 |
46 | build: build_deps
47 | $(PYTHON) setup.py build_ext --inplace
48 |
49 | dist_deps:
50 | $(SUDO) $(PIP) install -U -r requirements-dist.txt
51 |
52 | dist: build dist_deps
53 | $(SUDO) $(PYTHON) setup.py sdist
54 |
55 | dist_wheel: build dist_deps
56 | $(SUDO) $(PYTHON) setup.py bdist_wheel --universal
57 |
58 | upload:
59 | twine check dist/*
60 | @twine upload dist/* -u $(TWINE_USER) -p $(TWINE_PASS)
61 |
62 | clean:
63 | @rm -rf build/ dist/ *.egg-info/ **/__pycache__/
64 | @rm -rf toad/*.c toad/*.so
65 |
66 | docs: build
67 | @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
68 |
69 | %:
70 | @:
71 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 |
5 | # TOAD
6 |
7 |
8 | [![PyPi version][pypi-image]][pypi-url]
9 | [![Python version][python-image]][docs-url]
10 | [![Build Status][actions-image]][actions-url]
11 | [![Downloads Status][downloads-image]][docs-url]
12 |
13 |
14 | Toad is dedicated to facilitating model development process, especially for a scorecard. It provides intuitive functions of the entire process, from EDA, feature engineering and selection etc. to results validation and scorecard transformation. Its key functionality streamlines the most critical and time-consuming process such as feature selection and fine binning.
15 |
16 | Toad 是专为工业界模型开发设计的Python工具包,特别针对评分卡的开发。Toad 的功能覆盖了建模全流程,从 EDA、特征工程、特征筛选 到 模型验证和评分卡转化。Toad 的主要功能极大简化了建模中最重要最费时的流程,即特征筛选和分箱。
17 |
18 | ## Install and Upgrade · 安装与升级
19 |
20 | Pip
21 |
22 | ```bash
23 | pip install toad # to install
24 | pip install -U toad # to upgrade
25 | ```
26 |
27 | Conda
28 |
29 | ```bash
30 | conda install toad --channel conda-forge # to install
31 | conda install -U toad --channel conda-forge # to upgrade
32 | ```
33 |
34 | Source code
35 |
36 | ```bash
37 | python setup.py install
38 | ```
39 |
40 | ## Key features · 主要功能
41 |
42 | The following showcases some of the most popular features of toad, for more detailed demonstrations and user guidance, please refer to the tutorials.
43 |
44 | 以下部分简单介绍了toad最受欢迎的一些功能,具体的使用方法和使用教程,请详见文档部分。
45 |
46 | - Simple IV calculation for all features · 一键算IV:
47 |
48 | ```python
49 | toad.quality(data, 'target', indicators = ['iv'])
50 | ```
51 |
52 | - Preliminary selection based on criteria · 根据特定条件的初步变量筛选;
53 | - and stepwise feature selection (with optimised algorithm) · 优化过的逐步回归:
54 |
55 | ```python
56 | selected_data = toad.selection.select(data,target = 'target', empty = 0.5, iv = 0.02, corr = 0.7, return_drop=True, exclude=['ID','month'])
57 |
58 | final_data = toad.selection.stepwise(data_woe,target = 'target', estimator='ols', direction = 'both', criterion = 'aic', exclude = to_drop)
59 | ```
60 |
61 | - Reliable fine binning with visualisation · 分箱及可视化:
62 |
63 | ```python
64 | # Chi-squared fine binning
65 | c = toad.transform.Combiner()
66 | c.fit(data_selected.drop(to_drop, axis=1), y = 'target', method = 'chi', min_samples = 0.05)
67 | print(c.export())
68 |
69 | # Visualisation to check binning results
70 | col = 'feature_name'
71 | bin_plot(c.transform(data_selected[[col,'target']], labels=True), x=col, target='target')
72 | ```
73 |
74 | - Intuitive model results presentation · 模型结果展示:
75 |
76 | ```python
77 | toad.metrics.KS_bucket(pred_proba, final_data['target'], bucket=10, method = 'quantile')
78 | ```
79 |
80 | - One-click scorecard transformation · 评分卡转化:
81 |
82 | ```python
83 | card = toad.ScoreCard(
84 | combiner = c,
85 | transer = transer,
86 | class_weight = 'balanced',
87 | C=0.1,
88 | base_score = 600,
89 | base_odds = 35 ,
90 | pdo = 60,
91 | rate = 2
92 | )
93 |
94 | card.fit(final_data[col], final_data['target'])
95 | print(card.export())
96 | ```
97 |
98 | ## Documents · 文档
99 |
100 | - [Tutorial](https://toad.readthedocs.io/en/latest/tutorial.html)
101 |
102 | - [中文指引](https://toad.readthedocs.io/en/latest/tutorial_chinese.html)
103 |
104 | - [docs][docs-url]
105 |
106 | - [Contributing](CONTRIBUTING.md)
107 |
108 | ## Community · 社区
109 | We welcome public feedback and new PRs. We hold a WeChat group for questions and suggestions.
110 |
111 | 欢迎各位提PR,同时我们有toad使用交流的微信群,欢迎询问加群。
112 |
113 | ## Contributors
114 |
115 | [![Contributors][contributor-image]][contributor-url]
116 |
117 | ------------
118 |
119 | ## Dedicated by **The ESC Team**
120 |
121 | [pypi-image]: https://img.shields.io/pypi/v/toad?style=flat-square
122 | [pypi-url]: https://pypi.org/project/toad/
123 | [python-image]: https://img.shields.io/pypi/pyversions/toad?style=flat-square
124 | [actions-image]: https://img.shields.io/github/actions/workflow/status/amphibian-dev/toad/release.yml?style=flat-square
125 | [actions-url]: https://github.com/amphibian-dev/toad/actions
126 | [downloads-image]: https://img.shields.io/pypi/dm/toad?style=flat-square
127 | [docs-url]: https://toad.readthedocs.io/
128 | [contributor-image]: https://contrib.rocks/image?repo=amphibian-dev/toad
129 | [contributor-url]: https://github.com/amphibian-dev/toad/graphs/contributors
130 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | docutils==0.16
2 | recommonmark
3 | sphinx-readable-theme
4 | ipykernel
5 | nbsphinx
6 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 |
13 | import os
14 | import sys
15 | import inspect
16 |
17 | sys.path.insert(0, os.path.abspath('../..'))
18 |
19 |
20 | # -- Project information -----------------------------------------------------
21 |
22 | project = 'toad'
23 | copyright = '2020, ESC Team'
24 | author = 'ESC Team'
25 |
26 |
27 | import toad
28 | version = toad.VERSION
29 | # The full version, including alpha/beta/rc tags
30 | release = version
31 |
32 |
33 | # -- General configuration ---------------------------------------------------
34 | import recommonmark
35 | import sphinx_readable_theme
36 | from recommonmark.transform import AutoStructify
37 |
38 | # Add any Sphinx extension module names here, as strings. They can be
39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
40 | # ones.
41 | extensions = [
42 | 'sphinx.ext.doctest',
43 | 'sphinx.ext.intersphinx',
44 | 'sphinx.ext.todo',
45 | 'sphinx.ext.autodoc',
46 | "sphinx.ext.autosummary",
47 | 'sphinx.ext.linkcode',
48 | 'sphinx.ext.napoleon',
49 | 'nbsphinx',
50 | 'recommonmark',
51 | 'sphinx_readable_theme',
52 | ]
53 |
54 |
55 |
56 | autodoc_member_order = 'bysource'
57 |
58 | # Add any paths that contain templates here, relative to this directory.
59 | templates_path = ['_templates']
60 |
61 | # List of patterns, relative to source directory, that match files and
62 | # directories to ignore when looking for source files.
63 | # This pattern also affects html_static_path and html_extra_path.
64 | exclude_patterns = [
65 | 'toad/commands',
66 | '_build',
67 | '**.ipynb_checkpoints',
68 | ]
69 |
70 | master_doc = 'index'
71 |
72 |
73 | def linkcode_resolve(domain, info):
74 | """linkcode extension config function
75 | """
76 | if domain != "py":
77 | return None
78 |
79 | modname = info["module"]
80 | fullname = info["fullname"]
81 |
82 | submod = sys.modules.get(modname)
83 | if submod is None:
84 | return None
85 |
86 | obj = submod
87 | for part in fullname.split("."):
88 | try:
89 | obj = getattr(obj, part)
90 | except AttributeError:
91 | return None
92 |
93 | try:
94 | # inspect.unwrap() was added in Python version 3.4
95 | if sys.version_info >= (3, 5):
96 | fn = inspect.getsourcefile(inspect.unwrap(obj))
97 | else:
98 | fn = inspect.getsourcefile(obj)
99 | except TypeError:
100 | fn = None
101 | if not fn:
102 | return None
103 |
104 | try:
105 | source, lineno = inspect.getsourcelines(obj)
106 | except OSError:
107 | lineno = None
108 |
109 | if lineno:
110 | linespec = "#L{:d}-L{:d}".format(lineno, lineno + len(source) - 1)
111 | else:
112 | linespec = ""
113 |
114 | fn = os.path.relpath(fn, start = os.path.dirname(toad.__file__))
115 |
116 | return "http://github.com/amphibian-dev/toad/blob/master/toad/{}{}".format(
117 | fn, linespec
118 | )
119 |
120 |
121 | # -- Options for HTML output -------------------------------------------------
122 |
123 | # The theme to use for HTML and HTML Help pages. See the documentation for
124 | # a list of builtin themes.
125 | #
126 | html_theme_path = [sphinx_readable_theme.get_html_theme_path()]
127 | html_theme = 'readable'
128 |
129 | # Add any paths that contain custom static files (such as style sheets) here,
130 | # relative to this directory. They are copied after the builtin static files,
131 | # so a file named "default.css" will overwrite the builtin "default.css".
132 | html_static_path = ['_static']
133 |
134 |
135 |
136 | def setup(app):
137 | app.add_config_value(
138 | 'recommonmark_config',
139 | {
140 | 'enable_eval_rst': True,
141 | 'enable_auto_toc_tree': True,
142 | 'auto_toc_tree_section': 'Contents',
143 | },
144 | True,
145 | )
146 |
147 | app.add_transform(AutoStructify)
148 |
--------------------------------------------------------------------------------
/docs/source/images/scorecard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/docs/source/images/scorecard.png
--------------------------------------------------------------------------------
/docs/source/images/stepwise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/docs/source/images/stepwise.png
--------------------------------------------------------------------------------
/docs/source/index.md:
--------------------------------------------------------------------------------
1 | # Welcome to toad's documentation!
2 |
3 |
4 | ## Installation
5 |
6 | via pip
7 |
8 | ```bash
9 | pip install toad
10 | ```
11 |
12 | via anaconda
13 | ```bash
14 | conda install toad --channel conda-forge
15 | ```
16 |
17 | via source code
18 | ```bash
19 | python setup.py install
20 | ```
21 |
22 | ## Tutorial
23 |
24 | A [basic tutorial](tutorial) is provided.
25 |
26 | [中文指引](tutorial_chinese)
27 |
28 | ## Contents
29 |
30 | ```eval_rst
31 | .. toctree::
32 | :maxdepth: 1
33 |
34 | toad
35 | ```
36 |
37 |
38 | ## Indices and tables
39 |
40 |
41 | ```eval_rst
42 | * :ref:`genindex`
43 | * :ref:`modindex`
44 | * :ref:`search`
45 | ```
46 |
47 |
48 | ## Links
49 |
50 | [FiboRule](http://open.fibo.cn/)
51 |
--------------------------------------------------------------------------------
/docs/source/modules.md:
--------------------------------------------------------------------------------
1 | ## toad
2 |
3 |
4 | ```eval_rst
5 | .. toctree::
6 | :maxdepth: 4
7 |
8 | toad
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/reference.md:
--------------------------------------------------------------------------------
1 | # ChiMerge
2 |
3 | [https://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf](https://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
4 |
5 | ChiMerge Algorithm uses Chi-squared statistic to discretize attributes (numeric). In toad, we firstly transform Char/Object attributes to numeric with WOE function. The Algorithm is clear in paper (i.e. ChiMerge Algorithm Part).
6 |
7 | # Stepwise Regression
8 |
9 | [https://link.springer.com/article/10.1007%2FBF02576123](https://link.springer.com/article/10.1007%2FBF02576123) [1]
10 |
11 | [https://www.sciencedirect.com/science/article/pii/S0950584917305153?via%3Dihub](https://www.sciencedirect.com/science/article/pii/S0950584917305153?via%3Dihub) [2]
12 |
13 | [http://www.jstor.org/stable/1434071](http://www.jstor.org/stable/1434071)[3]
14 |
15 | Stepwise Regression (Forward/Backward/Stepwise, i.e. [2] 3.6. Stepwise Linear Regression)is uesed to reduce Low Information Gain Attributes and simplify the Final Model.
16 |
17 | The Stepwise Regression Process[2]:
18 |
19 | ```eval_rst
20 | .. image:: images/stepwise.png
21 | :width: 80%
22 | :align: center
23 | ```
24 |
25 | # Scorecard Transformation
26 |
27 | John Wiley & Sons, Inc., *Credit Risk Scorecards Developing and Implementing Intelligent Credit Scoring* (Final Scorecard Production Part)
28 |
29 |
30 |
31 | Formula:
32 |
33 | Score = Offset + Factor ∗ ln (odds) #odds: good:bad
34 |
35 | Score + pdo = Offset + Factor ∗ ln (2 ∗ odds) # pdo: points to double the odds
36 |
37 | ==>
38 |
39 | pdo = Factor ∗ ln (2),
40 |
41 | Factor = pdo / ln (2);
42 |
43 | Offset = Score - Factor ∗ ln (odds)
44 |
45 | For example, if a scorecard were being scaled where the user wanted
46 |
47 | odds of 50:1 at 600 points and wanted the odds to double every 20
48 |
49 | points (i.e., pdo = 20), the factor and offset would be:
50 |
51 | Factor = 20 / ln (2) = 28.8539
52 |
53 | Offset = 600 – 28.8539 * ln (50) = 487.123
54 |
55 | ==>
56 |
57 | Each score corresponding to each set of odds:
58 |
59 | Score = 487.123 + 28.8539 * ln (odds)
60 |
61 | Scorecard is developed with WOE as input, the formula can be modified as:
62 |
63 | ```eval_rst
64 | .. image:: images/scorecard.png
65 | :width: 80%
66 | :align: center
67 | ```
68 |
69 | WOE = weight of evidence for each grouped attribute
70 |
71 | β = regression coefficient for each characteristic
72 |
73 | a = intercept term from logistic regression
74 |
75 | n = number of characteristics
76 |
77 | k = number of groups (of attributes) in each characteristic
78 |
--------------------------------------------------------------------------------
/docs/source/toad.detector.md:
--------------------------------------------------------------------------------
1 | ## toad.detector module
2 |
3 |
4 | ```eval_rst
5 | .. automodule:: toad.detector
6 | :members:
7 | :special-members: __init__
8 | :show-inheritance:
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/toad.md:
--------------------------------------------------------------------------------
1 | ## toad package
2 |
3 |
4 |
5 | ## Submodules
6 |
7 |
8 | ```eval_rst
9 | .. toctree::
10 |
11 | toad.detector
12 | toad.merge
13 | toad.metrics
14 | toad.plot
15 | toad.scorecard
16 | toad.selection
17 | toad.stats
18 | toad.transform
19 | toad.preprocessing
20 | toad.nn
21 | toad.utils
22 | ```
23 |
24 | ## Module contents
25 |
26 | ```eval_rst
27 | .. automodule:: toad
28 | :members:
29 | :special-members: __init__
30 | :show-inheritance:
31 | ```
32 |
--------------------------------------------------------------------------------
/docs/source/toad.merge.md:
--------------------------------------------------------------------------------
1 | ## toad.merge module
2 |
3 |
4 | ```eval_rst
5 | .. automodule:: toad.merge
6 | :members:
7 | :special-members: __init__
8 | :show-inheritance:
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/toad.metrics.md:
--------------------------------------------------------------------------------
1 | ## toad.metrics module
2 |
3 |
4 | ```eval_rst
5 | .. automodule:: toad.metrics
6 | :members:
7 | :special-members: __init__
8 | :show-inheritance:
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/toad.nn.functional.md:
--------------------------------------------------------------------------------
1 | ## toad.nn.functional module
2 |
3 | ```eval_rst
4 | .. automodule:: toad.nn.functional
5 | :members:
6 | :special-members: __init__
7 | :show-inheritance:
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/source/toad.nn.md:
--------------------------------------------------------------------------------
1 | ## toad.nn module
2 |
3 |
4 | ```eval_rst
5 | .. toctree::
6 | toad.nn.module
7 | toad.nn.functional
8 | toad.nn.trainer
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/toad.nn.module.md:
--------------------------------------------------------------------------------
1 | ## toad.nn.module module
2 |
3 | ```eval_rst
4 | .. automodule:: toad.nn.module
5 | :members:
6 | :special-members: __init__
7 | :show-inheritance:
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/source/toad.nn.trainer.md:
--------------------------------------------------------------------------------
1 | ## toad.nn.trainer module
2 |
3 | ```eval_rst
4 | .. automodule:: toad.nn.trainer
5 | :members:
6 | :special-members: __init__
7 | :show-inheritance:
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/source/toad.plot.md:
--------------------------------------------------------------------------------
1 | ## toad.plot module
2 |
3 |
4 | ```eval_rst
5 | .. automodule:: toad.plot
6 | :members:
7 | :special-members: __init__
8 | :show-inheritance:
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/toad.preprocessing.md:
--------------------------------------------------------------------------------
1 | ## toad.preprocessing module
2 |
3 |
4 | ```eval_rst
5 | .. toctree::
6 | toad.preprocessing.process
7 | toad.preprocessing.partition
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/source/toad.preprocessing.partition.md:
--------------------------------------------------------------------------------
1 | ## toad.preprocessing.partition module
2 |
3 | ```eval_rst
4 | .. automodule:: toad.preprocessing.partition
5 | :members:
6 | :special-members: __init__
7 | :show-inheritance:
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/source/toad.preprocessing.process.md:
--------------------------------------------------------------------------------
1 | ## toad.preprocessing.process module
2 |
3 | ```eval_rst
4 | .. automodule:: toad.preprocessing.process
5 | :members:
6 | :special-members: __init__
7 | :show-inheritance:
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/source/toad.scorecard.md:
--------------------------------------------------------------------------------
1 | ## toad.scorecard module
2 |
3 |
4 | ```eval_rst
5 | .. automodule:: toad.scorecard
6 | :members:
7 | :special-members: __init__
8 | :show-inheritance:
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/toad.selection.md:
--------------------------------------------------------------------------------
1 | ## toad.selection module
2 |
3 |
4 | ```eval_rst
5 | .. automodule:: toad.selection
6 | :members:
7 | :special-members: __init__
8 | :show-inheritance:
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/toad.stats.md:
--------------------------------------------------------------------------------
1 | ## toad.stats module
2 |
3 |
4 | ```eval_rst
5 | .. automodule:: toad.stats
6 | :members:
7 | :special-members: __init__
8 | :show-inheritance:
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/toad.transform.md:
--------------------------------------------------------------------------------
1 | ## toad.transform module
2 |
3 |
4 | ```eval_rst
5 | .. automodule:: toad.transform
6 | :members:
7 | :special-members: __init__
8 | :inherited-members:
9 | :show-inheritance:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/source/toad.utils.decorator.md:
--------------------------------------------------------------------------------
1 | ## toad.utils.decorator module
2 |
3 | ```eval_rst
4 | .. automodule:: toad.utils.decorator
5 | :members:
6 | :special-members: __init__
7 | :show-inheritance:
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/source/toad.utils.func.md:
--------------------------------------------------------------------------------
1 | ## toad.utils.func module
2 |
3 | ```eval_rst
4 | .. automodule:: toad.utils.func
5 | :members:
6 | :special-members: __init__
7 | :show-inheritance:
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/source/toad.utils.md:
--------------------------------------------------------------------------------
1 | ## toad.utils module
2 |
3 |
4 | ```eval_rst
5 | .. toctree::
6 | toad.utils.func
7 | toad.utils.decorator
8 | toad.utils.mixin
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/source/toad.utils.mixin.md:
--------------------------------------------------------------------------------
1 | ## toad.utils.mixin module
2 |
3 | ```eval_rst
4 | .. automodule:: toad.utils.mixin
5 | :members:
6 | :special-members: __init__
7 | :show-inheritance:
8 | ```
9 |
--------------------------------------------------------------------------------
/images/toad_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/images/toad_logo.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "toad"
3 | description = "Toad is dedicated to facilitating model development process, especially for a scorecard."
4 | requires-python = ">=3.9"
5 | license = {file = "LICENSE"}
6 | authors = [{name = "Secbone", email = "secbone@gmail.com"}]
7 | classifiers = [
8 | 'Operating System :: POSIX',
9 | 'Operating System :: Microsoft :: Windows',
10 | 'Operating System :: MacOS :: MacOS X',
11 | 'Programming Language :: Python :: 3.9',
12 | 'Programming Language :: Python :: 3.10',
13 | 'Programming Language :: Python :: 3.11',
14 | 'Programming Language :: Python :: 3.12',
15 | ]
16 | dynamic = [
17 | "version",
18 | "readme",
19 | "dependencies",
20 | "optional-dependencies",
21 | "entry-points",
22 | ]
23 |
24 | [tool.setuptools.dynamic]
25 | readme = {file = ["README.md"], content-type = "text/markdown"}
26 | dependencies = {file = ["requirements.txt"]}
27 | optional-dependencies = {nn = {file = ["requirements-nn.txt"]}, tools = {file = ["requirements-tools.txt"]}, all = {file = ["requirements-nn.txt", "requirements-tools.txt"]} }
28 |
29 | [build-system]
30 | requires = [
31 | "setuptools",
32 | "Cython >= 0.29.15",
33 | "numpy >= 1.20",
34 | "wheel",
35 | "twine",
36 | ]
37 | build-backend = "setuptools.build_meta"
38 |
39 | [console_scripts]
40 | toad = "toad.cli:main"
41 |
42 | [tool.setuptools.packages.find]
43 | exclude = ["tests"]
44 |
45 | [project.urls]
46 | Homepage = "https://github.com/amphibian-dev/toad"
47 | Documentation = "https://toad.readthedocs.io/en/stable/"
48 | Repository = "https://github.com/amphibian-dev/toad.git"
49 | Issues = "https://github.com/amphibian-dev/toad/issues"
50 | Changelog = "https://github.com/amphibian-dev/toad/blob/master/CHANGELOG.md"
51 |
--------------------------------------------------------------------------------
/requirements-dist.txt:
--------------------------------------------------------------------------------
1 | wheel
2 | twine
3 |
--------------------------------------------------------------------------------
/requirements-nn.txt:
--------------------------------------------------------------------------------
1 | torch >= 1.8.1
2 | torchvision >= 0.9.1
3 | numpy < 2.0 ; sys_platform == "darwin"
4 |
--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-timeout
3 |
--------------------------------------------------------------------------------
/requirements-tools.txt:
--------------------------------------------------------------------------------
1 | cloudpickle
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython <= 0.29.15 ; python_version < "3.10"
2 | Cython >= 0.29.15 ; python_version >= "3.10"
3 | numpy <= 1.24 ; python_version < "3.10"
4 | numpy >= 1.24 ; python_version >= "3.10"
5 | pandas >= 1.5
6 | scipy
7 | joblib >= 0.12
8 | scikit-learn >= 0.21
9 | seaborn >= 0.10.0
10 | setuptools
11 |
--------------------------------------------------------------------------------
/scripts/build_wheels.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e -x
3 |
4 |
5 | # Compile wheels
6 | for PYBIN in /opt/python/cp3[5678]*/bin; do
7 | "${PYBIN}/pip" install -r /io/dev-requirements.txt
8 | "${PYBIN}/pip" wheel --no-deps /io/ -w /dist/
9 | done
10 |
11 | # Bundle external shared libraries into the wheels
12 | for whl in /dist/toad*.whl; do
13 | auditwheel repair "$whl" --plat $PLAT -w /io/dist/
14 | done
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [build_ext]
2 | inplace = 1
3 |
4 | [bdist_wheel]
5 | universal=1
6 |
7 | [aliases]
8 | test=pytest
9 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from setuptools import setup, find_packages, Extension
4 |
5 |
6 | NAME = 'toad'
7 |
8 |
9 | CURRENT_PATH = os.path.abspath(os.path.dirname(__file__))
10 | VERSION_FILE = os.path.join(CURRENT_PATH, NAME, 'version.py')
11 |
12 | def get_version():
13 | ns = {}
14 | with open(VERSION_FILE) as f:
15 | exec(f.read(), ns)
16 | return ns['__version__']
17 |
18 |
19 | def get_ext_modules():
20 | from Cython.Build import cythonize
21 |
22 | extensions = [
23 | Extension('toad.c_utils', sources = ['toad/c_utils.pyx'], include_dirs = [np.get_include()]),
24 | Extension('toad.merge', sources = ['toad/merge.pyx'], include_dirs = [np.get_include()]),
25 | ]
26 |
27 | return cythonize(extensions)
28 |
29 |
30 | def get_requirements(stage = None):
31 | file_name = 'requirements'
32 |
33 | if stage is not None:
34 | file_name = f"{file_name}-{stage}"
35 |
36 | requirements = []
37 | with open(f"{file_name}.txt", 'r') as f:
38 | for line in f:
39 | line = line.strip()
40 | if not line or line.startswith('-'):
41 | continue
42 |
43 | requirements.append(line)
44 |
45 | return requirements
46 |
47 |
48 | setup(
49 | name = NAME,
50 | version = get_version(),
51 | description = 'Toad is dedicated to facilitating model development process, especially for a scorecard.',
52 | long_description = open('README.md', encoding = 'utf-8').read(),
53 | long_description_content_type = 'text/markdown',
54 | url = 'https://github.com/amphibian-dev/toad',
55 | author = 'ESC Team',
56 | author_email = 'secbone@gmail.com',
57 | packages = find_packages(exclude = ['tests']),
58 | include_dirs = [np.get_include()],
59 | ext_modules = get_ext_modules(),
60 | include_package_data = True,
61 | python_requires = '>=3.7',
62 | setup_requires = ['numpy'],
63 | tests_require = get_requirements('test'),
64 | license = 'MIT',
65 | classifiers = [
66 | 'Operating System :: POSIX',
67 | 'Operating System :: Microsoft :: Windows',
68 | 'Operating System :: MacOS :: MacOS X',
69 | 'Programming Language :: Python :: 3.8',
70 | 'Programming Language :: Python :: 3.9',
71 | 'Programming Language :: Python :: 3.10',
72 | 'Programming Language :: Python :: 3.11',
73 | 'Programming Language :: Python :: 3.12',
74 | ],
75 | entry_points = {
76 | 'console_scripts': [
77 | 'toad = toad.cli:main',
78 | ],
79 | },
80 | )
81 |
--------------------------------------------------------------------------------
/toad/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from .merge import merge, DTMerge, ChiMerge, StepMerge, QuantileMerge, KMeansMerge
3 | except ImportError:
4 | import warnings
5 | warnings.warn(
6 | """`merge` module need to be builded""",
7 | ImportWarning,
8 | )
9 |
10 | from .detector import detect
11 | from .metrics import KS, KS_bucket, F1
12 | from .stats import quality, IV, VIF, WOE, entropy, entropy_cond, gini, gini_cond
13 | from .transform import Combiner, WOETransformer
14 | from .selection import select
15 | from .scorecard import ScoreCard
16 | from .utils import Progress, performance
17 | from .version import __version__
18 |
19 | VERSION = __version__
20 |
--------------------------------------------------------------------------------
/toad/c_utils.pxd:
--------------------------------------------------------------------------------
1 | ctypedef fused number:
2 | int
3 | double
4 | long
5 |
6 |
7 | cdef number c_min(number[:] arr)
8 |
9 | cdef number c_sum(number[:,:] arr)
10 |
11 | cdef number[:] c_sum_axis_0(number[:,:] arr)
12 |
13 | cdef number[:] c_sum_axis_1(number[:,:] arr)
14 |
--------------------------------------------------------------------------------
/toad/c_utils.pyx:
--------------------------------------------------------------------------------
1 | # cython: language_level = 3, infer_types = True, boundscheck = False
2 |
3 | import numpy as np
4 | cimport numpy as np
5 | cimport cython
6 |
7 |
8 |
9 | cdef number c_min(number[:] arr):
10 | cdef number res = np.inf
11 |
12 | for i in range(arr.shape[0]):
13 | if res > arr[i]:
14 | res = arr[i]
15 | return res
16 |
17 |
18 | cdef number c_sum(number[:,:] arr):
19 | cdef number res = 0
20 |
21 | cdef Py_ssize_t i,j
22 | for i in range(arr.shape[0]):
23 | for j in range(arr.shape[1]):
24 | res += arr[i, j]
25 |
26 | return res
27 |
28 |
29 | cdef number[:] c_sum_axis_0(number[:,:] arr):
30 | cdef number[:] res = np.zeros(arr.shape[1], dtype=float)
31 |
32 | for i in range(arr.shape[0]):
33 | for j in range(arr.shape[1]):
34 | res[j] += arr[i, j]
35 |
36 | return res
37 |
38 |
39 | cdef number[:] c_sum_axis_1(number[:,:] arr):
40 | cdef number[:] res = np.zeros(arr.shape[0], dtype=float)
41 |
42 | for i in range(arr.shape[0]):
43 | for j in range(arr.shape[1]):
44 | res[i] += arr[i, j]
45 |
46 | return res
47 |
--------------------------------------------------------------------------------
/toad/cli.py:
--------------------------------------------------------------------------------
1 | """
2 | toad command line application
3 | """
4 | import argparse
5 | from .commands import get_plugins
6 |
7 |
8 | def add_sub(parsers, config):
9 | """add sub parser by config
10 | """
11 | info = config.get('info', {})
12 | args = config.get('args', [])
13 | defaults = config.get('defaults', None)
14 |
15 | sub_parser = parsers.add_parser(**info)
16 |
17 | for detail in args:
18 | flag = detail.pop('flag')
19 | sub_parser.add_argument(*flag, **detail)
20 |
21 | if defaults:
22 | sub_parser.set_defaults(**defaults)
23 |
24 |
25 | def get_parser():
26 | """get parser
27 | """
28 | parser = argparse.ArgumentParser(
29 | prog = 'toad',
30 | description = 'Detect data from a csv file',
31 | )
32 |
33 | subparsers = parser.add_subparsers()
34 |
35 | plugins = get_plugins()
36 | for plug in plugins:
37 | add_sub(subparsers, plug.ARGS)
38 |
39 | return parser
40 |
41 |
42 | def main():
43 | """
44 | """
45 | parser = get_parser()
46 |
47 | args = parser.parse_args()
48 | if hasattr(args, 'func'):
49 | args.func(args)
50 |
51 |
52 | if __name__ == '__main__':
53 | main()
54 |
--------------------------------------------------------------------------------
/toad/cli_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 | import pyximport
6 |
7 | pyximport.install(setup_args={"include_dirs": np.get_include()})
8 |
9 | from toad.cli import get_parser
10 |
11 | def disable_stdout(fn):
12 |
13 | def wrapper(*args):
14 | import os
15 | import sys
16 |
17 | with open(os.devnull, 'w') as f:
18 | so = sys.stdout
19 | sys.stdout = f
20 |
21 | fn(*args)
22 |
23 | sys.stdout = so
24 |
25 | return wrapper
26 |
27 |
28 | parser = get_parser()
29 |
30 |
31 |
32 | @disable_stdout
33 | def test_detect():
34 | args = parser.parse_args(['detect', '-i', 'tests/test_data.csv'])
35 | rep = args.func(args)
36 | assert rep.loc['E', 'unique'] == 20
37 |
38 | @pytest.mark.skip("tree command will generate a pic in travis-ci log")
39 | @disable_stdout
40 | def test_tree():
41 | args = parser.parse_args(['tree', '-i', 'tests/test_data.csv'])
42 | args.func(args)
43 | pass
44 |
--------------------------------------------------------------------------------
/toad/commands/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pkgutil
3 | from importlib import import_module
4 |
5 | COMMAND_DIR = os.path.dirname(os.path.abspath(__file__))
6 |
7 | def get_plugins():
8 | plugins = []
9 |
10 | for _, name, ispkg in pkgutil.iter_modules([COMMAND_DIR]):
11 | if ispkg:
12 | module = import_module('toad.commands.{}'.format(name))
13 | plugins.append(module)
14 |
15 | return plugins
16 |
--------------------------------------------------------------------------------
/toad/commands/detect/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import argparse
3 | import pandas as pd
4 |
5 | def func(args):
6 | """detect csv data
7 |
8 | Examples:
9 |
10 | toad detect -i xxx.csv -o report.csv
11 | """
12 | from toad.detector import detect
13 |
14 | sys.stdout.write('reading data....\n')
15 | with args.input as input:
16 | data = pd.read_csv(input)
17 |
18 | sys.stdout.write('detecting...\n')
19 | report = detect(data)
20 |
21 | if args.output:
22 | sys.stdout.write('saving report...\n')
23 | report.to_csv(args.output)
24 | sys.stdout.write('report saved!\n')
25 | else:
26 | sys.stdout.write(str(report))
27 | sys.stdout.write('\n')
28 |
29 | return report
30 |
31 | ARGS = {
32 | 'info': {
33 | 'name': 'detect',
34 | 'description': 'detect data from a csv file',
35 | },
36 | 'defaults': {
37 | 'func': func,
38 | },
39 | 'args': [
40 | {
41 | 'flag': ('-i', '--input'),
42 | 'type': argparse.FileType(),
43 | 'help': 'the csv file which will be detected',
44 | 'required': True,
45 | },
46 | {
47 | 'flag': ('-o', '--output'),
48 | 'type': argparse.FileType('w'),
49 | 'help': 'path of the csv report will be saved',
50 | },
51 | ]
52 | }
53 |
--------------------------------------------------------------------------------
/toad/commands/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import argparse
3 | import pandas as pd
4 |
5 | def func(args):
6 | """detect csv data
7 |
8 | Examples:
9 |
10 | toad evaluate -i xxx.csv
11 | """
12 | from .evaluate import evaluate
13 |
14 |
15 | sys.stdout.write('reading data....\n')
16 |
17 | test_data = pd.read_csv(args.input)
18 | if args.base is not None:
19 | self_data = pd.read_csv(args.base)
20 | else:
21 | self_data = None
22 |
23 | arguments = {
24 | 'excel_name': args.name,
25 | 'num': args.top,
26 | 'iv_threshold_value': args.iv,
27 | 'unique_num': args.unique,
28 | 'self_data': self_data,
29 | 'overdue_days': args.overdue,
30 | }
31 |
32 | evaluate(test_data, **arguments)
33 |
34 |
35 | ARGS = {
36 | 'info': {
37 | 'name': 'evaluate',
38 | 'description': '第三方数据评估',
39 | },
40 | 'defaults': {
41 | 'func': func,
42 | },
43 | 'args': [
44 | {
45 | 'flag': ('-i', '--input'),
46 | 'type': argparse.FileType('r', encoding='utf-8'),
47 | 'help': '需要评估的 csv 文件',
48 | 'required': True,
49 | },
50 | {
51 | 'flag': ('--base',),
52 | 'type': argparse.FileType('r', encoding='utf-8'),
53 | 'help': '用于测试提升效果的基准 csv 数据文件',
54 | 'default': None,
55 | },
56 | {
57 | 'flag': ('--overdue',),
58 | 'help': '是否启用逾期天数分析',
59 | 'action': 'store_true',
60 | },
61 | {
62 | 'flag': ('--top',),
63 | 'type': int,
64 | 'help': '选择 IV 最高的 n 个变量分析',
65 | 'default': 10,
66 | },
67 | {
68 | 'flag': ('--iv',),
69 | 'type': float,
70 | 'help': '选择 IV 大于阈值的变量进行分析',
71 | 'default': 0.02,
72 | },
73 | {
74 | 'flag': ('--unique',),
75 | 'type': int,
76 | 'help': '将连续变量合并成 n 组进行分析',
77 | 'default': 10,
78 | },
79 | {
80 | 'flag': ('--name',),
81 | 'type': str,
82 | 'help': '生成报告的文件名',
83 | 'default': 'report.xlsx',
84 | },
85 | ]
86 | }
87 |
--------------------------------------------------------------------------------
/toad/commands/tree/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import argparse
3 | import pandas as pd
4 |
5 | def func(args):
6 | """detect csv data
7 |
8 | Examples:
9 |
10 | toad tree -i xxx.csv
11 | """
12 | import toad
13 | from .tree import split_data, dtree
14 | args = vars(args)
15 |
16 | # remove func attribute
17 | args.pop('func')
18 |
19 | input = args.pop('input')
20 | target = args.pop('target')
21 | include = args.pop('include')
22 | exclude = args.pop('exclude')
23 |
24 | sys.stdout.write('reading data....\n')
25 | data = pd.read_csv(input)
26 |
27 | X, *tars = split_data(data, target = target)
28 |
29 | if include is not None:
30 | X = X[include]
31 |
32 | if exclude is not None:
33 | X = X.drop(columns = exclude)
34 |
35 | X = toad.utils.get_dummies(X)
36 |
37 |
38 | for t in tars:
39 | sys.stdout.write('analyse '+ t.name +' ...\n')
40 | dtree(X, t, **args)
41 |
42 |
43 | ARGS = {
44 | 'info': {
45 | 'name': 'tree',
46 | 'description': 'analyse bad rate from a csv file',
47 | },
48 | 'defaults': {
49 | 'func': func,
50 | },
51 | 'args': [
52 | {
53 | 'flag': ('-i', '--input'),
54 | 'type': argparse.FileType('r', encoding='utf-8'),
55 | 'help': 'the csv file which will be analysed',
56 | 'required': True,
57 | },
58 | {
59 | 'flag': ('-t', '--target'),
60 | 'nargs': '+',
61 | 'help': 'the target(s) will be analysed',
62 | 'default': 'target',
63 | },
64 | {
65 | 'flag': ('-c', '--criterion'),
66 | 'type': str,
67 | 'help': 'criterion to measure the quality of a split. Support "gini" (default), "entropy"',
68 | 'default': 'gini',
69 | },
70 | {
71 | 'flag': ('-d', '--depth'),
72 | 'type': int,
73 | 'help': 'the maximum depth of the tree',
74 | 'default': None,
75 | },
76 | {
77 | 'flag': ('-s', '--sample'),
78 | 'type': float,
79 | 'help': 'minimum number of sample in each node',
80 | 'default': 0.01,
81 | },
82 | {
83 | 'flag': ('-r', '--ratio'),
84 | 'type': float,
85 | 'help': 'threshold of ratio that will be highlighted',
86 | 'default': 0.15,
87 | },
88 | {
89 | 'flag': ('--exclude',),
90 | 'nargs': '+',
91 | 'help': 'feature names that will not use to analyse',
92 | 'default': None,
93 | },
94 | {
95 | 'flag': ('--include',),
96 | 'nargs': '+',
97 | 'help': 'feature names that will be used to analyse',
98 | 'default': None,
99 | },
100 | ]
101 | }
102 |
--------------------------------------------------------------------------------
/toad/commands/tree/tree.py:
--------------------------------------------------------------------------------
1 | """
2 | Windows:
3 | conda install python-graphviz
4 | Mac:
5 | brew install graphviz
6 | pip install graphviz
7 | """
8 |
9 | import numpy as np
10 | import pandas as pd
11 |
12 | import graphviz
13 |
14 | import sklearn
15 | from sklearn.tree import DecisionTreeClassifier
16 |
17 |
18 | def tree_to_dot(tree, features, high_light = 0.15):
19 | from io import StringIO
20 | from sklearn.tree import _tree
21 |
22 | out = StringIO()
23 | tree_ = tree.tree_
24 |
25 | features = np.array([
26 | features[i] if i != _tree.TREE_UNDEFINED else "undefined!"
27 | for i in tree_.feature
28 | ])
29 |
30 | out.write('digraph Tree {\n')
31 | out.write('edge [fontname="FangSong"];\n')
32 | out.write('node [shape=box];\n')
33 |
34 | def recurse(node, parent = None, label = None):
35 | sample = tree_.n_node_samples[node]
36 | bad_rate = tree_.value[node][0,1] / sample
37 |
38 | out.write('{} [label="'.format(node))
39 |
40 | out.write('bad rate: {:.2%}\n'.format(bad_rate))
41 | out.write('sample: {:.2%}\n'.format(sample / tree_.n_node_samples[0]))
42 |
43 | # end of label
44 | out.write('"')
45 |
46 | if bad_rate > high_light:
47 | out.write(', color="red"')
48 |
49 | # end of node
50 | out.write('];\n')
51 |
52 | if tree_.feature[node] != _tree.TREE_UNDEFINED:
53 | name = features[node]
54 | threshold = tree_.threshold[node]
55 | recurse(tree_.children_left[node], node, '{} <= {:.2f}'.format(name, threshold))
56 | recurse(tree_.children_right[node], node, '{} > {:.2f}'.format(name, threshold))
57 |
58 | if parent is not None:
59 | out.write('{} -> {} [label="{}"];\n'.format(parent, node, label))
60 |
61 | recurse(0, None)
62 |
63 | out.write('}')
64 | s = out.getvalue()
65 | out.close()
66 | return s
67 |
68 |
69 | def dot_to_img(dot, file = 'report.png'):
70 | import os
71 |
72 | name, ext = os.path.splitext(file)
73 |
74 | graph = graphviz.Source(dot)
75 | graph.format = ext[1:]
76 | graph.view(name, cleanup = True)
77 |
78 |
79 | def split_data(frame, target = 'target'):
80 | X = frame.drop(columns = target)
81 |
82 | res = (X,)
83 | if isinstance(target, str):
84 | target = [target]
85 |
86 | for col in target:
87 | res += (frame[col],)
88 |
89 | return res
90 |
91 |
92 | def dtree(frame, target, criterion = 'gini', depth = None, sample = 0.01, ratio = 0.15):
93 | tree = DecisionTreeClassifier(
94 | criterion = criterion,
95 | min_samples_leaf = sample,
96 | max_depth = depth,
97 | )
98 |
99 | tree.fit(frame.fillna(-1), target)
100 |
101 | dot_string = tree_to_dot(tree, frame.columns.values, high_light = ratio)
102 |
103 | dot_to_img(dot_string, file = target.name + '.png')
104 |
--------------------------------------------------------------------------------
/toad/detector.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | """Command line tools for detecting csv data
4 |
5 | Team: ESC
6 |
7 | Examples:
8 |
9 | python detector.py -i xxx.csv -o report.csv
10 |
11 | """
12 |
13 | import pandas as pd
14 |
15 | def getTopValues(series, top = 5, reverse = False):
16 | """Get top/bottom n values
17 |
18 | Args:
19 | series (Series): data series
20 | top (number): number of top/bottom n values
21 | reverse (bool): it will return bottom n values if True is given
22 |
23 | Returns:
24 | Series: Series of top/bottom n values and percentage. ['value:percent', None]
25 | """
26 | itype = 'top'
27 | counts = series.value_counts()
28 | counts = list(zip(counts.index, counts, counts.divide(series.size)))
29 |
30 | if reverse:
31 | counts.reverse()
32 | itype = 'bottom'
33 |
34 | template = "{0[0]}:{0[2]:.2%}"
35 | indexs = [itype + str(i + 1) for i in range(top)]
36 | values = [template.format(counts[i]) if i < len(counts) else None for i in range(top)]
37 |
38 | return pd.Series(values, index = indexs)
39 |
40 |
41 | def getDescribe(series, percentiles = [.25, .5, .75]):
42 | """Get describe of series
43 |
44 | Args:
45 | series (Series): data series
46 | percentiles: the percentiles to include in the output
47 |
48 | Returns:
49 | Series: the describe of data include mean, std, min, max and percentiles
50 | """
51 | d = series.describe(percentiles)
52 | return d.drop('count')
53 |
54 |
55 | def countBlank(series, blanks = []):
56 | """Count number and percentage of blank values in series
57 |
58 | Args:
59 | series (Series): data series
60 | blanks (list): list of blank values
61 |
62 | Returns:
63 | number: number of blanks
64 | str: the percentage of blank values
65 | """
66 | if len(blanks)>0:
67 | isnull = series.replace(blanks, None).isnull()
68 | else:
69 | isnull = series.isnull()
70 | n = isnull.sum()
71 | ratio = isnull.mean()
72 |
73 | return (n, "{0:.2%}".format(ratio))
74 |
75 |
76 | def isNumeric(series):
77 | """Check if the series's type is numeric
78 |
79 | Args:
80 | series (Series): data series
81 |
82 | Returns:
83 | bool
84 | """
85 | return series.dtype.kind in 'ifc'
86 |
87 |
88 | def detect(dataframe):
89 | """ Detect data
90 |
91 | Args:
92 | dataframe (DataFrame): data that will be detected
93 |
94 | Returns:
95 | DataFrame: report of detecting
96 | """
97 |
98 | rows = []
99 | for name, series in dataframe.items():
100 | numeric_index = ['mean', 'std', 'min', '1%', '10%', '50%', '75%', '90%', '99%', 'max']
101 | discrete_index = ['top1', 'top2', 'top3', 'top4', 'top5', 'bottom5', 'bottom4', 'bottom3', 'bottom2', 'bottom1']
102 |
103 | details_index = [numeric_index[i] + '_or_' + discrete_index[i] for i in range(len(numeric_index))]
104 | details = []
105 |
106 | if isNumeric(series):
107 | desc = getDescribe(
108 | series,
109 | percentiles = [.01, .1, .5, .75, .9, .99]
110 | )
111 | details = desc.tolist()
112 | else:
113 | top5 = getTopValues(series)
114 | bottom5 = getTopValues(series, reverse = True)
115 | details = top5.tolist() + bottom5[::-1].tolist()
116 |
117 | # print(details_index)
118 | nblank, pblank = countBlank(series)
119 |
120 | row = pd.Series(
121 | index = ['type', 'size', 'missing', 'unique'] + details_index,
122 | data = [series.dtype, series.size, pblank, series.nunique()] + details
123 | )
124 |
125 | row.name = name
126 | rows.append(row)
127 |
128 | return pd.DataFrame(rows)
129 |
--------------------------------------------------------------------------------
/toad/impute.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from pandas.api.types import is_numeric_dtype
4 | from sklearn.experimental import enable_iterative_imputer
5 | from sklearn.impute import IterativeImputer
6 | from sklearn.ensemble import RandomForestRegressor
7 | from sklearn.preprocessing import LabelEncoder
8 |
9 |
10 |
11 | def impute(df):
12 | imputer = Imputer(
13 | estimator = RandomForestRegressor(),
14 | random_state = 1,
15 | )
16 |
17 | return imputer.fit_transform(df)
18 |
19 |
20 | class Imputer(IterativeImputer):
21 | def __init__(self, missing_values = np.nan, **kwargs):
22 | super().__init__(missing_values = np.nan, **kwargs)
23 |
24 | if not isinstance(missing_values, list):
25 | missing_values = [missing_values]
26 |
27 | self.missing_values_list = missing_values
28 | self.encoder_dict = dict()
29 |
30 | def _impute_one_feature(self, X_filled, mask_missing_values, feat_idx,
31 | neighbor_feat_idx, **kwargs):
32 |
33 | return super()._impute_one_feature(X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, **kwargs)
34 |
35 | def fit_transform(self, X, **kwargs):
36 | X, mask = self._replace_empty(X)
37 | X = self._fit_encode(X, mask)
38 |
39 | res = super().fit_transform(X, **kwargs)
40 | res = pd.DataFrame(res, columns = X.columns)
41 | return self._decode(res)
42 |
43 |
44 | def transform(self, X, **kwargs):
45 | X, mask = self._replace_empty(X)
46 | X = self._encode(X, mask)
47 |
48 | res = super().transform(X, **kwargs)
49 | res = pd.DataFrame(res, columns = X.columns)
50 | return self._decode(res)
51 |
52 |
53 | def _replace_empty(self, X):
54 | mask = X.isin(self.missing_values_list)
55 | X = X.where(~mask, np.nan)
56 | return X, mask
57 |
58 | def _fit_encode(self, X, mask):
59 | """fit encoder for object data
60 |
61 | Args:
62 | X (DataFrame)
63 | mask (Mask): empty mask for X
64 | """
65 | category_data = X.select_dtypes(exclude = np.number).columns
66 |
67 | for col in category_data:
68 | unique, X[col].loc[~mask[col]] = np.unique(X[col][~mask[col]], return_inverse = True)
69 |
70 | self.encoder_dict[col] = unique
71 |
72 | return X
73 |
74 | def _encode(self, X, mask):
75 | """encode object data to number
76 |
77 | Args:
78 | X (DataFrame)
79 | mask (Mask): empty mask for X
80 | """
81 | for col, unique in self.encoder_dict.items():
82 | table = dict(zip(unique, np.arange(len(unique))))
83 | X[col].loc[~mask[col]] = np.array([table[v] for v in X[col][~mask[col]]])
84 |
85 | return X
86 |
87 | def _decode(self, X):
88 | """decode object data from number to origin data
89 |
90 | Args:
91 | X (DataFrame)
92 | mask (Mask): empty mask for X
93 | """
94 | for col, unique in self.encoder_dict.items():
95 | ix = X[col].values.astype(int)
96 | X[col] = unique[ix]
97 |
98 | return X
99 |
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/toad/impute_test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from .impute import impute
5 |
6 |
7 | ab = np.array(list('ABCDEFG'))
8 | int_feat = np.random.randint(10, size = 500)
9 | float_feat = np.random.rand(500)
10 | str_feat = ab[np.random.choice(7, 500)]
11 | uni_feat = np.ones(500)
12 | # empty_feat = np.full(500, np.nan)
13 |
14 | target = np.random.randint(2, size = 500)
15 |
16 | df = pd.DataFrame({
17 | 'A': int_feat,
18 | 'B': str_feat,
19 | 'C': uni_feat,
20 | 'D': float_feat,
21 | # 'E': empty_feat,
22 | })
23 |
24 | mask = np.random.choice([True, False], size = 500 * 4, p = [0.95, 0.05]).reshape(500, 4)
25 | df = df.where(mask, np.nan)
26 |
27 |
28 | def test_impute_with_number():
29 | res = impute(df.drop(columns = 'B'))
30 |
31 | assert res.isna().sum().sum() == 0
32 |
33 |
34 | def test_impute_with_str():
35 | res = impute(df)
36 |
37 | assert res.isna().sum().sum() == 0
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/toad/merge_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 | import pyximport
6 |
7 | pyximport.install(setup_args={"include_dirs": np.get_include()})
8 |
9 | from .merge import merge, ChiMerge, DTMerge, QuantileMerge, StepMerge, KMeansMerge
10 |
11 | np.random.seed(1)
12 | feature = np.random.rand(500)
13 | target = np.random.randint(2, size = 500)
14 | A = np.random.randint(100, size = 500)
15 | B = np.random.randint(3, size = 500)
16 |
17 | df = pd.DataFrame({
18 | 'feature': feature,
19 | 'target': target,
20 | 'A': A,
21 | })
22 |
23 |
24 |
25 | def test_chimerge():
26 | splits = ChiMerge(feature, target, n_bins = 10)
27 | assert len(splits) == 9
28 |
29 | def test_chimerge_bins_not_enough():
30 | splits = ChiMerge(B, target, n_bins = 10)
31 | assert len(splits) == 2
32 |
33 | def test_chimerge_bins_with_min_samples():
34 | splits = ChiMerge(feature, target, min_samples = 0.02)
35 | assert len(splits) == 10
36 |
37 | def test_dtmerge():
38 | splits = DTMerge(feature, target, n_bins = 10)
39 | assert len(splits) == 9
40 |
41 | def test_quantilemerge():
42 | splits = QuantileMerge(feature, n_bins = 10)
43 | assert len(splits) == 9
44 |
45 | def test_quantilemerge_not_enough():
46 | splits = QuantileMerge(B, n_bins = 10)
47 | assert len(splits) == 2
48 |
49 | def test_stepmerge():
50 | splits = StepMerge(feature, n_bins = 10)
51 | assert len(splits) == 9
52 |
53 | def test_kmeansmerge():
54 | splits = KMeansMerge(feature, n_bins = 10)
55 | assert len(splits) == 9
56 |
57 | def test_merge():
58 | res = merge(feature, target = target, method = 'chi', n_bins = 10)
59 | assert len(np.unique(res)) == 10
60 |
61 | def test_merge_frame():
62 | res = merge(df, target = 'target', method = 'chi', n_bins = 10)
63 | assert len(np.unique(res['A'])) == 10
64 |
--------------------------------------------------------------------------------
/toad/metrics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from scipy.stats import ks_2samp
4 |
5 | from sklearn.metrics import f1_score, roc_auc_score, roc_curve
6 |
7 | from .utils import (
8 | feature_splits,
9 | iter_df,
10 | unpack_tuple,
11 | bin_by_splits,
12 | )
13 |
14 |
15 | def KS(score, target):
16 | """calculate ks value
17 |
18 | Args:
19 | score (array-like): list of score or probability that the model predict
20 | target (array-like): list of real target
21 |
22 | Returns:
23 | float: the max KS value
24 | """
25 | mask = target == 1
26 | res = ks_2samp(score[mask], score[~mask])
27 | return res[0]
28 |
29 |
30 | def KS_bucket(score, target, bucket = 10, method = 'quantile', return_splits = False, **kwargs):
31 | """calculate ks value by bucket
32 |
33 | Args:
34 | score (array-like): list of score or probability that the model predict
35 | target (array-like): list of real target
36 | bucket (int): n groups that will bin into
37 | method (str): method to bin score. `quantile` (default), `step`
38 | return_splits (bool): if need to return splits of bucket
39 |
40 | Returns:
41 | DataFrame
42 | """
43 | df = pd.DataFrame({
44 | 'score': score,
45 | 'bad': target,
46 | })
47 |
48 | df['good'] = 1 - df['bad']
49 |
50 | bad_total = df['bad'].sum()
51 | good_total = df['good'].sum()
52 | all_total = bad_total + good_total
53 |
54 | splits = None
55 | df['bucket'] = 0
56 |
57 | if bucket is False:
58 | df['bucket'] = score
59 | elif isinstance(bucket, (list, np.ndarray, pd.Series)):
60 | # list of split pointers
61 | if len(bucket) < len(score):
62 | bucket = bin_by_splits(score, bucket)
63 |
64 | df['bucket'] = bucket
65 | elif isinstance(bucket, int):
66 | from .merge import merge
67 | df['bucket'], splits = merge(score, n_bins = bucket, method = method, return_splits = True, **kwargs)
68 |
69 | grouped = df.groupby('bucket', as_index = False)
70 |
71 | agg1 = pd.DataFrame()
72 | agg1['min'] = grouped.min()['score']
73 | agg1['max'] = grouped.max()['score']
74 | agg1['bads'] = grouped.sum()['bad']
75 | agg1['goods'] = grouped.sum()['good']
76 | agg1['total'] = agg1['bads'] + agg1['goods']
77 |
78 | agg2 = (agg1.sort_values(by = 'min')).reset_index(drop = True)
79 |
80 | agg2['bad_rate'] = agg2['bads'] / agg2['total']
81 | agg2['good_rate'] = agg2['goods'] / agg2['total']
82 |
83 | agg2['odds'] = agg2['bads'] / agg2['goods']
84 |
85 | agg2['bad_prop'] = agg2['bads'] / bad_total
86 | agg2['good_prop'] = agg2['goods'] / good_total
87 | agg2['total_prop'] = agg2['total'] / all_total
88 |
89 |
90 | cum_bads = agg2['bads'].cumsum()
91 | cum_goods = agg2['goods'].cumsum()
92 | cum_total = agg2['total'].cumsum()
93 |
94 | cum_bads_rev = agg2.loc[::-1, 'bads'].cumsum()[::-1]
95 | cum_goods_rev = agg2.loc[::-1, 'goods'].cumsum()[::-1]
96 | cum_total_rev = agg2.loc[::-1, 'total'].cumsum()[::-1]
97 |
98 | agg2['cum_bad_rate'] = cum_bads / cum_total
99 | agg2['cum_bad_rate_rev'] = cum_bads_rev / cum_total_rev
100 |
101 | agg2['cum_bads_prop'] = cum_bads / bad_total
102 | agg2['cum_bads_prop_rev'] = cum_bads_rev / bad_total
103 | agg2['cum_goods_prop'] = cum_goods / good_total
104 | agg2['cum_goods_prop_rev'] = cum_goods_rev / good_total
105 | agg2['cum_total_prop'] = cum_total / all_total
106 | agg2['cum_total_prop_rev'] = cum_total_rev / all_total
107 |
108 |
109 | agg2['ks'] = agg2['cum_bads_prop'] - agg2['cum_goods_prop']
110 |
111 | reverse_suffix = ''
112 | # fix negative ks value
113 | if agg2['ks'].sum() < 0:
114 | agg2['ks'] = -agg2['ks']
115 | reverse_suffix = '_rev'
116 |
117 | agg2['lift'] = agg2['bad_prop'] / agg2['total_prop']
118 | agg2['cum_lift'] = agg2['cum_bads_prop' + reverse_suffix] / agg2['cum_total_prop' + reverse_suffix]
119 |
120 | if return_splits and splits is not None:
121 | return agg2, splits
122 |
123 | return agg2
124 |
125 | def KS_by_col(df, by='feature', score='score', target='target'):
126 | """
127 | """
128 |
129 | pass
130 |
131 |
132 | def SSE(y_pred, y):
133 | """sum of squares due to error
134 | """
135 | return np.sum((y_pred - y) ** 2)
136 |
137 |
138 | def MSE(y_pred, y):
139 | """mean of squares due to error
140 | """
141 | return np.mean((y_pred - y) ** 2)
142 |
143 |
144 | def AIC(y_pred, y, k, llf = None):
145 | """Akaike Information Criterion
146 |
147 | Args:
148 | y_pred (array-like)
149 | y (array-like)
150 | k (int): number of featuers
151 | llf (float): result of log-likelihood function
152 | """
153 | if llf is None:
154 | llf = np.log(SSE(y_pred, y))
155 |
156 | return 2 * k - 2 * llf
157 |
158 |
159 | def BIC(y_pred, y, k, llf = None):
160 | """Bayesian Information Criterion
161 |
162 | Args:
163 | y_pred (array-like)
164 | y (array-like)
165 | k (int): number of featuers
166 | llf (float): result of log-likelihood function
167 | """
168 | n = len(y)
169 | if llf is None:
170 | llf = np.log(SSE(y_pred, y))
171 |
172 | return np.log(n) * k - 2 * llf
173 |
174 |
175 | def F1(score, target, split = 'best', return_split = False):
176 | """calculate f1 value
177 |
178 | Args:
179 | score (array-like)
180 | target (array-like)
181 |
182 | Returns:
183 | float: best f1 score
184 | float: best spliter
185 | """
186 | dataframe = pd.DataFrame({
187 | 'score': score,
188 | 'target': target,
189 | })
190 |
191 | if split == 'best':
192 | # find best split for score
193 | splits = feature_splits(dataframe['score'], dataframe['target'])
194 | else:
195 | splits = [split]
196 |
197 | best = 0
198 | sp = None
199 | for df, pointer in iter_df(dataframe, 'score', 'target', splits):
200 | v = f1_score(df['target'], df['score'])
201 |
202 | if v > best:
203 | best = v
204 | sp = pointer
205 |
206 | if return_split:
207 | return best, sp
208 |
209 | return best
210 |
211 |
212 | def AUC(score, target, return_curve = False):
213 | """AUC Score
214 |
215 | Args:
216 | score (array-like): list of score or probability that the model predict
217 | target (array-like): list of real target
218 | return_curve (bool): if need return curve data for ROC plot
219 |
220 | Returns:
221 | float: auc score
222 | """
223 | # fix score order
224 | if np.nanmax(score) > 1:
225 | score = -score
226 |
227 | auc = roc_auc_score(target, score)
228 |
229 | if not return_curve:
230 | return auc
231 |
232 | return (auc,) + roc_curve(target, score)
233 |
234 |
235 | def _PSI(test, base):
236 | test_prop = pd.Series(test).value_counts(normalize = True, dropna = False)
237 | base_prop = pd.Series(base).value_counts(normalize = True, dropna = False)
238 |
239 | psi = np.sum((test_prop - base_prop) * np.log(test_prop / base_prop))
240 |
241 | frame = pd.DataFrame({
242 | 'test': test_prop,
243 | 'base': base_prop,
244 | })
245 | frame.index.name = 'value'
246 |
247 | return psi, frame.reset_index()
248 |
249 |
250 |
251 | def PSI(test, base, combiner = None, return_frame = False):
252 | """calculate PSI
253 |
254 | Args:
255 | test (array-like): data to test PSI
256 | base (array-like): base data for calculate PSI
257 | combiner (Combiner|list|dict): combiner to combine data
258 | return_frame (bool): if need to return frame of proportion
259 |
260 | Returns:
261 | float|Series
262 | """
263 |
264 | if combiner is not None:
265 | if isinstance(combiner, (dict, list)):
266 | from .transform import Combiner
267 | combiner = Combiner().load(combiner)
268 |
269 | test = combiner.transform(test, labels = True)
270 | base = combiner.transform(base, labels = True)
271 |
272 | psi = list()
273 | frame = list()
274 |
275 | if isinstance(test, pd.DataFrame):
276 | for col in test:
277 | p, f = _PSI(test[col], base[col])
278 | psi.append(p)
279 | frame.append(f)
280 |
281 | psi = pd.Series(psi, index = test.columns)
282 |
283 | frame = pd.concat(
284 | frame,
285 | keys = test.columns,
286 | names = ['columns', 'id'],
287 | ).reset_index()
288 | frame = frame.drop(columns = 'id')
289 | else:
290 | psi, frame = _PSI(test, base)
291 |
292 |
293 | res = (psi,)
294 |
295 | if return_frame:
296 | res += (frame,)
297 |
298 | return unpack_tuple(res)
299 |
300 |
301 | def matrix(y_pred, y, splits = None):
302 | """confusion matrix of target
303 |
304 | Args:
305 | y_pred (array-like)
306 | y (array-like)
307 | splits (float|list): split points of y_pred
308 |
309 | Returns:
310 | DataFrame: confusion matrix witch true labels in rows and predicted labels in columns
311 |
312 | """
313 | if splits is not None:
314 | y_pred = bin_by_splits(y_pred, splits)
315 |
316 | labels = np.unique(y)
317 | from sklearn.metrics import confusion_matrix
318 | m = confusion_matrix(y, y_pred, labels = labels)
319 |
320 | return pd.DataFrame(
321 | m,
322 | index = pd.Index(labels, name = 'Actual'),
323 | columns = pd.Index(labels, name = 'Predicted'),
324 | )
325 |
--------------------------------------------------------------------------------
/toad/metrics_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 | from .metrics import KS, KS_bucket, F1, PSI, AUC, matrix
6 |
7 | np.random.seed(1)
8 |
9 | feature = np.random.rand(500)
10 | target = np.random.randint(2, size = 500)
11 | base_feature = np.random.rand(500)
12 |
13 | test_df = pd.DataFrame({
14 | 'A': np.random.rand(500),
15 | 'B': np.random.rand(500),
16 | })
17 | base_df = pd.DataFrame({
18 | 'A': np.random.rand(500),
19 | 'B': np.random.rand(500),
20 | })
21 |
22 | FUZZ_THRESHOLD = 1e-10
23 |
24 | def test_KS():
25 | result = KS(feature, target)
26 | assert result == 0.05536775661256989
27 |
28 | def test_KS_bucket():
29 | result = KS_bucket(feature, target)
30 | assert result.loc[4, 'ks'] == -0.028036335090276976
31 |
32 | def test_KS_bucket_use_step():
33 | result = KS_bucket(feature, target, method = 'step', clip_q = 0.01)
34 | assert result.loc[4, 'ks'] == -0.0422147102645028
35 |
36 | def test_KS_bucket_for_all_score():
37 | result = KS_bucket(feature, target, bucket = False)
38 | assert len(result) == 500
39 |
40 | def test_KS_bucket_return_splits():
41 | result, splits = KS_bucket(feature, target, return_splits = True)
42 | assert len(splits) == 9
43 |
44 | def test_KS_bucket_use_split_pointers():
45 | result = KS_bucket(feature, target, bucket = [0.2, 0.6])
46 | assert len(result) == 3
47 |
48 | def test_KS_bucket_with_lift():
49 | result = KS_bucket(feature, target)
50 | assert result.loc[3, 'lift'] == 1.0038610038610036
51 |
52 | def test_KS_bucket_with_cum_lift():
53 | result = KS_bucket(feature, target)
54 | assert result.loc[3, 'cum_lift'] == 1.003861003861004
55 |
56 |
57 | def test_F1():
58 | result, split = F1(feature, target, return_split = True)
59 | assert result == pytest.approx(0.6844207723035951, FUZZ_THRESHOLD)
60 |
61 | def test_F1_split():
62 | result = F1(feature, target, split = 0.5)
63 | assert result == pytest.approx(0.51417004048583, FUZZ_THRESHOLD)
64 |
65 | def test_AUC():
66 | result = AUC(feature, target)
67 | assert result == 0.5038690142424582
68 |
69 | def test_AUC_with_curve():
70 | auc, fpr, tpr, thresholds = AUC(feature, target, return_curve = True)
71 | assert thresholds[200] == 0.15773006987053328
72 |
73 | def test_PSI():
74 | result = PSI(feature, base_feature, combiner = [0.3, 0.5, 0.7])
75 | assert result == 0.018630024627491467
76 |
77 | def test_PSI_frame():
78 | result = PSI(
79 | test_df,
80 | base_df,
81 | combiner = {
82 | 'A': [0.3, 0.5, 0.7],
83 | 'B': [0.4, 0.8],
84 | },
85 | )
86 |
87 | assert result['B'] == pytest.approx(0.014528279995858708, FUZZ_THRESHOLD)
88 |
89 | def test_PSI_return_frame():
90 | result, frame = PSI(
91 | test_df,
92 | base_df,
93 | combiner = {
94 | 'A': [0.3, 0.5, 0.7],
95 | 'B': [0.4, 0.8],
96 | },
97 | return_frame = True,
98 | )
99 |
100 | assert frame.loc[4, 'test'] == 0.38
101 |
102 | def test_matrix():
103 | df = matrix(feature, target, splits = 0.5)
104 | assert df.iloc[0,1] == 133
105 |
--------------------------------------------------------------------------------
/toad/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from .trainer import *
3 |
--------------------------------------------------------------------------------
/toad/nn/functional.py:
--------------------------------------------------------------------------------
1 | from toad.utils.decorator import support_numpy
2 |
3 | def flooding(loss, b):
4 | """flooding loss
5 | """
6 | return (loss - b).abs() + b
7 |
8 |
9 | @support_numpy
10 | def focal_loss(input, target, alpha = 1., gamma = 2., reduction = 'mean'):
11 | """focal loss
12 |
13 | Args:
14 | input (Tensor): N x C, C is the number of classes
15 | target (Tensor): N, each value is the index of classes
16 | alpha (Variable): balaced variant of focal loss, range is in [0, 1]
17 | gamma (float): focal loss parameter
18 | reduction (str): `mean`, `sum`, `none` for reduce the loss of each classes
19 | """
20 | import numpy as np
21 | import torch
22 | import torch.nn.functional as F
23 |
24 | prob = F.sigmoid(input)
25 | weight = torch.pow(1. - prob, gamma)
26 | focal = -alpha * weight * torch.log(prob)
27 | loss = F.nll_loss(focal, target, reduction = reduction)
28 |
29 | return loss
30 |
31 |
32 | @support_numpy
33 | def binary_focal_loss(input, target, **kwargs):
34 | """binary focal loss
35 | """
36 | # convert 1d tensor to 2d
37 | if input.ndim == 1:
38 | import torch
39 | input = input.view(-1, 1)
40 | input = torch.hstack([1 - input, input])
41 |
42 | return focal_loss(input, target, **kwargs)
43 |
44 |
45 | def focal_loss_for_numpy(input, target, alpha = 1., gamma = 2., reduction = 'mean'):
46 | """focal loss for numpy array
47 | """
48 | import numpy as np
49 |
50 | prob = 1 / (1 + np.exp(-input))
51 | weight = np.power(1. - prob, gamma)
52 | focal = -alpha * weight * np.log(prob)
53 | loss = -focal[np.arange(len(focal)), target]
54 |
55 | if reduction == 'mean':
56 | loss = loss.mean()
57 | elif reduction == 'sum':
58 | loss = loss.sum()
59 | elif reduction == 'none':
60 | pass
61 |
62 | return loss
63 |
64 |
65 | def label_smoothing(labels, smoothing = 0.1):
66 | """label smoothing
67 | """
68 | assert len(labels.shape) == 2, "labels must be 2 dim where shape should be (N, C)"
69 |
70 | return (1. - smoothing) * labels + smoothing / labels.shape[1]
71 |
--------------------------------------------------------------------------------
/toad/nn/functional_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import torch
4 | from torch import nn
5 | import torch.nn.functional as F
6 | from torch.utils.data import Dataset, DataLoader
7 |
8 | from .functional import focal_loss, binary_focal_loss
9 |
10 | DATASET_SIZE = 20000
11 | NUM_CLASSES = 4
12 |
13 |
14 | @pytest.fixture(autouse=True)
15 | def seed():
16 | torch.manual_seed(0)
17 | yield
18 |
19 |
20 | def test_focal_loss(seed):
21 | y_pred = torch.rand(DATASET_SIZE, NUM_CLASSES, dtype=torch.float)
22 | y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long)
23 | loss = focal_loss(y_pred, y)
24 | assert loss.item() == pytest.approx(-0.07764504849910736, 1e-6)
25 |
26 |
27 | def test_loss_with_grad(seed):
28 | y_pred = torch.rand(DATASET_SIZE, NUM_CLASSES, dtype=torch.float, requires_grad=True)
29 | y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long)
30 | loss = focal_loss(y_pred, y)
31 | loss.backward()
32 | assert y_pred.grad is not None
33 |
34 |
35 | def test_binary_focal_loss(seed):
36 | y_pred = torch.rand(DATASET_SIZE, dtype=torch.float)
37 | y = torch.randint(2, size=(DATASET_SIZE,), dtype=torch.long)
38 | loss = binary_focal_loss(y_pred, y)
39 | assert loss.item() == pytest.approx(-0.07776755839586258, 1e-6)
40 |
41 |
42 | def test_numpy_support_focal_loss(seed):
43 | y_pred = torch.rand(DATASET_SIZE, NUM_CLASSES, dtype=torch.float).numpy()
44 | y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long).numpy()
45 | loss = focal_loss(y_pred, y)
46 | assert loss.item() == pytest.approx(-0.07764504849910736, 1e-6)
47 |
48 |
49 | def test_binary_focal_loss_for_xgb(seed):
50 | from toad.utils.decorator import xgb_loss
51 |
52 | y_pred = torch.rand(DATASET_SIZE, dtype=torch.float).numpy()
53 | y = torch.randint(2, size=(DATASET_SIZE,), dtype=torch.long).numpy()
54 | loss_func = xgb_loss(gamma=5.0, alpha=0.5)(binary_focal_loss)
55 | grad, hess = loss_func(y_pred, y)
56 |
57 | assert grad == pytest.approx(-0.00023283064365386963)
58 | assert hess == pytest.approx(465.66128730773926)
59 |
60 |
61 | # TODO
62 | # focal loss sum/none
63 | # label_smoothing
64 |
--------------------------------------------------------------------------------
/toad/nn/loss.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 |
3 | import torch
4 | from torch.nn import Module
5 |
6 | from .functional import focal_loss
7 |
8 |
9 | class FocalLoss(Module):
10 | def __init__(self, alpha = 1., gamma = 2., reduction = 'mean'):
11 | super(FocalLoss, self).__init__()
12 |
13 | self.alpha = alpha
14 | self.gamma = gamma
15 | self.reduction = reduction
16 |
17 | def forward(self, input, target):
18 | return focal_loss(
19 | input,
20 | target,
21 | alpha = self.alpha,
22 | gamma = self.gamma,
23 | reduction = self.reduction,
24 | )
25 |
26 |
27 | class DictLoss(Module):
28 | def __init__(self, torch_loss, weights: Dict[str, float] = None):
29 | super(DictLoss, self).__init__()
30 | self.torch_loss = torch_loss
31 | self.weights = weights or {}
32 |
33 | def forward(self, input: Dict[str, torch.Tensor], target: Dict[str, torch.Tensor]):
34 | loss = 0
35 | weight_sum = 0
36 | for key, _target in target.items():
37 | if key not in input:
38 | continue
39 | weight = self.weights.get(key, 1)
40 | mask = torch.bitwise_not(torch.isnan(_target))
41 | _target = _target.to(input[key].device)
42 | loss += weight * self.torch_loss(input[key][mask], _target[mask])
43 | weight_sum += weight
44 |
45 | return loss / weight_sum
46 |
47 |
48 | class ListLoss(Module):
49 | def __init__(self, torch_loss, weights: List[float] = None):
50 | super(ListLoss, self).__init__()
51 | self.torch_loss = torch_loss
52 | self.weights = weights
53 |
54 | def forward(self, input: List[torch.Tensor], target: List[torch.Tensor]):
55 | loss = 0
56 | weight_sum = 0
57 | for i, (_input, _target) in enumerate(zip(input, target)):
58 | if self.weights:
59 | weight = self.weights[i]
60 | else:
61 | weight = 1
62 | _target = _target.to(_input.device)
63 | mask = torch.bitwise_not(torch.isnan(_target))
64 | loss += weight * self.torch_loss(_input[mask], _target[mask])
65 | weight_sum += weight
66 |
67 | return loss / weight_sum
68 |
--------------------------------------------------------------------------------
/toad/nn/loss_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import torch.nn.functional as F
4 | from torch.utils.data import Dataset, DataLoader
5 |
6 | from .module import Module
7 | from .loss import DictLoss, ListLoss
8 |
9 | DATASET_SIZE = 20000
10 | NUM_FEATS = 784
11 | NUM_CLASSES = 2
12 |
13 | X = torch.rand(DATASET_SIZE, NUM_FEATS, dtype=torch.float)
14 | y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long)
15 |
16 |
17 | class DictDataset(Dataset):
18 | def __init__(self, x, y):
19 | super().__init__()
20 | self.x = x
21 | self.y = y
22 |
23 | def __len__(self):
24 | return self.x.shape[0]
25 |
26 | def __getitem__(self, item):
27 | return self.x[item], {'y': self.y[item]}
28 |
29 |
30 | class ListDataset(Dataset):
31 | def __init__(self, x, y):
32 | super().__init__()
33 | self.x = x
34 | self.y = y
35 |
36 | def __len__(self):
37 | return self.x.shape[0]
38 |
39 | def __getitem__(self, item):
40 | return self.x[item], [self.y[item]]
41 |
42 |
43 | class TestDictModel(Module):
44 | def __init__(self, in_feats, out_feats):
45 | super().__init__()
46 |
47 | self.linear = nn.Linear(in_feats, out_feats)
48 |
49 | def forward(self, x):
50 | x = self.linear(x)
51 | return {'y': F.relu(x)}
52 |
53 | def fit_step(self, batch, loss=None):
54 | x, y = batch
55 | y_hat = self(x)
56 | return loss(y_hat, y)
57 |
58 |
59 | class TestListModel(Module):
60 | def __init__(self, in_feats, out_feats):
61 | super().__init__()
62 |
63 | self.linear = nn.Linear(in_feats, out_feats)
64 |
65 | def forward(self, x):
66 | x = self.linear(x)
67 | return [F.relu(x)]
68 |
69 | def fit_step(self, batch, loss=None):
70 | x, y = batch
71 | y_hat = self(x)
72 | return loss(y_hat, y)
73 |
74 |
75 | def test_dict_loss():
76 | model = TestDictModel(NUM_FEATS, NUM_CLASSES)
77 | loader = DataLoader(
78 | DictDataset(X, y),
79 | batch_size=128,
80 | shuffle=True,
81 | )
82 | model.fit(loader, epoch=1, loss=DictLoss(F.cross_entropy))
83 |
84 |
85 | def test_list_loss():
86 | model = TestListModel(NUM_FEATS, NUM_CLASSES)
87 | loader = DataLoader(
88 | ListDataset(X, y),
89 | batch_size=128,
90 | shuffle=True,
91 | )
92 | model.fit(loader, epoch=1, loss=ListLoss(F.cross_entropy))
93 |
--------------------------------------------------------------------------------
/toad/nn/module.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from torch import nn, optim
4 | from torch.nn.parallel import DistributedDataParallel
5 |
6 | from .trainer.history import get_current_history
7 | from ..utils.progress import Progress
8 |
9 |
10 |
11 | class Module(nn.Module):
12 | """base module for every model
13 |
14 | Examples:
15 | >>> from toad.nn import Module
16 | ... from torch import nn
17 | ...
18 | ... class Net(Module):
19 | ... def __init__(self, inputs, hidden, outputs):
20 | ... super().__init__()
21 | ... self.model = nn.Sequential(
22 | ... nn.Linear(inputs, hidden),
23 | ... nn.ReLU(),
24 | ... nn.Linear(hidden, outputs),
25 | ... nn.Sigmoid(),
26 | ... )
27 | ...
28 | ... def forward(self, x):
29 | ... return self.model(x)
30 | ...
31 | ... def fit_step(self, batch):
32 | ... x, y = batch
33 | ... y_hat = self(x)
34 | ...
35 | ... # log into history
36 | ... self.log('y', y)
37 | ... self.log('y_hat', y_hat)
38 | ...
39 | ... return nn.functional.mse_loss(y_hat, y)
40 | ...
41 | ... model = Net(10, 4, 1)
42 | ...
43 | ... model.fit(train_loader)
44 |
45 | """
46 | def __init__(self):
47 | """define model struct
48 | """
49 | super().__init__()
50 |
51 |
52 | @property
53 | def device(self):
54 | """device of model
55 | """
56 | return next(self.parameters()).device
57 |
58 |
59 | def fit(self, loader, trainer = None, optimizer = None, loss = None, early_stopping = None, **kwargs):
60 | """train model
61 |
62 | Args:
63 | loader (DataLoader): loader for training model
64 | trainer (Trainer): trainer for training model
65 | optimizer (torch.Optimier): the default optimizer is `Adam(lr = 1e-3)`
66 | loss (Callable): could be called as 'loss(y_hat, y)'
67 | early_stopping (earlystopping): the default value is `loss_earlystopping`,
68 | you can set it to `False` to disable early stopping
69 | epoch (int): number of epoch for training loop
70 | callback (callable): callable function will be called every epoch
71 | """
72 | if trainer is None:
73 | from .trainer import Trainer
74 | trainer = Trainer(self, loader, optimizer = optimizer, loss = loss, early_stopping = early_stopping)
75 |
76 | trainer.train(**kwargs)
77 |
78 |
79 | def evaluate(self, loader, trainer = None):
80 | """evaluate model
81 |
82 | Args:
83 | loader (DataLoader): loader for evaluate model
84 | trainer (Trainer): trainer for evaluate model
85 | """
86 | if trainer is None:
87 | from .trainer import Trainer
88 | trainer = Trainer(self)
89 |
90 | return trainer.evaluate(loader)
91 |
92 |
93 |
94 | def fit_step(self, batch, loss = None, *args, **kwargs):
95 | """step for fitting
96 |
97 | Args:
98 | batch (Any): batch data from dataloader
99 | loss (Callable): could be called as 'loss(y_hat, y)'
100 |
101 | Returns:
102 | Tensor: loss of this step
103 | """
104 | x, y = batch
105 | y_hat = self.__call__(x)
106 | if loss is None:
107 | loss = nn.functional.mse_loss
108 | return loss(y_hat, y)
109 |
110 |
111 | def save(self, path):
112 | """save model
113 | """
114 | torch.save(self.state_dict(), path)
115 |
116 |
117 | def load(self, path):
118 | """load model
119 | """
120 | state = torch.load(path)
121 | self.load_state_dict(state)
122 |
123 |
124 | def log(self, key, value):
125 | """log values to history
126 |
127 | Args:
128 | key (str): name of message
129 | value (Tensor): tensor of values
130 | """
131 | history = get_current_history()
132 | if history is None:
133 | return
134 |
135 | return history.log(key, value)
136 |
137 |
138 | def distributed(self, backend = None, **kwargs):
139 | """get distributed model
140 | """
141 | if not torch.distributed.is_initialized():
142 | if backend is None:
143 | # choose a backend
144 | backend = 'nccl' if torch.distributed.is_nccl_available() else 'gloo'
145 |
146 | torch.distributed.init_process_group(backend, **kwargs)
147 |
148 | return DistModule(self)
149 |
150 |
151 |
152 | class DistModule(DistributedDataParallel):
153 | """distributed module class
154 | """
155 | def fit(self, *args, **kwargs):
156 | return self.module.fit(*args, **kwargs)
157 |
158 | def save(self, *args, **kwargs):
159 | return self.module.save(*args, **kwargs)
160 |
161 | def load(self, *args, **kwargs):
162 | return self.module.load(*args, **kwargs)
163 |
164 | def log(self, *args, **kwargs):
165 | return self.module.log(*args, **kwargs)
166 |
167 |
168 |
--------------------------------------------------------------------------------
/toad/nn/module_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import torch.nn.functional as F
4 | from torch.utils.data import TensorDataset, DataLoader
5 |
6 | from .module import Module
7 |
8 | DATASET_SIZE = 20000
9 | NUM_FEATS = 784
10 | NUM_CLASSES = 2
11 |
12 | X = torch.rand(DATASET_SIZE, NUM_FEATS, dtype = torch.float)
13 | y = torch.randint(NUM_CLASSES, size = (DATASET_SIZE,), dtype = torch.long)
14 |
15 | loader = DataLoader(
16 | TensorDataset(X, y),
17 | batch_size = 128,
18 | shuffle = True,
19 | )
20 |
21 | class TestModel(Module):
22 | def __init__(self, in_feats, out_feats):
23 | super().__init__()
24 |
25 | self.linear = nn.Linear(in_feats, out_feats)
26 |
27 | def forward(self, x):
28 | x = self.linear(x)
29 | return F.relu(x)
30 |
31 | def fit_step(self, batch):
32 | x, y = batch
33 | y_hat = self(x)
34 | return F.cross_entropy(y_hat, y)
35 |
36 | def test_model():
37 | model = TestModel(NUM_FEATS, NUM_CLASSES)
38 | model.fit(loader, epoch = 1)
39 |
40 |
41 | def test_fit_callback():
42 | h_list = []
43 |
44 | def func(history, epoch):
45 | h_list.append(history)
46 |
47 | model = TestModel(NUM_FEATS, NUM_CLASSES)
48 | model.fit(loader, epoch = 2, callback = func)
49 | assert len(h_list) == 2
50 |
51 |
52 | class TestModel2(TestModel):
53 | def fit_step(self, batch, loss=None):
54 | x, y = batch
55 | y_hat = self(x)
56 | return loss(y_hat, y)
57 |
58 |
59 | def test_model_loss():
60 | model = TestModel2(NUM_FEATS, NUM_CLASSES)
61 | model.fit(loader, epoch=1, loss=F.cross_entropy)
62 |
--------------------------------------------------------------------------------
/toad/nn/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .history import History, get_current_history
2 | from .callback import callback
3 | from .earlystop import earlystopping
4 | from .trainer import Trainer
5 |
6 | __all__ = [
7 | 'History',
8 | 'get_current_history',
9 | 'callback',
10 | 'earlystopping',
11 | 'Trainer',
12 | ]
13 |
--------------------------------------------------------------------------------
/toad/nn/trainer/callback.py:
--------------------------------------------------------------------------------
1 | from ...utils.decorator import Decorator
2 |
3 | class callback(Decorator):
4 | """callback for trainer
5 |
6 | Examples:
7 | >>> @callback
8 | ... def savemodel(model):
9 | ... model.save("path_to_file")
10 | ...
11 | ... trainer.train(model, callback = savemodel)
12 |
13 | """
14 | def __init__(self, *args, **kwargs):
15 | if hasattr(self, 'wrapped'):
16 | # use `wrapped` func as core func
17 | super().__init__(getattr(self, 'wrapped'))
18 | # setup configuration
19 | self.setup(*args, **kwargs)
20 | return
21 |
22 | # init normal decorator
23 | super().__init__(*args, **kwargs)
24 |
25 |
26 | def setup_func(self, func):
27 | import inspect
28 | self._params = inspect.signature(func).parameters
29 |
30 | return func
31 |
32 |
33 | def wrapper(self, **kwargs):
34 | params = {k: v for k ,v in kwargs.items() if k in self._params.keys()}
35 |
36 | return self.call(**params)
37 |
38 |
39 |
40 | class checkpoint(callback):
41 | """
42 | Args:
43 | dir (string): dir name for saving checkpoint
44 | every (int): every epoch for saving
45 | format (string): checkpoint file format
46 | """
47 | dirpath = "model_checkpoints"
48 | every = 1
49 | filename = "{name}-{epoch}.pt"
50 |
51 |
52 | def wrapper(self, **kwargs):
53 | model = kwargs.get("model")
54 | epoch = kwargs.get("epoch")
55 |
56 | name = type(model).__name__
57 |
58 | from pathlib import Path
59 | dirpath = Path(self.dirpath)
60 | dirpath.mkdir(parents = True, exist_ok = True)
61 |
62 | filename = self.filename.format(
63 | name = name,
64 | epoch = epoch,
65 | )
66 |
67 | path = dirpath / filename
68 |
69 | if epoch % self.every == 0:
70 | super().wrapper(
71 | path = path,
72 | **kwargs
73 | )
74 |
75 |
76 | class savemodel(checkpoint):
77 | """
78 | Args:
79 | dir (string): dir name for saving checkpoint
80 | every (int): every epoch for saving
81 | format (string): checkpoint file format, default is `{name}-{epoch}.pt`
82 | """
83 | def wrapped(self, model, path):
84 | import torch
85 | torch.save(model.state_dict(), path)
86 |
--------------------------------------------------------------------------------
/toad/nn/trainer/callback_test.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from .callback import callback, savemodel
4 | from ..module import Module
5 |
6 | class TestModel(Module):
7 | def __init__(self, in_feats, out_feats):
8 | super().__init__()
9 |
10 | self.linear = nn.Linear(in_feats, out_feats)
11 |
12 |
13 | def test_callback():
14 | @callback
15 | def hook(history, trainer):
16 | return history['a']
17 |
18 | res = hook(epoch = 1, trainer = None, history = {"a": 3})
19 |
20 | assert res == 3
21 |
22 | def test_checkpoint():
23 | model = TestModel(10, 2)
24 | hook = savemodel(dirpath = '/dev', filename = "null")
25 | hook(model = model, epoch = 1)
26 |
--------------------------------------------------------------------------------
/toad/nn/trainer/earlystop.py:
--------------------------------------------------------------------------------
1 | from .callback import callback
2 | from ...utils.decorator import Decorator
3 |
4 |
5 | class earlystopping(callback):
6 | """
7 | Examples:
8 | >>> @earlystopping(delta = 1e-3, patience = 5)
9 | ... def auc(history):
10 | ... return AUC(history['y_hat'], history['y'])
11 | """
12 | delta = -1e-3
13 | patience = 10
14 | skip = 0
15 |
16 | def setup(self, delta = -1e-3, patience = 10, skip = 0):
17 | """
18 | Args:
19 | delta (float): stop training if diff of new score is smaller than delta
20 | patience (int): patience of rounds to stop training
21 | skip (int): n rounds from starting training to warm up
22 | """
23 | self.direction = 1.0 if delta > 0 else -1.0
24 | self.delta = delta * self.direction
25 | self.patience = patience
26 | self.skip = skip
27 |
28 | self.reset()
29 |
30 |
31 | def get_best_state(self):
32 | """get best state of model
33 | """
34 | return self.best_state
35 |
36 |
37 | def reset(self):
38 | """
39 | """
40 | self.best_score = float('inf') * (-self.direction)
41 | self.best_state = None
42 | self._times = 0
43 |
44 |
45 | def wrapper(self, model, trainer = None, epoch = 0, **kwargs):
46 | # set skip round
47 | if epoch < self.skip:
48 | return False
49 |
50 | score = super().wrapper(model = model, epoch = epoch, **kwargs)
51 | diff = (score - self.best_score) * self.direction
52 |
53 | if diff > self.delta:
54 | self.best_state = model.state_dict()
55 | self.best_score = score
56 | self._times = 0
57 | return False
58 |
59 | self._times += 1
60 | if self._times >= self.patience:
61 | model.load_state_dict(self.best_state)
62 |
63 | if trainer:
64 | trainer.terminate()
65 |
66 | return True
67 |
68 |
69 | class loss_stopping(earlystopping):
70 | """scoring function
71 | """
72 | def wrapped(self, history):
73 | return history['loss'].mean()
74 |
--------------------------------------------------------------------------------
/toad/nn/trainer/earlystop_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from .earlystop import earlystopping
3 |
4 |
5 |
6 | def test_earlystopping():
7 | model = torch.nn.Linear(10, 10)
8 |
9 | @earlystopping(delta = -1, patience = 3)
10 | def scoring(history):
11 | return history['loss']
12 |
13 | rounds = []
14 | for i in range(10):
15 | if scoring(model = model, history = {"loss": 1}):
16 | break
17 |
18 | rounds.append(i)
19 |
20 | assert len(rounds) == 3
21 |
22 |
23 | def test_best_state():
24 | model = torch.nn.Linear(10, 1)
25 |
26 | @earlystopping(delta = -1, patience = 1)
27 | def scoring(history):
28 | return history['loss']
29 |
30 | with torch.no_grad():
31 | model.weight.fill_(1.)
32 |
33 | # save init weight
34 | scoring(model = model, history = {"loss": 10})
35 | assert scoring.best_state["weight"].sum().item() == 10
36 |
37 | # change weight
38 | with torch.no_grad():
39 | model.weight.fill_(0.)
40 |
41 | # save best weight
42 | scoring(model = model, history = {"loss": 5})
43 | assert scoring.best_state["weight"].sum().item() == 0
44 |
45 |
--------------------------------------------------------------------------------
/toad/nn/trainer/event.py:
--------------------------------------------------------------------------------
1 | from .callback import callback as Callback
2 |
3 |
4 | class Event:
5 | def __init__(self):
6 | self._events = {}
7 |
8 | def register(self, event, handler, every = 1):
9 | """register events handler
10 | """
11 | if not isinstance(handler, Callback):
12 | handler = Callback(handler)
13 |
14 | if event not in self._events:
15 | self._events[event] = []
16 |
17 | handler._event_count = 0
18 | handler._event_every = every
19 |
20 | self._events[event].append(handler)
21 |
22 |
23 | def on(self, event, **kwargs):
24 | def wrapper(handler):
25 | self.register(event, handler, **kwargs)
26 | return handler
27 |
28 | return wrapper
29 |
30 |
31 | def emit(self, event, *args, **kwargs):
32 | """emit event
33 | """
34 | if event not in self._events:
35 | return
36 |
37 | # trigger handler
38 | for handler in self._events[event]:
39 | # increase count
40 | handler._event_count += 1
41 |
42 | # trigger event
43 | if handler._event_count % handler._event_every == 0:
44 | handler(*args, **kwargs)
45 |
46 |
47 | def mute(self, event):
48 | """remove events handlers
49 | """
50 | if event in self._events:
51 | handlers = self._events.pop(event)
52 |
--------------------------------------------------------------------------------
/toad/nn/trainer/event_test.py:
--------------------------------------------------------------------------------
1 | from .event import Event
2 |
3 |
4 | def test_event_trigger():
5 | e = Event()
6 |
7 | counts = 0
8 |
9 | @e.on("test:trigger")
10 | def func():
11 | nonlocal counts
12 | counts += 1
13 |
14 | e.emit("test:trigger")
15 |
16 | assert counts == 1
17 |
18 |
19 | def test_event_trigger_every():
20 | e = Event()
21 |
22 | counts = 0
23 |
24 | @e.on("test:trigger", every = 2)
25 | def func():
26 | nonlocal counts
27 | counts += 1
28 |
29 | for i in range(10):
30 | e.emit("test:trigger")
31 |
32 | assert counts == 5
33 |
--------------------------------------------------------------------------------
/toad/nn/trainer/history.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 |
5 | _history_stack = [None]
6 |
7 |
8 | def get_current_history():
9 | global _history_stack
10 |
11 | return _history_stack[-1]
12 |
13 |
14 |
15 | class History:
16 | """model history
17 | """
18 | def __init__(self):
19 | self._store = {}
20 |
21 |
22 | def __getitem__(self, key):
23 | return self._store[key]
24 |
25 |
26 | def __setitem__(self, key, value):
27 | return self.log(key, value)
28 |
29 |
30 | def _push(self, key, value):
31 | """push value into history
32 |
33 | Args:
34 | key (str): key of history
35 | value (np.ndarray): an array of values
36 | """
37 | if key not in self._store:
38 | self._store[key] = value
39 | return
40 |
41 | self._store[key] = np.concatenate([
42 | self._store[key],
43 | value,
44 | ])
45 |
46 |
47 | def log(self, key, value):
48 | """log message to history
49 |
50 | Args:
51 | key (str): name of message
52 | value (Tensor): tensor of values
53 | """
54 | if isinstance(value, torch.Tensor):
55 | value = value.detach().cpu().numpy()
56 |
57 | # fix scaler tensor
58 | if value.ndim == 0:
59 | value = value.reshape(-1)
60 |
61 | if np.isscalar(value):
62 | value = np.array([value])
63 |
64 | if not isinstance(value, np.ndarray):
65 | raise TypeError("value should be `torch.Tensor` or `scalar`")
66 |
67 | self._push(key, value)
68 |
69 |
70 | def start(self):
71 | global _history_stack
72 | _history_stack.append(self)
73 |
74 | return self
75 |
76 |
77 | def end(self):
78 | global _history_stack
79 | return _history_stack.pop()
80 |
81 |
82 | def __enter__(self):
83 | return self.start()
84 |
85 |
86 | def __exit__(self, exc_type, exc_val, exc_tb):
87 | return self.end()
88 |
--------------------------------------------------------------------------------
/toad/nn/trainer/history_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from .history import History, get_current_history
4 |
5 | def test_history_log():
6 | history = History()
7 |
8 | for i in range(10):
9 | history.log('tensor', torch.rand(3, 5))
10 |
11 | assert history['tensor'].shape == (30, 5)
12 |
13 |
14 | def test_current_history():
15 | history = History()
16 |
17 | with history:
18 | h = get_current_history()
19 | h.log('tensor', torch.rand(3, 5))
20 |
21 | assert history['tensor'].shape == (3, 5)
22 |
--------------------------------------------------------------------------------
/toad/nn/trainer/metrics.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/toad/nn/trainer/metrics.py
--------------------------------------------------------------------------------
/toad/nn/trainer/trainer_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import torch
4 | from torch import nn
5 | import torch.nn.functional as F
6 | from torch.utils.data import TensorDataset, DataLoader
7 |
8 | from .history import History
9 | from ..module import Module
10 | from .trainer import Trainer
11 | from .callback import callback
12 | from .earlystop import earlystopping
13 |
14 |
15 | DATASET_SIZE = 20000
16 | NUM_FEATS = 784
17 | NUM_CLASSES = 2
18 |
19 | X = torch.rand(DATASET_SIZE, NUM_FEATS, dtype = torch.float)
20 | y = torch.randint(NUM_CLASSES, size = (DATASET_SIZE,), dtype = torch.long)
21 |
22 | loader = DataLoader(
23 | TensorDataset(X, y),
24 | batch_size = 128,
25 | shuffle = True,
26 | )
27 |
28 | class TestModel(Module):
29 | def __init__(self, in_feats, out_feats):
30 | super().__init__()
31 |
32 | self.linear = nn.Linear(in_feats, out_feats)
33 |
34 | def forward(self, x):
35 | x = self.linear(x)
36 | return F.relu(x)
37 |
38 | def fit_step(self, batch):
39 | x, y = batch
40 | y_hat = self(x)
41 | return F.cross_entropy(y_hat, y)
42 |
43 |
44 | def test_trainer():
45 | model = TestModel(NUM_FEATS, NUM_CLASSES)
46 | trainer = Trainer(model, loader)
47 | trainer.train(epoch = 2)
48 | assert len(trainer.history) == 2
49 |
50 |
51 | def test_trainer_early_stopping():
52 | model = TestModel(NUM_FEATS, NUM_CLASSES)
53 |
54 | @earlystopping(delta = -1.0, patience = 3)
55 | def scoring(history):
56 | return history['loss'].mean()
57 |
58 | trainer = Trainer(model, loader, early_stopping = scoring)
59 | trainer.train(epoch = 200)
60 | assert len(trainer.history) == 4
61 |
62 |
63 | def test_trainer_fit_step():
64 | model = TestModel(NUM_FEATS, NUM_CLASSES)
65 | trainer = Trainer(model, loader)
66 | step_count = 0
67 |
68 | @trainer.fit_step
69 | def step(model, batch):
70 | x, y = batch
71 | y_hat = model(x)
72 | nonlocal step_count
73 | step_count += 1
74 | return F.cross_entropy(y_hat, y)
75 |
76 | trainer.train(epoch = 2)
77 | assert step_count > 1
78 |
79 |
80 | def test_multi_callbacks():
81 | log = {}
82 |
83 | @callback
84 | def log_epoch(epoch):
85 | log['epoch'] = epoch
86 |
87 | @callback
88 | def log_loss(history):
89 | log['loss'] = history['loss']
90 |
91 | model = TestModel(NUM_FEATS, NUM_CLASSES)
92 | trainer = Trainer(model)
93 | trainer.train(loader, epoch = 2, callback = [log_epoch, log_loss])
94 |
95 | assert log['epoch'] == 1
96 | assert len(log['loss']) == 157
97 |
98 |
99 | def test_trainer_evaluate():
100 | model = TestModel(NUM_FEATS, NUM_CLASSES)
101 | trainer = Trainer(model, loader)
102 |
103 | @trainer.fit_step
104 | def step(model, batch):
105 | x, y = batch
106 | y_hat = model(x)
107 | return F.cross_entropy(y_hat, y)
108 |
109 | history = trainer.evaluate(loader)
110 |
111 | assert len(history["loss"]) == 157
112 |
113 |
114 |
115 | class TestModel2(TestModel):
116 | def fit_step(self, batch, loss=None):
117 | x, y = batch
118 | y_hat = self(x)
119 | return loss(y_hat, y)
120 |
121 |
122 | def test_trainer_loss():
123 | model = TestModel2(NUM_FEATS, NUM_CLASSES)
124 | trainer = Trainer(model, loader, loss = F.cross_entropy)
125 | trainer.train(epoch = 2)
126 | assert len(trainer.history) == 2
127 |
128 |
129 | # def test_trainer_distributed():
130 | # model = TestModel(NUM_FEATS, NUM_CLASSES)
131 | # trainer = Trainer(model, loader)
132 | # trainer.distributed(workers = 2)
133 | # trainer.train(epoch = 5)
134 |
135 |
136 |
137 | ### distribut model test
138 | # from toad.nn.trainer.trainer import Trainer
139 | # from torchvision.transforms import ToTensor
140 | # import torch
141 | # from torch import nn
142 | # from torchvision import datasets
143 | # from toad.nn import Module
144 | # from torch.utils.data import DataLoader
145 | # import ray
146 |
147 | # class NeuralNetwork(Module):
148 | # def __init__(self):
149 | # super(NeuralNetwork, self).__init__()
150 | # self.flatten = nn.Flatten()
151 | # self.linear_relu_stack = nn.Sequential(
152 | # nn.Linear(28 * 28, 512),
153 | # nn.ReLU(),
154 | # nn.Linear(512, 512),
155 | # nn.ReLU(),
156 | # nn.Linear(512, 10),
157 | # nn.ReLU(),
158 | # )
159 | # def forward(self, x):
160 | # x = self.flatten(x)
161 | # logits = self.linear_relu_stack(x)
162 | # return logits
163 | # def fit_step(self, batch):
164 | # X, y = batch
165 | # pred =self(X)
166 | # loss_fn=nn.CrossEntropyLoss()
167 | # return loss_fn(pred, y)
168 |
169 |
170 | # @pytest.mark.skip("distributed trainer skip")
171 | # def test_distribute_example():
172 | # training_data = datasets.FashionMNIST(
173 | # root="~/data",
174 | # train=True,
175 | # download=True,
176 | # transform=ToTensor(),
177 | # )
178 | # # Download test data from open datasets.
179 | # test_data = datasets.FashionMNIST(
180 | # root="~/data",
181 | # train=False,
182 | # download=True,
183 | # transform=ToTensor(),
184 | # )
185 | # worker_batch_size = 64 // 4
186 | # # Create data loaders.
187 | # train_dataloader = DataLoader(training_data, batch_size=16)
188 | # test_dataloader = DataLoader(test_data, batch_size=16)
189 | # model=NeuralNetwork()
190 | # optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
191 | # trainer=Trainer(model,train_dataloader,optimizer)
192 | # trainer.distributed(address="ray://172.20.144.21:10001",num_works=4,use_gpu=False)
193 | # trainer.train(epoch=1)
194 | # trainer.evaluate(test_dataloader)
195 |
--------------------------------------------------------------------------------
/toad/nn/zoo/__init__.py:
--------------------------------------------------------------------------------
1 | from .autoencoder import BaseAutoEncoder, VAE
2 |
--------------------------------------------------------------------------------
/toad/nn/zoo/autoencoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn.functional import relu, binary_cross_entropy
4 |
5 | from ..module import Module
6 |
7 |
8 |
9 | class BaseAutoEncoder(Module):
10 | def __init__(self, input, hidden, zipped):
11 | super().__init__()
12 |
13 | self.encoder = nn.Sequential(
14 | nn.Linear(input, hidden),
15 | nn.ReLU(),
16 | nn.Linear(hidden, zipped),
17 | )
18 |
19 | self.decoder = nn.Sequential(
20 | nn.Linear(zipped, hidden),
21 | nn.ReLU(),
22 | nn.Linear(hidden, input),
23 | )
24 |
25 | self.loss = nn.MSELoss()
26 |
27 |
28 | def encode(self, x):
29 | return self.encoder(x)
30 |
31 | def decode(self, x):
32 | return self.decoder(x)
33 |
34 | def forward(self, x):
35 | z = self.encode(x)
36 | return self.decode(z)
37 |
38 | def fit_step(self, x):
39 | return self.loss(self(x), x)
40 |
41 |
42 |
43 | class VAE(Module):
44 | def __init__(self, input, hidden, zipped):
45 | super().__init__()
46 |
47 | self.hidden_layer = nn.Linear(input, hidden)
48 |
49 | self.mu_layer = nn.Linear(hidden, zipped)
50 | self.var_layer = nn.Linear(hidden, zipped)
51 |
52 | self.decoder = nn.Sequential(
53 | nn.Linear(zipped, hidden),
54 | nn.ReLU(),
55 | nn.Linear(hidden, input),
56 | )
57 |
58 | self.loss = nn.MSELoss()
59 |
60 | def encode(self, x):
61 | h = relu(self.hidden_layer(x))
62 | mu = self.mu_layer(h)
63 | var = self.var_layer(h)
64 |
65 | std = torch.exp(var / 2)
66 | eps = torch.rand_like(std)
67 |
68 | z = mu + eps * std
69 | return z, mu, var
70 |
71 | def decode(self, x):
72 | return self.decoder(x)
73 |
74 | def forward(self, x):
75 | z, mu, var = self.encode(x)
76 | x_hat = self.decode(z)
77 | return x_hat, mu, var
78 |
79 | def fit_step(self, x):
80 | x_hat, mu, var = self(x)
81 | l = self.loss(x_hat, x)
82 | kld = -0.5 * torch.sum(1 + var - torch.pow(mu, 2) - torch.exp(var))
83 |
84 | loss = l + kld
85 | return loss
86 |
--------------------------------------------------------------------------------
/toad/nn/zoo/autoencoder_test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | import pytest
4 | import numpy as np
5 | from torch.utils.data import TensorDataset, DataLoader
6 |
7 | from .autoencoder import BaseAutoEncoder, VAE
8 |
9 | # skip testing with python 3.9 on linux
10 | if sys.version_info >= (3, 9) and sys.platform.startswith('linux'):
11 | pytest.skip("failed with python 3.9 on linux, need fix!", allow_module_level = True)
12 |
13 |
14 | X = torch.Tensor(np.random.rand(20000, 784))
15 |
16 | loader = DataLoader(
17 | X,
18 | batch_size = 128,
19 | shuffle = True,
20 | )
21 |
22 | def test_ae():
23 | ae = BaseAutoEncoder(784, 200, 10)
24 | ae.fit(loader, epoch = 1)
25 |
26 | def test_vae():
27 | vae = VAE(784, 200, 10)
28 | vae.fit(loader, epoch = 1)
29 |
--------------------------------------------------------------------------------
/toad/plot.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | from sklearn.metrics import roc_curve
5 |
6 | from .stats import IV, feature_bin_stats
7 | from .metrics import AUC
8 | from .tadpole import tadpole
9 | from .tadpole.utils import HEATMAP_CMAP, MAX_STYLE, add_annotate, add_text, reset_ylim
10 | from .utils import unpack_tuple, generate_str
11 |
12 | def badrate_plot(frame, x = None, target = 'target', by = None,
13 | freq = None, format = None, return_counts = False,
14 | return_proportion = False, return_frame = False):
15 | """plot for badrate
16 |
17 | Args:
18 | frame (DataFrame)
19 | x (str): column in frame that will be used as x axis
20 | target (str): target column in frame
21 | by (str): column in frame that will be calculated badrate by it
22 | freq (str): offset aliases string by pandas
23 | http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
24 | format (str): format string for time
25 | return_counts (bool): if need return counts plot
26 | return_frame (bool): if need return frame
27 |
28 | Returns:
29 | Axes: badrate plot
30 | Axes: counts plot
31 | Axes: proportion plot
32 | Dataframe: grouping detail data
33 | """
34 | frame = frame.copy()
35 | markers = True
36 |
37 | if not isinstance(target, str):
38 | temp_name = generate_str()
39 | frame[temp_name] = target
40 | target = temp_name
41 |
42 | grouper = x
43 | if freq is not None:
44 | frame.loc[:, x] = pd.to_datetime(frame[x], format = format)
45 | grouper = pd.Grouper(key = x, freq = freq)
46 |
47 | if by is not None:
48 | grouper = [by, grouper]
49 |
50 | styles_count = frame[by].nunique()
51 | if styles_count > MAX_STYLE:
52 | markers = ['o'] * styles_count
53 |
54 | group = frame.groupby(grouper)
55 | table = group[target].agg(['sum', 'count']).reset_index()
56 | table['badrate'] = table['sum'] / table['count']
57 |
58 | # set number dtype to object
59 | if np.issubdtype(table[x].dtype, np.number):
60 | table[x] = table[x].astype(str)
61 |
62 |
63 | rate_plot = tadpole.lineplot(
64 | x = x,
65 | y = 'badrate',
66 | hue = by,
67 | style = by,
68 | data = table,
69 | legend = 'full',
70 | markers = markers,
71 | dashes = False,
72 | )
73 |
74 | # set y axis start with 0
75 | rate_plot.set_ylim(0, None)
76 |
77 | res = (rate_plot,)
78 |
79 | if return_counts:
80 | count_plot = tadpole.barplot(
81 | x = x,
82 | y = 'count',
83 | hue = by,
84 | data = table,
85 | )
86 | res += (count_plot,)
87 |
88 |
89 | if return_proportion:
90 | table['prop'] = 0
91 | for v in table[x].unique():
92 | mask = (table[x] == v)
93 | table.loc[mask, 'prop'] = table[mask]['count'] / table[mask]['count'].sum()
94 |
95 | prop_plot = tadpole.barplot(
96 | x = x,
97 | y = 'prop',
98 | hue = by,
99 | data = table,
100 | )
101 | res += (prop_plot,)
102 |
103 |
104 | if return_frame:
105 | res += (table,)
106 |
107 | return unpack_tuple(res)
108 |
109 |
110 | def corr_plot(frame, figure_size = (20, 15), ax = None):
111 | """plot for correlation
112 |
113 | Args:
114 | frame (DataFrame): frame to draw plot
115 | Returns:
116 | Axes
117 | """
118 | corr = frame.corr()
119 |
120 | mask = np.zeros_like(corr, dtype = bool)
121 | mask[np.triu_indices_from(mask)] = True
122 |
123 | map_plot = tadpole.heatmap(
124 | corr,
125 | mask = mask,
126 | cmap = HEATMAP_CMAP,
127 | vmax = 1,
128 | vmin = -1,
129 | center = 0,
130 | square = True,
131 | cbar_kws = {"shrink": .5},
132 | linewidths = .5,
133 | annot = True,
134 | fmt = '.2f',
135 | figure_size = figure_size,
136 | ax = ax,
137 | )
138 |
139 | return map_plot
140 |
141 |
142 | def proportion_plot(x = None, keys = None, ax = None):
143 | """plot for comparing proportion in different dataset
144 |
145 | Args:
146 | x (Series|list): series or list of series data for plot
147 | keys (str|list): keys for each data
148 |
149 | Returns:
150 | Axes
151 | """
152 | if not isinstance(x, list):
153 | x = [x]
154 |
155 | if keys is None:
156 | keys = [
157 | x[ix].name
158 | if hasattr(x[ix], 'name') and x[ix].name is not None
159 | else ix
160 | for ix in range(len(x))
161 | ]
162 | elif isinstance(keys, str):
163 | keys = [keys]
164 |
165 | x = map(pd.Series, x)
166 | data = pd.concat(x, keys = keys, names = ['keys']).reset_index()
167 | data = data.rename(columns = {data.columns[2]: 'value'})
168 |
169 | prop_data = data.groupby('keys')['value'].value_counts(
170 | normalize = True,
171 | dropna = False,
172 | ).rename('proportion').reset_index()
173 |
174 | prop_plot = tadpole.barplot(
175 | x = 'value',
176 | y = 'proportion',
177 | hue = 'keys',
178 | data = prop_data,
179 | ax = ax,
180 | )
181 |
182 | return prop_plot
183 |
184 |
185 | def roc_plot(score, target, compare = None, figsize = (14, 10), ax = None):
186 | """plot for roc
187 |
188 | Args:
189 | score (array-like): predicted score
190 | target (array-like): true target
191 | compare (array-like): another score for comparing with score
192 |
193 | Returns:
194 | Axes
195 | """
196 | auc, fpr, tpr, thresholds = AUC(score, target, return_curve = True)
197 |
198 | if ax is None:
199 | fig, ax = plt.subplots(1, 1, figsize = figsize)
200 |
201 | ax.plot(fpr, tpr, label = 'ROC curve (area = %0.5f)' % auc)
202 | ax.fill_between(fpr, tpr, alpha = 0.3)
203 | if compare is not None:
204 | c_aux, c_fpr, c_tpr, _ = AUC(compare, target, return_curve = True)
205 | ax.plot(c_fpr, c_tpr,label = 'ROC compare (area = %0.5f)' % c_aux)
206 | ax.fill_between(c_fpr, c_tpr, alpha = 0.3)
207 |
208 | ax.plot([0, 1], [0, 1], color = 'red', linestyle = '--')
209 | plt.legend(loc = "lower right")
210 |
211 | return ax
212 |
213 | def ks_plot(score, target, figsize = (14, 10), ax = None):
214 | """plot for ks
215 |
216 | Args:
217 | score (array-like): predicted score
218 | target (array-like): true target
219 | compare (array-like): another score for comparing with score
220 |
221 | Returns:
222 | Axes
223 | """
224 | fpr, tpr, thresholds = roc_curve(target, score)
225 |
226 | if ax is None:
227 | fig, ax = plt.subplots(1, 1, figsize = figsize)
228 |
229 | ax.plot(thresholds[1 : ], tpr[1 : ], label = 'tpr')
230 | ax.plot(thresholds[1 : ], fpr[1 : ], label = 'fpr')
231 | ax.plot(thresholds[1 : ], (tpr - fpr)[1 : ], label = 'ks')
232 |
233 | ax.invert_xaxis()
234 | ax.legend()
235 |
236 | ks_value = max(tpr - fpr)
237 | x = np.argwhere(abs(fpr - tpr) == ks_value)[0, 0]
238 | thred_value = thresholds[x]
239 | ax.axvline(thred_value, color = 'r', linestyle = '--')
240 | plt.title(f'ks:{ks_value:.5f} threshold:{thred_value:.5f}')
241 |
242 | return ax
243 |
244 | def bin_plot(frame, x = None, target = 'target', iv = True, annotate_format = ".2f",
245 | return_frame = False, figsize = (12, 6), ax = None):
246 | """plot for bins
247 |
248 | Args:
249 | frame (DataFrame)
250 | x (str): column in frame that will be used as x axis
251 | target (str): target column in frame
252 | iv (bool): if need to show iv in plot
253 | annotate_format (str): format str for axis annotation of chart
254 | return_frame (bool): if need return bin frame
255 | figsize (tuple): size of the figure (width, height)
256 |
257 | Returns:
258 | Dataframe: contains good, bad, badrate, prop, y_prop, n_prop, woe, iv
259 | """
260 | frame = frame.copy()
261 |
262 | if not isinstance(target, str):
263 | temp_name = generate_str()
264 | frame[temp_name] = target
265 | target = temp_name
266 |
267 | table = feature_bin_stats(frame, x, target)
268 |
269 | if ax is None:
270 | fig, ax = plt.subplots(figsize=figsize)
271 |
272 | ax = tadpole.barplot(
273 | x = x,
274 | y = 'prop',
275 | data = table,
276 | color = '#82C6E2',
277 | ax = ax,
278 | )
279 |
280 | ax = add_annotate(ax, format = annotate_format)
281 |
282 | badrate_ax = ax.twinx()
283 | badrate_ax.grid(False)
284 |
285 | badrate_ax = tadpole.lineplot(
286 | x = x,
287 | y = 'badrate',
288 | data = table,
289 | color = '#D65F5F',
290 | ax = badrate_ax,
291 | )
292 |
293 | badrate_ax.set_ylim([0, None])
294 | badrate_ax = add_annotate(badrate_ax, format = annotate_format)
295 |
296 | if iv:
297 | ax = reset_ylim(ax)
298 | ax = add_text(ax, 'IV: {:.5f}'.format(table['iv'].sum()))
299 |
300 | res = (ax,)
301 |
302 | if return_frame:
303 | res += (table,)
304 |
305 | return unpack_tuple(res)
306 |
--------------------------------------------------------------------------------
/toad/plot_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 | from .plot import (
6 | badrate_plot,
7 | corr_plot,
8 | proportion_plot,
9 | roc_plot,
10 | bin_plot,
11 | )
12 |
13 | np.random.seed(1)
14 |
15 | LENGTH = 500
16 |
17 | A = np.random.rand(LENGTH)
18 | A[np.random.choice(LENGTH, 20, replace = False)] = np.nan
19 |
20 | B = np.random.randint(100, size = LENGTH)
21 | C = A + np.random.normal(0, 0.2, LENGTH)
22 | D = A + np.random.normal(0, 0.1, LENGTH)
23 |
24 | E = np.random.rand(LENGTH)
25 | E[np.random.choice(LENGTH, 480, replace = False)] = np.nan
26 |
27 | F = B + np.random.normal(0, 10, LENGTH)
28 |
29 | target = np.random.randint(2, size = LENGTH)
30 |
31 | frame = pd.DataFrame({
32 | 'A': A,
33 | 'B': B,
34 | 'C': C,
35 | 'D': D,
36 | 'E': E,
37 | 'F': F,
38 | })
39 |
40 | frame['target'] = target
41 |
42 |
43 | def test_badrate_plot():
44 | g = badrate_plot(
45 | frame,
46 | x = 'A',
47 | target = 'target',
48 | return_counts = True,
49 | return_proportion = True,
50 | )
51 |
52 | def test_badrate_plot_y_axis():
53 | g = badrate_plot(
54 | frame,
55 | x = 'A',
56 | target = 'target',
57 | )
58 | bottom, _ = g.get_ylim()
59 | assert bottom == 0
60 |
61 | def test_corr_plot():
62 | g = corr_plot(frame)
63 |
64 |
65 | def test_proportion_plot():
66 | g = proportion_plot(x = frame['target'])
67 |
68 |
69 | def test_roc_plot():
70 | g = roc_plot(frame['B'], frame['target'])
71 |
72 |
73 | def test_bin_plot():
74 | g = bin_plot(frame, x = 'B', target = 'target')
75 |
76 |
77 | def test_bin_plot_return_frame():
78 | g, df = bin_plot(frame, x = 'B', target = 'target', return_frame = True)
79 | assert df.shape == (100, 10)
80 |
--------------------------------------------------------------------------------
/toad/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .process import Processing, Mask, F
2 | from .partition import Partition, TimePartition
--------------------------------------------------------------------------------
/toad/preprocessing/partition.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | class Partition:
6 | def partition(self, data):
7 | """partition data
8 |
9 | Args:
10 | data (DataFrame): dataframe
11 |
12 | Returns:
13 | iterator -> ndarray[bool]: mask of partition data
14 | iterator -> str: suffix string of current partition
15 | """
16 | yield np.ones(len(data)).astype(bool), ''
17 |
18 |
19 |
20 | class TimePartition(Partition):
21 | """partition data by time delta
22 |
23 | Args:
24 | base (str): column name of base time
25 | filter (str): column name of target time to be compared
26 | times (list): list of time delta`
27 |
28 | Example:
29 |
30 | >>> TimePartition('apply_time', 'query_time', ['30d', '90d', 'all'])
31 |
32 | """
33 | def __init__(self, base, filter, times):
34 | self.base = base
35 | self.filter = filter
36 | self.times = times
37 |
38 |
39 | def partition(self, data):
40 | base = pd.to_datetime(data[self.base])
41 | filter = pd.to_datetime(data[self.filter])
42 |
43 | for t in self.times:
44 | if t != 'all':
45 | delta = pd.Timedelta(t)
46 | mask = filter > (base - delta)
47 | else:
48 | mask = np.ones(len(filter)).astype(bool)
49 |
50 | yield mask, '_' + t
51 |
52 |
53 | class ValuePartition(Partition):
54 | """partition data by column values
55 |
56 | Args:
57 | column (str): column name which will be used as partition
58 |
59 | Example:
60 |
61 | >>> ValuePartition('status')
62 |
63 | """
64 | def __init__(self, column):
65 | self.column = column
66 |
67 |
68 | def partition(self, data):
69 | data = data[self.column]
70 | unique = data.unique()
71 |
72 | for u in unique:
73 | if pd.isna(u):
74 | mask = data.isna()
75 | else:
76 | mask = (data == u)
77 |
78 | yield mask, '_' + str(u)
--------------------------------------------------------------------------------
/toad/preprocessing/partition_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | from .partition import TimePartition, ValuePartition
7 |
8 |
9 | np.random.seed(1)
10 |
11 | ab = np.array(list('ABCDEFG'))
12 |
13 | history = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, 400, size = 500)
14 | open_time = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, size = 500)
15 | A = ab[np.random.choice(7, 500)]
16 | B = np.random.randint(10, size = 500).astype(float)
17 | B[np.random.choice(500, 10)] = np.nan
18 |
19 |
20 | df = pd.DataFrame({
21 | 'history': history,
22 | 'open_time': open_time,
23 | 'A': A,
24 | 'B': B,
25 | })
26 |
27 |
28 | def test_timepartition():
29 | tp = TimePartition('open_time', 'history', ['90d', '180d'])
30 | mask, suffix = next(tp.partition(df))
31 | assert mask.sum() == 93
32 |
33 |
34 | def test_timepartition_all():
35 | tp = TimePartition('open_time', 'history', ['all'])
36 | mask, suffix = next(tp.partition(df))
37 | assert mask.sum() == 500
38 |
39 | def test_valuepartition():
40 | vp = ValuePartition('A')
41 | mask, suffix = next(vp.partition(df))
42 | assert mask.sum() == 67
43 |
44 | def test_valuepartition_with_na():
45 | vp = ValuePartition('B')
46 | s = 0
47 | for mask, suffix in vp.partition(df):
48 | s += mask.sum()
49 |
50 | assert s == 500
--------------------------------------------------------------------------------
/toad/preprocessing/process.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | _ALL_SYMBOL_ = '__all_symbol__'
5 |
6 | class Processing:
7 | """
8 |
9 | Examples:
10 |
11 | >>> (Processing(data)
12 | ... .groupby('id')
13 | ... .partitionby(TimePartition(
14 | ... 'base_time',
15 | ... 'filter_time',
16 | ... ['30d', '60d', '180d', '365d', 'all']
17 | ... ))
18 | ... .apply({'A': ['max', 'min', 'mean']})
19 | ... .apply({'B': ['max', 'min', 'mean']})
20 | ... .apply({'C': 'nunique'})
21 | ... .apply({'D': {
22 | ... 'f': len,
23 | ... 'name': 'normal_count',
24 | ... 'mask': Mask('D').isin(['normal']),
25 | ... }})
26 | ... .apply({'id': 'count'})
27 | ... .exec()
28 | ... )
29 | """
30 | def __init__(self, data):
31 | self.data = data
32 | self.funcs = {}
33 | self.partitions = None
34 |
35 | def groupby(self, name):
36 | """group data by name
37 |
38 | Args:
39 | name (str): column name in data
40 | """
41 | self._groupby = name
42 | return self
43 |
44 | def apply(self, f):
45 | """apply functions to data
46 |
47 | Args:
48 | f (dict|function): a config dict that keys are the column names and
49 | values are the functions, it will take the column series as the
50 | functions argument. if `f` is a function, it will take the whole
51 | dataframe as the argument.
52 |
53 | """
54 | if not isinstance(f, dict):
55 | f = {
56 | _ALL_SYMBOL_: f
57 | }
58 |
59 | for k, v in f.items():
60 | self.append_func(k, v)
61 |
62 | return self
63 |
64 |
65 | def append_func(self, col, func):
66 | if not isinstance(func, (list, tuple)):
67 | func = [func]
68 |
69 | if col not in self.funcs:
70 | self.funcs[col] = []
71 |
72 | for f in func:
73 | self.funcs[col].append(self._convert_func(f))
74 |
75 |
76 | def _convert_func(self, f):
77 | if isinstance(f, F):
78 | return f
79 |
80 | if not isinstance(f, dict):
81 | f = {'f': f}
82 |
83 | return F(**f)
84 |
85 |
86 | def partitionby(self, p):
87 | """partition data to multiple pieces, processing will process to all the pieces
88 |
89 | Args:
90 | p (Partition)
91 | """
92 | self.partitions = p
93 | return self
94 |
95 | def exec(self):
96 | if self.partitions is None:
97 | return self.process(self.data)
98 |
99 | res = None
100 | for mask, suffix in self.partitions.partition(self.data):
101 | data = self.process(self.data[mask])
102 | data = data.add_suffix(suffix)
103 |
104 | if res is None:
105 | res = data
106 | continue
107 |
108 | res = res.join(data, how = 'outer')
109 |
110 | return res
111 |
112 |
113 |
114 | def process(self, data):
115 | group = data.groupby(self._groupby)
116 |
117 | res = []
118 | for col, l in self.funcs.items():
119 | for f in l:
120 | g = group
121 |
122 | if f.need_filter:
123 | g = f.filter(data).groupby(self._groupby)
124 |
125 | if f.is_buildin:
126 | r = getattr(g[col], f.name)()
127 | r.name = f.name
128 | else:
129 | if col == _ALL_SYMBOL_:
130 | col = None
131 |
132 | r = g.apply(f, col = col)
133 |
134 | if isinstance(r, pd.Series):
135 | r = pd.DataFrame(r)
136 |
137 | res.append(r.add_prefix(col + '_'))
138 |
139 | return pd.concat(res, axis=1)
140 |
141 |
142 |
143 | class Mask:
144 | """a placeholder to select dataframe
145 | """
146 | def __init__(self, column = None):
147 | self.column = column
148 | self.operators = []
149 |
150 | def push(self, op, value):
151 | self.operators.append({
152 | 'op': op,
153 | 'value': value,
154 | })
155 |
156 | def replay(self, data):
157 | base = data
158 | if self.column is not None:
159 | base = data[self.column]
160 |
161 | for item in self.operators:
162 | v = item['value']
163 |
164 | if isinstance(v, Mask):
165 | v = v.replay(data)
166 |
167 | f = getattr(base, item['op'])
168 |
169 | if v is None:
170 | base = f()
171 | continue
172 |
173 | base = f(v)
174 |
175 | return base
176 |
177 | def __eq__(self, other):
178 | self.push('__eq__', other)
179 | return self
180 |
181 | def __lt__(self, other):
182 | self.push('__lt__', other)
183 | return self
184 |
185 | def __gt__(self, other):
186 | self.push('__gt__', other)
187 | return self
188 |
189 | def __le__(self, other):
190 | self.push('__le__', other)
191 | return self
192 |
193 | def __ge__(self, other):
194 | self.push('__ge__', other)
195 | return self
196 |
197 | def __invert__(self):
198 | self.push('__invert__', None)
199 | return self
200 |
201 | def __and__(self, other):
202 | self.push('__and__', other)
203 | return self
204 |
205 | def __or__(self, other):
206 | self.push('__or__', other)
207 | return self
208 |
209 | def __xor__(self, other):
210 | self.push('__xor__', other)
211 | return self
212 |
213 | def isin(self, other):
214 | self.push('isin', other)
215 | return self
216 |
217 | def isna(self):
218 | self.push('isna', None)
219 | return self
220 |
221 |
222 |
223 | class F:
224 | """function class for processing
225 | """
226 | def __init__(self, f, name = None, mask = None):
227 | self.f = f
228 |
229 | if name is None:
230 | if self.is_buildin:
231 | name = f
232 | else:
233 | name = f.__name__
234 |
235 | self.__name__ = name
236 |
237 | self.mask = mask
238 |
239 | @property
240 | def name(self):
241 | return self.__name__
242 |
243 | @property
244 | def is_buildin(self):
245 | return isinstance(self.f, str)
246 |
247 | @property
248 | def need_filter(self):
249 | return self.mask is not None
250 |
251 | def __call__(self, data, *args, col = None, **kwargs):
252 | if col in data:
253 | data = data[col]
254 |
255 | r = self.f(data, *args, **kwargs)
256 |
257 | if not isinstance(r, dict):
258 | r = {
259 | self.name: r
260 | }
261 |
262 | return pd.Series(r)
263 |
264 |
265 | def filter(self, data):
266 | if self.mask is None:
267 | return data
268 |
269 | mask = self.mask
270 | if isinstance(self.mask, Mask):
271 | mask = self.mask.replay(data)
272 |
273 | return data[mask]
274 |
275 |
--------------------------------------------------------------------------------
/toad/preprocessing/process_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | from .process import Processing, Mask, F
7 |
8 |
9 | np.random.seed(1)
10 |
11 | history = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, 400, size = 500)
12 | open_time = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, size = 500)
13 | A = np.random.randint(10, size = 500)
14 | B = np.random.rand(500)
15 | B[np.random.choice(500, 10)] = np.nan
16 |
17 |
18 | df = pd.DataFrame({
19 | 'history': history,
20 | 'open_time': open_time,
21 | 'A': A,
22 | 'B': B,
23 | })
24 |
25 |
26 | def test_mask():
27 | m = Mask('A') > 3
28 | assert m.replay(df).sum() == (A > 3).sum()
29 |
30 |
31 | def test_mask_without_name():
32 | m = Mask() > 3
33 | assert m.replay(A).sum() == (A > 3).sum()
34 |
35 | def test_mask_isin():
36 | m = Mask('A').isin([1,2,3])
37 | assert m.replay(df).sum() == df['A'].isin([1,2,3]).sum()
38 |
39 | def test_mask_isna():
40 | m = Mask('A').isna()
41 | assert m.replay(df).sum() == df['A'].isna().sum()
42 |
43 | def test_f():
44 | assert F(len)(A)[0] == 500
45 |
46 | def test_processing():
47 | res = (
48 | Processing(df)
49 | .groupby('open_time')
50 | .apply({'A': ['min', 'mean']})
51 | .apply({'B': [
52 | {
53 | 'f': 'size',
54 | 'mask': Mask('A') > 1,
55 | },
56 | {
57 | 'f': len,
58 | },
59 | ]})
60 | .exec()
61 | )
62 |
63 | assert res.size == 120 and res.loc['2020-02-29', 'B_size'] == 23
64 |
65 |
66 | def test_processing_with_partition():
67 | from .partition import ValuePartition
68 | res = (
69 | Processing(df)
70 | .groupby('open_time')
71 | .partitionby(ValuePartition('A'))
72 | .apply({'B': ['mean', 'size']})
73 | .exec()
74 | )
75 |
76 | assert res.size == 600 and res.loc['2020-02-29', 'B_size_1'] == 2
--------------------------------------------------------------------------------
/toad/scorecard_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 | from sklearn.linear_model import LogisticRegression
5 |
6 | from .scorecard import ScoreCard, WOETransformer, Combiner
7 |
8 | np.random.seed(1)
9 |
10 | # Create a testing dataframe and a scorecard model.
11 |
12 | ab = np.array(list('ABCDEFG'))
13 | feature = np.random.randint(10, size = 500)
14 | target = np.random.randint(2, size = 500)
15 | str_feat = ab[np.random.choice(7, 500)]
16 |
17 | df = pd.DataFrame({
18 | 'A': feature,
19 | 'B': str_feat,
20 | 'C': ab[np.random.choice(2, 500)],
21 | 'D': np.ones(500),
22 | })
23 |
24 | card_config = {
25 | 'A': {
26 | '[-inf ~ 3)': 100,
27 | '[3 ~ 5)': 200,
28 | '[5 ~ 8)': 300,
29 | '[8 ~ inf)': 400,
30 | 'nan': 500,
31 | },
32 | 'B': {
33 | ','.join(list('ABCD')): 200,
34 | ','.join(list('EF')): 400,
35 | 'else': 500,
36 | },
37 | 'C': {
38 | 'A': 200,
39 | 'B': 100,
40 | },
41 | }
42 |
43 | combiner = Combiner()
44 | bins = combiner.fit_transform(df, target, n_bins = 5)
45 | woe_transer = WOETransformer()
46 | woe = woe_transer.fit_transform(bins, target)
47 |
48 | # create a score card
49 | card = ScoreCard(
50 | combiner = combiner,
51 | transer = woe_transer,
52 | )
53 | card.fit(woe, target)
54 |
55 |
56 | FUZZ_THRESHOLD = 1e-6
57 | TEST_SCORE = pytest.approx(453.5702462572068, FUZZ_THRESHOLD)
58 | TEST_PROBA = pytest.approx(0.4673322872985267, FUZZ_THRESHOLD)
59 |
60 |
61 | def test_representation():
62 | repr(card)
63 |
64 |
65 | def test_load():
66 | card = ScoreCard().load(card_config)
67 | score = card.predict(df)
68 | assert score[200] == 600
69 |
70 |
71 | def test_load_after_init_combiner():
72 | card = ScoreCard(
73 | combiner = combiner,
74 | transer = woe_transer,
75 | )
76 | card.load(card_config)
77 | score = card.predict(df)
78 | assert score[200] == 600
79 |
80 |
81 | def test_proba_to_score():
82 | model = LogisticRegression()
83 | model.fit(woe, target)
84 |
85 | proba = model.predict_proba(woe)[:, 1]
86 | score = card.proba_to_score(proba)
87 | assert score[404] == TEST_SCORE
88 |
89 |
90 | def test_score_to_prob():
91 | score = card.predict(df)
92 | proba = card.score_to_proba(score)
93 | assert proba[404] == TEST_PROBA
94 |
95 |
96 | def test_predict():
97 | score = card.predict(df)
98 | assert score[404] == TEST_SCORE
99 |
100 |
101 | def test_predict_proba():
102 | proba = card.predict_proba(df)
103 | assert proba[404, 1] == TEST_PROBA
104 |
105 |
106 | def test_card_feature_effect():
107 | """
108 | verify the `base effect of each feature` is consistent with assumption
109 | FEATURE_EFFECT is manually calculated with following logic:
110 | FEATURE_EFFECT = np.median(card.woe_to_score(df),axis = 0)
111 | """
112 | FEATURE_EFFECT = pytest.approx(np.array([142.26368948220417, 152.82747912111066, 148.82665746001695, 0.]), FUZZ_THRESHOLD)
113 | assert card.base_effect.values == FEATURE_EFFECT
114 |
115 |
116 | def test_predict_sub_score():
117 | score, sub = card.predict(df, return_sub=True)
118 | assert sub.loc[250, 'B'] == pytest.approx(162.09822360428146, FUZZ_THRESHOLD)
119 |
120 |
121 | def test_woe_to_score():
122 | score = card.woe_to_score(woe)
123 | score = np.sum(score, axis=1)
124 | assert score[404] == TEST_SCORE
125 |
126 |
127 | def test_bin_to_score():
128 | score = card.bin_to_score(bins)
129 | assert score[404] == TEST_SCORE
130 |
131 |
132 | def test_export_map():
133 | card_map = card.export()
134 | assert card_map['B']['D'] == 159.26
135 |
136 |
137 | def test_card_map():
138 | config = card.export()
139 | card_from_map = ScoreCard().load(config)
140 | score = card_from_map.predict(df)
141 | assert score[404] == 453.57
142 |
143 |
144 | def test_card_map_with_else():
145 | card_from_map = ScoreCard().load(card_config)
146 | score = card_from_map.predict(df)
147 | assert score[80] == 1000
148 |
149 |
150 | def test_generate_testing_frame():
151 | card = ScoreCard().load(card_config)
152 | frame = card.testing_frame()
153 | assert frame.loc[4, 'B'] == 'E'
154 |
155 |
156 | def test_export_frame():
157 | card = ScoreCard().load(card_config)
158 | frame = card.export(to_frame=True)
159 | rows = frame[(frame['name'] == 'B') & (frame['value'] == 'else')].reset_index()
160 | assert rows.loc[0, 'score'] == 500
161 |
162 |
163 | def test_card_combiner_number_not_match():
164 | c = combiner.export()
165 | c['A'] = [0, 3, 6, 8]
166 | com = Combiner().load(c)
167 | bins = com.transform(df)
168 | woe_transer = WOETransformer()
169 | woe = woe_transer.fit_transform(bins, target)
170 |
171 | card = ScoreCard(
172 | combiner=com,
173 | transer=woe_transer,
174 | )
175 |
176 | with pytest.raises(Exception) as e:
177 | # will raise an exception when fitting a card
178 | card.fit(woe, target)
179 |
180 | assert '\'A\' is not matched' in str(e.value)
181 |
182 |
183 | def test_card_combiner_str_not_match():
184 | c = combiner.export()
185 | c['C'] = [['A'], ['B'], ['C']]
186 | com = Combiner().load(c)
187 | bins = com.transform(df)
188 | woe_transer = WOETransformer()
189 | woe = woe_transer.fit_transform(bins, target)
190 |
191 | card = ScoreCard(
192 | combiner=com,
193 | transer=woe_transer,
194 | )
195 |
196 | with pytest.raises(Exception) as e:
197 | # will raise an exception when fitting a card
198 | card.fit(woe, target)
199 |
200 | assert '\'C\' is not matched' in str(e.value)
201 |
202 |
203 | def test_card_with_less_X():
204 | x = woe.drop(columns='A')
205 | card = ScoreCard(
206 | combiner=combiner,
207 | transer=woe_transer,
208 | )
209 |
210 | card.fit(x, target)
211 | assert card.predict(df)[200] == pytest.approx(457.5903160102142, FUZZ_THRESHOLD)
212 |
213 |
214 | def test_card_predict_with_unknown_feature():
215 | np.random.seed(9)
216 | unknown_df = df.copy()
217 | unknown_df.loc[200, 'C'] = 'U'
218 | assert card.predict(unknown_df)[200] == pytest.approx(456.41288777297257, FUZZ_THRESHOLD)
219 |
220 |
221 | def test_card_predict_with_unknown_feature_default_max():
222 | np.random.seed(9)
223 | unknown_df = df.copy()
224 | unknown_df.loc[200, 'C'] = 'U'
225 | score, sub = card.predict(unknown_df, default = 'max', return_sub = True)
226 |
227 | assert sub.loc[200, 'C'] == card['C']['scores'].max()
228 | assert score[200] == pytest.approx(462.2871531373114, FUZZ_THRESHOLD)
229 |
230 |
231 | def test_card_predict_with_unknown_feature_default_with_value():
232 | np.random.seed(9)
233 | unknown_df = df.copy()
234 | unknown_df.loc[200, 'C'] = 'U'
235 | score, sub = card.predict(unknown_df, default = 42, return_sub = True)
236 |
237 | assert sub.loc[200, 'C'] == 42
238 | assert score[200] == pytest.approx(355.46049567729443, FUZZ_THRESHOLD)
239 |
240 |
241 | def test_get_reason_vector():
242 | """
243 | verify the score reason of df is consistent with assumption
244 | DF_REASON is manually calculated with following logic:
245 | if score is lower than base_odds, select top k feature with lowest subscores where their corresponding subscores are lower than the base effect of features.
246 | if score is higher than base_odds, select top k feature with highest subscores where their corresponding subscores are higher than the base effect of features.
247 |
248 | e.g. xx.iloc[404]
249 | sub_scores: 151 159 143 0
250 | base_effect: 142 153 149 0
251 | diff_effect: +9 +6 -6 0
252 |
253 | total_score: 453(151+159+143+0) > base_odds(35)
254 | which is larger than base, hence, we try to find top `keep` features who contributed most to positivity
255 | find_largest_top_3: A(+9) B(+6) D(+0)
256 | """
257 | reason = card.get_reason(df)
258 | assert reason.iloc[404]['top1'].tolist() == ['C', pytest.approx(142.9523920956781, FUZZ_THRESHOLD), 'B']
259 |
260 |
261 | @pytest.mark.timeout(0.007)
262 | def test_predict_dict():
263 | """ a test for scalar inference time cost """
264 | proba = card.predict(df.iloc[404].to_dict())
265 | assert proba == TEST_SCORE
266 |
267 |
--------------------------------------------------------------------------------
/toad/selection_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 | from .selection import drop_empty, drop_var, drop_corr, drop_iv, drop_vif, select, stepwise
6 |
7 | np.random.seed(1)
8 |
9 | LENGTH = 500
10 |
11 | A = np.random.rand(LENGTH)
12 | A[np.random.choice(LENGTH, 20, replace = False)] = np.nan
13 |
14 | B = np.random.randint(100, size = LENGTH)
15 | C = A + np.random.normal(0, 0.2, LENGTH)
16 | D = A + np.random.normal(0, 0.1, LENGTH)
17 |
18 | E = np.random.rand(LENGTH)
19 | E[np.random.choice(LENGTH, 480, replace = False)] = np.nan
20 |
21 | F = B + np.random.normal(0, 10, LENGTH)
22 |
23 | target = np.random.randint(2, size = LENGTH)
24 |
25 | frame = pd.DataFrame({
26 | 'A': A,
27 | 'B': B,
28 | 'C': C,
29 | 'D': D,
30 | 'E': E,
31 | 'F': F,
32 | })
33 |
34 | frame['target'] = target
35 |
36 |
37 | def test_drop_empty():
38 | df = drop_empty(frame, threshold = 0.8)
39 | assert 'E' not in df
40 |
41 | def test_drop_var():
42 | df = drop_var(frame, threshold = 0.1)
43 | assert 'A' not in df
44 |
45 | def test_drop_var_exclude():
46 | df = drop_var(frame, threshold = 0.1, exclude = 'A')
47 | assert 'A' in df
48 |
49 | def test_drop_corr():
50 | df = drop_corr(frame, target = 'target')
51 | assert set(['D', 'E', 'F', 'target']) == set(df.columns.tolist())
52 |
53 | def test_drop_corr_with_string():
54 | ab = np.array(list('ABCDEFG'))
55 | str_feat = pd.Series(ab[np.random.choice(7, 500)])
56 |
57 | df = drop_corr(pd.concat((frame, str_feat.rename('str_f')), axis = 1), target = 'target')
58 | assert set(['D', 'E', 'F', 'target', 'str_f']) == set(df.columns.tolist())
59 |
60 | def test_drop_iv():
61 | df = drop_iv(frame, target = 'target', threshold = 0.25)
62 | assert 'B' not in df
63 |
64 | def test_select():
65 | df = select(frame, target = 'target', empty = 0.8, iv = 0.2, corr = 0.7)
66 | assert ['D', 'F', 'target'] == df.columns.tolist()
67 |
68 | def test_select_exclude():
69 | df = select(frame, target = 'target', empty = 0.8, iv = 0.2, corr = 0.7, exclude = ['A'])
70 | assert ['A', 'D', 'F', 'target'] == df.columns.tolist()
71 |
72 | def test_stepwise():
73 | df = stepwise(frame.fillna(-1), target = 'target')
74 | assert ['C', 'E', 'F', 'target'] == df.columns.tolist()
75 |
76 | def test_stepwise_backward():
77 | df = stepwise(frame.fillna(-1), target = 'target', direction = 'backward')
78 | assert ['C', 'E', 'F', 'target'] == df.columns.tolist()
79 |
80 | def test_stepwise_forward():
81 | df = stepwise(frame.fillna(-1), target = 'target', direction = 'forward')
82 | assert ['C', 'E', 'F', 'target'] == df.columns.tolist()
83 |
84 | def test_stepwise_exclude():
85 | df = stepwise(frame.fillna(-1), target = 'target', exclude = 'B')
86 | assert ['B', 'C', 'E', 'F', 'target'] == df.columns.tolist()
87 |
88 | def test_stepwise_return_drop():
89 | df, drop_list = stepwise(frame.fillna(-1), target = 'target', return_drop = True)
90 | assert ['B', 'A', 'D'] == drop_list
91 |
92 | def test_stepwise_lr():
93 | df = stepwise(frame.fillna(-1), target = 'target', estimator = 'lr', direction = 'forward')
94 | assert ['C', 'target'] == df.columns.tolist()
95 |
96 | def test_stepwise_ks():
97 | df = stepwise(frame.fillna(-1), target = 'target', criterion = 'ks', direction = 'forward')
98 | assert ['A', 'C', 'target'] == df.columns.tolist()
99 |
100 | def test_stepwise_zero():
101 | df = pd.DataFrame({
102 | 'X': np.zeros(500),
103 | 'Z': np.random.rand(500),
104 | 'Y': np.random.randint(2, size = 500),
105 | })
106 | df = stepwise(df, target = 'Y')
107 | assert set(['Z', 'Y']) == set(df.columns.tolist())
108 |
109 | def test_stepwise_forward_when_best_is_first():
110 | df = frame[['E', 'F', 'B', 'A', 'D', 'C', 'target']]
111 | df = stepwise(df.fillna(-1), target = 'target', direction = 'forward')
112 | assert ['E', 'F', 'C', 'target'] == df.columns.tolist()
113 |
114 | def test_drop_vif():
115 | df = drop_vif(frame.fillna(-1), exclude = 'target')
116 | assert ['C', 'F', 'target'] == df.columns.tolist()
117 |
--------------------------------------------------------------------------------
/toad/stats_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 | from .stats import IV, WOE, gini, gini_cond, entropy_cond, quality, _IV, VIF
6 |
7 |
8 | np.random.seed(1)
9 |
10 | feature = np.random.rand(500)
11 | target = np.random.randint(2, size = 500)
12 | A = np.random.randint(100, size = 500)
13 | B = np.random.randint(100, size = 500)
14 | mask = np.random.randint(8, size = 500)
15 |
16 | df = pd.DataFrame({
17 | 'feature': feature,
18 | 'target': target,
19 | 'A': A,
20 | 'B': B,
21 | })
22 |
23 |
24 | def test_woe():
25 | value = WOE(0.2, 0.3)
26 | assert value == pytest.approx(-0.4054651081081643)
27 |
28 | def test_iv_priv():
29 | value, _ = _IV(df['feature'], df['target'])
30 | assert value == pytest.approx(0.010385942643745403)
31 |
32 | def test_iv():
33 | value = IV(df['feature'], df['target'], n_bins = 10, method = 'dt')
34 | assert value == pytest.approx(0.2735917707743619)
35 |
36 | def test_iv_return_sub():
37 | _, sub = IV(mask, df['target'], return_sub = True, n_bins = 10, method = 'dt')
38 | assert len(sub) == 8
39 | assert sub[4] == pytest.approx(0.006449386778057019)
40 |
41 | def test_iv_frame():
42 | res = IV(df, 'target', n_bins = 10, method = 'chi')
43 | assert res.loc[0, 'A'] == pytest.approx(0.226363832867123)
44 |
45 | def test_gini():
46 | value = gini(df['target'])
47 | assert value == 0.499352
48 |
49 | def test_gini_cond():
50 | value = gini_cond(df['feature'], df['target'])
51 | assert value == pytest.approx(0.4970162601626016)
52 |
53 | def test_entropy_cond():
54 | value = entropy_cond(df['feature'], df['target'])
55 | assert value == pytest.approx(0.6924990371522171)
56 |
57 | def test_quality():
58 | result = quality(df, 'target')
59 | assert result.loc['feature', 'iv'] == 0.2735917707743619
60 | assert result.loc['A', 'gini'] == 0.49284164671885444
61 | assert result.loc['B', 'entropy'] == pytest.approx(0.6924956879070063, 5e-5)
62 | assert result.loc['feature', 'unique'] == 500
63 |
64 | def test_quality_iv_only():
65 | result = quality(df, 'target', iv_only = True)
66 | assert np.isnan(result.loc['feature', 'gini'])
67 |
68 | def test_quality_with_merge():
69 | result = quality(df, 'target', n_bins = 5, method = 'chi')
70 | assert result.loc['feature', 'iv'] == 0.13367825777558
71 |
72 | def test_quality_object_type_array_with_nan():
73 | feature = np.array([np.nan, 'A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype = 'O')[mask]
74 |
75 | df = pd.DataFrame({
76 | 'feature': feature,
77 | 'target': target,
78 | })
79 | result = quality(df)
80 | assert result.loc['feature', 'iv'] == 0.016379338180530334
81 |
82 | def test_vif():
83 | vif = VIF(df)
84 | assert vif['A'] == 2.969336442640111
85 |
--------------------------------------------------------------------------------
/toad/tadpole/__init__.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 |
3 | sns.set_palette('muted')
4 |
5 | from .base import Tadpole
6 | from .utils import tadpole_axes
7 |
8 |
9 | tadpole = Tadpole()
--------------------------------------------------------------------------------
/toad/tadpole/base.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | from .utils import (
3 | get_axes,
4 | tadpole_axes,
5 | FIG_SIZE,
6 | )
7 |
8 | class Tadpole:
9 | def __getattr__(self, name):
10 | t = getattr(sns, name)
11 | if callable(t):
12 | return self.wrapsns(t)
13 |
14 | return t
15 |
16 | def wrapsns(self, f):
17 | @tadpole_axes
18 | def wrapper(*args, figure_size = FIG_SIZE, **kwargs):
19 | kw = kwargs.copy()
20 | if 'ax' not in kw:
21 | kw['ax'] = get_axes(size = figure_size)
22 |
23 | try:
24 | return f(*args, **kw)
25 | except:
26 | return f(*args, **kwargs)
27 |
28 | return wrapper
29 |
--------------------------------------------------------------------------------
/toad/tadpole/fonts/NotoSansCJKsc-Regular.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/toad/tadpole/fonts/NotoSansCJKsc-Regular.otf
--------------------------------------------------------------------------------
/toad/tadpole/func.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/toad/tadpole/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import seaborn as sns
3 | from functools import wraps
4 | import matplotlib.pyplot as plt
5 | from matplotlib.axes import Axes
6 | from matplotlib.font_manager import FontProperties
7 |
8 | sns.set_palette('muted')
9 |
10 | CURRENT_PATH = os.path.abspath(os.path.dirname(__file__))
11 | FONT_FILE = 'NotoSansCJKsc-Regular.otf'
12 | FONTS_PATH = os.path.join(CURRENT_PATH, 'fonts', FONT_FILE)
13 | myfont = FontProperties(fname = os.path.abspath(FONTS_PATH))
14 | sns.set(font = myfont.get_family())
15 |
16 | HEATMAP_CMAP = sns.diverging_palette(240, 10, as_cmap = True)
17 | MAX_STYLE = 6
18 | FIG_SIZE = (12, 6)
19 |
20 | def get_axes(size = FIG_SIZE):
21 | _, ax = plt.subplots(figsize = size)
22 | return ax
23 |
24 | def reset_legend(axes):
25 | if axes.get_legend() is not None:
26 | axes.legend(
27 | loc='center left',
28 | bbox_to_anchor=(1, 0.5),
29 | framealpha = 0,
30 | prop = myfont,
31 | )
32 |
33 | return axes
34 |
35 | def reset_ticklabels(axes):
36 | labels = []
37 | if axes.get_xticklabels():
38 | labels += axes.get_xticklabels()
39 |
40 | if axes.get_yticklabels():
41 | labels += axes.get_yticklabels()
42 |
43 | for label in labels:
44 | label.set_fontproperties(myfont)
45 |
46 | return axes
47 |
48 | def reset_xticks(axes):
49 | for label in axes.get_xticklabels():
50 | label.set_ha('left')
51 | label.set_rotation(-25)
52 |
53 | return axes
54 |
55 |
56 | def reset_title(axes):
57 | title = axes.get_title()
58 |
59 | if title:
60 | axes.set_title(title, fontproperties = myfont)
61 |
62 | return axes
63 |
64 |
65 | def reset_xylabels(axes):
66 | y_label = axes.get_ylabel()
67 | if y_label:
68 | axes.set_ylabel(y_label, fontproperties = myfont)
69 |
70 | x_label = axes.get_xlabel()
71 | if x_label:
72 | axes.set_xlabel(x_label, fontproperties = myfont)
73 |
74 | return axes
75 |
76 |
77 | def reset_ylim(axes):
78 | # for axes and twins
79 | for ax in axes.figure.axes:
80 | if ax.bbox.bounds == axes.bbox.bounds:
81 | bottom, top = ax.get_ylim()
82 | top += (top - bottom) * 0.1
83 | ax.set_ylim(bottom, top)
84 |
85 | return axes
86 |
87 |
88 | def fix_axes(axes):
89 | if not isinstance(axes, Axes):
90 | return axes
91 |
92 | functions = [reset_title, reset_xylabels, reset_ticklabels, reset_legend, reset_xticks]
93 |
94 | for func in functions:
95 | func(axes)
96 | return axes
97 |
98 | def tadpole_axes(fn):
99 | @wraps(fn)
100 | def func(*args, **kwargs):
101 | res = fn(*args, **kwargs)
102 |
103 | if not isinstance(res, tuple):
104 | return fix_axes(res)
105 |
106 | r = tuple()
107 | for i in res:
108 | r += (fix_axes(i),)
109 |
110 | return r
111 |
112 | return func
113 |
114 |
115 |
116 | def annotate(ax, x, y, space = 5, format = ".2f"):
117 | """
118 | """
119 | va = 'bottom'
120 |
121 | if y < 0:
122 | space *= -1
123 | va = 'top'
124 |
125 | ax.annotate(
126 | ("{:"+ format +"}").format(y),
127 | (x, y),
128 | xytext = (0, space),
129 | textcoords = "offset points",
130 | ha = 'center',
131 | va = va,
132 | )
133 |
134 |
135 |
136 | def add_bar_annotate(ax, **kwargs):
137 | """
138 | """
139 | for rect in ax.patches:
140 | y_value = rect.get_height()
141 | x_value = rect.get_x() + rect.get_width() / 2
142 |
143 | annotate(ax, x_value, y_value, **kwargs)
144 |
145 | return ax
146 |
147 |
148 | def add_line_annotate(ax, **kwargs):
149 | """
150 | """
151 | for line in ax.lines:
152 | points = line.get_xydata()
153 |
154 | for point in points:
155 | annotate(ax, point[0], point[1], **kwargs)
156 |
157 | return ax
158 |
159 |
160 | def add_annotate(ax, **kwargs):
161 | if len(ax.lines) > 0:
162 | add_line_annotate(ax, **kwargs)
163 |
164 | if len(ax.patches) > 0:
165 | add_bar_annotate(ax, **kwargs)
166 |
167 | return ax
168 |
169 |
170 | def add_text(ax, text, loc = 'top left', offset = (0.01, 0.04)):
171 | x_min, x_max = ax.get_xlim()
172 | y_min, y_max = ax.get_ylim()
173 |
174 | x_offset = (x_max - x_min) * offset[0]
175 | y_offset = (y_max - y_min) * offset[1]
176 |
177 | if loc == 'top left':
178 | loc = (x_min + x_offset, y_max - y_offset)
179 | elif loc == 'top right':
180 | loc = (x_max - x_offset, y_max - y_offset)
181 | elif loc == 'bottom left':
182 | loc = (x_min + x_offset, y_min + y_offset)
183 | elif loc == 'bottom right':
184 | loc = (x_max - x_offset, y_min + y_offset)
185 |
186 | ax.text(*loc, text, fontsize = 'x-large')
187 |
188 | return ax
189 |
--------------------------------------------------------------------------------
/toad/transform_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 | import pyximport
6 |
7 | pyximport.install(setup_args={"include_dirs": np.get_include()})
8 |
9 | from .transform import WOETransformer, Combiner, GBDTTransformer
10 |
11 | np.random.seed(1)
12 |
13 | ab = np.array(list('ABCDEFG'))
14 | feature = np.random.randint(10, size = 500)
15 | target = np.random.randint(2, size = 500)
16 | str_feat = ab[np.random.choice(7, 500)]
17 | uni_feat = np.ones(500)
18 | empty_feat = feature.astype(float)
19 | empty_feat[np.random.choice(500, 50, replace = False)] = np.nan
20 |
21 | df = pd.DataFrame({
22 | 'A': feature,
23 | 'B': str_feat,
24 | 'C': uni_feat,
25 | 'D': empty_feat,
26 | 'target': target,
27 | })
28 |
29 |
30 |
31 | def test_duplicated_keys():
32 | dup_df = df.rename(columns = {"C": "A"})
33 | with pytest.raises(Exception, match=r"X has duplicate keys `.*`"):
34 | WOETransformer().fit_transform(dup_df, target)
35 |
36 | def test_woe_transformer():
37 | f = WOETransformer().fit_transform(feature, target)
38 | assert f[451] == pytest.approx(-0.17061154127869285)
39 |
40 | def test_woe_transformer_with_str():
41 | f = WOETransformer().fit_transform(str_feat, target)
42 | assert f[451] == pytest.approx(-0.2198594761130199)
43 |
44 | def test_woe_transformer_with_unknown_group():
45 | transer = WOETransformer().fit(str_feat, target)
46 | res = transer.transform(['Z'], default = 'min')
47 | assert res[0] == pytest.approx(-0.2198594761130199)
48 |
49 | def test_woe_transformer_frame():
50 | res = WOETransformer().fit_transform(df, target)
51 | assert res.iloc[451, 1] == pytest.approx(-0.2198594761130199)
52 |
53 | def test_woe_transformer_dict():
54 | transer = WOETransformer().fit(df, 'target')
55 | res = transer.transform({
56 | "A": 6,
57 | "B": "C",
58 | "C": 1,
59 | "D": 2,
60 | })
61 | assert res['B'].item() == pytest.approx(-0.09149433112609942)
62 |
63 | def test_woe_transformer_select_dtypes():
64 | res = WOETransformer().fit_transform(df, target, select_dtypes = 'object')
65 | assert res.loc[451, 'A'] == 3
66 |
67 | def test_woe_transformer_exclude():
68 | res = WOETransformer().fit_transform(df, target, exclude = 'A')
69 | assert res.loc[451, 'A'] == 3
70 |
71 | def test_woe_transformer_export_single():
72 | transer = WOETransformer().fit(feature, target)
73 | t = transer.export()
74 | assert t[transer._default_name][5] == pytest.approx(0.3938235330926786)
75 |
76 | def test_woe_transformer_export():
77 | transer = WOETransformer().fit(df, target)
78 | t = transer.export()
79 | assert t['C'][1] == 0
80 |
81 | def test_woe_transformer_load():
82 | rules = {
83 | 'A': {
84 | 1: 0.1,
85 | 2: 0.2,
86 | 3: 0.3,
87 | }
88 | }
89 |
90 | transer = WOETransformer().load(rules)
91 | assert transer._rules['A']['woe'][1] == 0.2
92 |
93 |
94 | def test_combiner():
95 | f = Combiner().fit_transform(feature, target, method = 'chi')
96 | assert f[451] == 3
97 |
98 | def test_combiner_with_str():
99 | f = Combiner().fit_transform(str_feat, target, method = 'chi')
100 | assert f[451] == 0
101 |
102 | def test_combiner_unique_feature():
103 | f = Combiner().fit_transform(uni_feat, target, method = 'chi')
104 | assert f[451] == 0
105 |
106 | def test_combiner_frame():
107 | res = Combiner().fit_transform(df, target)
108 | assert res.iloc[404, 1] == 2
109 |
110 | def test_combiner_select_dtypes():
111 | res = Combiner().fit_transform(df, target, select_dtypes = 'number')
112 | assert res.loc[451, 'B'] == 'G'
113 |
114 | def test_combiner_exclude():
115 | res = Combiner().fit_transform(df, target, exclude = 'B')
116 | assert res.loc[451, 'B'] == 'G'
117 |
118 | def test_combiner_labels():
119 | combiner = Combiner().fit(df, target)
120 | res = combiner.transform(df, labels = True)
121 | assert res.loc[451, 'A'] == '03.[3 ~ 4)'
122 |
123 | def test_combiner_single_feature():
124 | combiner = Combiner().fit(df['A'], method = 'step', n_bins = 5)
125 | res = combiner.transform(df['A'])
126 | assert res[451] == 1
127 |
128 | def test_combiner_export():
129 | combiner = Combiner().fit(df, target, method = 'chi', n_bins = 4)
130 | bins = combiner.export()
131 | assert isinstance(bins['B'][0], list)
132 |
133 | def test_combiner_update():
134 | combiner = Combiner().fit(df, target, method = 'chi', n_bins = 4)
135 | combiner.update({'A': [1,2,3,4,5,6]})
136 | bins = combiner.export()
137 | assert len(bins['A']) == 6
138 |
139 | def test_combiner_step():
140 | combiner = Combiner().fit(df['A'], method = 'step', n_bins = 4)
141 | bins = combiner.export()
142 | assert bins['A'][1] == 4.5
143 |
144 | def test_combiner_target_in_frame():
145 | combiner = Combiner().fit(df, 'target', n_bins = 4)
146 | bins = combiner.export()
147 | assert bins['A'][1] == 6
148 |
149 | def test_combiner_target_in_frame_kwargs():
150 | combiner = Combiner().fit(df, y = 'target', n_bins = 4)
151 | bins = combiner.export()
152 | assert bins['A'][1] == 6
153 |
154 | def test_combiner_empty_separate():
155 | combiner = Combiner()
156 | bins = combiner.fit_transform(df, 'target', n_bins = 4, empty_separate = True)
157 | mask = pd.isna(df['D'])
158 | assert (bins['D'][~mask] != 4).all()
159 |
160 | def test_combiner_labels_with_empty():
161 | combiner = Combiner().fit(df, 'target', n_bins = 4, empty_separate = True)
162 | res = combiner.transform(df, labels = True)
163 | assert res.loc[2, 'D'] == '04.nan'
164 |
165 | def test_gbdt_transformer():
166 | np.random.seed(1)
167 |
168 | df = pd.DataFrame({
169 | 'A': np.random.rand(500),
170 | 'B': np.random.randint(10, size = 500),
171 | })
172 | f = GBDTTransformer().fit_transform(df, target, n_estimators = 10, max_depth = 2)
173 | assert f.shape == (500, 40)
174 |
--------------------------------------------------------------------------------
/toad/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .func import *
2 | from .decorator import *
3 | from .progress import Progress
4 |
--------------------------------------------------------------------------------
/toad/utils/decorator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from time import time
4 | from .func import save_json, read_json
5 | from functools import wraps, WRAPPER_ASSIGNMENTS
6 |
7 |
8 |
9 | class Decorator:
10 | """base decorater class
11 | """
12 | _cls = None
13 | is_class = False
14 |
15 | def __init__(self, *args, is_class = False, **kwargs):
16 | self.is_class = is_class
17 | self.args = []
18 | self.kwargs = {}
19 |
20 | if len(args) == 1 and callable(args[0]):
21 | self.fn = args[0]
22 | else:
23 | self.setup(*args, **kwargs)
24 |
25 |
26 | @property
27 | def fn(self):
28 | if hasattr(self, '__wrapped__'):
29 | return self.__wrapped__
30 |
31 | return None
32 |
33 | @fn.setter
34 | def fn(self, func):
35 | if hasattr(self, 'setup_func'):
36 | func = self.setup_func(func)
37 |
38 | self.__wrapped__ = func
39 |
40 | def __call__(self, *args, **kwargs):
41 | if self.fn is None:
42 | self.fn = args[0]
43 | return self
44 |
45 | if self.is_class:
46 | self._cls = args[0]
47 | args = args[1:]
48 |
49 | return self.wrapper(*args, **kwargs)
50 |
51 |
52 | def __get__(self, instance, type = None):
53 | self.is_class = True
54 | self._cls = instance
55 |
56 | @wraps(self.__wrapped__)
57 | def func(*args, **kwargs):
58 | return self.__call__(instance, *args, **kwargs)
59 |
60 | return func
61 |
62 |
63 | def __getattribute__(self, name):
64 | if name in WRAPPER_ASSIGNMENTS:
65 | return getattr(self.__wrapped__, name)
66 |
67 | return object.__getattribute__(self, name)
68 |
69 |
70 | def setup(self, *args, **kwargs):
71 | self.args = args
72 | self.kwargs = kwargs
73 |
74 | for key in kwargs:
75 | setattr(self, key, kwargs[key])
76 |
77 |
78 | def call(self, *args, **kwargs):
79 | if self._cls is not None:
80 | args = (self._cls, *args)
81 |
82 | return self.fn(*args, **kwargs)
83 |
84 | def wrapper(self, *args, **kwargs):
85 | return self.call(*args, **kwargs)
86 |
87 |
88 | class frame_exclude(Decorator):
89 | """decorator for exclude columns
90 | """
91 |
92 | def wrapper(self, X, *args, exclude = None, **kwargs):
93 | if exclude is not None and isinstance(X, pd.DataFrame):
94 | X = X.drop(columns = exclude)
95 |
96 | return self.call(X, *args, **kwargs)
97 |
98 |
99 | class select_dtypes(Decorator):
100 | """ decorator for select frame by dtypes
101 | """
102 |
103 | def wrapper(self, X, *args, select_dtypes = None, **kwargs):
104 | if select_dtypes is not None and isinstance(X, pd.DataFrame):
105 | X = X.select_dtypes(include = select_dtypes)
106 |
107 | return self.call(X, *args, **kwargs)
108 |
109 |
110 | class save_to_json(Decorator):
111 | """support save result to json file
112 | """
113 | def wrapper(self, *args, to_json = None, **kwargs):
114 | res = self.call(*args, **kwargs)
115 |
116 | if to_json is not None:
117 | save_json(res, to_json)
118 |
119 | return res
120 |
121 |
122 | class load_from_json(Decorator):
123 | """support load data from json file
124 | """
125 | require_first = False
126 |
127 | def wrapper(self, *args, from_json = None, **kwargs):
128 | if from_json is not None:
129 | obj = read_json(from_json)
130 | args = (obj, *args)
131 |
132 | elif self.require_first and len(args) > 0 and isinstance(args[0], str):
133 | obj = read_json(args[0])
134 | args = (obj, *args[1:])
135 |
136 | return self.call(*args, **kwargs)
137 |
138 |
139 | class support_dataframe(Decorator):
140 | """decorator for supporting dataframe
141 | """
142 | require_target = True
143 | target = 'target'
144 |
145 | def wrapper(self, frame, *args, **kwargs):
146 | if not isinstance(frame, pd.DataFrame):
147 | return self.call(frame, *args, **kwargs)
148 |
149 | frame = frame.copy()
150 | if self.require_target and isinstance(args[0], str):
151 | target = frame.pop(args[0])
152 | args = (target,) + args[1:]
153 | elif self.target in kwargs and isinstance(kwargs[self.target], str):
154 | kwargs[self.target] = frame.pop(kwargs[self.target])
155 |
156 | res = dict()
157 | for col in frame:
158 | r = self.call(frame[col], *args, **kwargs)
159 |
160 | if not isinstance(r, np.ndarray):
161 | r = [r]
162 |
163 | res[col] = r
164 | return pd.DataFrame(res)
165 |
166 |
167 | class proxy_docstring(Decorator):
168 | method_name = None
169 |
170 | def __get__(self, *args):
171 | func = super().__get__(*args)
172 |
173 | if self.method_name is not None and hasattr(self._cls, self.method_name):
174 | setattr(func, '__doc__', getattr(self._cls, self.method_name).__doc__)
175 |
176 | return func
177 |
178 |
179 | class support_numpy(Decorator):
180 | """decorator for supporting numpy array to use torch function
181 | """
182 | def wrapper(self, *args, **kwargs):
183 | import torch
184 |
185 | has_numpy = False
186 | l_args = []
187 | for a in args:
188 | if not isinstance(a, torch.Tensor):
189 | a = torch.tensor(a)
190 | has_numpy = True
191 |
192 | l_args.append(a)
193 |
194 | res = self.call(*l_args, **kwargs)
195 |
196 | # only when arguments has numpy array, convert result to numpy array
197 | if has_numpy and isinstance(res, torch.Tensor):
198 | res = res.numpy()
199 |
200 | return res
201 |
202 |
203 | class xgb_loss(Decorator):
204 | """decorator for converting function to xgb supported loss function
205 |
206 | Args:
207 | loss_func (callable): loss function
208 | **kwargs: other arguments for loss function except `pred` and `label`
209 |
210 | Examples:
211 |
212 | >>> @xgb_loss(**kwargs)
213 | >>> def loss_func(pred, label, **kwargs):
214 | >>> ...
215 | >>> return loss
216 | >>>
217 | >>> # or use `xgb_loss` directly
218 | >>> xgb_func = xgb_loss(**kwargs)(loss_func)
219 | >>>
220 | >>> # use in xgb
221 | >>> model = xgb.XGBClassifier(objective = xgb_func)
222 | """
223 | def wrapper(self, pred, label):
224 | from .func import derivative
225 |
226 | def partial_func(x):
227 | return self.call(x, label, **self.kwargs)
228 |
229 | grad = derivative(partial_func, pred, n=1, dx=1e-6)
230 | hess = derivative(partial_func, pred, n=2, dx=1e-6)
231 |
232 | return grad, hess
233 |
234 |
235 | class performance(Decorator):
236 | """decorator for analysis code performance
237 |
238 | Args:
239 | loop (int): loop times, default `1`
240 |
241 | Examples:
242 | >>> @performance(loop = 100)
243 | >>> def func():
244 | >>> ... # code
245 | >>> return res
246 | >>>
247 | >>> func()
248 | >>>
249 | >>> # or use `performance` in `with` statement
250 | >>> with performance():
251 | >>> ... # code
252 | """
253 | loop = 1
254 |
255 | def wrapper(self, *args, **kwargs):
256 | costs = []
257 | for _ in range(self.loop):
258 | start = time()
259 | res = self.call(*args, **kwargs)
260 | end = time()
261 | costs.append(end - start)
262 |
263 | self.analysis(costs)
264 | return res
265 |
266 |
267 | def analysis(self, costs):
268 | import numpy as np
269 |
270 | print('total cost: {:.5f}s'.format(np.sum(costs)))
271 | print("-"*40)
272 | data = {
273 | "Mean": np.mean(costs),
274 | "Min": np.min(costs),
275 | "Max": np.max(costs),
276 | "90%": np.percentile(costs, 90),
277 | "95%": np.percentile(costs, 95),
278 | "99%": np.percentile(costs, 99),
279 | }
280 | HEADER = "{:>8}"*len(data)
281 | BODY = "{:>7.3f}s"*len(data)
282 | print(HEADER.format(*data.keys()))
283 | print(BODY.format(*data.values()))
284 |
285 |
286 | def __enter__(self):
287 | self.start = time()
288 | return self
289 |
290 | def __exit__(self, exc_type, exc_value, traceback):
291 | self.end = time()
292 | self.analysis([self.end - self.start])
293 |
--------------------------------------------------------------------------------
/toad/utils/decorator_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import pandas as pd
4 |
5 | from .decorator import (
6 | Decorator,
7 | frame_exclude,
8 | xgb_loss,
9 | performance,
10 | )
11 |
12 | np.random.seed(1)
13 |
14 |
15 | def func():
16 | "This is a doc for method"
17 | pass
18 |
19 |
20 | def test_decorator_doc():
21 | f = frame_exclude(func)
22 |
23 | assert f.__doc__ == 'This is a doc for method'
24 |
25 |
26 | def test_decorator_init_func():
27 | class a(Decorator):
28 | def setup_func(self, func):
29 | return sum
30 |
31 | f = a(func)
32 |
33 | assert f([10, 20]) == 30
34 |
35 |
36 | def test_decorator_inherit():
37 | class a(Decorator):
38 | bias = 0
39 | def wrapper(self, *args, a = 0, **kwargs):
40 | return self.call(a + self.bias)
41 |
42 | class b(a):
43 | def wrapper(self, *args, b = 0, **kwargs):
44 | a = super().wrapper(*args, **kwargs)
45 | b = self.call(b)
46 | return a + b
47 |
48 | f = b(bias = 2)(lambda x: x+1)
49 | assert f(a = 1, b = 2) == 7
50 |
51 |
52 | def test_xgb_loss():
53 | def loss(x, y):
54 | return np.abs(x - y).sum()
55 |
56 | xgb_l = xgb_loss(loss)
57 | grad, hess = xgb_l(np.arange(3), np.arange(3, 6))
58 |
59 | assert grad == pytest.approx(-3.0)
60 | assert hess == pytest.approx(0.0)
61 |
62 |
63 | def test_performance():
64 | @performance(loop = 10)
65 | def func(x):
66 | from time import sleep
67 | sleep(0.01)
68 | return x**x
69 |
70 | assert func(2) == 4
71 |
72 |
73 | def test_performance_with_clause():
74 | def func(x):
75 | from time import sleep
76 | sleep(0.01)
77 | return x**x
78 |
79 | with performance():
80 | res = func(2)
81 |
82 | assert res == 4
83 |
--------------------------------------------------------------------------------
/toad/utils/func_test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pytest
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from .func import (
7 | np_unique,
8 | fillna,
9 | clip,
10 | diff_time_frame,
11 | bin_to_number,
12 | generate_target,
13 | generate_str,
14 | get_dummies,
15 | feature_splits,
16 | )
17 |
18 | np.random.seed(1)
19 | feature = np.random.rand(500)
20 | target = np.random.randint(2, size = 500)
21 |
22 |
23 |
24 | def test_fillna():
25 | res = fillna(np.array([1, 2, 3, np.nan, 4, 5]))
26 | assert res[3] == -1
27 |
28 |
29 | def test_np_unique():
30 | res = np_unique(np.array([np.nan, np.nan, np.nan]))
31 | assert len(res) == 1
32 |
33 |
34 | def test_clip():
35 | res1 = clip(feature, quantile = (.05, .95))
36 | res2 = clip(feature, quantile = 0.05)
37 | assert np.testing.assert_array_equal(res1, res2) is None
38 |
39 |
40 | def test_feature_splits():
41 | value = feature_splits(feature, target)
42 | assert len(value) == 243
43 |
44 |
45 | @pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python3.8 or higher")
46 | def test_diff_time_frame():
47 | time_data = [
48 | {
49 | 'base': '2018-01',
50 | 'time1': '2018-04',
51 | 'time2': '2018-04-02',
52 | },
53 | {
54 | 'base': '2018-01',
55 | 'time1': '2018-05',
56 | 'time2': '2018-04-05',
57 | },
58 | {
59 | 'base': '2018-02',
60 | 'time1': '2018-04',
61 | 'time2': '2018-04-10',
62 | },
63 | ]
64 |
65 | frame = pd.DataFrame(time_data)
66 | res = diff_time_frame(frame['base'], frame[['time1', 'time2']])
67 | assert res.iloc[0, 1] == 91
68 |
69 |
70 | def test_bin_to_number():
71 | s = pd.Series([
72 | '1',
73 | '1-100',
74 | '-',
75 | '100-200',
76 | np.nan,
77 | '200-300',
78 | '300',
79 | '100-200',
80 | '>500',
81 | ])
82 |
83 | res = s.apply(bin_to_number())
84 | assert res[3] == 150
85 |
86 | def test_bin_to_number_for_frame():
87 | df = pd.DataFrame([
88 | {
89 | 'area_1': '100-200',
90 | 'area_2': '150~200',
91 | },
92 | {
93 | 'area_1': '300-400',
94 | 'area_2': '200~250',
95 | },
96 | {
97 | 'area_1': '200-300',
98 | 'area_2': '450~500',
99 | },
100 | {
101 | 'area_1': '100-200',
102 | 'area_2': '250~300',
103 | },
104 | ])
105 |
106 | res = df.applymap(bin_to_number())
107 | assert res.loc[1, 'area_2'] == 225
108 |
109 | def test_generate_target():
110 | t = generate_target(len(feature), rate = 0.3, weight = feature)
111 | rate = t.sum() / len(t)
112 | assert rate == 0.3
113 |
114 | @pytest.fixture
115 | def test_generate_str():
116 | s = generate_str(size = 8)
117 | assert s == 'EPL5MTQK'
118 |
119 | def test_get_dummies_binary():
120 | ab = np.array(list('ABCDEFG'))
121 | df = pd.DataFrame({
122 | 'binary': ab[np.random.choice(2, 500)],
123 | 'multiple': ab[np.random.choice(5, 500)],
124 | })
125 | data = get_dummies(df, binary_drop = True)
126 |
127 | assert 'binary_A' not in data.columns
128 |
--------------------------------------------------------------------------------
/toad/utils/mixin.py:
--------------------------------------------------------------------------------
1 | import re
2 | import numpy as np
3 | from copy import deepcopy
4 | from .decorator import save_to_json, load_from_json
5 |
6 |
7 | DEFAULT_NAME = '_feature_default_name_'
8 |
9 |
10 | class RulesMixin:
11 | _rules = {}
12 |
13 | def _parse_rule(self, rule):
14 | return rule
15 |
16 | def _format_rule(self, rule):
17 | return rule
18 |
19 | def default_rule(self):
20 | if len(self._rules) == 1:
21 | # return the only rule as default
22 | return next(iter(self._rules.values()))
23 |
24 | if self._default_name not in self._rules:
25 | raise Exception('can not get default rule')
26 |
27 | return self._rules[self._default_name]
28 |
29 | @property
30 | def _default_name(self):
31 | return DEFAULT_NAME
32 |
33 | @property
34 | def rules(self):
35 | return self._rules
36 |
37 | @rules.setter
38 | def rules(self, value):
39 | self._rules = value
40 |
41 |
42 | @load_from_json(is_class = True, require_first = True)
43 | def load(self, rules, update = False, **kwargs):
44 | """load rules from dict or json file
45 |
46 | Args:
47 | rules (dict): dictionary of rules
48 | from_json (str|IOBase): json file of rules
49 | update (bool): if need to use updating instead of replacing rules
50 | """
51 | rules = deepcopy(rules)
52 |
53 | if not isinstance(rules, dict):
54 | rules = {
55 | self._default_name: rules,
56 | }
57 |
58 | for key in rules:
59 | rules[key] = self._parse_rule(rules[key], **kwargs)
60 |
61 | if update:
62 | self._rules.update(rules)
63 | else:
64 | self._rules = rules
65 |
66 | if hasattr(self, 'after_load'):
67 | self.after_load(rules)
68 |
69 | return self
70 |
71 | @save_to_json(is_class = True)
72 | def export(self, **kwargs):
73 | """export rules to dict or a json file
74 |
75 | Args:
76 | to_json (str|IOBase): json file to save rules
77 |
78 | Returns:
79 | dict: dictionary of rules
80 | """
81 | res = {}
82 | for key in self._rules:
83 | res[key] = self._format_rule(self._rules[key], **kwargs)
84 |
85 | if hasattr(self, 'after_export'):
86 | res = self.after_export(res, **kwargs)
87 |
88 | return res
89 |
90 | def update(self, *args, **kwargs):
91 | """update rules
92 |
93 | Args:
94 | rules (dict): dictionary of rules
95 | from_json (str|IOBase): json file of rules
96 | """
97 | return self.load(*args, update = True, **kwargs)
98 |
99 |
100 | def __len__(self):
101 | return len(self._rules.keys())
102 |
103 | def __contains__(self, key):
104 | return key in self._rules
105 |
106 | def __getitem__(self, key):
107 | return self._rules[key]
108 |
109 | def __setitem__(self, key, value):
110 | self._rules[key] = value
111 |
112 | def __iter__(self):
113 | return iter(self._rules)
114 |
115 |
116 |
117 |
118 | RE_NUM = r'-?\d+(.\d+)?'
119 | RE_SEP = r'[~-]'
120 | RE_BEGIN = r'(-inf|{num})'.format(num = RE_NUM)
121 | RE_END = r'(inf|{num})'.format(num = RE_NUM)
122 | RE_RANGE = r'\[{begin}\s*{sep}\s*{end}\)'.format(
123 | begin = RE_BEGIN,
124 | end = RE_END,
125 | sep = RE_SEP,
126 | )
127 |
128 |
129 |
130 |
131 |
132 | class BinsMixin:
133 | EMPTY_BIN = -1
134 | ELSE_GROUP = 'else'
135 | NUMBER_EXP = re.compile(RE_RANGE)
136 |
137 | @classmethod
138 | def parse_bins(self, bins):
139 | """parse labeled bins to array
140 | """
141 | if self._is_numeric(bins):
142 | return self._numeric_parser(bins)
143 |
144 | l = list()
145 |
146 | for item in bins:
147 | if item == self.ELSE_GROUP:
148 | l.append(item)
149 | else:
150 | l.append(item.split(','))
151 |
152 | return np.array(l, dtype = object)
153 |
154 |
155 | @classmethod
156 | def format_bins(self, bins, index = False, ellipsis = None):
157 | """format bins to label
158 |
159 | Args:
160 | bins (ndarray): bins to format
161 | index (bool): if need index prefix
162 | ellipsis (int): max length threshold that labels will not be ellipsis, `None` for skipping ellipsis
163 |
164 | Returns:
165 | ndarray: array of labels
166 | """
167 | l = list()
168 |
169 | if np.issubdtype(bins.dtype, np.number):
170 | has_empty = len(bins) > 0 and np.isnan(bins[-1])
171 |
172 | if has_empty:
173 | bins = bins[:-1]
174 |
175 | sp_l = [-np.inf] + bins.tolist() + [np.inf]
176 | for i in range(len(sp_l) - 1):
177 | l.append('['+str(sp_l[i])+' ~ '+str(sp_l[i+1])+')')
178 |
179 | if has_empty:
180 | l.append('nan')
181 | else:
182 | for keys in bins:
183 | if isinstance(keys, str) and keys == self.ELSE_GROUP:
184 | l.append(keys)
185 | else:
186 | label = ','.join(keys)
187 | if ellipsis is not None:
188 | label = label[:ellipsis] + '..' if len(label) > ellipsis else label
189 | l.append(label)
190 |
191 | if index:
192 | l = ["{:02}.{}".format(ix, lab) for ix, lab in enumerate(l)]
193 |
194 | return np.array(l)
195 |
196 |
197 | @classmethod
198 | def _is_numeric(self, bins):
199 | m = self.NUMBER_EXP.match(bins[0])
200 |
201 | return m is not None
202 |
203 | @classmethod
204 | def _numeric_parser(self, bins):
205 | l = list()
206 |
207 | for item in bins:
208 |
209 | if item == 'nan':
210 | l.append(np.nan)
211 | continue
212 |
213 | m = self.NUMBER_EXP.match(item)
214 | split = m.group(3)
215 |
216 | if split == 'inf':
217 | # split = np.inf
218 | continue
219 |
220 | split = float(split)
221 |
222 | l.append(split)
223 |
224 | return np.array(l)
225 |
--------------------------------------------------------------------------------
/toad/utils/mixin_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from .mixin import RulesMixin, BinsMixin
4 |
5 | np.random.seed(1)
6 |
7 | class RulesObject(RulesMixin):
8 | def _parse_rule(self, rule):
9 | return {
10 | 'rule': rule
11 | }
12 |
13 |
14 | def _format_rule(self, rule):
15 | return 'rule -> %s' % rule['rule']
16 |
17 |
18 | rules = {'A': 'rule_A'}
19 |
20 | def test_rule_parse():
21 | r = RulesObject().load(rules)
22 | assert r.rules['A']['rule'] == 'rule_A'
23 |
24 | def test_rule_format():
25 | r = RulesObject().load(rules)
26 | assert r.export()['A'] == 'rule -> rule_A'
27 |
28 | def test_save_update():
29 | r = RulesObject().load(rules)
30 | r.update({'A': 'update_A'})
31 | assert r.rules['A']['rule'] == 'update_A'
32 |
33 | def test_format_bins():
34 | obj = BinsMixin()
35 | formated = obj.format_bins(np.array([2,4,6]))
36 | expect = ['[-inf ~ 2)', '[2 ~ 4)', '[4 ~ 6)', '[6 ~ inf)']
37 | assert all([a == b for a, b in zip(formated, expect)])
38 |
39 | def test_format_bins_with_index():
40 | obj = BinsMixin()
41 | formated = obj.format_bins(np.array([2,4,6]), index = True)
42 | assert '01.[2 ~ 4)' in formated
43 |
44 | def test_format_bins_with_ellipsis():
45 | obj = BinsMixin()
46 | formated = obj.format_bins(np.array([['A', 'B', 'C'], ['D', 'E']], dtype = object), ellipsis = 3)
47 | assert formated[0] == 'A,B..' and formated[1] == 'D,E'
48 |
--------------------------------------------------------------------------------
/toad/utils/pickletracer.py:
--------------------------------------------------------------------------------
1 | import cloudpickle
2 | from pickle import Unpickler
3 | from cloudpickle import CloudPickler
4 |
5 | _global_tracer = None
6 |
7 | def get_current_tracer():
8 | global _global_tracer
9 | # if _global_tracer is None:
10 | # raise ValueError("tracer is not initialized")
11 | return _global_tracer
12 |
13 |
14 | class Unpickler(Unpickler):
15 | """trace object dependences during unpickle"""
16 | def find_class(self, module, name):
17 | tracer = get_current_tracer()
18 | tracer.add(module)
19 | return super().find_class(module, name)
20 |
21 |
22 | class Pickler(CloudPickler):
23 | """trace object dependences during pickle"""
24 | def __init__(self, *args, **kwargs):
25 | super().__init__(*args, **kwargs)
26 |
27 | import types
28 | self._reduce_module = CloudPickler.dispatch_table[types.ModuleType]
29 | self.dispatch_table[types.ModuleType] = self.reduce_module
30 |
31 |
32 | def reduce_module(self, obj):
33 | tracer = get_current_tracer()
34 | tracer.add(obj.__name__)
35 | return self._reduce_module(obj)
36 |
37 |
38 | def __setattr__(self, name, value):
39 | if name == 'persistent_id':
40 | # fix torch module
41 | def wrapper_func(obj):
42 | from torch.nn import Module
43 | if isinstance(obj, Module):
44 | return None
45 |
46 | return value(obj)
47 |
48 | return super().__setattr__(name, wrapper_func)
49 |
50 | return super().__setattr__(name, value)
51 |
52 |
53 | class Tracer:
54 | def __init__(self):
55 | import re
56 |
57 | self._modules = set()
58 | self._ignore_modules = {"builtins"}
59 | self._temp_dispatch_table = {}
60 |
61 | # match python site packages path
62 | self._regex = re.compile(r".*python[\d\.]+\/site-packages/[\w-]+")
63 |
64 | def add(self, module):
65 | root = module.split(".")[0]
66 |
67 | if root in self._ignore_modules:
68 | return
69 |
70 | self._modules.add(root)
71 |
72 | def trace(self, obj):
73 | """trace `obj` by picke and unpicke
74 | """
75 | import io
76 | dummy = io.BytesIO()
77 |
78 | with self:
79 | Pickler(dummy).dump(obj)
80 | dummy.seek(0)
81 | Unpickler(dummy).load()
82 |
83 | return self.get_deps()
84 |
85 |
86 | def get_deps(self):
87 | import sys
88 |
89 | deps = {
90 | "pip": [],
91 | "files": [],
92 | }
93 |
94 | for name in self._modules:
95 | if name not in sys.modules:
96 | # TODO: should raise error
97 | continue
98 |
99 | module = sys.modules[name]
100 | # package module
101 | if self._regex.match(module.__spec__.origin):
102 | # TODO: spilt pip and conde pkg
103 | deps["pip"].append(module)
104 | continue
105 |
106 | # local file module
107 | deps["files"].append(module)
108 |
109 | return deps
110 |
111 |
112 | def __enter__(self):
113 | global _global_tracer
114 | if _global_tracer is not None:
115 | raise ValueError("a tracer is already exists")
116 |
117 | # save the Cloudpickler global dispatch table
118 | self._temp_dispatch_table = CloudPickler.dispatch_table.copy()
119 | # setup the global tracer
120 | _global_tracer = self
121 | return self
122 |
123 | def __exit__(self, exc_type, exc_val, exc_tb):
124 | global _global_tracer
125 |
126 | # restore the dispatch table to Cloudpickler
127 | CloudPickler.dispatch_table = self._temp_dispatch_table
128 | # clean the global tracer
129 | _global_tracer = None
130 |
131 |
132 |
133 |
134 | def dump(obj, file, *args, **kwargs):
135 | return Pickler(file).dump(obj)
136 |
137 |
138 | def load(file, *args, **kwargs):
139 | return Unpickler(file).load()
140 |
141 |
--------------------------------------------------------------------------------
/toad/utils/pickletracer_test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pytest
3 | from .pickletracer import Tracer, get_current_tracer
4 |
5 |
6 | def test_tracer_with_clause():
7 | assert get_current_tracer() is None
8 | with Tracer() as t:
9 | assert get_current_tracer() == t
10 |
11 | assert get_current_tracer() is None
12 |
13 |
14 | @pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
15 | def test_trace_pyfunc():
16 | import pandas as pd
17 | import numpy as np
18 | from sklearn.linear_model import LinearRegression
19 | X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
20 | # y = 1 * x_0 + 2 * x_1 + 3
21 | y = np.dot(X, np.array([1, 2])) + 3
22 | reg = LinearRegression().fit(X, y)
23 | reg.score(X, y)
24 |
25 | def func(data):
26 | # data = dfunc(data)
27 | df = pd.DataFrame(data)
28 | return df
29 |
30 | class Model:
31 | def __init__(self, model, pref):
32 | self.model = model
33 | self.pref = pref
34 |
35 | def predict(self, data):
36 | data = self.pref(data)
37 | return self.model.predict(data)
38 |
39 |
40 | m = Model(reg, func)
41 |
42 | deps = Tracer().trace(m)
43 |
44 | assert set([m.__name__ for m in deps['pip']]) == set(['numpy', 'pandas', 'cloudpickle', 'sklearn'])
45 |
46 |
47 | def test_default_cloudpickle():
48 | import pandas as pd
49 |
50 | def func(data):
51 | # data = dfunc(data)
52 | df = pd.DataFrame(data)
53 | return df
54 |
55 | deps = Tracer().trace(func)
56 |
57 | import io
58 | import cloudpickle
59 |
60 | dummy = io.BytesIO()
61 | # this should be correct after trace object
62 | # test for restore cloudpickle global dispatch table
63 | cloudpickle.dump(func, dummy)
64 |
--------------------------------------------------------------------------------
/toad/utils/progress/__init__.py:
--------------------------------------------------------------------------------
1 | from .progress import Progress
2 |
--------------------------------------------------------------------------------
/toad/utils/progress/pandas.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from .progress import Progress
4 |
5 |
6 | class ProgressAccessor:
7 | def __init__(self, obj):
8 | self.obj = obj
9 |
10 | def apply(self, func, *args, **kwargs):
11 | if isinstance(self.obj, pd.Series):
12 | l = len(self.obj)
13 | else:
14 | # dataframe
15 | axis = kwargs.get("axis", 0)
16 | if axis == 'index':
17 | axis = 0
18 | elif axis == 'columns':
19 | axis = 1
20 |
21 | l = self.obj.size // self.obj.shape[axis]
22 |
23 | p = iter(Progress(range(l)))
24 |
25 | def wrapper(*args, **kwargs):
26 | next(p)
27 | return func(*args, **kwargs)
28 |
29 | res = self.obj.apply(wrapper, *args, **kwargs)
30 | p.end()
31 | return res
32 |
33 |
34 | class pandas_enable:
35 | def __init__(self):
36 | pd.api.extensions.register_dataframe_accessor("progress")(ProgressAccessor)
37 | pd.api.extensions.register_series_accessor("progress")(ProgressAccessor)
38 |
39 | def __enter__(self):
40 | return self
41 |
42 | def __exit__(self, exce_type, exce_value, exce_trace):
43 | pandas_disable()
44 |
45 |
46 | def pandas_disable():
47 | if hasattr(pd.DataFrame, 'progress'):
48 | delattr(pd.DataFrame, 'progress')
49 |
50 | if hasattr(pd.Series, 'progress'):
51 | delattr(pd.Series, 'progress')
52 |
--------------------------------------------------------------------------------
/toad/utils/progress/pandas_test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from .pandas import pandas_enable, pandas_disable
4 |
5 |
6 |
7 | def test_pandas_with():
8 | assert hasattr(pd.DataFrame, 'progress') == False
9 | assert hasattr(pd.Series, 'progress') == False
10 | with pandas_enable():
11 | assert hasattr(pd.DataFrame, 'progress') == True
12 | assert hasattr(pd.Series, 'progress') == True
13 | assert hasattr(pd.DataFrame, 'progress') == False
14 | assert hasattr(pd.Series, 'progress') == False
15 |
16 | def test_pandas_disable():
17 | assert hasattr(pd.DataFrame, 'progress') == False
18 | assert hasattr(pd.Series, 'progress') == False
19 | pandas_enable()
20 | assert hasattr(pd.DataFrame, 'progress') == True
21 | assert hasattr(pd.Series, 'progress') == True
22 | pandas_disable()
23 | assert hasattr(pd.DataFrame, 'progress') == False
24 | assert hasattr(pd.Series, 'progress') == False
25 |
26 | def test_dataframe_apply():
27 | df = pd.DataFrame({
28 | "A": np.random.rand(1000),
29 | "B": np.random.randint(10, size = (1000,))
30 | })
31 |
32 | with pandas_enable():
33 | res = df.progress.apply(lambda x: x + 1)
34 |
35 | def test_dataframe_apply_axis():
36 | df = pd.DataFrame({
37 | "A": np.random.rand(1000),
38 | "B": np.random.randint(10, size = (1000,))
39 | })
40 |
41 | with pandas_enable():
42 | res = df.progress.apply(lambda x: x + 1, axis = 1)
43 |
44 |
45 | def test_series_apply():
46 | series = pd.Series(np.random.rand(2000))
47 |
48 | with pandas_enable():
49 | res = series.progress.apply(lambda x: x + 1)
50 |
51 |
--------------------------------------------------------------------------------
/toad/utils/progress/progress.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from time import time
3 |
4 | class Progress:
5 | """
6 | """
7 | def __init__(self, iterable, size = None, interval = 0.1):
8 | """
9 | Args:
10 | iterable
11 | size (int): max size of iterable
12 | interval (float): update bar interval second, default is `0.1`
13 |
14 | Attrs:
15 | BAR_LENGTH (int): bar length, default is `32`
16 | SYMBOL_DONE (str): symbol indicating complation
17 | SYMBOL_REST (str): symbol indicating remaining
18 | prefix (str): string template before progress bar
19 | suffix (str): string template after progress bar
20 | template (str): string template for rendering, `{prefix} {bar} {suffix}`
21 | """
22 | self.iterable = iterable
23 | self.interval = interval
24 |
25 | self.batch = 1
26 | self.size = size
27 | if hasattr(iterable, '__len__'):
28 | self.size = len(iterable)
29 |
30 | # is pytorch dataloader
31 | if hasattr(iterable, 'batch_size'):
32 | self.batch = getattr(iterable, 'batch_size')
33 | self.size = len(iterable.dataset)
34 |
35 |
36 | self.reset()
37 |
38 |
39 | self.BAR_LENGTH = 32
40 |
41 | self.SYMBOL_DONE = '█'
42 | self.SYMBOL_REST = '.'
43 | self.prefix = ""
44 | self.suffix = ""
45 |
46 | if self.size is None:
47 | self.template = "{prefix} {done} iters {time:.2f}s {tps}it/s {suffix}"
48 | else:
49 | self.template = "{prefix} {percent:3.0f}%|{bar}| [{done}/{size}] {time:.2f}s {suffix}"
50 |
51 |
52 | def __len__(self):
53 | return self.size
54 |
55 |
56 | def __iter__(self):
57 | self.reset()
58 | self.iterator = iter(self.iterable)
59 | return self
60 |
61 |
62 | def __next__(self):
63 | try:
64 | return self.next()
65 | except StopIteration as e:
66 | self.end()
67 | raise e
68 |
69 |
70 | def reset(self):
71 | # reset index
72 | self.idx = 0
73 |
74 | # reset time
75 | self.time = None
76 | self.start_time = time()
77 | self._last_time = self.start_time
78 | self.iterator = iter(self.iterable)
79 |
80 |
81 | def next(self):
82 | item = next(self.iterator)
83 | self.update()
84 | return item
85 |
86 |
87 | def update(self, idx = None, force = False):
88 | # update idx
89 | if idx is None:
90 | idx = self.idx + 1
91 |
92 | self.idx = idx
93 |
94 | curr_time = time()
95 | self.time = curr_time - self.start_time
96 |
97 | # skip update if delta is too small
98 | if not force and curr_time - self._last_time < self.interval:
99 | return
100 |
101 | self._last_time = curr_time
102 |
103 | # update bar
104 | self.flush()
105 |
106 |
107 | def end(self):
108 | """progress end
109 | """
110 | self.update(idx = self.idx, force = True)
111 | self.print('\n')
112 |
113 |
114 | def flush(self):
115 | if self.size is None:
116 | done = self.idx * self.batch
117 | percent = 0
118 | bar = None
119 | else:
120 | done = min(self.idx * self.batch, self.size)
121 | percent = done / self.size
122 |
123 | bar = (self.SYMBOL_DONE * int(percent * self.BAR_LENGTH)).ljust(self.BAR_LENGTH, self.SYMBOL_REST)
124 |
125 | self.print('\r' + self.template.format(
126 | percent = percent * 100,
127 | bar = bar,
128 | done = done,
129 | size = self.size,
130 | time = self.time,
131 | tps = done / max(self.time, 1),
132 | prefix = self.prefix,
133 | suffix = self.suffix,
134 | ))
135 |
136 |
137 | def print(self, text):
138 | sys.stdout.write(text)
139 | sys.stdout.flush()
140 |
141 |
142 |
143 |
144 |
145 |
146 |
--------------------------------------------------------------------------------
/toad/utils/progress/progress_test.py:
--------------------------------------------------------------------------------
1 | from time import sleep, time
2 | from .progress import Progress
3 |
4 |
5 | class TestIterator:
6 | def __init__(self, size):
7 | self._size = size
8 |
9 | def __iter__(self):
10 | for i in range(self._size):
11 | yield i
12 |
13 |
14 | def test_progress():
15 | p = Progress(range(100))
16 | for i in p:
17 | sleep(0.01)
18 | assert p.idx == 100
19 |
20 | def test_progress_size():
21 | p = Progress(range(9527))
22 | assert p.size == 9527
23 |
24 | def test_iterator():
25 | ti = TestIterator(100)
26 | p = Progress(ti)
27 | for i in p:
28 | sleep(0.01)
29 | assert p.idx == 100
30 |
31 |
32 | def test_multi_loop():
33 | p = Progress(range(100))
34 | for i in p:
35 | sleep(0.01)
36 | assert p.idx == 100
37 |
38 | for i in p:
39 | sleep(0.01)
40 | assert p.idx == 100
41 |
42 | def test_speed():
43 | p = Progress(range(1000))
44 | for i in p:
45 | sleep(0.001)
46 | assert p.idx == 1000
47 |
--------------------------------------------------------------------------------
/toad/version.py:
--------------------------------------------------------------------------------
1 | __version_info__ = (0, 1, 5, 'final', 0)
2 |
3 | def get_version(version):
4 | main = '.'.join(str(x) for x in version[:3])
5 |
6 | if version[3] == 'final':
7 | return main
8 |
9 | symbol = {
10 | 'alpha': 'a',
11 | 'beta': 'b',
12 | 'rc': 'rc',
13 | }
14 |
15 | return main + symbol[version[3]] + str(version[4])
16 |
17 | __version__ = get_version(__version_info__)
18 |
--------------------------------------------------------------------------------